├── .astro-registry.yaml
├── .astro
    └── config.yaml
├── .astrocloud
    └── config.yaml
├── .dockerignore
├── .gitignore
├── Dockerfile
├── README.md
├── dags
    ├── bigquery_examples
    │   └── simple_bigquery.py
    ├── firebolt_examples
    │   └── simple_firebolt.py
    ├── great_expectations
    │   ├── great_expectations_bigquery.py
    │   ├── great_expectations_mlflow.py
    │   ├── great_expectations_pandas_df.py
    │   ├── great_expectations_redshift.py
    │   ├── great_expectations_snowflake.py
    │   └── great_expectations_snowflake_write_audit_publish.py
    ├── redshift_examples
    │   ├── simple_redshift_1.py
    │   ├── simple_redshift_2.py
    │   └── simple_redshift_3.py
    ├── snowflake_examples
    │   ├── complex_snowflake_transform.py
    │   ├── simple_snowflake.py
    │   ├── snowflake_dynamic_write_audit_publish.py
    │   ├── snowflake_write_audit_publish.py
    │   └── taxi_snowflake.py
    └── sql_examples
    │   ├── sql_check.py
    │   └── sql_check_redshift_etl.py
├── include
    ├── data
    │   ├── yellow_tripdata_sample_2019-01.csv
    │   └── yellow_tripdata_sample_2019-02.csv
    ├── forestfire_checks
    │   └── checks.py
    ├── gcs_xcom_backend.py
    ├── great_expectations
    │   ├── .gitignore
    │   ├── checkpoints
    │   │   ├── mlflow
    │   │   │   ├── feature_chk.yml
    │   │   │   └── preprocess_chk.yml
    │   │   └── taxi
    │   │   │   ├── fail
    │   │   │       └── chk.yml
    │   │   │   └── pass
    │   │   │       └── chk.yml
    │   ├── configs
    │   │   ├── bigquery_configs.py
    │   │   ├── mlflow_checkpoint_config.py
    │   │   ├── redshift_configs.py
    │   │   ├── s3_configs.py
    │   │   └── snowflake_configs.py
    │   ├── expectations
    │   │   ├── .ge_store_backend_id
    │   │   ├── mlflow
    │   │   │   ├── census_adult_income_features.json
    │   │   │   └── census_adult_income_preprocess.json
    │   │   ├── taxi
    │   │   │   ├── demo.json
    │   │   │   └── demo_fail.json
    │   │   └── test_suite.json
    │   ├── great_expectations.yml
    │   ├── notebooks
    │   │   ├── pandas
    │   │   │   └── validation_playground.ipynb
    │   │   ├── spark
    │   │   │   └── validation_playground.ipynb
    │   │   └── sql
    │   │   │   └── validation_playground.ipynb
    │   └── plugins
    │   │   └── custom_data_docs
    │   │       └── styles
    │   │           └── data_docs_custom_styles.css
    ├── grid_configs.py
    ├── libs
    │   └── schema_reg
    │   │   ├── __init__.py
    │   │   └── base_schema_transforms.py
    ├── metrics.py
    ├── sample_data
    │   ├── cost_data
    │   │   └── cost_data.csv
    │   ├── forestfire_data
    │   │   ├── forestfires.csv
    │   │   ├── forestfires_corrupt.csv
    │   │   └── forestfires_invalid.csv
    │   └── yellow_trip_data
    │   │   ├── yellow_tripdata_sample_2019-01.csv
    │   │   └── yellow_tripdata_sample_2019-02.csv
    ├── sql
    │   ├── bigquery_examples
    │   │   ├── load_bigquery_forestfire_data.sql
    │   │   └── row_quality_bigquery_forestfire_check.sql
    │   ├── dbt_examples
    │   │   └── copy_store_failures.sql
    │   ├── firebolt_examples
    │   │   ├── create_table.sql
    │   │   ├── drop_table.sql
    │   │   ├── load_forestfire_data.sql
    │   │   └── quality_check_template.sql
    │   ├── great_expectations_examples
    │   │   ├── copy_yellow_tripdata_snowflake_staging.sql
    │   │   ├── create_snowflake_yellow_tripdata_stage.sql
    │   │   ├── create_yellow_tripdata_redshift_table.sql
    │   │   ├── create_yellow_tripdata_snowflake_table.sql
    │   │   ├── delete_yellow_tripdata_table.sql
    │   │   └── table_schemas
    │   │   │   └── tripdata_schema.json
    │   ├── redshift_examples
    │   │   ├── create_redshift_forestfire_table.sql
    │   │   ├── drop_redshift_forestfire_table.sql
    │   │   ├── row_quality_redshift_forestfire_check.sql
    │   │   └── validate_redshift_forestfire_load.sql
    │   ├── snowflake_examples
    │   │   ├── copy_forestfire_snowflake_audit.sql
    │   │   ├── create_cost_table.sql
    │   │   ├── create_forestfire_cost_table.sql
    │   │   ├── create_forestfire_table.sql
    │   │   ├── create_snowflake_yellow_tripdata_stage.sql
    │   │   ├── create_snowflake_yellow_tripdata_table.sql
    │   │   ├── delete_forestfire_table.sql
    │   │   ├── delete_snowflake_table.sql
    │   │   ├── load_cost_data.sql
    │   │   ├── load_forestfire_cost_data.sql
    │   │   ├── load_snowflake_forestfire_data.sql
    │   │   ├── load_yellow_tripdata.sql
    │   │   ├── row_quality_snowflake_forestfire_check.sql
    │   │   ├── row_quality_yellow_tripdata_check.sql
    │   │   ├── row_quality_yellow_tripdata_template.sql
    │   │   ├── table_schemas
    │   │   │   └── forestfire_schema.json
    │   │   └── transform_forestfire_cost_table.sql
    │   └── sql_examples
    │   │   ├── create_redshift_yellow_tripdata_table.sql
    │   │   ├── drop_redshift_yellow_tripdata_table.sql
    │   │   └── row_quality_yellow_tripdata_check.sql
    └── validation
    │   └── forestfire_validation.json
├── packages.txt
├── plugins
    ├── firebolt_operator_test.py
    └── snowflake_check_operators.py
└── requirements.txt


/.astro-registry.yaml:
--------------------------------------------------------------------------------
 1 | # These categories will be applied to all DAGs in the repo.
 2 | categories:
 3 |   - ETL/ELT
 4 |   - Data Quality
 5 |   - Big Data and Analytics
 6 |   - Databases
 7 | # List of DAGs that should be published to the Astronomer Registry.
 8 | dags:
 9 |   - path: dags/bigquery_examples/simple_bigquery.py
10 |   - path: dags/dbt_examples/copy_store_failures_bigquery.py
11 |   - path: dags/dbt_examples/copy_store_failures_redshift.py
12 |   - path: dags/dbt_examples/copy_store_failures_snowflake.py
13 |   - path: dags/firebolt_examples/simple_firebolt.py
14 |   - path: dags/great_expectations/v2/simple_great_expectations_bigquery_el_v2.py
15 |   - path: dags/great_expectations/v2/simple_great_expectations_example_v2.py
16 |   - path: dags/redshift_examples/simple_redshift_1.py
17 |   - path: dags/redshift_examples/simple_redshift_2.py
18 |   - path: dags/redshift_examples/simple_redshift_3.py
19 |   - path: dags/snowflake_examples/complex_snowflake_transform.py
20 |   - path: dags/snowflake_examples/simple_snowflake.py
21 |   - path: dags/snowflake_examples/snowflake_dynamic_write_audit_publish.py
22 |   - path: dags/snowflake_examples/snowflake_write_audit_publish.py
23 |   - path: dags/snowflake_examples/taxi_snowflake.py
24 |   - path: dags/sql_examples/sql_check_redshift_etl.py
25 |   - path: dags/sql_examples/sql_check.py


--------------------------------------------------------------------------------
/.astro/config.yaml:
--------------------------------------------------------------------------------
1 | project:
2 |   name: airflow-data-quality-demo
3 | 


--------------------------------------------------------------------------------
/.astrocloud/config.yaml:
--------------------------------------------------------------------------------
1 | project:
2 |   name: airflow-data-quality-demo
3 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | .astro
 2 | .astrocloud
 3 | .github
 4 | .git
 5 | .gitignore
 6 | venv/
 7 | tests/
 8 | .env
 9 | airflow_settings.yaml
10 | logs/
11 | venv/


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .git
 2 | .env
 3 | airflow_settings.yaml
 4 | *.pyc
 5 | */__pycache__/*
 6 | .vim/
 7 | Pip*
 8 | *DS_Store
 9 | include/gcloud_key/*
10 | venv/*
11 | dag_graph_generator.py
12 | task_dependency_tree.json
13 | include/openlineage*
14 | include/openlineage/*
15 | *.code-workspace


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM quay.io/astronomer/astro-runtime:7.3.0
 2 | ENV AIRFLOW__CORE__ENABLE_XCOM_PICKLING=True
 3 | 
 4 | USER root
 5 | # Required for some ML/DS dependencies
 6 | RUN apt-get update -y
 7 | RUN apt-get install libgomp1 -y
 8 | RUN apt-get install -y git
 9 | USER astro
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data Quality Demo
 2 | This repo contains DAGs to demonstrate a variety of data quality and integrity checks.
 3 | All DAGs can be found under the dags/ folder, which is partitioned by backend data store
 4 | or provider. Specific data stores need connections and may require accounts with cloud providers. Further details are provided in the data store specific sections below.
 5 | 
 6 | ### Requirements
 7 | The Astronomer CLI and Docker installed locally are needed to run all DAGs in this repo. Additional requirements per project are listed below.
 8 | Provider packages are listed in the `requirements.txt` file.
 9 | 
10 | #### Redshift DAGs:
11 | - An AWS account
12 | - An S3 bucket
13 | - An active Redshift cluster
14 | 
15 | #### BigQuery DAGs:
16 | - A GCP account
17 | - A service role with create, modify, and delete privileges on BigQuery
18 | - An active GCP project with BigQuery
19 | 
20 | #### Snowflake DAGs:
21 | - A Snowflake account
22 | 
23 | #### Firebolt DAGs:
24 | - A Firebolt account
25 | 
26 | #### Great Expectations DAGs:
27 | - An account with service roles and tables as specified in one of the data stores above
28 | 
29 | #### SQL DAGs:
30 | - A running SQL database
31 | 
32 | ### Getting Started
33 | The easiest way to run these example DAGs is to use the Astronomer CLI to get an Airflow instance up and running locally:
34 | 1. [Install the Astronomer CLI](https://www.astronomer.io/docs/cloud/stable/develop/cli-quickstart).
35 | 2. Clone this repo locally and navigate into it.
36 | 3. Start Airflow locally by running `astro dev start`.
37 | 4. Create all necessary connections and variables - see below for specific DAG cases.
38 | 5. Navigate to localhost:8080 in your browser and you should see the tutorial DAGs there.
39 | 
40 | #### Redshift DAGs:
41 | In addition to the Getting Started steps, connections to AWS and Postgres (for Redshift) are needed to upload files to S3 and load to Redshift.
42 | Under `Admin -> Connections` in the Airflow UI, add a new connection named `aws_default`. The `Conn Type` is `Amazon Web Services`. In the `Login` field, enter your AWS Access Key associated with your account. In the `Password` field, enter the corresponding AWS Secret Access Key. Press `Save` at the bottom.
43 | Add another connection named `redshift_default`. The `Conn Type` is `Postgres`. The host is your Redshift host name, something like `cluster-name.XXXXXXXXXXXX.region.redshift.amazonaws.com`. The `Schema` is your Redshift schema name. `Login` is the Redshift username. `Password` is the corresponding password to access the cluster. `Port` should be 5439 (the Redshift default). Make sure your IP address is whitelisted in Redshift, and that Redshift is accepting connections outside of your VPC!
44 | 
45 | Variables needed are specified in each DAG and can be set under `Admin -> Variables` in the UI.
46 | 
47 | #### BigQuery DAGs:
48 | In addition to the Getting Started steps, connections to GCP and BigQuery are needed to create BigQuery Datasets, tables, and insert and delete data there.
49 | Under `Admin -> Connections` in the Airflow UI, add a new connection with Conn ID as `google_cloud_default`. The connection type is `Google Cloud`. A GCP key associated with a service account that has access to BigQuery is needed; for more information generating a key, [follow the instructions in this guide](https://cloud.google.com/iam/docs/creating-managing-service-account-keys). The key can either be added via a path via the Keyfile Path field, or the JSON can be directly copied and pasted into the Keyfile JSON field. In the case of the Keyfile Path, a relative path is allowed, and if using Astronomer, the recommended path is under the `include/` directory, as Docker will mount all files and directories under it. Make sure the file name is included in the path. Finally, add the project ID to the Project ID field. No scopes should be needed.
50 | 
51 | Variables needed are specified in each DAG and can be set under `Admin -> Variables` in the UI.
52 | 
53 | #### Snowflake DAGs:
54 | In addition to the Getting Started steps, a connection to Snowflake is needed to run DAGs. Under `Admin -> Connections` in the Airflow UI, add a new connection with Conn ID as `snowflake_default`. The connection type is `Snowflake`. The host field should be the full URL that you use to log into Snowflake, for example `https://[account].[region].snowflakecomputing.com`. Fill out the `Login`, `Password`, `Schema`, `Account`, `Database`, `Region`, `Role`, and `Warehouse` fields with your information.
55 | 
56 | #### Firebolt DAGs:
57 | In addition to the Getting Started steps, a connection to Firebolt is needed to run DAGs. Under `Admin -> Connections` in the Airflow UI, add a new connection with Conn ID as `firebolt_default`. The connection type is `Firebolt`. The host field should be `api.app.firebolt.com`. Fill in the `Login`, `Password` with your account login and password. In the `Advanced Connection Properties` field, enter at least an engine name in a dictionary, e.g.: `{"engine_name": "firebolt_test_general_purpose"}`.
58 | 
59 | #### Great Expectations DAGs:
60 | 
61 | For `airflow-provider-great-expectations<=0.1.5`
62 | In addition to the Getting Started steps, Great Expectations requires its own connections in addition to the Airflow Connections needed by other tasks in the DAG when using outside sources. These connections can be made in the file located at `include/great_expectations/uncommitted/config_variables.yml`. Note: you will have to create this file on your own, it does not come as part of the repository. Example connections in YAML are of the form:
63 | 
64 | ```
65 | my_bigquery_db:
66 |   bigquery://[gcp-id]/[dataset]
67 | my_snowflake_db:
68 |   snowflake://[username]:[password]@[account].[region]/[database]/[schema]?warehouse=[warehouse]&role=[role]
69 | my_redshift_db:
70 |   postgresql+psycopg2://[username]:[password]@[database_uri]:5439/[default_db]
71 | ```
72 | 
73 | See the Great Expectations docs for more information on [BigQuery](https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/database/bigquery/), [Redshift](https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/database/redshift/), or [Snowflake](https://docs.greatexpectations.io/docs/guides/connecting_to_your_data/database/snowflake/). More connections can be added via the Great Expectations CLI tool
74 | 
75 | Files related to the Great Expectations DAGs can be found under `include/great_expectations/`, and the referenced SQL queries under `include/sql/great_expectations_examples/`.
76 | 
77 | Variables needed are specified in each DAG and can be set under `Admin -> Variables` in the UI.
78 | 
79 | #### SQL DAGs:
80 | In addition to the Getting Started steps, a SQL database (sqlite, Postgres, MySQL, etc...) needs to be up and running. This database may be local or cloud-hosted. An Airflow Connection to the database is needed.
81 | 


--------------------------------------------------------------------------------
/dags/bigquery_examples/simple_bigquery.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ### Simple Extract/Load Pipeline with Data Quality Checks Using BigQuery
  3 | 
  4 | Before running the DAG, set the following in an Airflow or Environment Variable:
  5 | - key: gcp_project_id
  6 |     value: [gcp_project_id]
  7 | Fully replacing [gcp_project_id] with the actual ID.
  8 | 
  9 | Ensure you have a connection to GCP, using a role with access to BigQuery
 10 | and the ability to create, modify, and delete datasets and tables.
 11 | 
 12 | What makes this a simple data quality case is:
 13 | 1. Absolute ground truth: the local CSV file is considered perfect and immutable.
 14 | 2. No transformations or business logic.
 15 | 3. Exact values of data to quality check are known.
 16 | """
 17 | 
 18 | import json
 19 | 
 20 | from airflow import DAG
 21 | from airflow.models.baseoperator import chain
 22 | from airflow.operators.dummy_operator import DummyOperator
 23 | from airflow.providers.google.cloud.operators.bigquery import (
 24 |     BigQueryCheckOperator, BigQueryCreateEmptyDatasetOperator,
 25 |     BigQueryCreateEmptyTableOperator, BigQueryDeleteDatasetOperator,
 26 |     BigQueryInsertJobOperator, BigQueryValueCheckOperator)
 27 | from airflow.providers.google.cloud.sensors.bigquery import \
 28 |     BigQueryTableExistenceSensor
 29 | from airflow.utils.dates import datetime
 30 | from airflow.utils.task_group import TaskGroup
 31 | 
 32 | DATASET = "simple_bigquery_example_dag"
 33 | TABLE = "forestfires"
 34 | 
 35 | with DAG(
 36 |     "simple_bigquery",
 37 |     start_date=datetime(2021, 1, 1),
 38 |     description="Example DAG showcasing loading and data quality checking with BigQuery.",
 39 |     doc_md=__doc__,
 40 |     schedule_interval=None,
 41 |     template_searchpath="/usr/local/airflow/include/sql/bigquery_examples/",
 42 |     catchup=False,
 43 | ) as dag:
 44 | 
 45 |     """
 46 |     #### BigQuery dataset creation
 47 |     Create the dataset to store the sample data tables.
 48 |     """
 49 |     create_dataset = BigQueryCreateEmptyDatasetOperator(
 50 |         task_id="create_dataset", dataset_id=DATASET
 51 |     )
 52 | 
 53 |     """
 54 |     #### BigQuery table creation
 55 |     Create the table to store sample forest fire data.
 56 |     """
 57 |     create_table = BigQueryCreateEmptyTableOperator(
 58 |         task_id="create_table",
 59 |         dataset_id=DATASET,
 60 |         table_id=TABLE,
 61 |         schema_fields=[
 62 |             {"name": "id", "type": "INTEGER", "mode": "REQUIRED"},
 63 |             {"name": "y", "type": "INTEGER", "mode": "NULLABLE"},
 64 |             {"name": "month", "type": "STRING", "mode": "NULLABLE"},
 65 |             {"name": "day", "type": "STRING", "mode": "NULLABLE"},
 66 |             {"name": "ffmc", "type": "FLOAT", "mode": "NULLABLE"},
 67 |             {"name": "dmc", "type": "FLOAT", "mode": "NULLABLE"},
 68 |             {"name": "dc", "type": "FLOAT", "mode": "NULLABLE"},
 69 |             {"name": "isi", "type": "FLOAT", "mode": "NULLABLE"},
 70 |             {"name": "temp", "type": "FLOAT", "mode": "NULLABLE"},
 71 |             {"name": "rh", "type": "FLOAT", "mode": "NULLABLE"},
 72 |             {"name": "wind", "type": "FLOAT", "mode": "NULLABLE"},
 73 |             {"name": "rain", "type": "FLOAT", "mode": "NULLABLE"},
 74 |             {"name": "area", "type": "FLOAT", "mode": "NULLABLE"},
 75 |         ],
 76 |     )
 77 | 
 78 |     """
 79 |     #### BigQuery table check
 80 |     Ensure that the table was created in BigQuery before inserting data.
 81 |     """
 82 |     check_table_exists = BigQueryTableExistenceSensor(
 83 |         task_id="check_for_table",
 84 |         project_id="{{ var.value.gcp_project_id }}",
 85 |         dataset_id=DATASET,
 86 |         table_id=TABLE,
 87 |     )
 88 | 
 89 |     """
 90 |     #### Insert data
 91 |     Insert data into the BigQuery table using an existing SQL query (stored in
 92 |     a file under dags/sql).
 93 |     """
 94 |     load_data = BigQueryInsertJobOperator(
 95 |         task_id="insert_query",
 96 |         configuration={
 97 |             "query": {
 98 |                 "query": "{% include 'load_bigquery_forestfire_data.sql' %}",
 99 |                 "useLegacySql": False,
100 |             }
101 |         },
102 |     )
103 | 
104 |     """
105 |     #### Row-level data quality check
106 |     Run data quality checks on a few rows, ensuring that the data in BigQuery
107 |     matches the ground truth in the correspoding JSON file.
108 |     """
109 |     with open("include/validation/forestfire_validation.json") as ffv:
110 |         with TaskGroup(group_id="row_quality_checks") as quality_check_group:
111 |             ffv_json = json.load(ffv)
112 |             for id, values in ffv_json.items():
113 |                 values["id"] = id
114 |                 values["dataset"] = DATASET
115 |                 values["table"] = TABLE
116 |                 BigQueryCheckOperator(
117 |                     task_id=f"check_row_data_{id}",
118 |                     sql="row_quality_bigquery_forestfire_check.sql",
119 |                     use_legacy_sql=False,
120 |                     params=values,
121 |                 )
122 | 
123 |     """
124 |     #### Table-level data quality check
125 |     Run a row count check to ensure all data was uploaded to BigQuery properly.
126 |     """
127 |     check_bq_row_count = BigQueryValueCheckOperator(
128 |         task_id="check_row_count",
129 |         sql=f"SELECT COUNT(*) FROM {DATASET}.{TABLE}",
130 |         pass_value=9,
131 |         use_legacy_sql=False,
132 |     )
133 | 
134 |     """
135 |     #### Delete test dataset and table
136 |     Clean up the dataset and table created for the example.
137 |     """
138 |     delete_dataset = BigQueryDeleteDatasetOperator(
139 |         task_id="delete_dataset", dataset_id=DATASET, delete_contents=True
140 |     )
141 | 
142 |     begin = DummyOperator(task_id="begin")
143 |     end = DummyOperator(task_id="end")
144 | 
145 |     chain(
146 |         begin,
147 |         create_dataset,
148 |         create_table,
149 |         check_table_exists,
150 |         load_data,
151 |         [quality_check_group, check_bq_row_count],
152 |         delete_dataset,
153 |         end,
154 |     )
155 | 


--------------------------------------------------------------------------------
/dags/firebolt_examples/simple_firebolt.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Licensed to the Apache Software Foundation (ASF) under one
  3 | # or more contributor license agreements.  See the NOTICE file
  4 | # distributed with this work for additional information
  5 | # regarding copyright ownership.  The ASF licenses this file
  6 | # to you under the Apache License, Version 2.0 (the
  7 | # "License"); you may not use this file except in compliance
  8 | # with the License.  You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing,
 13 | # software distributed under the License is distributed on an
 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | # KIND, either express or implied.  See the License for the
 16 | # specific language governing permissions and limitations
 17 | # under the License.
 18 | """Example Firebolt Data Quality DAG
 19 | 
 20 | DAG starts the Firebolt engine specificed, creates sample table, loads sample data into table,
 21 | runs quality checks in CHECKS dictionary, then deletes the table and stops the engine.
 22 | 
 23 | Checks work by running a MIN() function over the specific aggregate check, where the aggregate
 24 | check is contained in a CASE statement. The CASE statement checks the result of the condition;
 25 | if true, the CASE statement returns 1, else 0. Then MIN() will return 0 if any row returns a
 26 | false result, and true otherwise.
 27 | 
 28 | Note: the Firebolt operator currently does not support templated SQL queries.
 29 | """
 30 | 
 31 | from datetime import datetime
 32 | 
 33 | from airflow import DAG
 34 | from airflow.models.baseoperator import chain
 35 | from airflow.operators.sql import SQLCheckOperator
 36 | from airflow.utils.task_group import TaskGroup
 37 | from firebolt_provider.operators.firebolt import (FireboltOperator,
 38 |                                                   FireboltStartEngineOperator,
 39 |                                                   FireboltStopEngineOperator)
 40 | 
 41 | FIREBOLT_CONN_ID = "firebolt_default"
 42 | FIREBOLT_SAMPLE_TABLE = "forest_fire"
 43 | FIREBOLT_DATABASE = "firebolt_test"
 44 | FIREBOLT_ENGINE = "firebolt_test_general_purpose"
 45 | 
 46 | CHECKS = {"id": "'column' IS NOT NULL", "ffmc": "MAX(ffmc) < 100"}
 47 | 
 48 | with DAG(
 49 |     "simple_firebolt",
 50 |     schedule_interval=None,
 51 |     start_date=datetime(2021, 1, 1),
 52 |     doc_md=__doc__,
 53 |     default_args={
 54 |         "conn_id": FIREBOLT_CONN_ID,
 55 |         "firebolt_conn_id": FIREBOLT_CONN_ID,
 56 |         "database": FIREBOLT_DATABASE,
 57 |         "engine_name": FIREBOLT_ENGINE,
 58 |     },
 59 |     template_searchpath="/usr/local/airflow/include/sql/firebolt_examples/",
 60 |     catchup=False,
 61 | ) as dag:
 62 | 
 63 |     start_engine = FireboltStartEngineOperator(task_id="start_engine")
 64 | 
 65 |     create_table = FireboltOperator(
 66 |         task_id="create_table",
 67 |         sql="create_table.sql",
 68 |         params={"table": FIREBOLT_SAMPLE_TABLE},
 69 |     )
 70 | 
 71 |     load_data = FireboltOperator(
 72 |         task_id="load_data",
 73 |         sql="load_forestfire_data.sql",
 74 |         params={"table": FIREBOLT_SAMPLE_TABLE},
 75 |     )
 76 | 
 77 |     with TaskGroup(group_id="aggregate_quality_checks") as check_group:
 78 |         for name, statement in CHECKS.items():
 79 |             check = SQLCheckOperator(
 80 |                 task_id=f"check_{name}",
 81 |                 sql="quality_check_template.sql",
 82 |                 params={
 83 |                     "col": name,
 84 |                     "check_statement": statement,
 85 |                     "table": FIREBOLT_SAMPLE_TABLE,
 86 |                 },
 87 |             )
 88 | 
 89 |     drop_table = FireboltOperator(
 90 |         task_id="drop_table",
 91 |         sql="drop_table.sql",
 92 |         params={"table": FIREBOLT_SAMPLE_TABLE},
 93 |     )
 94 | 
 95 |     stop_engine = FireboltStopEngineOperator(task_id="stop_engine")
 96 | 
 97 |     chain(
 98 |         start_engine,
 99 |         create_table,
100 |         load_data,
101 |         check_group,
102 |         drop_table,
103 |         stop_engine,
104 |     )
105 | 


--------------------------------------------------------------------------------
/dags/great_expectations/great_expectations_bigquery.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ### Simple EL Pipeline with Data Quality Checks Using BigQuery and Great Expectations
  3 | 
  4 | Before running the DAG, set the following in an Airflow or Environment Variable:
  5 | - key: gcp_project_id
  6 |     value: [gcp_project_id]
  7 | Fully replacing [gcp_project_id] with the actual ID.
  8 | 
  9 | Ensure you have a connection to GCP, using a role with access to BigQuery
 10 | and the ability to create, modify, and delete datasets and tables.
 11 | 
 12 | What makes this a simple data quality case is:
 13 | 1. Absolute ground truth: the local CSV file is considered perfect and immutable.
 14 | 2. No transformations or business logic.
 15 | 3. Exact values of data to quality check are known.
 16 | """
 17 | 
 18 | import os
 19 | from datetime import datetime
 20 | from pathlib import Path
 21 | 
 22 | from airflow import DAG
 23 | from airflow.models.baseoperator import chain
 24 | from airflow.providers.google.cloud.operators.bigquery import (
 25 |     BigQueryCreateEmptyDatasetOperator, BigQueryCreateEmptyTableOperator,
 26 |     BigQueryDeleteDatasetOperator)
 27 | from airflow.providers.google.cloud.transfers.gcs_to_bigquery import \
 28 |     GCSToBigQueryOperator
 29 | from airflow.providers.google.cloud.transfers.local_to_gcs import \
 30 |     LocalFilesystemToGCSOperator
 31 | from great_expectations_provider.operators.great_expectations import \
 32 |     GreatExpectationsOperator
 33 | 
 34 | base_path = Path(__file__).parents[2]
 35 | data_file = os.path.join(
 36 |     base_path,
 37 |     "include",
 38 |     "sample_data/yellow_trip_data/yellow_tripdata_sample_2019-01.csv",
 39 | )
 40 | ge_root_dir = os.path.join(base_path, "include", "great_expectations")
 41 | 
 42 | # In a production DAG, the global variables below should be stored as Airflow
 43 | # or Environment variables.
 44 | bq_dataset = "great_expectations_bigquery_example"
 45 | bq_table = "taxi"
 46 | gcp_bucket = "great-expectations-demo"
 47 | gcp_data_dest = "data/yellow_tripdata_sample_2019-01.csv"
 48 | 
 49 | with DAG(
 50 |     "great_expectations.bigquery",
 51 |     description="Example DAG showcasing loading and data quality checking with BigQuery and Great Expectations.",
 52 |     doc_md=__doc__,
 53 |     schedule_interval=None,
 54 |     start_date=datetime(2021, 1, 1),
 55 |     catchup=False,
 56 | ) as dag:
 57 | 
 58 |     """
 59 |     #### BigQuery dataset creation
 60 |     Create the dataset to store the sample data tables.
 61 |     """
 62 |     create_dataset = BigQueryCreateEmptyDatasetOperator(
 63 |         task_id="create_dataset", dataset_id=bq_dataset
 64 |     )
 65 | 
 66 |     """
 67 |     #### Upload taxi data to GCS
 68 |     Upload the test data to GCS so it can be transferred to BigQuery.
 69 |     """
 70 |     upload_taxi_data = LocalFilesystemToGCSOperator(
 71 |         task_id="upload_taxi_data",
 72 |         src=data_file,
 73 |         dst=gcp_data_dest,
 74 |         bucket=gcp_bucket,
 75 |     )
 76 | 
 77 |     """
 78 |     #### Create Temp Table for GE in BigQuery
 79 |     """
 80 |     create_temp_table = BigQueryCreateEmptyTableOperator(
 81 |         task_id="create_temp_table",
 82 |         dataset_id=bq_dataset,
 83 |         table_id=f"{bq_table}_temp",
 84 |         schema_fields=[
 85 |             {"name": "vendor_id", "type": "INTEGER", "mode": "REQUIRED"},
 86 |             {"name": "pickup_datetime", "type": "DATETIME", "mode": "NULLABLE"},
 87 |             {"name": "dropoff_datetime", "type": "DATETIME", "mode": "NULLABLE"},
 88 |             {"name": "passenger_count", "type": "INTEGER", "mode": "NULLABLE"},
 89 |             {"name": "trip_distance", "type": "FLOAT", "mode": "NULLABLE"},
 90 |             {"name": "rate_code_id", "type": "INTEGER", "mode": "NULLABLE"},
 91 |             {"name": "store_and_fwd_flag", "type": "STRING", "mode": "NULLABLE"},
 92 |             {"name": "pickup_location_id", "type": "INTEGER", "mode": "NULLABLE"},
 93 |             {"name": "dropoff_location_id", "type": "INTEGER", "mode": "NULLABLE"},
 94 |             {"name": "payment_type", "type": "INTEGER", "mode": "NULLABLE"},
 95 |             {"name": "fare_amount", "type": "FLOAT", "mode": "NULLABLE"},
 96 |             {"name": "extra", "type": "FLOAT", "mode": "NULLABLE"},
 97 |             {"name": "mta_tax", "type": "FLOAT", "mode": "NULLABLE"},
 98 |             {"name": "tip_amount", "type": "FLOAT", "mode": "NULLABLE"},
 99 |             {"name": "tolls_amount", "type": "FLOAT", "mode": "NULLABLE"},
100 |             {"name": "improvement_surcharge", "type": "FLOAT", "mode": "NULLABLE"},
101 |             {"name": "total_amount", "type": "FLOAT", "mode": "NULLABLE"},
102 |             {"name": "congestion_surcharge", "type": "FLOAT", "mode": "NULLABLE"},
103 |         ],
104 |     )
105 | 
106 |     """
107 |     #### Transfer data from GCS to BigQuery
108 |     Moves the data uploaded to GCS in the previous step to BigQuery, where
109 |     Great Expectations can run a test suite against it.
110 |     """
111 |     transfer_taxi_data = GCSToBigQueryOperator(
112 |         task_id="taxi_data_gcs_to_bigquery",
113 |         bucket=gcp_bucket,
114 |         source_objects=[gcp_data_dest],
115 |         skip_leading_rows=1,
116 |         destination_project_dataset_table="{}.{}".format(bq_dataset, bq_table),
117 |         schema_fields=[
118 |             {"name": "vendor_id", "type": "INTEGER", "mode": "REQUIRED"},
119 |             {"name": "pickup_datetime", "type": "DATETIME", "mode": "NULLABLE"},
120 |             {"name": "dropoff_datetime", "type": "DATETIME", "mode": "NULLABLE"},
121 |             {"name": "passenger_count", "type": "INTEGER", "mode": "NULLABLE"},
122 |             {"name": "trip_distance", "type": "FLOAT", "mode": "NULLABLE"},
123 |             {"name": "rate_code_id", "type": "INTEGER", "mode": "NULLABLE"},
124 |             {"name": "store_and_fwd_flag", "type": "STRING", "mode": "NULLABLE"},
125 |             {"name": "pickup_location_id", "type": "INTEGER", "mode": "NULLABLE"},
126 |             {"name": "dropoff_location_id", "type": "INTEGER", "mode": "NULLABLE"},
127 |             {"name": "payment_type", "type": "INTEGER", "mode": "NULLABLE"},
128 |             {"name": "fare_amount", "type": "FLOAT", "mode": "NULLABLE"},
129 |             {"name": "extra", "type": "FLOAT", "mode": "NULLABLE"},
130 |             {"name": "mta_tax", "type": "FLOAT", "mode": "NULLABLE"},
131 |             {"name": "tip_amount", "type": "FLOAT", "mode": "NULLABLE"},
132 |             {"name": "tolls_amount", "type": "FLOAT", "mode": "NULLABLE"},
133 |             {"name": "improvement_surcharge", "type": "FLOAT", "mode": "NULLABLE"},
134 |             {"name": "total_amount", "type": "FLOAT", "mode": "NULLABLE"},
135 |             {"name": "congestion_surcharge", "type": "FLOAT", "mode": "NULLABLE"},
136 |         ],
137 |         source_format="CSV",
138 |         create_disposition="CREATE_IF_NEEDED",
139 |         write_disposition="WRITE_TRUNCATE",
140 |         allow_jagged_rows=True,
141 |     )
142 | 
143 |     """
144 |     #### Great Expectations suite
145 |     Run the Great Expectations suite on the table.
146 |     """
147 |     ge_bigquery_validation = GreatExpectationsOperator(
148 |         task_id="ge_bigquery_validation",
149 |         data_context_root_dir=ge_root_dir,
150 |         conn_id="bigquery_default",
151 |         expectation_suite_name="taxi.demo",
152 |         data_asset_name=bq_table,
153 |         fail_task_on_validation_failure=False,
154 |     )
155 | 
156 |     """
157 |     #### Delete test dataset and table
158 |     Clean up the dataset and table created for the example.
159 |     """
160 |     delete_dataset = BigQueryDeleteDatasetOperator(
161 |         task_id="delete_dataset",
162 |         project_id="{{ var.value.gcp_project_id }}",
163 |         dataset_id=bq_dataset,
164 |         delete_contents=True,
165 |     )
166 | 
167 |     chain(
168 |         create_dataset,
169 |         create_temp_table,
170 |         upload_taxi_data,
171 |         transfer_taxi_data,
172 |         ge_bigquery_validation,
173 |         delete_dataset,
174 |     )
175 | 


--------------------------------------------------------------------------------
/dags/great_expectations/great_expectations_pandas_df.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ### Simple EL Pipeline with Data Quality Checks Using Pandas and Great Expectations
 3 | 
 4 | A simple example of performing data quality checks on a Pandas dataframe using Great Expectations.
 5 | 
 6 | What makes this a simple data quality case is:
 7 | 1. Absolute ground truth: the local CSV file is considered perfect and immutable.
 8 | 2. No transformations or business logic.
 9 | 3. Exact values of data to quality check are known.
10 | """
11 | 
12 | import os
13 | from datetime import datetime
14 | from pathlib import Path
15 | 
16 | import pandas as pd
17 | from airflow import DAG
18 | from great_expectations_provider.operators.great_expectations import \
19 |     GreatExpectationsOperator
20 | 
21 | base_path = Path(__file__).parents[2]
22 | data_file = os.path.join(
23 |     base_path,
24 |     "include",
25 |     "sample_data/yellow_trip_data/yellow_tripdata_sample_2019-01.csv",
26 | )
27 | ge_root_dir = os.path.join(base_path, "include", "great_expectations")
28 | 
29 | 
30 | with DAG(
31 |     "great_expectations.pandas_df",
32 |     start_date=datetime(2021, 1, 1),
33 |     description="Example DAG showcasing loading and data quality checking with Pandas and Great Expectations.",
34 |     doc_md=__doc__,
35 |     schedule_interval=None,
36 |     catchup=False,
37 | ) as dag:
38 | 
39 |     """
40 |     #### Great Expectations suite
41 |     Run the Great Expectations suite on the table.
42 |     """
43 |     ge_pandas_df_validation = GreatExpectationsOperator(
44 |         task_id="ge_pandas_df_validation",
45 |         data_context_root_dir=ge_root_dir,
46 |         dataframe_to_validate=pd.read_csv(filepath_or_buffer=data_file, header=0),
47 |         execution_engine="PandasExecutionEngine",
48 |         expectation_suite_name="taxi.demo",
49 |         data_asset_name="yellow_tripdata_sample_2019-01",
50 |     )
51 | 


--------------------------------------------------------------------------------
/dags/great_expectations/great_expectations_redshift.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ### Simple EL Pipeline with Data Quality Checks Using Redshift and Great Expectations
  3 | 
  4 | Use Great Expectations to check data quality in Redshift.
  5 | 
  6 | Before running the DAG, set the following in an Airflow or Environment Variable:
  7 | - key: aws_configs
  8 | - value: { "s3_bucket": [bucket_name], "s3_key_prefix": [key_prefix], "redshift_table": [table_name]}
  9 | Fully replacing [bucket_name], [key_prefix], and [table_name].
 10 | 
 11 | What makes this a simple data quality case is:
 12 | 1. Absolute ground truth: the local CSV file is considered perfect and immutable.
 13 | 2. No transformations or business logic.
 14 | 3. Exact values of data to quality check are known.
 15 | """
 16 | 
 17 | import os
 18 | from datetime import datetime
 19 | from pathlib import Path
 20 | 
 21 | from airflow import DAG
 22 | from airflow.models.baseoperator import chain
 23 | from airflow.providers.amazon.aws.operators.redshift_sql import \
 24 |     RedshiftSQLOperator
 25 | from airflow.providers.amazon.aws.transfers.local_to_s3 import \
 26 |     LocalFilesystemToS3Operator
 27 | from airflow.providers.amazon.aws.transfers.s3_to_redshift import \
 28 |     S3ToRedshiftOperator
 29 | from great_expectations_provider.operators.great_expectations import \
 30 |     GreatExpectationsOperator
 31 | 
 32 | table = "yellow_tripdata"
 33 | base_path = Path(__file__).parents[2]
 34 | data_file = os.path.join(
 35 |     base_path,
 36 |     "include",
 37 |     "sample_data/yellow_trip_data/yellow_tripdata_sample_2019-01.csv",
 38 | )
 39 | ge_root_dir = os.path.join(base_path, "include", "great_expectations")
 40 | 
 41 | with DAG(
 42 |     "great_expectations.redshift",
 43 |     start_date=datetime(2021, 1, 1),
 44 |     description="Example DAG showcasing loading and data quality checking with Redshift and Great Expectations.",
 45 |     doc_md=__doc__,
 46 |     schedule_interval=None,
 47 |     template_searchpath=f"{base_path}/include/sql/great_expectations_examples/",
 48 |     catchup=False,
 49 | ) as dag:
 50 | 
 51 |     upload_to_s3 = LocalFilesystemToS3Operator(
 52 |         task_id="upload_to_s3",
 53 |         filename=data_file,
 54 |         dest_key="{{ var.json.aws_configs.s3_key_prefix }}/yellow_tripdata_sample_2019-01.csv",
 55 |         dest_bucket="{{ var.json.aws_configs.s3_bucket }}",
 56 |         aws_conn_id="aws_default",
 57 |         replace=True,
 58 |     )
 59 | 
 60 |     """
 61 |     #### Create Redshift Table
 62 |     For demo purposes, create a Redshift table to store the forest fire data to.
 63 |     The database is not automatically destroyed at the end of the example; ensure
 64 |     this is done manually to avoid unnecessary costs. Additionally, set-up may
 65 |     need to be done in Airflow connections to allow access to Redshift.
 66 |     """
 67 |     create_redshift_table = RedshiftSQLOperator(
 68 |         task_id="create_redshift_table",
 69 |         sql="{% include 'create_yellow_tripdata_redshift_table.sql' %}",
 70 |         parameters={"table_name": "yellow_tripdata"},
 71 |         redshift_conn_id="redshift_default",
 72 |     )
 73 | 
 74 |     """
 75 |     #### Second load task
 76 |     Loads the S3 data from the previous load to a Redshift table (specified
 77 |     in the Airflow Variables backend).
 78 |     """
 79 |     load_to_redshift = S3ToRedshiftOperator(
 80 |         task_id="load_to_redshift",
 81 |         s3_bucket="{{ var.json.aws_configs.s3_bucket }}",
 82 |         s3_key="{{ var.json.aws_configs.s3_key_prefix }}/yellow_tripdata_sample_2019-01.csv",
 83 |         schema="PUBLIC",
 84 |         table=table,
 85 |         copy_options=["csv", "ignoreheader 1"],
 86 |     )
 87 | 
 88 |     """
 89 |     #### Great Expectations suite
 90 |     Run the Great Expectations suite on the table.
 91 |     """
 92 |     ge_redshift_validation = GreatExpectationsOperator(
 93 |         task_id="ge_redshift_validation",
 94 |         data_context_root_dir=ge_root_dir,
 95 |         conn_id="redshift_default",
 96 |         expectation_suite_name="taxi.demo",
 97 |         data_asset_name=table,
 98 |         fail_task_on_validation_failure=False,
 99 |     )
100 | 
101 |     """
102 |     #### Drop Redshift table
103 |     Drops the Redshift table if it exists already. This is to make sure that the
104 |     data in the success and failure cases do not interfere with each other during
105 |     the data quality check.
106 |     """
107 |     drop_redshift_table = RedshiftSQLOperator(
108 |         task_id="drop_table",
109 |         sql="delete_yellow_tripdata_table.sql",
110 |         redshift_conn_id="redshift_default",
111 |         parameters={"table_name": table},
112 |     )
113 | 
114 |     chain(
115 |         upload_to_s3,
116 |         create_redshift_table,
117 |         load_to_redshift,
118 |         ge_redshift_validation,
119 |         drop_redshift_table,
120 |     )
121 | 


--------------------------------------------------------------------------------
/dags/great_expectations/great_expectations_snowflake.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ### Simple EL Pipeline with Data Quality Checks Using Snowflake and Great Expectations
  3 | 
  4 | A simple example of performing data quality checks in Snowflake using Great Expectations.
  5 | 
  6 | Ensure a Snowflake Warehouse, Database, Schema, Role, and S3 Key and Secret
  7 | exist for the Snowflake connection, named `snowflake_default`. Access to S3
  8 | is needed for this example. An 'aws_configs' variable is needed in Variables,
  9 | see the Redshift Examples in the README section for more information.
 10 | 
 11 | What makes this a simple data quality case is:
 12 | 1. Absolute ground truth: the local CSV file is considered perfect and immutable.
 13 | 2. No transformations or business logic.
 14 | 3. Exact values of data to quality check are known.
 15 | """
 16 | 
 17 | import os
 18 | from datetime import datetime
 19 | from pathlib import Path
 20 | 
 21 | import pandas as pd
 22 | from airflow import DAG
 23 | from airflow.models.baseoperator import chain
 24 | from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator
 25 | from great_expectations_provider.operators.great_expectations import \
 26 |     GreatExpectationsOperator
 27 | 
 28 | # This table variable is a placeholder, in a live environment, it is better
 29 | # to pull the table info from a Variable in a template
 30 | table = "YELLOW_TRIPDATA"
 31 | base_path = Path(__file__).parents[2]
 32 | data_file = os.path.join(
 33 |     base_path,
 34 |     "include",
 35 |     "sample_data/yellow_trip_data/yellow_tripdata_sample_2019-01.csv",
 36 | )
 37 | ge_root_dir = os.path.join(base_path, "include", "great_expectations")
 38 | 
 39 | SNOWFLAKE_CONN_ID = "snowflake_default"
 40 | 
 41 | with DAG(
 42 |     "great_expectations.snowflake",
 43 |     start_date=datetime(2021, 1, 1),
 44 |     description="Example DAG showcasing loading and data quality checking with Snowflake and Great Expectations.",
 45 |     doc_md=__doc__,
 46 |     schedule_interval=None,
 47 |     template_searchpath="/usr/local/airflow/include/sql/snowflake_examples/",
 48 |     catchup=False,
 49 | ) as dag:
 50 | 
 51 |     """
 52 |     #### Snowflake table creation
 53 |     Create the table to store sample forest fire data.
 54 |     """
 55 |     create_table = SnowflakeOperator(
 56 |         task_id="create_table",
 57 |         sql="{% include 'create_snowflake_yellow_tripdata_table.sql' %}",
 58 |         params={"table_name": table},
 59 |     )
 60 | 
 61 |     """
 62 |     #### Insert data
 63 |     Insert data into the Snowflake table using an existing SQL query (stored in
 64 |     the include/sql/snowflake_examples/ directory).
 65 |     """
 66 |     load_data = SnowflakeOperator(
 67 |         task_id="insert_query",
 68 |         sql="{% include 'load_yellow_tripdata.sql' %}",
 69 |         params={"table_name": table},
 70 |     )
 71 | 
 72 |     """
 73 |     #### Delete table
 74 |     Clean up the table created for the example.
 75 |     """
 76 |     delete_table = SnowflakeOperator(
 77 |         task_id="delete_table",
 78 |         sql="{% include 'delete_snowflake_table.sql' %}",
 79 |         params={"table_name": table},
 80 |     )
 81 | 
 82 |     """
 83 |     #### Great Expectations suite
 84 |     Run the Great Expectations suite on the table.
 85 |     """
 86 |     ge_snowflake_validation = GreatExpectationsOperator(
 87 |         task_id="ge_snowflake_validation",
 88 |         data_context_root_dir=ge_root_dir,
 89 |         conn_id=SNOWFLAKE_CONN_ID,
 90 |         expectation_suite_name="taxi.demo",
 91 |         data_asset_name=table,
 92 |         fail_task_on_validation_failure=False,
 93 |     )
 94 | 
 95 |     ge_snowflake_query_validation = GreatExpectationsOperator(
 96 |         task_id="ge_snowflake_query_validation",
 97 |         data_context_root_dir=ge_root_dir,
 98 |         conn_id=SNOWFLAKE_CONN_ID,
 99 |         query_to_validate="SELECT *",
100 |         expectation_suite_name="taxi.demo",
101 |         data_asset_name=table,
102 |         fail_task_on_validation_failure=False,
103 |     )
104 | 
105 |     chain(
106 |         create_table,
107 |         load_data,
108 |         [
109 |             ge_snowflake_validation,
110 |             ge_snowflake_query_validation,
111 |         ],
112 |         delete_table,
113 |     )
114 | 


--------------------------------------------------------------------------------
/dags/great_expectations/great_expectations_snowflake_write_audit_publish.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ### Write-Audit-Publish Pattern EL Pipeline with Data Quality Checks Using Snowflake and Great Expectations
  3 | 
  4 | Use the Write-Audit-Publish pattern with Great Expectaitons and Snowflake.
  5 | 
  6 | Ensure a Snowflake Warehouse, Database, Schema, Role, and S3 Key and Secret
  7 | exist for the Snowflake connection, named `snowflake_default`. Access to S3
  8 | is needed for this example. An 'aws_configs' variable is needed in Variables,
  9 | see the Redshift Examples in the README section for more information.
 10 | 
 11 | The write-audit-publish pattern writes data to a staging table, audits the
 12 | data quality through quality checks, then publishes correct data to a
 13 | production table. In this example incorrect data is discarded, and the DAG
 14 | is failed on data quality check failure.
 15 | 
 16 | What makes this a simple data quality case is:
 17 | 1. Absolute ground truth: the local CSV file is considered perfect and immutable.
 18 | 2. No transformations or business logic.
 19 | 3. Exact values of data to quality check are known.
 20 | """
 21 | 
 22 | import json
 23 | import os
 24 | from datetime import datetime
 25 | from pathlib import Path
 26 | 
 27 | from airflow import DAG
 28 | from airflow.models.baseoperator import chain
 29 | from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator
 30 | from great_expectations_provider.operators.great_expectations import \
 31 |     GreatExpectationsOperator
 32 | 
 33 | from include.libs.schema_reg.base_schema_transforms import \
 34 |     snowflake_load_column_string
 35 | 
 36 | # These variables are a placeholder. In a live environment, it is better
 37 | # to pull the info from a Variable.
 38 | table = "YELLOW_TRIPDATA"
 39 | snowflake_conn = "snowflake_default"
 40 | base_path = Path(__file__).parents[2]
 41 | table_schema_path = (
 42 |     f"{base_path}/include/sql/great_expectations_examples/table_schemas/"
 43 | )
 44 | ge_root_dir = os.path.join(base_path, "include", "great_expectations")
 45 | 
 46 | with DAG(
 47 |     "great_expectations.snowflake_write_audit_publish",
 48 |     start_date=datetime(2022, 1, 1),
 49 |     description="Example DAG showcasing a write-audit-publish data quality pattern with Snowflake and Great Expectations.",
 50 |     doc_md=__doc__,
 51 |     schedule_interval=None,
 52 |     template_searchpath=f"{base_path}/include/sql/snowflake_examples/",
 53 |     catchup=False,
 54 | ) as dag:
 55 | 
 56 |     """
 57 |     #### Snowflake table creation
 58 |     Creates the tables to store sample data
 59 |     """
 60 |     create_snowflake_audit_table = SnowflakeOperator(
 61 |         task_id="create_snowflake_audit_table",
 62 |         sql="{% include 'create_snowflake_yellow_tripdata_table.sql' %}",
 63 |         params={"table_name": f"{table}_AUDIT"},
 64 |     )
 65 | 
 66 |     create_snowflake_table = SnowflakeOperator(
 67 |         task_id="create_snowflake_table",
 68 |         sql="{% include 'create_snowflake_yellow_tripdata_table.sql' %}",
 69 |         params={"table_name": table},
 70 |     )
 71 | 
 72 |     """
 73 |     #### Insert data
 74 |     Insert data into the Snowflake table using an existing SQL query (stored in
 75 |     the include/sql/snowflake_examples/ directory).
 76 |     """
 77 |     load_data = SnowflakeOperator(
 78 |         task_id="load_data",
 79 |         sql="{% include 'load_yellow_tripdata.sql' %}",
 80 |         params={"table_name": f"{table}_AUDIT"},
 81 |     )
 82 | 
 83 |     """
 84 |     #### Delete table
 85 |     Cleans up the tables created for the example
 86 |     """
 87 |     delete_snowflake_audit_table = SnowflakeOperator(
 88 |         task_id="delete_snowflake_audit_table",
 89 |         sql="{% include 'delete_snowflake_table.sql' %}",
 90 |         params={"table_name": f"{table}_AUDIT"},
 91 |         trigger_rule="all_success",
 92 |     )
 93 | 
 94 |     delete_snowflake_table = SnowflakeOperator(
 95 |         task_id="delete_snowflake_table",
 96 |         sql="{% include 'delete_snowflake_table.sql' %}",
 97 |         params={"table_name": table},
 98 |         trigger_rule="all_success",
 99 |     )
100 | 
101 |     """
102 |     #### Great Expectations suite
103 |     Runs the Great Expectations suite on the table
104 |     """
105 |     ge_snowflake_validation = GreatExpectationsOperator(
106 |         task_id="ge_snowflake_validation",
107 |         data_context_root_dir=ge_root_dir,
108 |         conn_id=snowflake_conn,
109 |         expectation_suite_name="taxi.demo",
110 |         schema="SCHEMA", # set this to your schema
111 |         data_asset_name=f"{table}_AUDIT",
112 |         #fail_task_on_validation_failure=False,
113 |     )
114 | 
115 |     with open(
116 |         f"{table_schema_path}/tripdata_schema.json",
117 |         "r",
118 |     ) as f:
119 |         table_schema = json.load(f).get("yellow_tripdata")
120 |         table_props = table_schema.get("properties")
121 |         table_dimensions = table_schema.get("dimensions")
122 |         table_metrics = table_schema.get("metrics")
123 | 
124 |         col_string = snowflake_load_column_string(table_props)
125 | 
126 |         """
127 |         #### Snowflake audit to production task
128 |         Loads the data from the audit table to the production table
129 |         """
130 |         copy_snowflake_audit_to_production_table = SnowflakeOperator(
131 |             task_id="copy_snowflake_audit_to_production_table",
132 |             sql="{% include 'copy_yellow_tripdata_snowflake_staging.sql' %}",
133 |             params={
134 |                 "table_name": table,
135 |                 "audit_table_name": f"{table}_AUDIT",
136 |                 "table_schema": table_props,
137 |                 "col_string": col_string,
138 |             },
139 |         )
140 | 
141 |     chain(
142 |         [create_snowflake_table, create_snowflake_audit_table],
143 |         load_data,
144 |         ge_snowflake_validation,
145 |         copy_snowflake_audit_to_production_table,
146 |         [delete_snowflake_table, delete_snowflake_audit_table],
147 |     )
148 | 


--------------------------------------------------------------------------------
/dags/redshift_examples/simple_redshift_1.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ### Simple EL Pipeline with Data Integrity Check 
 3 | 
 4 | A simple DAG showing a minimal EL data pipeline with a data
 5 | integrity check. using MD5 hashes. 
 6 | 
 7 | A single file is uploaded to S3, then its ETag is verified
 8 | against the MD5 hash of the local file. The two should match, which will
 9 | allow the DAG to flow along the "happy path". To see the "sad path", change
10 | `CSV_FILE_PATH` to `CSV_CORRUPT_FILE_PATH` in the `validate_etag` task.
11 | 
12 | Before running the DAG, set the following in an Airflow or Environment Variable:
13 | - key: aws_configs
14 | - value: { "s3_bucket": [bucket_name], "s3_key_prefix": [key_prefix]}
15 | Fully replacing [bucket_name] and [key_prefix].
16 | 
17 | What makes this a simple data quality case is:
18 | 1. Absolute ground truth: the local CSV file is considered perfect and immutable.
19 | 2. Single-step data pipeline: no business logic to complicate things.
20 | 3. Single metric to validate.
21 | 
22 | This demo works well in the case of validating data that is read from S3, such
23 | as other data pipelines that will read from S3, or Athena. It would not be
24 | helpful for data that is read from Redshift, as there is another load step
25 | that should be validated separately.
26 | """
27 | 
28 | import hashlib
29 | 
30 | from airflow import DAG, AirflowException
31 | from airflow.decorators import task
32 | from airflow.models import Variable
33 | from airflow.operators.dummy_operator import DummyOperator
34 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook
35 | from airflow.providers.amazon.aws.transfers.local_to_s3 import \
36 |     LocalFilesystemToS3Operator
37 | from airflow.utils.dates import datetime
38 | 
39 | # The file(s) to upload shouldn't be hardcoded in a production setting, this is just for demo purposes.
40 | CSV_FILE_NAME = "forestfires.csv"
41 | CSV_FILE_PATH = f"include/sample_data/forestfire_data/{CSV_FILE_NAME}"
42 | CSV_CORRUPT_FILE_NAME = "forestfires_corrupt.csv"
43 | CSV_CORRUPT_FILE_PATH = f"include/sample_data/forestfire_data/{CSV_CORRUPT_FILE_NAME}"
44 | 
45 | with DAG(
46 |     "simple_redshift_1",
47 |     start_date=datetime(2021, 7, 7),
48 |     description="A sample Airflow DAG to load data from csv files to S3, then check that all data was uploaded properly.",
49 |     doc_md=__doc__,
50 |     schedule_interval=None,
51 |     catchup=False,
52 | ) as dag:
53 | 
54 |     upload_file = LocalFilesystemToS3Operator(
55 |         task_id="upload_to_s3",
56 |         filename=CSV_FILE_PATH,
57 |         dest_key="{{ var.json.aws_configs.s3_key_prefix }}/" + CSV_FILE_PATH,
58 |         dest_bucket="{{ var.json.aws_configs.s3_bucket }}",
59 |         aws_conn_id="aws_default",
60 |         replace=True,
61 |     )
62 | 
63 |     @task
64 |     def validate_etag():
65 |         """
66 |         #### Validation task
67 |         Check the destination ETag against the local MD5 hash to ensure the file
68 |         was uploaded without errors.
69 |         """
70 |         s3 = S3Hook()
71 |         aws_configs = Variable.get("aws_configs", deserialize_json=True)
72 |         obj = s3.get_key(
73 |             key=f"{aws_configs.get('s3_key_prefix')}/{CSV_FILE_PATH}",
74 |             bucket_name=aws_configs.get("s3_bucket"),
75 |         )
76 |         obj_etag = obj.e_tag.strip('"')
77 |         # Change `CSV_FILE_PATH` to `CSV_CORRUPT_FILE_PATH` for the "sad path".
78 |         file_hash = hashlib.md5(open(CSV_FILE_PATH).read().encode("utf-8")).hexdigest()
79 |         if obj_etag != file_hash:
80 |             raise AirflowException(
81 |                 f"Upload Error: Object ETag in S3 did not match hash of local file."
82 |             )
83 | 
84 |     validate_file = validate_etag()
85 | 
86 |     begin = DummyOperator(task_id="begin")
87 |     end = DummyOperator(task_id="end")
88 | 
89 |     begin >> upload_file >> validate_file >> end
90 | 


--------------------------------------------------------------------------------
/dags/redshift_examples/simple_redshift_2.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ### Extract and Load Pipeline with Data Integrity Check
  3 | 
  4 |  A single file is uploaded to S3, then its ETag is verified
  5 | against the MD5 hash of the local file. The two should match, which will
  6 | allow the DAG to flow along the "happy path". 
  7 | 
  8 | To see the "sad path", change`CSV_FILE_PATH` to `CSV_CORRUPT_FILE_PATH` in the `validate_etag` task. If the
  9 | "happy path" is continued, a second data load from S3 to Redshift is triggered,
 10 | which is followed by another data integrity check. A similar "happy/sad path"
 11 | branch ends the DAG.
 12 | 
 13 | Before running the DAG, set the following in an Airflow or Environment Variable:
 14 | - key: aws_configs
 15 | - value: { "s3_bucket": [bucket_name], "s3_key_prefix": [key_prefix], "redshift_table": [table_name]}
 16 | Fully replacing [bucket_name], [key_prefix], and [table_name].
 17 | 
 18 | What makes this a simple data quality case is:
 19 | 1. Absolute ground truth: the local CSV file is considered perfect and immutable.
 20 | 2. No transformations or business logic.
 21 | 3. Single metric to validate (whether the uploads were successful).
 22 | 
 23 | This demo solves the issue the Simple EL Pipeline with Data Integrity Check DAG left open: validating an
 24 | upload to Redshift. However, it only validates that the data matches the
 25 | source file; it does not guarantee that the source file's data is actually
 26 | valid with respect to expectations about that data.
 27 | """
 28 | 
 29 | import hashlib
 30 | 
 31 | from airflow import DAG, AirflowException
 32 | from airflow.decorators import task
 33 | from airflow.models import Variable
 34 | from airflow.models.baseoperator import chain
 35 | from airflow.operators.dummy_operator import DummyOperator
 36 | from airflow.operators.sql import SQLCheckOperator
 37 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook
 38 | from airflow.providers.amazon.aws.transfers.local_to_s3 import \
 39 |     LocalFilesystemToS3Operator
 40 | from airflow.providers.amazon.aws.transfers.s3_to_redshift import \
 41 |     S3ToRedshiftOperator
 42 | from airflow.providers.postgres.operators.postgres import PostgresOperator
 43 | from airflow.utils.dates import datetime
 44 | 
 45 | # The file(s) to upload shouldn't be hardcoded in a production setting, this is just for demo purposes.
 46 | CSV_FILE_NAME = "forestfires.csv"
 47 | CSV_FILE_PATH = f"include/sample_data/forestfire_data/{CSV_FILE_NAME}"
 48 | CSV_CORRUPT_FILE_NAME = "forestfires_corrupt.csv"
 49 | CSV_CORRUPT_FILE_PATH = f"include/sample_data/forestfire_data/{CSV_CORRUPT_FILE_NAME}"
 50 | 
 51 | with DAG(
 52 |     "simple_redshift_2",
 53 |     start_date=datetime(2021, 7, 7),
 54 |     description="A sample Airflow DAG to load data from csv files to S3 and then Redshift, with data integrity checks.",
 55 |     doc_md=__doc__,
 56 |     schedule_interval=None,
 57 |     template_searchpath="/usr/local/airflow/include/sql/redshift_examples/",
 58 |     catchup=False,
 59 | ) as dag:
 60 | 
 61 |     upload_file = LocalFilesystemToS3Operator(
 62 |         task_id="upload_to_s3",
 63 |         filename=CSV_FILE_PATH,
 64 |         dest_key="{{ var.json.aws_configs.s3_key_prefix }}/" + CSV_FILE_PATH,
 65 |         dest_bucket="{{ var.json.aws_configs.s3_bucket }}",
 66 |         aws_conn_id="aws_default",
 67 |         replace=True,
 68 |     )
 69 | 
 70 |     @task
 71 |     def validate_etag():
 72 |         """
 73 |         #### Validation task
 74 |         Check the destination ETag against the local MD5 hash to ensure the file
 75 |         was uploaded without errors.
 76 |         """
 77 |         s3 = S3Hook()
 78 |         aws_configs = Variable.get("aws_configs", deserialize_json=True)
 79 |         obj = s3.get_key(
 80 |             key=f"{aws_configs.get('s3_key_prefix')}/{CSV_FILE_PATH}",
 81 |             bucket_name=aws_configs.get("s3_bucket"),
 82 |         )
 83 |         obj_etag = obj.e_tag.strip('"')
 84 |         # Change `CSV_FILE_PATH` to `CSV_CORRUPT_FILE_PATH` for the "sad path".
 85 |         file_hash = hashlib.md5(open(CSV_FILE_PATH).read().encode("utf-8")).hexdigest()
 86 |         if obj_etag != file_hash:
 87 |             raise AirflowException(
 88 |                 f"Upload Error: Object ETag in S3 did not match hash of local file."
 89 |             )
 90 | 
 91 |     validate_file = validate_etag()
 92 | 
 93 |     """
 94 |     #### Create Redshift Table
 95 |     For demo purposes, create a Redshift table to store the forest fire data to.
 96 |     The database is not automatically destroyed at the end of the example; ensure
 97 |     this is done manually to avoid unnecessary costs. Additionally, set-up may
 98 |     need to be done in Airflow connections to allow access to Redshift.
 99 |     """
100 |     create_redshift_table = PostgresOperator(
101 |         task_id="create_table",
102 |         sql="create_redshift_forestfire_table.sql",
103 |         postgres_conn_id="redshift_default",
104 |     )
105 | 
106 |     """
107 |     #### Second load task
108 |     Loads the S3 data from the previous load to a Redshift table (specified
109 |     in the Airflow Variables backend).
110 |     """
111 |     load_to_redshift = S3ToRedshiftOperator(
112 |         task_id="load_to_redshift",
113 |         s3_bucket="{{ var.json.aws_configs.s3_bucket }}",
114 |         s3_key="{{ var.json.aws_configs.s3_key_prefix }}" + f"/{CSV_FILE_PATH}",
115 |         schema="PUBLIC",
116 |         table="{{ var.json.aws_configs.redshift_table }}",
117 |         copy_options=["csv"],
118 |     )
119 | 
120 |     """
121 |     #### Redshift row validation task
122 |     Ensure that data was copied to Redshift from S3 correctly. A SQLCheckOperator is
123 |     used here to check for any files in the stl_load_errors table.
124 |     """
125 |     validate_redshift = SQLCheckOperator(
126 |         task_id="validate_redshift",
127 |         conn_id="redshift_default",
128 |         sql="validate_redshift_forestfire_load.sql",
129 |         params={"filename": CSV_FILE_NAME},
130 |     )
131 | 
132 |     """
133 |     #### Drop Redshift table
134 |     Drops the Redshift table if it exists already. This is to make sure that the
135 |     data in the success and failure cases do not interfere with each other during
136 |     the data quality check.
137 |     """
138 |     drop_redshift_table = PostgresOperator(
139 |         task_id="drop_table",
140 |         sql="drop_redshift_forestfire_table.sql",
141 |         postgres_conn_id="redshift_default",
142 |     )
143 | 
144 |     begin = DummyOperator(task_id="begin")
145 |     end = DummyOperator(task_id="end")
146 | 
147 |     chain(
148 |         begin,
149 |         upload_file,
150 |         validate_file,
151 |         create_redshift_table,
152 |         load_to_redshift,
153 |         validate_redshift,
154 |         drop_redshift_table,
155 |         end,
156 |     )
157 | 


--------------------------------------------------------------------------------
/dags/redshift_examples/simple_redshift_3.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ### Data Integrity Checks with Multiple Files
  3 | 
  4 | 
  5 | This is the third in a series of DAGs showing an EL pipeline with data integrity
  6 | and data quality checking. A single file is uploaded to S3, then its ETag is
  7 | verified against the MD5 hash of the local file. The two should match, which
  8 | will allow the DAG to continue to the next task. 
  9 | 
 10 | A second data load from S3 to Redshift is triggered, which is followed by another data integrity check.
 11 | If the check fails, an Airflow Exception is raised. Otherwise, a final data
 12 | quality check is performed on the Redshift table per row for a subset of rows,
 13 | immitating a row-based data quality spot check where the specific ground truth
 14 | is known.
 15 | 
 16 | Before running the DAG, set the following in an Airflow or Environment Variable:
 17 | - key: aws_configs
 18 | - value: { "s3_bucket": [bucket_name], "s3_key_prefix": [key_prefix], "redshift_table": [table_name]}
 19 | Fully replacing [bucket_name], [key_prefix], and [table_name].
 20 | 
 21 | What makes this a simple data quality case is:
 22 | 1. Absolute ground truth: the local CSV file is considered perfect and immutable.
 23 | 2. No transformations or business logic.
 24 | 3. Exact values of data to quality check are known.
 25 | 
 26 | This demo solves the issue Extract and Load Pipeline with Data Integrity Check left open: quality checking the data
 27 | in the uploaded file. This DAG is a good starting point for a data integrity
 28 | and data quality check.
 29 | """
 30 | 
 31 | import hashlib
 32 | import json
 33 | 
 34 | from airflow import DAG, AirflowException
 35 | from airflow.decorators import task
 36 | from airflow.models import Variable
 37 | from airflow.models.baseoperator import chain
 38 | from airflow.operators.dummy_operator import DummyOperator
 39 | from airflow.operators.sql import SQLCheckOperator
 40 | from airflow.providers.amazon.aws.hooks.s3 import S3Hook
 41 | from airflow.providers.amazon.aws.transfers.local_to_s3 import \
 42 |     LocalFilesystemToS3Operator
 43 | from airflow.providers.amazon.aws.transfers.s3_to_redshift import \
 44 |     S3ToRedshiftOperator
 45 | from airflow.providers.postgres.operators.postgres import PostgresOperator
 46 | from airflow.utils.dates import datetime
 47 | from airflow.utils.task_group import TaskGroup
 48 | 
 49 | # The file(s) to upload shouldn't be hardcoded in a production setting, this is just for demo purposes.
 50 | CSV_FILE_NAME = "forestfires.csv"
 51 | CSV_FILE_PATH = f"include/sample_data/forestfire_data/{CSV_FILE_NAME}"
 52 | # CSV_CORRUPT_FILE_NAME = "forestfires_corrupt.csv"
 53 | # CSV_CORRUPT_FILE_PATH = f"include/sample_data/forestfire_data/{CSV_CORRUPT_FILE_NAME}"
 54 | 
 55 | # Uncomment the below two constants to see the "sad path" (and comment out the paths above).
 56 | # CSV_FILE_NAME = "forestfires_invalid.csv"
 57 | # CSV_FILE_PATH = f"include/sample_data/forestfire_data/{CSV_FILE_NAME}"
 58 | 
 59 | with DAG(
 60 |     "simple_redshift_3",
 61 |     start_date=datetime(2021, 7, 7),
 62 |     description="A sample Airflow DAG to load data from csv files to S3 and then Redshift, with data integrity and quality checks.",
 63 |     doc_md=__doc__,
 64 |     schedule_interval=None,
 65 |     template_searchpath="/usr/local/airflow/include/sql/redshift_examples/",
 66 |     catchup=False,
 67 | ) as dag:
 68 | 
 69 |     upload_file = LocalFilesystemToS3Operator(
 70 |         task_id="upload_to_s3",
 71 |         filename=CSV_FILE_PATH,
 72 |         dest_key="{{ var.json.aws_configs.s3_key_prefix }}/" + CSV_FILE_PATH,
 73 |         dest_bucket="{{ var.json.aws_configs.s3_bucket }}",
 74 |         aws_conn_id="aws_default",
 75 |         replace=True,
 76 |     )
 77 | 
 78 |     @task
 79 |     def validate_etag():
 80 |         """
 81 |         #### Validation task
 82 |         Check the destination ETag against the local MD5 hash to ensure the file
 83 |         was uploaded without errors.
 84 |         """
 85 |         s3 = S3Hook()
 86 |         aws_configs = Variable.get("aws_configs", deserialize_json=True)
 87 |         obj = s3.get_key(
 88 |             key=f"{aws_configs.get('s3_key_prefix')}/{CSV_FILE_PATH}",
 89 |             bucket_name=aws_configs.get("s3_bucket"),
 90 |         )
 91 |         obj_etag = obj.e_tag.strip('"')
 92 |         # Change `CSV_FILE_PATH` to `CSV_CORRUPT_FILE_PATH` for the "sad path".
 93 |         file_hash = hashlib.md5(open(CSV_FILE_PATH).read().encode("utf-8")).hexdigest()
 94 |         if obj_etag != file_hash:
 95 |             raise AirflowException(
 96 |                 f"Upload Error: Object ETag in S3 did not match hash of local file."
 97 |             )
 98 | 
 99 |     validate_file = validate_etag()
100 | 
101 |     """
102 |     #### Create Redshift Table
103 |     For demo purposes, create a Redshift table to store the forest fire data to.
104 |     The database is not automatically destroyed at the end of the example; ensure
105 |     this is done manually to avoid unnecessary costs. Additionally, set-up may
106 |     need to be done in Airflow connections to allow access to Redshift.
107 |     """
108 |     create_redshift_table = PostgresOperator(
109 |         task_id="create_table",
110 |         sql="create_redshift_forestfire_table.sql",
111 |         postgres_conn_id="redshift_default",
112 |     )
113 | 
114 |     """
115 |     #### Second load task
116 |     Loads the S3 data from the previous load to a Redshift table (specified
117 |     in the Airflow Variables backend).
118 |     """
119 |     load_to_redshift = S3ToRedshiftOperator(
120 |         task_id="load_to_redshift",
121 |         s3_bucket="{{ var.json.aws_configs.s3_bucket }}",
122 |         s3_key="{{ var.json.aws_configs.s3_key_prefix }}" + f"/{CSV_FILE_PATH}",
123 |         schema="PUBLIC",
124 |         table="{{ var.json.aws_configs.redshift_table }}",
125 |         copy_options=["csv"],
126 |     )
127 | 
128 |     """
129 |     #### Redshift row validation task
130 |     Ensure that data was copied to Redshift from S3 correctly. A SQLCheckOperator is
131 |     used here to check for any files in the stl_load_errors table.
132 |     """
133 |     validate_redshift = SQLCheckOperator(
134 |         task_id="validate_redshift",
135 |         conn_id="redshift_default",
136 |         sql="validate_redshift_forestfire_load.sql",
137 |         params={"filename": CSV_FILE_NAME},
138 |     )
139 | 
140 |     """
141 |     #### Row-level data quality check
142 |     Run a data quality check on a few rows, ensuring that the data in Redshift
143 |     matches the ground truth in the correspoding JSON file.
144 |     """
145 |     with open("include/validation/forestfire_validation.json") as ffv:
146 |         with TaskGroup(group_id="row_quality_checks") as quality_check_group:
147 |             ffv_json = json.load(ffv)
148 |             for id, values in ffv_json.items():
149 |                 values["id"] = id
150 |                 SQLCheckOperator(
151 |                     task_id=f"forestfire_row_quality_check_{id}",
152 |                     conn_id="redshift_default",
153 |                     sql="row_quality_redshift_forestfire_check.sql",
154 |                     params=values,
155 |                 )
156 | 
157 |     """
158 |     #### Drop Redshift table
159 |     Drops the Redshift table if it exists already. This is to make sure that the
160 |     data in the success and failure cases do not interfere with each other during
161 |     the data quality check.
162 |     """
163 |     drop_redshift_table = PostgresOperator(
164 |         task_id="drop_table",
165 |         sql="drop_redshift_forestfire_table.sql",
166 |         postgres_conn_id="redshift_default",
167 |     )
168 | 
169 |     begin = DummyOperator(task_id="begin")
170 |     end = DummyOperator(task_id="end")
171 | 
172 |     chain(
173 |         begin,
174 |         upload_file,
175 |         validate_file,
176 |         create_redshift_table,
177 |         load_to_redshift,
178 |         validate_redshift,
179 |         quality_check_group,
180 |         drop_redshift_table,
181 |         end,
182 |     )
183 | 


--------------------------------------------------------------------------------
/dags/snowflake_examples/complex_snowflake_transform.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ### Snowflake ELT Pipeline with Multiple Datasets and Data Qality Checks
  3 | 
  4 | Run data quality checks, in SQL, on multiple Snowflake tables.
  5 | 
  6 | This DAG uses the forestfires public dataset on ForestFires to run data quality checks on multiple tables in Snowflake.
  7 | In the event of a failure, a Slack notification will be fired off. In this example, data quality checks are 
  8 | run as taskgroups after the data is uploaded.
  9 | 
 10 | Note that this DAG deletes all data it uploaded after the DQ checks run. 
 11 | 
 12 | Ensure a Snowflake Warehouse, Database, Schema, and Role exist for the Snowflake
 13 | connection provided to the operator under the connection ID `snowflake_default`.
 14 | """
 15 | 
 16 | from airflow import DAG
 17 | from airflow.models.baseoperator import chain
 18 | from airflow.operators.empty import EmptyOperator
 19 | from airflow.providers.common.sql.operators.sql import (SQLColumnCheckOperator,
 20 |                                                         SQLTableCheckOperator)
 21 | from airflow.providers.slack.operators.slack_webhook import \
 22 |     SlackWebhookOperator
 23 | from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator
 24 | from airflow.utils.dates import datetime
 25 | from airflow.utils.task_group import TaskGroup
 26 | 
 27 | SNOWFLAKE_FORESTFIRE_TABLE = "forestfires"
 28 | SNOWFLAKE_COST_TABLE = "costs"
 29 | SNOWFLAKE_FORESTFIRE_COST_TABLE = "forestfire_costs"
 30 | 
 31 | SNOWFLAKE_CONN_ID = "snowflake_default"
 32 | 
 33 | ROW_COUNT_CHECK = "COUNT(*) = 9"
 34 | 
 35 | 
 36 | def slack_failure_notification(context):
 37 |     task_id = context.get("task_instance").task_id
 38 |     dag_id = context.get("task_instance").dag_id
 39 |     exec_date = context.get("execution_date")
 40 |     log_url = context.get("task_instance").log_url
 41 |     slack_msg = f"""
 42 |             :red_circle: Task Failed. 
 43 |             *Task*: {task_id}  
 44 |             *Dag*: {dag_id} 
 45 |             *Execution Time*: {exec_date}  
 46 |             *Log Url*: {log_url} 
 47 |             """
 48 |     failed_alert = SlackWebhookOperator(
 49 |         task_id="slack_notification",
 50 |         http_conn_id="slack_webhook",
 51 |         message=slack_msg,
 52 |         channel="data_engineering",
 53 |         username="failbot",
 54 |     )
 55 |     return failed_alert.execute(context=context)
 56 | 
 57 | 
 58 | with DAG(
 59 |     "complex_snowflake_transform",
 60 |     description="Example DAG showcasing loading, transforming, and data quality checking with multiple datasets in Snowflake.",
 61 |     doc_md=__doc__,
 62 |     start_date=datetime(2021, 1, 1),
 63 |     schedule_interval=None,
 64 |     template_searchpath="/usr/local/airflow/include/sql/snowflake_examples/",
 65 |     catchup=False,
 66 | ) as dag:
 67 |     """
 68 |     #### Snowflake table creation
 69 |     Create the tables to store sample data.
 70 |     """
 71 |     create_forestfire_table = SnowflakeOperator(
 72 |         task_id="create_forestfire_table",
 73 |         sql="create_forestfire_table.sql",
 74 |         params={"table_name": SNOWFLAKE_FORESTFIRE_TABLE},
 75 |     )
 76 | 
 77 |     create_cost_table = SnowflakeOperator(
 78 |         task_id="create_cost_table",
 79 |         sql="create_cost_table.sql",
 80 |         params={"table_name": SNOWFLAKE_COST_TABLE},
 81 |     )
 82 | 
 83 |     create_forestfire_cost_table = SnowflakeOperator(
 84 |         task_id="create_forestfire_cost_table",
 85 |         sql="create_forestfire_cost_table.sql",
 86 |         params={"table_name": SNOWFLAKE_FORESTFIRE_COST_TABLE},
 87 |     )
 88 | 
 89 |     """
 90 |     #### Insert data
 91 |     Insert data into the Snowflake tables using existing SQL queries
 92 |     stored in the include/sql/snowflake_examples/ directory.
 93 |     """
 94 |     load_forestfire_data = SnowflakeOperator(
 95 |         task_id="load_forestfire_data",
 96 |         sql="load_snowflake_forestfire_data.sql",
 97 |         params={"table_name": SNOWFLAKE_FORESTFIRE_TABLE},
 98 |     )
 99 | 
100 |     load_cost_data = SnowflakeOperator(
101 |         task_id="load_cost_data",
102 |         sql="load_cost_data.sql",
103 |         params={"table_name": SNOWFLAKE_COST_TABLE},
104 |     )
105 | 
106 |     load_forestfire_cost_data = SnowflakeOperator(
107 |         task_id="load_forestfire_cost_data",
108 |         sql="load_forestfire_cost_data.sql",
109 |         params={"table_name": SNOWFLAKE_FORESTFIRE_COST_TABLE},
110 |     )
111 | 
112 |     """
113 |     #### Transform
114 |     Transform the forestfire_costs table to perform
115 |     sample logic.
116 |     """
117 |     transform_forestfire_cost_table = SnowflakeOperator(
118 |         task_id="transform_forestfire_cost_table",
119 |         sql="transform_forestfire_cost_table.sql",
120 |         params={"table_name": SNOWFLAKE_FORESTFIRE_COST_TABLE},
121 |     )
122 | 
123 |     """
124 |     #### Quality checks
125 |     Perform data quality checks on the various tables.
126 |     """
127 |     with TaskGroup(
128 |         group_id="quality_check_group_forestfire",
129 |         default_args={
130 |             "conn_id": SNOWFLAKE_CONN_ID,
131 |             "on_failure_callback": slack_failure_notification,
132 |         },
133 |     ) as quality_check_group_forestfire:
134 |         """
135 |         #### Column-level data quality check
136 |         Run data quality checks on columns of the forestfire table
137 |         """
138 |         forestfire_column_checks = SQLColumnCheckOperator(
139 |             task_id="forestfire_column_checks",
140 |             table=SNOWFLAKE_FORESTFIRE_TABLE,
141 |             column_mapping={
142 |                 "ID": {"null_check": {"equal_to": 0}},
143 |                 "RH": {"max": {"leq_to": 100}},
144 |             },
145 |         )
146 | 
147 |         """
148 |         #### Table-level data quality check
149 |         Run data quality checks on the forestfire table
150 |         """
151 |         forestfire_table_checks = SQLTableCheckOperator(
152 |             task_id="forestfire_table_checks",
153 |             table=SNOWFLAKE_FORESTFIRE_TABLE,
154 |             checks={"row_count_check": {"check_statement": ROW_COUNT_CHECK}},
155 |         )
156 | 
157 |     with TaskGroup(
158 |         group_id="quality_check_group_cost",
159 |         default_args={
160 |             "conn_id": SNOWFLAKE_CONN_ID,
161 |             "on_failure_callback": slack_failure_notification,
162 |         },
163 |     ) as quality_check_group_cost:
164 |         """
165 |         #### Column-level data quality check
166 |         Run data quality checks on columns of the forestfire table
167 |         """
168 |         cost_column_checks = SQLColumnCheckOperator(
169 |             task_id="cost_column_checks",
170 |             table=SNOWFLAKE_COST_TABLE,
171 |             column_mapping={
172 |                 "ID": {"null_check": {"equal_to": 0}},
173 |                 "LAND_DAMAGE_COST": {"min": {"geq_to": 0}},
174 |                 "PROPERTY_DAMAGE_COST": {"min": {"geq_to": 0}},
175 |                 "LOST_PROFITS_COST": {"min": {"geq_to": 0}},
176 |             },
177 |         )
178 | 
179 |         """
180 |         #### Table-level data quality check
181 |         Run data quality checks on the forestfire table
182 |         """
183 |         cost_table_checks = SQLTableCheckOperator(
184 |             task_id="cost_table_checks",
185 |             table=SNOWFLAKE_COST_TABLE,
186 |             checks={"row_count_check": {"check_statement": ROW_COUNT_CHECK}},
187 |         )
188 | 
189 |     with TaskGroup(
190 |         group_id="quality_check_group_forestfire_costs",
191 |         default_args={
192 |             "conn_id": SNOWFLAKE_CONN_ID,
193 |             "on_failure_callback": slack_failure_notification,
194 |         },
195 |     ) as quality_check_group_forestfire_costs:
196 |         """
197 |         #### Column-level data quality check
198 |         Run data quality checks on columns of the forestfire table
199 |         """
200 |         forestfire_costs_column_checks = SQLColumnCheckOperator(
201 |             task_id="forestfire_costs_column_checks",
202 |             table=SNOWFLAKE_FORESTFIRE_COST_TABLE,
203 |             column_mapping={"AREA": {"min": {"geq_to": 0}}},
204 |         )
205 | 
206 |         """
207 |         #### Table-level data quality check
208 |         Run data quality checks on the forestfire table
209 |         """
210 |         forestfire_costs_table_checks = SQLTableCheckOperator(
211 |             task_id="forestfire_costs_table_checks",
212 |             table=SNOWFLAKE_FORESTFIRE_COST_TABLE,
213 |             checks={
214 |                 "row_count_check": {"check_statement": ROW_COUNT_CHECK},
215 |                 "total_cost_check": {
216 |                     "check_statement": "land_damage_cost + property_damage_cost + lost_profits_cost = total_cost"
217 |                 },
218 |             },
219 |         )
220 | 
221 |     """
222 |     #### Delete tables
223 |     Clean up the tables created for the example.
224 |     """
225 |     delete_forestfire_table = SnowflakeOperator(
226 |         task_id="delete_forestfire_table",
227 |         sql="delete_snowflake_table.sql",
228 |         params={"table_name": SNOWFLAKE_FORESTFIRE_TABLE},
229 |     )
230 | 
231 |     delete_cost_table = SnowflakeOperator(
232 |         task_id="delete_costs_table",
233 |         sql="delete_snowflake_table.sql",
234 |         params={"table_name": SNOWFLAKE_COST_TABLE},
235 |     )
236 | 
237 |     delete_forestfire_cost_table = SnowflakeOperator(
238 |         task_id="delete_forestfire_cost_table",
239 |         sql="delete_snowflake_table.sql",
240 |         params={"table_name": SNOWFLAKE_FORESTFIRE_COST_TABLE},
241 |     )
242 | 
243 |     begin = EmptyOperator(task_id="begin")
244 |     create_done = EmptyOperator(task_id="create_done")
245 |     load_done = EmptyOperator(task_id="load_done")
246 |     end = EmptyOperator(task_id="end")
247 | 
248 |     chain(
249 |         begin,
250 |         [create_forestfire_table, create_cost_table, create_forestfire_cost_table],
251 |         create_done,
252 |         [load_forestfire_data, load_cost_data],
253 |         load_done,
254 |         [quality_check_group_forestfire, quality_check_group_cost],
255 |         load_forestfire_cost_data,
256 |         quality_check_group_forestfire_costs,
257 |         transform_forestfire_cost_table,
258 |         [delete_forestfire_table, delete_cost_table, delete_forestfire_cost_table],
259 |         end,
260 |     )
261 | 


--------------------------------------------------------------------------------
/dags/snowflake_examples/simple_snowflake.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ### Simple EL Pipeline with Data Quality Checks Using Snowflake
  3 | 
  4 | Runs a data quality check, in SQL, on the forest fires dataset
  5 | 
  6 | Note that this DAG will clean up after itself and delete all data it uploads.
  7 | 
  8 | Ensure a Snowflake Warehouse, Database, Schema, and Role exist for the Snowflake
  9 | connection provided to the Operator. The names of these data should replace the
 10 | dummy values at the top of the file.
 11 | 
 12 | A Snowflake Connection is also needed, named `snowflake_default`.
 13 | 
 14 | What makes this a simple data quality case is:
 15 | 1. Absolute ground truth: the local CSV file is considered perfect and immutable.
 16 | 2. No transformations or business logic.
 17 | 3. Exact values of data to quality check are known.
 18 | """
 19 | 
 20 | from airflow import DAG
 21 | from airflow.models.baseoperator import chain
 22 | from airflow.operators.empty import EmptyOperator
 23 | from airflow.providers.common.sql.operators.sql import (SQLColumnCheckOperator,
 24 |                                                         SQLTableCheckOperator)
 25 | from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator
 26 | from airflow.utils.dates import datetime
 27 | from airflow.utils.task_group import TaskGroup
 28 | 
 29 | SNOWFLAKE_FORESTFIRE_TABLE = "forestfires"
 30 | SNOWFLAKE_CONN_ID = "snowflake_default"
 31 | 
 32 | 
 33 | with DAG(
 34 |     "simple_snowflake",
 35 |     description="Example DAG showcasing loading and data quality checking with Snowflake.",
 36 |     doc_md=__doc__,
 37 |     start_date=datetime(2021, 1, 1),
 38 |     schedule_interval=None,
 39 |     template_searchpath="/usr/local/airflow/include/sql/snowflake_examples/",
 40 |     catchup=False,
 41 | ) as dag:
 42 | 
 43 |     """
 44 |     #### Snowflake table creation
 45 |     Create the table to store sample forest fire data.
 46 |     """
 47 |     create_table = SnowflakeOperator(
 48 |         task_id="create_table",
 49 |         sql="{% include 'create_forestfire_table.sql' %}",
 50 |         params={"table_name": SNOWFLAKE_FORESTFIRE_TABLE},
 51 |     )
 52 | 
 53 |     """
 54 |     #### Insert data
 55 |     Insert data into the Snowflake table using an existing SQL query (stored in
 56 |     the include/sql/snowflake_examples/ directory).
 57 |     """
 58 |     load_data = SnowflakeOperator(
 59 |         task_id="insert_query",
 60 |         sql="{% include 'load_snowflake_forestfire_data.sql' %}",
 61 |         params={"table_name": SNOWFLAKE_FORESTFIRE_TABLE},
 62 |     )
 63 | 
 64 |     with TaskGroup(
 65 |         group_id="quality_checks", default_args={"conn_id": SNOWFLAKE_CONN_ID}
 66 |     ) as quality_check_group:
 67 |         """
 68 |         #### Column-level data quality check
 69 |         Run data quality checks on columns of the audit table
 70 |         """
 71 |         column_checks = SQLColumnCheckOperator(
 72 |             task_id="column_checks",
 73 |             table=SNOWFLAKE_FORESTFIRE_TABLE,
 74 |             column_mapping={"ID": {"null_check": {"equal_to": 0}}},
 75 |         )
 76 | 
 77 |         """
 78 |         #### Table-level data quality check
 79 |         Run data quality checks on the audit table
 80 |         """
 81 |         table_checks = SQLTableCheckOperator(
 82 |             task_id="table_checks",
 83 |             table=SNOWFLAKE_FORESTFIRE_TABLE,
 84 |             checks={"row_count_check": {"check_statement": "COUNT(*) = 9"}},
 85 |         )
 86 | 
 87 |     """
 88 |     #### Delete table
 89 |     Clean up the table created for the example.
 90 |     """
 91 |     delete_table = SnowflakeOperator(
 92 |         task_id="delete_table",
 93 |         sql="{% include 'delete_snowflake_table.sql' %}",
 94 |         params={"table_name": SNOWFLAKE_FORESTFIRE_TABLE},
 95 |     )
 96 | 
 97 |     begin = EmptyOperator(task_id="begin")
 98 |     end = EmptyOperator(task_id="end")
 99 | 
100 |     chain(begin, create_table, load_data, quality_check_group, delete_table, end)
101 | 


--------------------------------------------------------------------------------
/dags/snowflake_examples/snowflake_dynamic_write_audit_publish.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ### Data Quality Checks Using Snowflake and Dynamic Task Mapping. 
  3 | 
  4 | Map over a set of columns and perform data quality checks.
  5 | 
  6 | This DAG shows how to use Airflow's dynamic task mapping to create tasks based off of a supplied list of columns to perform a data quality check.
  7 | All DQ checks in this DAg are performed in SQL and are expressed in a task group.
  8 | 
  9 | Note this DAG will clean up after itself once it's done running.
 10 | """
 11 | 
 12 | import json
 13 | from pathlib import Path
 14 | 
 15 | from airflow import DAG
 16 | from airflow.models.baseoperator import chain
 17 | from airflow.operators.empty import EmptyOperator
 18 | from airflow.providers.common.sql.operators.sql import (SQLColumnCheckOperator,
 19 |                                                         SQLTableCheckOperator)
 20 | from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator
 21 | from airflow.utils.dates import datetime
 22 | from airflow.utils.task_group import TaskGroup
 23 | 
 24 | from include.forestfire_checks.checks import COL_CHECKS, TABLE_CHECKS
 25 | from include.libs.schema_reg.base_schema_transforms import \
 26 |     snowflake_load_column_string
 27 | 
 28 | SNOWFLAKE_FORESTFIRE_TABLE = "forestfires"
 29 | SNOWFLAKE_AUDIT_TABLE = f"{SNOWFLAKE_FORESTFIRE_TABLE}_AUDIT"
 30 | 
 31 | base_path = Path(__file__).parents[2]
 32 | table_schema_path = f"{base_path}/include/sql/snowflake_examples/table_schemas/"
 33 | 
 34 | with DAG(
 35 |     "snowflake_dynamic_write_audit_publish",
 36 |     doc_md=__doc__,
 37 |     start_date=datetime(2021, 1, 1),
 38 |     schedule_interval=None,
 39 |     template_searchpath="/usr/local/airflow/include/sql/snowflake_examples/",
 40 |     default_args={"conn_id": "snowflake_default"},
 41 |     catchup=False,
 42 | ) as dag:
 43 |     """
 44 |     #### Snowflake audit table creation
 45 |     Creates the tables to store sample data for testing
 46 |     """
 47 |     create_forestfire_audit_table = SnowflakeOperator(
 48 |         task_id="create_forestfire_audit_table",
 49 |         sql="create_forestfire_table.sql",
 50 |         params={"table_name": SNOWFLAKE_AUDIT_TABLE},
 51 |     )
 52 | 
 53 |     """
 54 |     #### Snowflake table creation
 55 |     Create the table to store verified sample data.
 56 |     """
 57 |     create_forestfire_production_table = SnowflakeOperator(
 58 |         task_id="create_forestfire_production_table",
 59 |         sql="create_forestfire_table.sql",
 60 |         params={"table_name": SNOWFLAKE_FORESTFIRE_TABLE},
 61 |     )
 62 | 
 63 |     """
 64 |     #### Insert data
 65 |     Insert data into the Snowflake audit table using an existing SQL query (stored in
 66 |     the include/sql/snowflake_examples/ directory).
 67 |     """
 68 |     load_data = SnowflakeOperator(
 69 |         task_id="insert_query",
 70 |         sql="load_snowflake_forestfire_data.sql",
 71 |         params={"table_name": SNOWFLAKE_AUDIT_TABLE},
 72 |     )
 73 | 
 74 |     with TaskGroup(group_id="quality_checks") as quality_check_group:
 75 |         """
 76 |         #### Column-level data quality check
 77 |         Run data quality checks on columns of the audit table
 78 |         """
 79 |         column_checks = SQLColumnCheckOperator.partial(
 80 |             task_id="column_checks",
 81 |             table=SNOWFLAKE_AUDIT_TABLE,
 82 |         ).expand(column_mapping=COL_CHECKS)
 83 | 
 84 |         """
 85 |         #### Table-level data quality check
 86 |         Run data quality checks on the audit table
 87 |         """
 88 |         table_checks = SQLTableCheckOperator.partial(
 89 |             task_id="table_checks",
 90 |             table=SNOWFLAKE_AUDIT_TABLE,
 91 |         ).expand(checks=TABLE_CHECKS)
 92 | 
 93 |     with open(
 94 |         f"{table_schema_path}/forestfire_schema.json",
 95 |         "r",
 96 |     ) as f:
 97 |         table_schema = json.load(f).get("forestfire")
 98 |         table_props = table_schema.get("properties")
 99 |         table_dimensions = table_schema.get("dimensions")
100 |         table_metrics = table_schema.get("metrics")
101 | 
102 |         col_string = snowflake_load_column_string(table_props)
103 | 
104 |         """
105 |         #### Snowflake audit to production task
106 |         Loads the data from the audit table to the production table
107 |         """
108 |         copy_snowflake_audit_to_production_table = SnowflakeOperator(
109 |             task_id="copy_snowflake_audit_to_production_table",
110 |             sql="copy_forestfire_snowflake_audit.sql",
111 |             params={
112 |                 "table_name": SNOWFLAKE_FORESTFIRE_TABLE,
113 |                 "audit_table_name": f"{SNOWFLAKE_FORESTFIRE_TABLE}_AUDIT",
114 |                 "table_schema": table_props,
115 |                 "col_string": col_string,
116 |             },
117 |             trigger_rule="all_success",
118 |         )
119 | 
120 |     """
121 |     #### Delete audit table
122 |     Clean up the table created for the example.
123 |     """
124 |     delete_audit_table = SnowflakeOperator(
125 |         task_id="delete_audit_table",
126 |         sql="delete_forestfire_table.sql",
127 |         params={"table_name": f"{SNOWFLAKE_FORESTFIRE_TABLE}_AUDIT"},
128 |         trigger_rule="all_success",
129 |     )
130 | 
131 |     begin = EmptyOperator(task_id="begin")
132 |     end = EmptyOperator(task_id="end")
133 | 
134 |     chain(
135 |         begin,
136 |         [create_forestfire_production_table, create_forestfire_audit_table],
137 |         load_data,
138 |         quality_check_group,
139 |         copy_snowflake_audit_to_production_table,
140 |         delete_audit_table,
141 |         end,
142 |     )
143 | 


--------------------------------------------------------------------------------
/dags/snowflake_examples/snowflake_write_audit_publish.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ### ELT Pipeline with Data Quality Checks Using Snowflake
  3 | 
  4 | Example DAG showcasing loading and data quality checking with Snowflake with a Write, Audit, Publish pattern.
  5 | """
  6 | 
  7 | import json
  8 | from pathlib import Path
  9 | 
 10 | from airflow import DAG
 11 | from airflow.models.baseoperator import chain
 12 | from airflow.operators.empty import EmptyOperator
 13 | from airflow.providers.common.sql.operators.sql import (SQLColumnCheckOperator,
 14 |                                                         SQLTableCheckOperator)
 15 | from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator
 16 | from airflow.utils.dates import datetime
 17 | from airflow.utils.task_group import TaskGroup
 18 | 
 19 | from include.libs.schema_reg.base_schema_transforms import \
 20 |     snowflake_load_column_string
 21 | 
 22 | SNOWFLAKE_FORESTFIRE_TABLE = "forestfires"
 23 | SNOWFLAKE_AUDIT_TABLE = f"{SNOWFLAKE_FORESTFIRE_TABLE}_AUDIT"
 24 | SNOWFLAKE_CONN_ID = "snowflake_default"
 25 | 
 26 | base_path = Path(__file__).parents[2]
 27 | table_schema_path = f"{base_path}/include/sql/snowflake_examples/table_schemas/"
 28 | 
 29 | with DAG(
 30 |     "snowflake_write_audit_publish",
 31 |     doc_md=__doc__,
 32 |     start_date=datetime(2021, 1, 1),
 33 |     schedule_interval=None,
 34 |     template_searchpath="/usr/local/airflow/include/sql/snowflake_examples/",
 35 |     catchup=False,
 36 | ) as dag:
 37 | 
 38 |     """
 39 |     #### Snowflake audit table creation
 40 |     Creates the tables to store sample data for testing
 41 |     """
 42 |     create_forestfire_audit_table = SnowflakeOperator(
 43 |         task_id="create_forestfire_audit_table",
 44 |         sql="create_forestfire_table.sql",
 45 |         params={"table_name": SNOWFLAKE_AUDIT_TABLE},
 46 |     )
 47 | 
 48 |     """
 49 |     #### Snowflake table creation
 50 |     Create the table to store verified sample data.
 51 |     """
 52 |     create_forestfire_production_table = SnowflakeOperator(
 53 |         task_id="create_forestfire_production_table",
 54 |         sql="create_forestfire_table.sql",
 55 |         params={"table_name": SNOWFLAKE_FORESTFIRE_TABLE},
 56 |     )
 57 | 
 58 |     """
 59 |     #### Insert data
 60 |     Insert data into the Snowflake audit table using an existing SQL query (stored in
 61 |     the include/sql/snowflake_examples/ directory).
 62 |     """
 63 |     load_data = SnowflakeOperator(
 64 |         task_id="insert_query",
 65 |         sql="load_snowflake_forestfire_data.sql",
 66 |         params={"table_name": SNOWFLAKE_AUDIT_TABLE},
 67 |     )
 68 | 
 69 |     with TaskGroup(
 70 |         group_id="quality_checks", default_args={"conn_id": SNOWFLAKE_CONN_ID}
 71 |     ) as quality_check_group:
 72 |         """
 73 |         #### Column-level data quality check
 74 |         Run data quality checks on columns of the audit table
 75 |         """
 76 |         column_checks = SQLColumnCheckOperator(
 77 |             task_id="column_checks",
 78 |             table=SNOWFLAKE_AUDIT_TABLE,
 79 |             column_mapping={"ID": {"null_check": {"equal_to": 0}}},
 80 |         )
 81 | 
 82 |         """
 83 |         #### Table-level data quality check
 84 |         Run data quality checks on the audit table
 85 |         """
 86 |         table_checks = SQLTableCheckOperator(
 87 |             task_id="table_checks",
 88 |             table=SNOWFLAKE_AUDIT_TABLE,
 89 |             checks={"row_count_check": {"check_statement": "COUNT(*) = 9"}},
 90 |         )
 91 | 
 92 |     with open(
 93 |         f"{table_schema_path}/forestfire_schema.json",
 94 |         "r",
 95 |     ) as f:
 96 |         table_schema = json.load(f).get("forestfire")
 97 |         table_props = table_schema.get("properties")
 98 |         table_dimensions = table_schema.get("dimensions")
 99 |         table_metrics = table_schema.get("metrics")
100 | 
101 |         col_string = snowflake_load_column_string(table_props)
102 | 
103 |         """
104 |         #### Snowflake audit to production task
105 |         Loads the data from the audit table to the production table
106 |         """
107 |         copy_snowflake_audit_to_production_table = SnowflakeOperator(
108 |             task_id="copy_snowflake_audit_to_production_table",
109 |             sql="copy_forestfire_snowflake_audit.sql",
110 |             params={
111 |                 "table_name": SNOWFLAKE_FORESTFIRE_TABLE,
112 |                 "audit_table_name": f"{SNOWFLAKE_FORESTFIRE_TABLE}_AUDIT",
113 |                 "table_schema": table_props,
114 |                 "col_string": col_string,
115 |             },
116 |             trigger_rule="all_success",
117 |         )
118 | 
119 |     """
120 |     #### Delete audit table
121 |     Clean up the table created for the example.
122 |     """
123 |     delete_audit_table = SnowflakeOperator(
124 |         task_id="delete_audit_table",
125 |         sql="delete_forestfire_table.sql",
126 |         params={"table_name": f"{SNOWFLAKE_FORESTFIRE_TABLE}_AUDIT"},
127 |         trigger_rule="all_success",
128 |     )
129 | 
130 |     begin = EmptyOperator(task_id="begin")
131 |     end = EmptyOperator(task_id="end")
132 | 
133 |     chain(
134 |         begin,
135 |         [create_forestfire_production_table, create_forestfire_audit_table],
136 |         load_data,
137 |         quality_check_group,
138 |         copy_snowflake_audit_to_production_table,
139 |         delete_audit_table,
140 |         end,
141 |     )
142 | 


--------------------------------------------------------------------------------
/dags/sql_examples/sql_check.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ### SQL Check Operators Data Quality Example
  3 | 
  4 | "A sample Airflow DAG to perform data quality checks using SQL Operators.
  5 | 
  6 | Before running the DAG, ensure you have an active and reachable SQL database
  7 | running, with a connection to that database in an Airflow Connection, and
  8 | the data loaded. This DAG **will not** run successfully as-is. For an
  9 | out-of-the-box working demo, see the sql_data_quality_redshift_etl DAG.
 10 | 
 11 | Note: The data files for this example do **not** include an `upload_date`
 12 | column. This column is needed for the interval check, and is added as a
 13 | Task in sql_check_redshift_etl.py.
 14 | """
 15 | 
 16 | from airflow import DAG
 17 | from airflow.models.baseoperator import chain
 18 | from airflow.operators.dummy_operator import DummyOperator
 19 | from airflow.operators.sql import (SQLCheckOperator, SQLIntervalCheckOperator,
 20 |                                    SQLThresholdCheckOperator,
 21 |                                    SQLValueCheckOperator)
 22 | from airflow.utils.dates import datetime
 23 | from airflow.utils.task_group import TaskGroup
 24 | 
 25 | # This table variable is a placeholder, in a live environment, it is better
 26 | # to pull the table info from a Variable in a template
 27 | TABLE = "yellow_tripdata"
 28 | DATES = ["2019-01", "2019-02"]
 29 | 
 30 | # By putting conn_id as a default_arg, the arg is passed to every task,
 31 | # reducing boilerplate
 32 | with DAG(
 33 |     "sql_data_quality",
 34 |     start_date=datetime(2021, 7, 7),
 35 |     doc_md=__doc__,
 36 |     schedule_interval=None,
 37 |     default_args={"conn_id": "postgres_default"},
 38 |     template_searchpath="/usr/local/airflow/include/sql/sql_examples/",
 39 |     catchup=False,
 40 | ) as dag:
 41 | 
 42 |     begin = DummyOperator(task_id="begin")
 43 |     end = DummyOperator(task_id="end")
 44 | 
 45 |     """
 46 |     #### Run Table-Level Quality Check
 47 |     Ensure that the correct number of rows are present in the table.
 48 |     """
 49 |     value_check = SQLValueCheckOperator(
 50 |         task_id="check_row_count",
 51 |         sql=f"SELECT COUNT(*) FROM {TABLE};",
 52 |         pass_value=20000,
 53 |     )
 54 | 
 55 |     """
 56 |     #### Run Interval Check
 57 |     Check that the average trip distance today is within a desirable threshold
 58 |     compared to the average trip distance yesterday.
 59 |     """
 60 |     interval_check = SQLIntervalCheckOperator(
 61 |         task_id="check_interval_data",
 62 |         table=TABLE,
 63 |         days_back=-1,
 64 |         date_filter_column="upload_date",
 65 |         metrics_thresholds={"AVG(trip_distance)": 1.5},
 66 |     )
 67 | 
 68 |     """
 69 |     #### Threshold Check
 70 |     Similar to the threshold cases in the Row-Level Check above, ensures that
 71 |     certain row(s) values meet the desired threshold(s).
 72 |     """
 73 |     threshold_check = SQLThresholdCheckOperator(
 74 |         task_id="check_threshold",
 75 |         sql=f"SELECT MAX(passenger_count) FROM {TABLE};",
 76 |         min_threshold=1,
 77 |         max_threshold=8,
 78 |     )
 79 | 
 80 |     """
 81 |     #### Run Row-Level Quality Checks
 82 |     For each date of data, run checks on 10 rows to ensure basic data quality
 83 |     cases (found in the .sql file) pass.
 84 |     """
 85 |     with TaskGroup(group_id="row_quality_checks") as quality_check_group:
 86 |         # Create 10 tasks, to spot-check 10 random rows
 87 |         for i in range(0, 10):
 88 |             """
 89 |             #### Run Row-Level Quality Checks
 90 |             Runs a series of checks on different columns of data for a single,
 91 |             randomly chosen row. This acts as a spot-check on data.
 92 |             """
 93 |             SQLCheckOperator(
 94 |                 task_id=f"yellow_tripdata_row_quality_check_{i}",
 95 |                 sql="row_quality_yellow_tripdata_check.sql",
 96 |             )
 97 | 
 98 |         chain(
 99 |             begin,
100 |             [quality_check_group, value_check, interval_check, threshold_check],
101 |             end,
102 |         )
103 | 


--------------------------------------------------------------------------------
/dags/sql_examples/sql_check_redshift_etl.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ### SQL Check Operators Data Quality ETL Example
  3 | 
  4 | Use the SQLCheckOperators to perform data quality checks in ETL use cases.
  5 | 
  6 | Before running the DAG, set the following in an Airflow or Environment Variable:
  7 | - key: aws_configs
  8 | - value: { "s3_bucket": [bucket_name], "s3_key_prefix": [key_prefix], "redshift_table": [table_name]}
  9 | Fully replacing [bucket_name], [key_prefix], and [table_name].
 10 | 
 11 | See the README for information on how to set up your Redshift connection.
 12 | This DAG can be used with other databases as long as the Redshift (and possibly
 13 | transfer operators) are changed.
 14 | """
 15 | 
 16 | import pandas as pd
 17 | from airflow import DAG
 18 | from airflow.decorators import task
 19 | from airflow.models.baseoperator import chain
 20 | from airflow.operators.dummy_operator import DummyOperator
 21 | from airflow.operators.sql import (SQLCheckOperator, SQLIntervalCheckOperator,
 22 |                                    SQLThresholdCheckOperator,
 23 |                                    SQLValueCheckOperator)
 24 | from airflow.providers.amazon.aws.transfers.local_to_s3 import \
 25 |     LocalFilesystemToS3Operator
 26 | from airflow.providers.amazon.aws.transfers.s3_to_redshift import \
 27 |     S3ToRedshiftOperator
 28 | from airflow.providers.postgres.operators.postgres import PostgresOperator
 29 | from airflow.utils.dates import datetime
 30 | from airflow.utils.task_group import TaskGroup
 31 | 
 32 | DATES = ["2019-01", "2019-02"]
 33 | TASK_DICT = {}
 34 | 
 35 | with DAG(
 36 |     "sql_data_quality_redshift_etl",
 37 |     start_date=datetime(2021, 7, 7),
 38 |     doc_md=__doc__,
 39 |     schedule_interval=None,
 40 |     default_args={"conn_id": "redshift_default"},
 41 |     template_searchpath="/usr/local/airflow/include/sql/sql_examples/",
 42 |     catchup=False,
 43 | ) as dag:
 44 | 
 45 |     """
 46 |     #### Dummy operators
 47 |     Help label start and end of dag. Converges exist because lists of tasks
 48 |     cannot set another list as downstream.
 49 |     """
 50 |     begin = DummyOperator(task_id="begin")
 51 |     end = DummyOperator(task_id="end")
 52 |     converge_1 = DummyOperator(task_id="converge_1")
 53 |     converge_2 = DummyOperator(task_id="converge_2")
 54 | 
 55 |     """
 56 |     #### Create Redshift Table
 57 |     For demo purposes, create a Redshift table to store the forest fire data to.
 58 |     The database is not automatically destroyed at the end of the example; ensure
 59 |     this is done manually to avoid unnecessary costs. Additionally, set-up may
 60 |     need to be done in Airflow connections to allow access to Redshift.
 61 |     """
 62 |     create_redshift_table = PostgresOperator(
 63 |         task_id="create_table",
 64 |         sql="create_redshift_yellow_tripdata_table.sql",
 65 |         postgres_conn_id="redshift_default",
 66 |     )
 67 | 
 68 |     with TaskGroup(group_id="row_quality_checks") as quality_check_group:
 69 |         # Create 10 tasks, to spot-check 10 random rows
 70 |         for i in range(0, 10):
 71 |             """
 72 |             #### Run Row-Level Quality Checks
 73 |             Runs a series of checks on different columns of data for a single,
 74 |             randomly chosen row. This acts as a spot-check on data. Note: When
 75 |             using the sample data, row level checks may fail. Which column(s) of
 76 |             the row that failed may be checked in the logs. To further diagnose
 77 |             the issue, run a modified query directly in Redshift's query editor
 78 |             to check individual values against calculations and expectations.
 79 |             """
 80 |             SQLCheckOperator(
 81 |                 task_id=f"yellow_tripdata_row_quality_check_{i}",
 82 |                 sql="row_quality_yellow_tripdata_check.sql",
 83 |             )
 84 | 
 85 |     """
 86 |     #### Run Table-Level Quality Check
 87 |     Ensure that the correct number of rows are present in the table.
 88 |     """
 89 |     value_check = SQLValueCheckOperator(
 90 |         task_id="check_row_count",
 91 |         sql="SELECT COUNT(*) FROM {{ var.json.aws_configs.redshift_table }};",
 92 |         pass_value=20000,
 93 |     )
 94 | 
 95 |     """
 96 |     #### Run Interval Check
 97 |     Check that the average trip distance today is within a desirable threshold
 98 |     compared to the average trip distance yesterday.
 99 |     """
100 |     interval_check = SQLIntervalCheckOperator(
101 |         task_id="check_interval_data",
102 |         table="{{ var.json.aws_configs.redshift_table }}",
103 |         days_back=-1,
104 |         date_filter_column="upload_date",
105 |         metrics_thresholds={"AVG(trip_distance)": 1.5},
106 |     )
107 | 
108 |     """
109 |     #### Threshold Check
110 |     Similar to the threshold cases in the Row-Level Check above, ensures that
111 |     certain row(s) values meet the desired threshold(s).
112 |     """
113 |     threshold_check = SQLThresholdCheckOperator(
114 |         task_id="check_threshold",
115 |         sql="SELECT MAX(passenger_count) FROM {{ var.json.aws_configs.redshift_table }};",
116 |         min_threshold=1,
117 |         max_threshold=8,
118 |     )
119 | 
120 |     """
121 |     #### Drop Redshift table
122 |     Drops the Redshift table if it exists already. This is to make sure that the
123 |     data in the success and failure cases do not interfere with each other during
124 |     the data quality check.
125 |     """
126 |     drop_redshift_table = PostgresOperator(
127 |         task_id="drop_table",
128 |         sql="drop_redshift_yellow_tripdata_table.sql",
129 |         postgres_conn_id="redshift_default",
130 |     )
131 | 
132 |     @task
133 |     def add_upload_date(file_path, upload_date):
134 |         """
135 |         #### Transform Task
136 |         In general, it is not recommended to perform transform operations in
137 |         Airflow Tasks, as Airflow is designed to be an orchestrator, not a
138 |         computation engine. However, the transform is done here as it is a
139 |         relatively small operation, simply adding an upload_date column to the
140 |         dataframe for use in the SQL data quality checks later. Doing the
141 |         transform here also makes this example more easily extensible to the
142 |         use of other backend datastores.
143 |         """
144 |         trip_dict = pd.read_csv(
145 |             file_path,
146 |             header=0,
147 |             parse_dates=["pickup_datetime"],
148 |             infer_datetime_format=True,
149 |         )
150 |         trip_dict["upload_date"] = upload_date
151 |         trip_dict.to_csv(file_path, header=True, index=False)
152 | 
153 |     @task
154 |     def delete_upload_date(file_path):
155 |         """
156 |         #### Drop added column
157 |         Drops the upload_date column used for this example, as this data is used
158 |         by other example DAGs in this repository, so it should not interfere
159 |         with those.
160 |         """
161 |         trip_dict = pd.read_csv(
162 |             file_path,
163 |             header=0,
164 |             parse_dates=["pickup_datetime"],
165 |             infer_datetime_format=True,
166 |         )
167 |         trip_dict.drop(columns="upload_date", inplace=True)
168 |         trip_dict.to_csv(file_path, header=True, index=False)
169 | 
170 |     for i, date in enumerate(DATES):
171 |         file_path = f"/usr/local/airflow/include/sample_data/yellow_trip_data/yellow_tripdata_sample_{date}.csv"
172 | 
173 |         TASK_DICT[f"add_upload_date_{date}"] = add_upload_date(
174 |             file_path, "{{ macros.ds_add(ds, " + str(-i) + ") }}"
175 |         )
176 | 
177 |         """
178 |         #### Upload task
179 |         Simply loads the file to a specified location in S3.
180 |         """
181 |         TASK_DICT[f"upload_to_s3_{date}"] = LocalFilesystemToS3Operator(
182 |             task_id=f"upload_to_s3_{date}",
183 |             filename=file_path,
184 |             dest_key="{{ var.json.aws_configs.s3_key_prefix }}/" + file_path,
185 |             dest_bucket="{{ var.json.aws_configs.s3_bucket }}",
186 |             aws_conn_id="aws_default",
187 |             replace=True,
188 |         )
189 | 
190 |         """
191 |         #### Redshift load task
192 |         Loads the S3 data from the previous load to a Redshift table (specified
193 |         in the Airflow Variables backend).
194 |         """
195 |         TASK_DICT[f"load_to_redshift_{date}"] = S3ToRedshiftOperator(
196 |             task_id=f"load_to_redshift_{date}",
197 |             s3_bucket="{{ var.json.aws_configs.s3_bucket }}",
198 |             s3_key="{{ var.json.aws_configs.s3_key_prefix }}/" + file_path,
199 |             schema="PUBLIC",
200 |             table="{{ var.json.aws_configs.redshift_table }}",
201 |             copy_options=[
202 |                 "csv",
203 |                 "ignoreheader 1",
204 |                 "TIMEFORMAT AS 'YYYY-MM-DD HH24:MI:SS'",
205 |             ],
206 |         )
207 | 
208 |         TASK_DICT[f"delete_upload_date_{date}"] = delete_upload_date(file_path)
209 | 
210 |         chain(
211 |             begin,
212 |             [TASK_DICT[f"add_upload_date_{date}"]],
213 |             converge_1,
214 |             [TASK_DICT[f"upload_to_s3_{date}"]],
215 |             create_redshift_table,
216 |             [TASK_DICT[f"load_to_redshift_{date}"]],
217 |             converge_2,
218 |             [quality_check_group, value_check, interval_check, threshold_check],
219 |             drop_redshift_table,
220 |             [TASK_DICT[f"delete_upload_date_{date}"]],
221 |             end,
222 |         )
223 | 


--------------------------------------------------------------------------------
/include/forestfire_checks/checks.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Table Level Checks
 3 | """
 4 | TABLE_CHECKS = [
 5 |     {"row_count_check": {"check_statement": "COUNT(*) = 9"}},
 6 |     {"dmc_less_than_twice_dc_check": {"check_statement": "2 * dmc < dc"}}
 7 |     # could be cool to check the table schema against known columns, as well
 8 | ]
 9 | 
10 | """
11 | Column Level Checks
12 | """
13 | COL_CHECKS = [
14 |     {"id": {
15 |         "null_check": {"equal_to": 0},
16 |         "distinct_check": {"equal_to": 9}
17 |     }},
18 |     {"ffmc": {
19 |         "min": {"geq_to": 50},
20 |         "max": {"less_than": 100}
21 |     }},
22 | ]
23 | 


--------------------------------------------------------------------------------
/include/gcs_xcom_backend.py:
--------------------------------------------------------------------------------
 1 | ## Source https://medium.com/apache-airflow/airflow-2-0-dag-authoring-redesigned-651edc397178
 2 | 
 3 | from typing import Any
 4 | from airflow.models.xcom import BaseXCom
 5 | from airflow.providers.google.cloud.hooks.gcs import GCSHook
 6 | 
 7 | import pandas as pd
 8 | import uuid
 9 | 
10 | 
11 | class GCSXComBackend(BaseXCom):
12 |     PREFIX = "xcom_gcs://"
13 |     BUCKET_NAME = "xcom_gcs"
14 | 
15 |     @staticmethod
16 |     def serialize_value(value: Any):
17 |         if isinstance(value, pd.DataFrame):
18 |             hook = GCSHook()
19 |             object_name = "data_" + str(uuid.uuid4())
20 |             with hook.provide_file_and_upload(
21 |                     bucket_name=GCSXComBackend.BUCKET_NAME,
22 |                     object_name=object_name,
23 |             ) as f:
24 |                 value.to_csv(f.name, index=False)
25 |             # Append prefix to persist information that the file
26 |             # has to be downloaded from GCS
27 |             value = GCSXComBackend.PREFIX + object_name
28 |         return BaseXCom.serialize_value(value)
29 | 
30 |     @staticmethod
31 |     def deserialize_value(result) -> Any:
32 |         result = BaseXCom.deserialize_value(result)
33 |         if isinstance(result, str) and result.startswith(GCSXComBackend.PREFIX):
34 |             object_name = result.replace(GCSXComBackend.PREFIX, "")
35 |             with GCSHook().provide_file(
36 |                     bucket_name=GCSXComBackend.BUCKET_NAME,
37 |                     object_name=object_name,
38 |             ) as f:
39 |                 f.flush()
40 |                 result = pd.read_csv(f.name)
41 |         return result
42 | 


--------------------------------------------------------------------------------
/include/great_expectations/.gitignore:
--------------------------------------------------------------------------------
1 | uncommitted/


--------------------------------------------------------------------------------
/include/great_expectations/checkpoints/mlflow/feature_chk.yml:
--------------------------------------------------------------------------------
 1 | name: mlflow.feature_chk
 2 | config_version: 1.0
 3 | template_name:
 4 | module_name: great_expectations.checkpoint
 5 | class_name: Checkpoint
 6 | run_name_template: '%Y%m%d-%H%M%S-my-run-name-template'
 7 | expectation_suite_name: mlflow.census_adult_income_features
 8 | batch_request:
 9 | action_list:
10 |   - name: store_validation_result
11 |     action:
12 |       class_name: StoreValidationResultAction
13 |   - name: store_evaluation_params
14 |     action:
15 |       class_name: StoreEvaluationParametersAction
16 |   - name: update_data_docs
17 |     action:
18 |       class_name: UpdateDataDocsAction
19 |       site_names: []
20 | evaluation_parameters: {}
21 | runtime_configuration: {}
22 | validations:
23 |   - batch_request:
24 |       datasource_name: my_mlflow_datasource
25 |       data_connector_name: default_inferred_data_connector_name
26 |       data_asset_name: mlflow_dataframe
27 |       data_connector_query:
28 | profilers: []
29 | ge_cloud_id:
30 | expectation_suite_ge_cloud_id:
31 | 


--------------------------------------------------------------------------------
/include/great_expectations/checkpoints/mlflow/preprocess_chk.yml:
--------------------------------------------------------------------------------
 1 | name: mlflow.preprocess_chk
 2 | config_version: 1.0
 3 | template_name:
 4 | module_name: great_expectations.checkpoint
 5 | class_name: Checkpoint
 6 | run_name_template: '%Y%m%d-%H%M%S-my-run-name-template'
 7 | expectation_suite_name: mlflow.census_adult_income_preprocess
 8 | batch_request:
 9 | action_list:
10 |   - name: store_validation_result
11 |     action:
12 |       class_name: StoreValidationResultAction
13 |   - name: store_evaluation_params
14 |     action:
15 |       class_name: StoreEvaluationParametersAction
16 |   - name: update_data_docs
17 |     action:
18 |       class_name: UpdateDataDocsAction
19 |       site_names: []
20 | evaluation_parameters: {}
21 | runtime_configuration: {}
22 | validations:
23 |   - batch_request:
24 |       datasource_name: my_mlflow_datasource
25 |       data_connector_name: default_inferred_data_connector_name
26 |       data_asset_name: mlflow_dataframe
27 |       data_connector_query:
28 | profilers: []
29 | ge_cloud_id:
30 | expectation_suite_ge_cloud_id:
31 | 


--------------------------------------------------------------------------------
/include/great_expectations/checkpoints/taxi/fail/chk.yml:
--------------------------------------------------------------------------------
 1 | name: taxi.fail.chk
 2 | config_version: 1.0
 3 | template_name:
 4 | module_name: great_expectations.checkpoint
 5 | class_name: Checkpoint
 6 | run_name_template: '%Y%m%d-%H%M%S-my-run-name-template'
 7 | expectation_suite_name: taxi.demo
 8 | batch_request:
 9 | action_list:
10 |   - name: store_validation_result
11 |     action:
12 |       class_name: StoreValidationResultAction
13 |   - name: store_evaluation_params
14 |     action:
15 |       class_name: StoreEvaluationParametersAction
16 |   - name: update_data_docs
17 |     action:
18 |       class_name: UpdateDataDocsAction
19 |       site_names: []
20 | evaluation_parameters: {}
21 | runtime_configuration: {}
22 | validations:
23 |   - batch_request:
24 |       datasource_name: my_datasource
25 |       data_connector_name: default_inferred_data_connector_name
26 |       data_asset_name: yellow_tripdata_sample_2019-02.csv
27 |       data_connector_query:
28 |         index: -1
29 | profilers: []
30 | ge_cloud_id:
31 | expectation_suite_ge_cloud_id:
32 | 


--------------------------------------------------------------------------------
/include/great_expectations/checkpoints/taxi/pass/chk.yml:
--------------------------------------------------------------------------------
 1 | name: taxi.pass.chk
 2 | config_version: 1.0
 3 | template_name:
 4 | module_name: great_expectations.checkpoint
 5 | class_name: Checkpoint
 6 | run_name_template: '%Y%m%d-%H%M%S-my-run-name-template'
 7 | expectation_suite_name: taxi.demo
 8 | batch_request:
 9 | action_list:
10 |   - name: store_validation_result
11 |     action:
12 |       class_name: StoreValidationResultAction
13 |   - name: store_evaluation_params
14 |     action:
15 |       class_name: StoreEvaluationParametersAction
16 |   - name: update_data_docs
17 |     action:
18 |       class_name: UpdateDataDocsAction
19 |       site_names: []
20 | evaluation_parameters: {}
21 | runtime_configuration: {}
22 | validations:
23 |   - batch_request:
24 |       datasource_name: my_datasource
25 |       data_connector_name: default_inferred_data_connector_name
26 |       data_asset_name: yellow_tripdata_sample_2019-01.csv
27 |       data_connector_query:
28 |         index: -1
29 | profilers: []
30 | ge_cloud_id:
31 | expectation_suite_ge_cloud_id:
32 | 


--------------------------------------------------------------------------------
/include/great_expectations/configs/bigquery_configs.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from pathlib import Path
  4 | from great_expectations.core.batch import BatchRequest
  5 | from great_expectations.data_context.types.base import (
  6 |     DataContextConfig,
  7 |     CheckpointConfig
  8 | )
  9 | 
 10 | base_path = Path(__file__).parents[3]
 11 | data_dir = os.path.join(base_path, "include", "data")
 12 | ge_root_dir = os.path.join(base_path, "include", "great_expectations")
 13 | 
 14 | bigquery_data_context_config = DataContextConfig(
 15 |     **{
 16 |         "config_version": 3.0,
 17 |         "datasources": {
 18 |             "my_bigquery_datasource": {
 19 |                 "data_connectors": {
 20 |                     "default_inferred_data_connector_name": {
 21 |                         "default_regex": {
 22 |                             "group_names": ["data_asset_name"],
 23 |                             "pattern": "(.*)",
 24 |                         },
 25 |                         "base_directory": data_dir,
 26 |                         "class_name": "InferredAssetFilesystemDataConnector",
 27 |                     },
 28 |                     "default_runtime_data_connector_name": {
 29 |                         "batch_identifiers": ["default_identifier_name"],
 30 |                         "class_name": "RuntimeDataConnector",
 31 |                     },
 32 |                 },
 33 |                 "execution_engine": {
 34 |                     "class_name": "PandasExecutionEngine",
 35 |                 },
 36 |                 "class_name": "Datasource",
 37 |             }
 38 |         },
 39 |         "config_variables_file_path": os.path.join(
 40 |             ge_root_dir, "uncommitted", "config_variables.yml"
 41 |         ),
 42 |         "stores": {
 43 |             "expectations_store": {
 44 |                 "class_name": "ExpectationsStore",
 45 |                 "store_backend": {
 46 |                     "class_name": "TupleFilesystemStoreBackend",
 47 |                     "base_directory": os.path.join(ge_root_dir, "expectations"),
 48 |                 },
 49 |             },
 50 |             "validations_store": {
 51 |                 "class_name": "ValidationsStore",
 52 |                 "store_backend": {
 53 |                     "class_name": "TupleFilesystemStoreBackend",
 54 |                     "base_directory": os.path.join(
 55 |                         ge_root_dir, "uncommitted", "validations"
 56 |                     ),
 57 |                 },
 58 |             },
 59 |             "evaluation_parameter_store": {"class_name": "EvaluationParameterStore"},
 60 |             "checkpoint_store": {
 61 |                 "class_name": "CheckpointStore",
 62 |                 "store_backend": {
 63 |                     "class_name": "TupleFilesystemStoreBackend",
 64 |                     "suppress_store_backend_id": True,
 65 |                     "base_directory": os.path.join(ge_root_dir, "checkpoints"),
 66 |                 },
 67 |             },
 68 |         },
 69 |         "expectations_store_name": "expectations_store",
 70 |         "validations_store_name": "validations_store",
 71 |         "evaluation_parameter_store_name": "evaluation_parameter_store",
 72 |         "checkpoint_store_name": "checkpoint_store",
 73 |         "data_docs_sites": {
 74 |             "local_site": {
 75 |                 "class_name": "SiteBuilder",
 76 |                 "show_how_to_buttons": True,
 77 |                 "store_backend": {
 78 |                     "class_name": "TupleFilesystemStoreBackend",
 79 |                     "base_directory": os.path.join(
 80 |                         ge_root_dir, "uncommitted", "data_docs", "local_site"
 81 |                     ),
 82 |                 },
 83 |                 "site_index_builder": {"class_name": "DefaultSiteIndexBuilder"},
 84 |             }
 85 |         },
 86 |         "anonymous_usage_statistics": {
 87 |             "data_context_id": "abcdabcd-1111-2222-3333-abcdabcdabcd",
 88 |             "enabled": False,
 89 |         },
 90 |         "notebooks": None,
 91 |         "concurrency": {"enabled": False},
 92 |     }
 93 | )
 94 | 
 95 | bigquery_checkpoint_config = CheckpointConfig(
 96 |     **{
 97 |         "name": "taxi.pass.chk",
 98 |         "config_version": 1.0,
 99 |         "template_name": None,
100 |         "class_name": "Checkpoint",
101 |         "run_name_template": "%Y%m%d-%H%M%S-my-run-name-template",
102 |         "expectation_suite_name": "taxi.demo",
103 |         "batch_request": None,
104 |         "action_list": [
105 |             {
106 |                 "name": "store_validation_result",
107 |                 "action": {"class_name": "StoreValidationResultAction"},
108 |             },
109 |             {
110 |                 "name": "store_evaluation_params",
111 |                 "action": {"class_name": "StoreEvaluationParametersAction"},
112 |             },
113 |             {
114 |                 "name": "update_data_docs",
115 |                 "action": {"class_name": "UpdateDataDocsAction", "site_names": []},
116 |             },
117 |         ],
118 |         "evaluation_parameters": {},
119 |         "runtime_configuration": {},
120 |         "validations": [
121 |             {
122 |                 "batch_request": {
123 |                     "datasource_name": "my_bigquery_datasource",
124 |                     "data_connector_name": "default_inferred_data_connector_name",
125 |                     "data_asset_name": "taxi",
126 |                     "batch_spec_passthrough": {
127 |                         "bigquery_temp_table": "taxi_temp"
128 |                     },
129 |                 },
130 |             }
131 |         ],
132 |         "profilers": [],
133 |         "ge_cloud_id": None,
134 |         "expectation_suite_ge_cloud_id": None,
135 |     }
136 | )
137 | 
138 | bigquery_batch_request = BatchRequest(
139 |     **{
140 |         "datasource_name": "my_bigquery_datasource",
141 |         "data_connector_name": "default_inferred_data_connector_name",
142 |         "data_asset_name": "great_expectations_bigquery_example.taxi",
143 |         "data_connector_query": {"index": -1},
144 |     }
145 | )
146 | 


--------------------------------------------------------------------------------
/include/great_expectations/configs/mlflow_checkpoint_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | from great_expectations.data_context.types.base import (
 6 |     CheckpointConfig,
 7 | )
 8 | 
 9 | base_path = Path(__file__).parents[3]
10 | data_dir = os.path.join(base_path, "include", "data")
11 | ge_root_dir = os.path.join(base_path, "include", "great_expectations")
12 | 
13 | mlflow_preprocess_checkpoint_config = CheckpointConfig(
14 |     **{
15 |         "name": "mlflow.preprocess_chk",
16 |         "config_version": 1.0,
17 |         "template_name": None,
18 |         "class_name": "Checkpoint",
19 |         "run_name_template": "%Y%m%d-%H%M%S-my-run-name-template",
20 |         "expectation_suite_name": "mlflow.census_adult_income_preprocess",
21 |         "batch_request": None,
22 |         "action_list": [
23 |             {
24 |                 "name": "store_validation_result",
25 |                 "action": {"class_name": "StoreValidationResultAction"},
26 |             },
27 |             {
28 |                 "name": "store_evaluation_params",
29 |                 "action": {"class_name": "StoreEvaluationParametersAction"},
30 |             },
31 |             {
32 |                 "name": "update_data_docs",
33 |                 "action": {"class_name": "UpdateDataDocsAction"},
34 |             },
35 |         ],
36 |     }
37 | )
38 | 
39 | mlflow_feature_checkpoint_config = CheckpointConfig(
40 |     **{
41 |         "name": "mlflow.feature_chk",
42 |         "config_version": 1.0,
43 |         "template_name": None,
44 |         "class_name": "Checkpoint",
45 |         "run_name_template": "%Y%m%d-%H%M%S-my-run-name-template",
46 |         "expectation_suite_name": "mlflow.census_adult_income_features",
47 |         "batch_request": None,
48 |         "action_list": [
49 |             {
50 |                 "name": "store_validation_result",
51 |                 "action": {"class_name": "StoreValidationResultAction"},
52 |             },
53 |             {
54 |                 "name": "store_evaluation_params",
55 |                 "action": {"class_name": "StoreEvaluationParametersAction"},
56 |             },
57 |             {
58 |                 "name": "update_data_docs",
59 |                 "action": {"class_name": "UpdateDataDocsAction"},
60 |             },
61 |         ],
62 |     }
63 | )
64 | 


--------------------------------------------------------------------------------
/include/great_expectations/configs/redshift_configs.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from pathlib import Path
  4 | from great_expectations.core.batch import BatchRequest
  5 | from great_expectations.data_context.types.base import (
  6 |     DataContextConfig,
  7 |     CheckpointConfig
  8 | )
  9 | 
 10 | base_path = Path(__file__).parents[3]
 11 | data_dir = os.path.join(base_path, "include", "data")
 12 | ge_root_dir = os.path.join(base_path, "include", "great_expectations")
 13 | 
 14 | redshift_data_context_config = DataContextConfig(
 15 |     **{
 16 |         "config_version": 3.0,
 17 |         "datasources": {
 18 |             "my_redshift_datasource": {
 19 |                 "module_name": "great_expectations.datasource",
 20 |                 "data_connectors": {
 21 |                     "default_inferred_data_connector_name": {
 22 |                         "default_regex": {
 23 |                             "group_names": ["data_asset_name"],
 24 |                             "pattern": "(.*)",
 25 |                         },
 26 |                         "base_directory": data_dir,
 27 |                         "module_name": "great_expectations.datasource.data_connector",
 28 |                         "class_name": "InferredAssetFilesystemDataConnector",
 29 |                     },
 30 |                     "default_runtime_data_connector_name": {
 31 |                         "batch_identifiers": ["default_identifier_name"],
 32 |                         "module_name": "great_expectations.datasource.data_connector",
 33 |                         "class_name": "RuntimeDataConnector",
 34 |                     },
 35 |                 },
 36 |                 "execution_engine": {
 37 |                     "module_name": "great_expectations.execution_engine",
 38 |                     "class_name": "PandasExecutionEngine",
 39 |                 },
 40 |                 "class_name": "Datasource",
 41 |             }
 42 |         },
 43 |         "config_variables_file_path": os.path.join(
 44 |             ge_root_dir, "uncommitted", "config_variables.yml"
 45 |         ),
 46 |         "stores": {
 47 |             "expectations_store": {
 48 |                 "class_name": "ExpectationsStore",
 49 |                 "store_backend": {
 50 |                     "class_name": "TupleFilesystemStoreBackend",
 51 |                     "base_directory": os.path.join(ge_root_dir, "expectations"),
 52 |                 },
 53 |             },
 54 |             "validations_store": {
 55 |                 "class_name": "ValidationsStore",
 56 |                 "store_backend": {
 57 |                     "class_name": "TupleFilesystemStoreBackend",
 58 |                     "base_directory": os.path.join(
 59 |                         ge_root_dir, "uncommitted", "validations"
 60 |                     ),
 61 |                 },
 62 |             },
 63 |             "evaluation_parameter_store": {"class_name": "EvaluationParameterStore"},
 64 |             "checkpoint_store": {
 65 |                 "class_name": "CheckpointStore",
 66 |                 "store_backend": {
 67 |                     "class_name": "TupleFilesystemStoreBackend",
 68 |                     "suppress_store_backend_id": True,
 69 |                     "base_directory": os.path.join(ge_root_dir, "checkpoints"),
 70 |                 },
 71 |             },
 72 |         },
 73 |         "expectations_store_name": "expectations_store",
 74 |         "validations_store_name": "validations_store",
 75 |         "evaluation_parameter_store_name": "evaluation_parameter_store",
 76 |         "checkpoint_store_name": "checkpoint_store",
 77 |         "data_docs_sites": {
 78 |             "local_site": {
 79 |                 "class_name": "SiteBuilder",
 80 |                 "show_how_to_buttons": True,
 81 |                 "store_backend": {
 82 |                     "class_name": "TupleFilesystemStoreBackend",
 83 |                     "base_directory": os.path.join(
 84 |                         ge_root_dir, "uncommitted", "data_docs", "local_site"
 85 |                     ),
 86 |                 },
 87 |                 "site_index_builder": {"class_name": "DefaultSiteIndexBuilder"},
 88 |             }
 89 |         },
 90 |         "anonymous_usage_statistics": {
 91 |             "data_context_id": "abcdabcd-1111-2222-3333-abcdabcdabcd",
 92 |             "enabled": False,
 93 |         },
 94 |         "notebooks": None,
 95 |         "concurrency": {"enabled": False},
 96 |     }
 97 | )
 98 | 
 99 | redshift_checkpoint_config = CheckpointConfig(
100 |     **{
101 |         "name": "taxi.pass.chk",
102 |         "config_version": 1.0,
103 |         "template_name": None,
104 |         "class_name": "Checkpoint",
105 |         "run_name_template": "%Y%m%d-%H%M%S-my-run-name-template",
106 |         "expectation_suite_name": "taxi.demo",
107 |         "batch_request": None,
108 |         "action_list": [
109 |             {
110 |                 "name": "store_validation_result",
111 |                 "action": {"class_name": "StoreValidationResultAction"},
112 |             },
113 |             {
114 |                 "name": "store_evaluation_params",
115 |                 "action": {"class_name": "StoreEvaluationParametersAction"},
116 |             },
117 |             {
118 |                 "name": "update_data_docs",
119 |                 "action": {"class_name": "UpdateDataDocsAction", "site_names": []},
120 |             },
121 |         ],
122 |         "evaluation_parameters": {},
123 |         "runtime_configuration": {},
124 |         "validations": [
125 |             {
126 |                 "batch_request": {
127 |                     "datasource_name": "my_redshift_datasource",
128 |                     "data_connector_name": "default_inferred_data_connector_name",
129 |                     "data_asset_name": "public.yellow_tripdata",
130 |                 },
131 |             }
132 |         ],
133 |         "profilers": [],
134 |         "ge_cloud_id": None,
135 |         "expectation_suite_ge_cloud_id": None,
136 |     }
137 | )
138 | 
139 | redshift_batch_request = BatchRequest(
140 |     **{
141 |         "datasource_name": "my_redshift_db",
142 |         "data_connector_name": "default_inferred_data_connector_name",
143 |         "data_asset_name": "yellow_tripdata_sample_2019-01.csv",
144 |         "data_connector_query": {"index": -1},
145 |     }
146 | )
147 | 


--------------------------------------------------------------------------------
/include/great_expectations/configs/s3_configs.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from pathlib import Path
  4 | from great_expectations.core.batch import BatchRequest
  5 | from great_expectations.data_context.types.base import (
  6 |     DataContextConfig,
  7 |     CheckpointConfig
  8 | )
  9 | 
 10 | base_path = Path(__file__).parents[3]
 11 | data_dir = os.path.join(base_path, "include", "data")
 12 | ge_root_dir = os.path.join(base_path, "include", "great_expectations")
 13 | 
 14 | s3_data_context_config = DataContextConfig(
 15 |     **{
 16 |         "config_version": 3.0,
 17 |         "datasources": {
 18 |             "my_s3_config": {
 19 |                 "module_name": "great_expectations.datasource",
 20 |                 "data_connectors": {
 21 |                     "default_inferred_data_connector_name": {
 22 |                         "default_regex": {
 23 |                             "group_names": ["yellow_tripdata", "date"],
 24 |                             "pattern": "(yellow_tripdata_sample)_(\d{4}-\d{2})\.csv",
 25 |                         },
 26 |                         "base_directory": "benji-dq-test/test/tripdata/",
 27 |                         "module_name": "great_expectations.datasource.data_connector",
 28 |                         "class_name": "InferredAssetS3DataConnector",
 29 |                     },
 30 |                     "default_runtime_data_connector_name": {
 31 |                         "batch_identifiers": ["default_identifier_name"],
 32 |                         "module_name": "great_expectations.datasource.data_connector",
 33 |                         "class_name": "RuntimeDataConnector",
 34 |                     },
 35 |                 },
 36 |                 "execution_engine": {
 37 |                     "module_name": "great_expectations.execution_engine",
 38 |                     "class_name": "SqlAlchemyExecutionEngine"
 39 |                 },
 40 |                 "class_name": "Datasource",
 41 |             }
 42 |         },
 43 |         "config_variables_file_path": os.path.join(
 44 |             ge_root_dir, "uncommitted", "config_variables.yml"
 45 |         ),
 46 |         "stores": {
 47 |             "expectations_store": {
 48 |                 "class_name": "ExpectationsStore",
 49 |                 "store_backend": {
 50 |                     "class_name": "TupleFilesystemStoreBackend",
 51 |                     "base_directory": os.path.join(ge_root_dir, "expectations"),
 52 |                 },
 53 |             },
 54 |             "validations_store": {
 55 |                 "class_name": "ValidationsStore",
 56 |                 "store_backend": {
 57 |                     "class_name": "TupleFilesystemStoreBackend",
 58 |                     "base_directory": os.path.join(
 59 |                         ge_root_dir, "uncommitted", "validations"
 60 |                     ),
 61 |                 },
 62 |             },
 63 |             "evaluation_parameter_store": {"class_name": "EvaluationParameterStore"},
 64 |             "checkpoint_store": {
 65 |                 "class_name": "CheckpointStore",
 66 |                 "store_backend": {
 67 |                     "class_name": "TupleFilesystemStoreBackend",
 68 |                     "suppress_store_backend_id": True,
 69 |                     "base_directory": os.path.join(ge_root_dir, "checkpoints"),
 70 |                 },
 71 |             },
 72 |         },
 73 |         "expectations_store_name": "expectations_store",
 74 |         "validations_store_name": "validations_store",
 75 |         "evaluation_parameter_store_name": "evaluation_parameter_store",
 76 |         "checkpoint_store_name": "checkpoint_store",
 77 |         "data_docs_sites": {
 78 |             "local_site": {
 79 |                 "class_name": "SiteBuilder",
 80 |                 "show_how_to_buttons": True,
 81 |                 "store_backend": {
 82 |                     "class_name": "TupleFilesystemStoreBackend",
 83 |                     "base_directory": os.path.join(
 84 |                         ge_root_dir, "uncommitted", "data_docs", "local_site"
 85 |                     ),
 86 |                 },
 87 |                 "site_index_builder": {"class_name": "DefaultSiteIndexBuilder"},
 88 |             }
 89 |         },
 90 |         "anonymous_usage_statistics": {
 91 |             "data_context_id": "abcdabcd-1111-2222-3333-abcdabcdabcd",
 92 |             "enabled": False,
 93 |         },
 94 |         "notebooks": None,
 95 |         "concurrency": {"enabled": False},
 96 |     }
 97 | )
 98 | 
 99 | snowflake_checkpoint_config = CheckpointConfig(
100 |     **{
101 |         "name": "taxi.pass.chk",
102 |         "config_version": 1.0,
103 |         "template_name": None,
104 |         "module_name": "great_expectations.checkpoint",
105 |         "class_name": "Checkpoint",
106 |         "run_name_template": "%Y%m%d-%H%M%S-my-run-name-template",
107 |         "expectation_suite_name": "taxi.demo",
108 |         "batch_request": None,
109 |         "action_list": [
110 |             {
111 |                 "name": "store_validation_result",
112 |                 "action": {"class_name": "StoreValidationResultAction"},
113 |             },
114 |             {
115 |                 "name": "store_evaluation_params",
116 |                 "action": {"class_name": "StoreEvaluationParametersAction"},
117 |             },
118 |             {
119 |                 "name": "update_data_docs",
120 |                 "action": {"class_name": "UpdateDataDocsAction", "site_names": []},
121 |             },
122 |         ],
123 |         "evaluation_parameters": {},
124 |         "runtime_configuration": {},
125 |         "validations": [
126 |             {
127 |                 "batch_request": {
128 |                     "datasource_name": "my_snowflake_db",
129 |                     "data_connector_name": "default_inferred_data_connector_name",
130 |                     "data_asset_name": "YELLOW_TRIPDATA",
131 |                     "data_connector_query": {"index": -1},
132 |                 },
133 |             }
134 |         ],
135 |         "profilers": [],
136 |         "ge_cloud_id": None,
137 |         "expectation_suite_ge_cloud_id": None,
138 |     }
139 | )
140 | 
141 | snowflake_batch_request = BatchRequest(
142 |     **{
143 |         "datasource_name": "my_snowflake_db",
144 |         "data_connector_name": "default_inferred_data_connector_name",
145 |         "data_asset_name": "yellow_tripdata_sample_2019-01.csv",
146 |         "data_connector_query": {"index": -1},
147 |     }
148 | )
149 | 


--------------------------------------------------------------------------------
/include/great_expectations/configs/snowflake_configs.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import yaml
  3 | 
  4 | from pathlib import Path
  5 | from great_expectations.core.batch import BatchRequest
  6 | from great_expectations.data_context.types.base import (
  7 |     DataContextConfig,
  8 |     CheckpointConfig
  9 | )
 10 | 
 11 | base_path = Path(__file__).parents[3]
 12 | data_dir = os.path.join(base_path, "include", "data")
 13 | ge_root_dir = os.path.join(base_path, "include", "great_expectations")
 14 | connection_string = ""
 15 | 
 16 | # Note: The user must first configure a `config_variable.yml` file for this to work
 17 | # The file is not included with this repo.
 18 | with open(
 19 |     f"{ge_root_dir}/uncommitted/config_variables.yml",
 20 |     "r",
 21 | ) as f:
 22 |     connection_string = yaml.safe_load(f).get("my_snowflake_db")
 23 | 
 24 | snowflake_data_context_config = DataContextConfig(
 25 |     **{
 26 |         "config_version": 3.0,
 27 |         "datasources": {},
 28 |         "config_variables_file_path": os.path.join(
 29 |             ge_root_dir, "uncommitted", "config_variables.yml"
 30 |         ),
 31 |         "stores": {
 32 |             "expectations_store": {
 33 |                 "class_name": "ExpectationsStore",
 34 |                 "store_backend": {
 35 |                     "class_name": "TupleFilesystemStoreBackend",
 36 |                     "base_directory": os.path.join(ge_root_dir, "expectations"),
 37 |                 },
 38 |             },
 39 |             "validations_store": {
 40 |                 "class_name": "ValidationsStore",
 41 |                 "store_backend": {
 42 |                     "class_name": "TupleFilesystemStoreBackend",
 43 |                     "base_directory": os.path.join(
 44 |                         ge_root_dir, "uncommitted", "validations"
 45 |                     ),
 46 |                 },
 47 |             },
 48 |             "evaluation_parameter_store": {"class_name": "EvaluationParameterStore"},
 49 |             "checkpoint_store": {
 50 |                 "class_name": "CheckpointStore",
 51 |                 "store_backend": {
 52 |                     "class_name": "TupleFilesystemStoreBackend",
 53 |                     "suppress_store_backend_id": True,
 54 |                     "base_directory": os.path.join(ge_root_dir, "checkpoints"),
 55 |                 },
 56 |             },
 57 |         },
 58 |         "expectations_store_name": "expectations_store",
 59 |         "validations_store_name": "validations_store",
 60 |         "evaluation_parameter_store_name": "evaluation_parameter_store",
 61 |         "checkpoint_store_name": "checkpoint_store",
 62 |         "data_docs_sites": {
 63 |             "local_site": {
 64 |                 "class_name": "SiteBuilder",
 65 |                 "show_how_to_buttons": True,
 66 |                 "store_backend": {
 67 |                     "class_name": "TupleFilesystemStoreBackend",
 68 |                     "base_directory": os.path.join(
 69 |                         ge_root_dir, "uncommitted", "data_docs", "local_site"
 70 |                     ),
 71 |                 },
 72 |                 "site_index_builder": {"class_name": "DefaultSiteIndexBuilder"},
 73 |             }
 74 |         },
 75 |         "anonymous_usage_statistics": {
 76 |             "data_context_id": "abcdabcd-1111-2222-3333-abcdabcdabcd",
 77 |             "enabled": False,
 78 |         },
 79 |         "notebooks": None,
 80 |         "concurrency": {"enabled": False},
 81 |     }
 82 | )
 83 | 
 84 | snowflake_checkpoint_config = CheckpointConfig(
 85 |     **{
 86 |         "name": "taxi.pass.chk",
 87 |         "config_version": 1.0,
 88 |         "template_name": None,
 89 |         "module_name": "great_expectations.checkpoint",
 90 |         "class_name": "Checkpoint",
 91 |         "run_name_template": "%Y%m%d-%H%M%S-my-run-name-template",
 92 |         "expectation_suite_name": "taxi.demo",
 93 |         "batch_request": None,
 94 |         "action_list": [
 95 |             {
 96 |                 "name": "store_validation_result",
 97 |                 "action": {"class_name": "StoreValidationResultAction"},
 98 |             },
 99 |             {
100 |                 "name": "store_evaluation_params",
101 |                 "action": {"class_name": "StoreEvaluationParametersAction"},
102 |             },
103 |             {
104 |                 "name": "update_data_docs",
105 |                 "action": {"class_name": "UpdateDataDocsAction", "site_names": []},
106 |             },
107 |         ],
108 |         "evaluation_parameters": {},
109 |         "runtime_configuration": {},
110 |         "validations": [
111 |             {
112 |                 "batch_request": {
113 |                     "datasource_name": "my_snowflake_datasource",
114 |                     "data_connector_name": "default_inferred_data_connector_name",
115 |                     "data_asset_name": "yellow_tripdata",
116 |                 },
117 |             }
118 |         ],
119 |         "profilers": [],
120 |         "ge_cloud_id": None,
121 |         "expectation_suite_ge_cloud_id": None,
122 |     }
123 | )
124 | 
125 | snowflake_audit_checkpoint_config = CheckpointConfig(
126 |     **{
127 |         "name": "taxi.pass.chk",
128 |         "config_version": 1.0,
129 |         "template_name": None,
130 |         "class_name": "Checkpoint",
131 |         "run_name_template": "%Y%m%d-%H%M%S-my-run-name-template",
132 |         "expectation_suite_name": "taxi.demo",
133 |         "batch_request": None,
134 |         "action_list": [
135 |             {
136 |                 "name": "store_validation_result",
137 |                 "action": {"class_name": "StoreValidationResultAction"},
138 |             },
139 |             {
140 |                 "name": "store_evaluation_params",
141 |                 "action": {"class_name": "StoreEvaluationParametersAction"},
142 |             },
143 |             {
144 |                 "name": "update_data_docs",
145 |                 "action": {"class_name": "UpdateDataDocsAction", "site_names": []},
146 |             },
147 |         ],
148 |         "evaluation_parameters": {},
149 |         "runtime_configuration": {},
150 |         "validations": [
151 |             {
152 |                 "batch_request": {
153 |                     "datasource_name": "my_snowflake_datasource",
154 |                     "data_connector_name": "default_inferred_data_connector_name",
155 |                     "data_asset_name": "yellow_tripdata_audit",
156 |                 },
157 |             }
158 |         ],
159 |         "profilers": [],
160 |         "ge_cloud_id": None,
161 |         "expectation_suite_ge_cloud_id": None,
162 |     }
163 | )
164 | 
165 | snowflake_batch_request = BatchRequest(
166 |     **{
167 |         "datasource_name": "my_snowflake_db",
168 |         "data_connector_name": "default_inferred_data_connector_name",
169 |         "data_asset_name": "yellow_tripdata_sample_2019-01.csv",
170 |         "data_connector_query": {"index": -1},
171 |     }
172 | )
173 | 


--------------------------------------------------------------------------------
/include/great_expectations/expectations/.ge_store_backend_id:
--------------------------------------------------------------------------------
1 | store_backend_id = abcdabcd-1111-2222-3333-abcdabcdabcd


--------------------------------------------------------------------------------
/include/great_expectations/expectations/mlflow/census_adult_income_features.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "data_asset_type": null,
  3 |   "expectation_suite_name": "census_adult_income",
  4 |   "expectations": [
  5 |     {
  6 |       "expectation_type": "expect_table_columns_to_match_ordered_list",
  7 |       "kwargs": {
  8 |         "column_list": [
  9 |           "capital_gain",
 10 |           "capital_loss",
 11 |           "hours_per_week",
 12 |           "workclass_Federal-gov",
 13 |           "workclass_Local-gov",
 14 |           "workclass_Never-worked",
 15 |           "workclass_Private",
 16 |           "workclass_Self-emp-inc",
 17 |           "workclass_Self-emp-not-inc",
 18 |           "workclass_State-gov",
 19 |           "workclass_Unknown",
 20 |           "workclass_Without-pay",
 21 |           "education_10th",
 22 |           "education_11th",
 23 |           "education_12th",
 24 |           "education_1st-4th",
 25 |           "education_5th-6th",
 26 |           "education_7th-8th",
 27 |           "education_9th",
 28 |           "education_Assoc-acdm",
 29 |           "education_Assoc-voc",
 30 |           "education_Bachelors",
 31 |           "education_Doctorate",
 32 |           "education_HS-grad",
 33 |           "education_Masters",
 34 |           "education_Preschool",
 35 |           "education_Prof-school",
 36 |           "education_Some-college",
 37 |           "occupation_Adm-clerical",
 38 |           "occupation_Armed-Forces",
 39 |           "occupation_Craft-repair",
 40 |           "occupation_Exec-managerial",
 41 |           "occupation_Farming-fishing",
 42 |           "occupation_Handlers-cleaners",
 43 |           "occupation_Machine-op-inspct",
 44 |           "occupation_Other-service",
 45 |           "occupation_Priv-house-serv",
 46 |           "occupation_Prof-specialty",
 47 |           "occupation_Protective-serv",
 48 |           "occupation_Sales",
 49 |           "occupation_Tech-support",
 50 |           "occupation_Transport-moving",
 51 |           "occupation_Unknown",
 52 |           "race_Amer-Indian-Eskimo",
 53 |           "race_Asian-Pac-Islander",
 54 |           "race_Black",
 55 |           "race_Other",
 56 |           "race_White",
 57 |           "sex_Female",
 58 |           "sex_Male",
 59 |           "income_bracket_>50K",
 60 |           "native_country_Cambodia",
 61 |           "native_country_Canada",
 62 |           "native_country_China",
 63 |           "native_country_Columbia",
 64 |           "native_country_Cuba",
 65 |           "native_country_Dominican-Republic",
 66 |           "native_country_Ecuador",
 67 |           "native_country_El-Salvador",
 68 |           "native_country_England",
 69 |           "native_country_France",
 70 |           "native_country_Germany",
 71 |           "native_country_Greece",
 72 |           "native_country_Guatemala",
 73 |           "native_country_Haiti",
 74 |           "native_country_Holand-Netherlands",
 75 |           "native_country_Honduras",
 76 |           "native_country_Hong",
 77 |           "native_country_Hungary",
 78 |           "native_country_India",
 79 |           "native_country_Iran",
 80 |           "native_country_Ireland",
 81 |           "native_country_Italy",
 82 |           "native_country_Jamaica",
 83 |           "native_country_Japan",
 84 |           "native_country_Laos",
 85 |           "native_country_Mexico",
 86 |           "native_country_Nicaragua",
 87 |           "native_country_Outlying-US(Guam-USVI-etc)",
 88 |           "native_country_Peru",
 89 |           "native_country_Philippines",
 90 |           "native_country_Poland",
 91 |           "native_country_Portugal",
 92 |           "native_country_Puerto-Rico",
 93 |           "native_country_Scotland",
 94 |           "native_country_South",
 95 |           "native_country_Taiwan",
 96 |           "native_country_Thailand",
 97 |           "native_country_Trinadad&Tobago",
 98 |           "native_country_United-States",
 99 |           "native_country_Unknown",
100 |           "native_country_Vietnam",
101 |           "native_country_Yugoslavia",
102 |           "age_bins",
103 |           "never_married"
104 |         ]
105 |       },
106 |       "meta": {}
107 |     },
108 |     {
109 |       "expectation_type": "expect_column_values_to_be_of_type",
110 |       "kwargs": {
111 |         "column": "sex_Male",
112 |         "type_": "int"
113 |       },
114 |       "meta": {
115 |         "notes": {
116 |           "content": "",
117 |           "format": "markdown"
118 |         }
119 |       }
120 |     },
121 |     {
122 |       "expectation_type": "expect_column_values_to_be_of_type",
123 |       "kwargs": {
124 |         "column": "sex_Female",
125 |         "type_": "int"
126 |       },
127 |       "meta": {
128 |         "notes": {
129 |           "content": "",
130 |           "format": "markdown"
131 |         }
132 |       }
133 |     },
134 |     {
135 |       "expectation_type": "expect_column_values_to_be_in_set",
136 |       "kwargs": {
137 |         "column": "sex_Male",
138 |         "value_set": [
139 |           0,
140 |           1
141 |         ]
142 |       },
143 |       "meta": {
144 |         "notes": {
145 |           "content": "",
146 |           "format": "markdown"
147 |         }
148 |       }
149 |     },
150 |     {
151 |       "expectation_type": "expect_column_values_to_be_in_set",
152 |       "kwargs": {
153 |         "column": "sex_Female",
154 |         "value_set": [
155 |           0,
156 |           1
157 |         ]
158 |       },
159 |       "meta": {
160 |         "notes": {
161 |           "content": "",
162 |           "format": "markdown"
163 |         }
164 |       }
165 |     }
166 |   ],
167 |   "ge_cloud_id": null,
168 |   "meta": {
169 |     "great_expectations_version": "0.13.49"
170 |   }
171 | }
172 | 


--------------------------------------------------------------------------------
/include/great_expectations/expectations/mlflow/census_adult_income_preprocess.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data_asset_type": null,
 3 |   "expectation_suite_name": "census_adult_income_preprocess",
 4 |   "expectations": [
 5 |     {
 6 |       "expectation_type": "expect_table_columns_to_match_ordered_list",
 7 |       "kwargs": {
 8 |         "column_list": [
 9 |           "age",
10 |           "workclass",
11 |           "education",
12 |           "marital_status",
13 |           "occupation",
14 |           "race",
15 |           "sex",
16 |           "capital_gain",
17 |           "capital_loss",
18 |           "hours_per_week",
19 |           "native_country",
20 |           "income_bracket"
21 |         ]
22 |       },
23 |       "meta": {}
24 |     },
25 |     {
26 |       "expectation_type": "expect_column_values_to_not_be_null",
27 |       "kwargs": {
28 |         "column": "age",
29 |         "mostly": 1.0
30 |       },
31 |       "meta": {
32 |         "notes": {
33 |           "content": "Ensure age is not null",
34 |           "format": "markdown"
35 |         }
36 |       }
37 |     },
38 |     {
39 |       "expectation_type": "expect_column_values_to_not_be_null",
40 |       "kwargs": {
41 |         "column": "workclass",
42 |         "mostly": 1.0
43 |       },
44 |       "meta": {
45 |         "notes": {
46 |           "content": "Ensure workclass is not null",
47 |           "format": "markdown"
48 |         }
49 |       }
50 |     },
51 |     {
52 |       "expectation_type": "expect_column_values_to_be_of_type",
53 |       "kwargs": {
54 |         "column": "hours_per_week",
55 |         "type_": "int"
56 |       },
57 |       "meta": {
58 |         "notes": {
59 |           "content": "",
60 |           "format": "markdown"
61 |         }
62 |       }
63 |     },
64 |     {
65 |       "expectation_type": "expect_column_values_to_be_in_set",
66 |       "kwargs": {
67 |         "column": "sex",
68 |         "value_set": [
69 |           "Male",
70 |           "Female"
71 |         ]
72 |       },
73 |       "meta": {
74 |         "notes": {
75 |           "content": "",
76 |           "format": "markdown"
77 |         }
78 |       }
79 |     }
80 |   ],
81 |   "ge_cloud_id": null,
82 |   "meta": {
83 |     "great_expectations_version": "0.13.49"
84 |   }
85 | }
86 | 


--------------------------------------------------------------------------------
/include/great_expectations/expectations/taxi/demo.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data_asset_type": null,
 3 |   "expectation_suite_name": "taxi.demo",
 4 |   "expectations": [
 5 |     {
 6 |       "expectation_context": {
 7 |         "description": null
 8 |       },
 9 |       "expectation_type": "expect_table_row_count_to_be_between",
10 |       "ge_cloud_id": null,
11 |       "kwargs": {
12 |         "max_value": 11000,
13 |         "min_value": 1000
14 |       },
15 |       "meta": {}
16 |     },
17 |     {
18 |       "expectation_context": {
19 |         "description": null
20 |       },
21 |       "expectation_type": "expect_column_values_to_not_be_null",
22 |       "ge_cloud_id": null,
23 |       "kwargs": {
24 |         "column": "vendor_id"
25 |       },
26 |       "meta": {}
27 |     },
28 |     {
29 |       "expectation_context": {
30 |         "description": null
31 |       },
32 |       "expectation_type": "expect_column_distinct_values_to_be_in_set",
33 |       "ge_cloud_id": null,
34 |       "kwargs": {
35 |         "column": "vendor_id",
36 |         "value_set": [
37 |           1,
38 |           2,
39 |           4
40 |         ]
41 |       },
42 |       "meta": {}
43 |     },
44 |     {
45 |       "expectation_context": {
46 |         "description": null
47 |       },
48 |       "expectation_type": "expect_column_values_to_be_between",
49 |       "ge_cloud_id": null,
50 |       "kwargs": {
51 |         "column": "passenger_count",
52 |         "max_value": 6,
53 |         "min_value": 1
54 |       },
55 |       "meta": {}
56 |     }
57 |   ],
58 |   "ge_cloud_id": null,
59 |   "meta": {
60 |     "great_expectations_version": "0.13.49"
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/include/great_expectations/expectations/taxi/demo_fail.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data_asset_type": null,
 3 |   "expectation_suite_name": "taxi.demo",
 4 |   "expectations": [
 5 |     {
 6 |       "expectation_context": {
 7 |         "description": null
 8 |       },
 9 |       "expectation_type": "expect_table_row_count_to_be_between",
10 |       "ge_cloud_id": null,
11 |       "kwargs": {
12 |         "max_value": 11000,
13 |         "min_value": 9000
14 |       },
15 |       "meta": {}
16 |     },
17 |     {
18 |       "expectation_context": {
19 |         "description": null
20 |       },
21 |       "expectation_type": "expect_column_values_to_not_be_null",
22 |       "ge_cloud_id": null,
23 |       "kwargs": {
24 |         "column": "vendor_id"
25 |       },
26 |       "meta": {}
27 |     },
28 |     {
29 |       "expectation_context": {
30 |         "description": null
31 |       },
32 |       "expectation_type": "expect_column_distinct_values_to_be_in_set",
33 |       "ge_cloud_id": null,
34 |       "kwargs": {
35 |         "column": "vendor_id",
36 |         "value_set": [
37 |           1,
38 |           2
39 |         ]
40 |       },
41 |       "meta": {}
42 |     },
43 |     {
44 |       "expectation_context": {
45 |         "description": null
46 |       },
47 |       "expectation_type": "expect_column_values_to_be_between",
48 |       "ge_cloud_id": null,
49 |       "kwargs": {
50 |         "column": "passenger_count",
51 |         "max_value": 6,
52 |         "min_value": 1
53 |       },
54 |       "meta": {}
55 |     }
56 |   ],
57 |   "ge_cloud_id": null,
58 |   "meta": {
59 |     "citations": [
60 |       {
61 |         "batch_request": {
62 |           "data_asset_name": "yellow_tripdata_sample_2019-01.csv",
63 |           "data_connector_name": "default_inferred_data_connector_name",
64 |           "datasource_name": "my_datasource",
65 |           "limit": 1000
66 |         },
67 |         "citation_date": "2021-12-06T16:11:55.452248Z",
68 |         "comment": "Created suite added via CLI"
69 |       }
70 |     ],
71 |     "great_expectations_version": "0.13.45"
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/include/great_expectations/expectations/test_suite.json:
--------------------------------------------------------------------------------
1 | {
2 |   "data_asset_type": null,
3 |   "expectation_suite_name": "test_suite",
4 |   "expectations": [],
5 |   "ge_cloud_id": null,
6 |   "meta": {
7 |     "great_expectations_version": "0.13.49"
8 |   }
9 | }


--------------------------------------------------------------------------------
/include/great_expectations/great_expectations.yml:
--------------------------------------------------------------------------------
 1 | # Welcome to Great Expectations! Always know what to expect from your data.
 2 | #
 3 | # Here you can define datasources, batch kwargs generators, integrations and
 4 | # more. This file is intended to be committed to your repo. For help with
 5 | # configuration please:
 6 | #   - Read our docs: https://docs.greatexpectations.io/en/latest/reference/spare_parts/data_context_reference.html#configuration
 7 | #   - Join our slack channel: http://greatexpectations.io/slack
 8 | 
 9 | # config_version refers to the syntactic version of this config file, and is used in maintaining backwards compatibility
10 | # It is auto-generated and usually does not need to be changed.
11 | config_version: 3.0
12 | 
13 | # Datasources tell Great Expectations where your data lives and how to get it.
14 | # You can use the CLI command `great_expectations datasource new` to help you
15 | # add a new datasource. Read more at https://docs.greatexpectations.io/en/latest/reference/core_concepts/datasource.html
16 | 
17 | # NOTE: Datasources in this repository are built on-the-fly by the GreatExpectationsOperator
18 | 
19 | # The plugins_directory will be added to your python path for custom modules
20 | # used to override and extend Great Expectations.
21 | plugins_directory: plugins/
22 | 
23 | stores:
24 | # Stores are configurable places to store things like Expectations, Validations
25 | # Data Docs, and more. These are for advanced users only - most users can simply
26 | # leave this section alone.
27 | #
28 | # Three stores are required: expectations, validations, and
29 | # evaluation_parameters, and must exist with a valid store entry. Additional
30 | # stores can be configured for uses such as data_docs, etc.
31 |   expectations_store:
32 |     class_name: ExpectationsStore
33 |     store_backend:
34 |       class_name: TupleFilesystemStoreBackend
35 |       base_directory: expectations/
36 | 
37 |   validations_store:
38 |     class_name: ValidationsStore
39 |     store_backend:
40 |       class_name: TupleFilesystemStoreBackend
41 |       base_directory: uncommitted/validations/
42 | 
43 |   evaluation_parameter_store:
44 |     class_name: EvaluationParameterStore
45 |   checkpoint_store:
46 |     class_name: CheckpointStore
47 |     store_backend:
48 |       class_name: TupleFilesystemStoreBackend
49 |       suppress_store_backend_id: true
50 |       base_directory: checkpoints/
51 | 
52 | expectations_store_name: expectations_store
53 | validations_store_name: validations_store
54 | evaluation_parameter_store_name: evaluation_parameter_store
55 | checkpoint_store_name: checkpoint_store
56 | 
57 | data_docs_sites:
58 |   # Data Docs make it simple to visualize data quality in your project. These
59 |   # include Expectations, Validations & Profiles. The are built for all
60 |   # Datasources from JSON artifacts in the local repo including validations &
61 |   # profiles from the uncommitted directory. Read more at https://docs.greatexpectations.io/en/latest/reference/core_concepts/data_docs.html
62 |   local_site:
63 |     class_name: SiteBuilder
64 |     show_how_to_buttons: true
65 |     store_backend:
66 |       class_name: TupleFilesystemStoreBackend
67 |       base_directory: uncommitted/data_docs/local_site/
68 |     site_index_builder:
69 |       class_name: DefaultSiteIndexBuilder
70 | 
71 | anonymous_usage_statistics:
72 |   data_context_id: abcdabcd-1111-2222-3333-abcdabcdabcd
73 |   enabled: false
74 | notebooks:
75 | concurrency:
76 |   enabled: false
77 | include_rendered_content:
78 |   globally: false
79 |   expectation_validation_result: false
80 |   expectation_suite: false
81 | 


--------------------------------------------------------------------------------
/include/great_expectations/notebooks/pandas/validation_playground.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Validation Playground\n",
  8 |     "\n",
  9 |     "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n",
 10 |     "\n",
 11 |     "#### This notebook assumes that you created at least one expectation suite in your project.\n",
 12 |     "#### Here you will learn how to validate data loaded into a Pandas DataFrame against an expectation suite.\n",
 13 |     "\n",
 14 |     "\n",
 15 |     "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import json\n",
 25 |     "import great_expectations as ge\n",
 26 |     "import great_expectations.jupyter_ux\n",
 27 |     "from great_expectations.datasource.types import BatchKwargs\n",
 28 |     "import datetime"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## 1. Get a DataContext\n",
 36 |     "This represents your **project** that you just created using `great_expectations init`."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "context = ge.data_context.DataContext()"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## 2. Choose an Expectation Suite\n",
 53 |     "\n",
 54 |     "List expectation suites that you created in your project"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "context.list_expectation_suite_names()"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "expectation_suite_name =  # TODO: set to a name from the list above"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## 3. Load a batch of data you want to validate\n",
 80 |     "\n",
 81 |     "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# list datasources of the type PandasDatasource in your project\n",
 91 |     "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'PandasDatasource']"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "datasource_name = # TODO: set to a datasource name from above"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# If you would like to validate a file on a filesystem:\n",
110 |     "batch_kwargs = {'path': \"YOUR_FILE_PATH\", 'datasource': datasource_name}\n",
111 |     "\n",
112 |     "# If you already loaded the data into a Pandas Data Frame:\n",
113 |     "batch_kwargs = {'dataset': \"YOUR_DATAFRAME\", 'datasource': datasource_name}\n",
114 |     "\n",
115 |     "\n",
116 |     "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n",
117 |     "batch.head()"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "## 4. Validate the batch with Validation Operators\n",
125 |     "\n",
126 |     "`Validation Operators` provide a convenient way to bundle the validation of\n",
127 |     "multiple expectation suites and the actions that should be taken after validation.\n",
128 |     "\n",
129 |     "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n",
130 |     "\n",
131 |     "* validating a group of batches that are logically related\n",
132 |     "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n",
133 |     "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n",
134 |     "\n",
135 |     "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n",
145 |     "\n",
146 |     "\"\"\"\n",
147 |     "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n",
148 |     "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n",
149 |     "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n",
150 |     "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n",
151 |     "be None and run_time will default to the current UTC datetime.\n",
152 |     "\"\"\"\n",
153 |     "\n",
154 |     "run_id = {\n",
155 |     "  \"run_name\": \"some_string_that_uniquely_identifies_this_run\",  # insert your own run_name here\n",
156 |     "  \"run_time\": datetime.datetime.now(datetime.timezone.utc)\n",
157 |     "}\n",
158 |     "\n",
159 |     "results = context.run_validation_operator(\n",
160 |     "    \"action_list_operator\",\n",
161 |     "    assets_to_validate=[batch],\n",
162 |     "    run_id=run_id)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "## 5. View the Validation Results in Data Docs\n",
170 |     "\n",
171 |     "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n",
172 |     "\n",
173 |     "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "context.open_data_docs()"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "## Congratulations! You ran Validations!\n",
190 |     "\n",
191 |     "## Next steps:\n",
192 |     "\n",
193 |     "### 1. Read about the typical workflow with Great Expectations:\n",
194 |     "\n",
195 |     "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n",
196 |     "\n",
197 |     "### 2. Explore the documentation & community\n",
198 |     "\n",
199 |     "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers."
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": []
208 |   }
209 |  ],
210 |  "metadata": {
211 |   "kernelspec": {
212 |    "display_name": "Python 3",
213 |    "language": "python",
214 |    "name": "python3"
215 |   },
216 |   "language_info": {
217 |    "codemirror_mode": {
218 |     "name": "ipython",
219 |     "version": 3
220 |    },
221 |    "file_extension": ".py",
222 |    "mimetype": "text/x-python",
223 |    "name": "python",
224 |    "nbconvert_exporter": "python",
225 |    "pygments_lexer": "ipython3",
226 |    "version": "3.7.0"
227 |   },
228 |   "pycharm": {
229 |    "stem_cell": {
230 |     "cell_type": "raw",
231 |     "source": [],
232 |     "metadata": {
233 |      "collapsed": false
234 |     }
235 |    }
236 |   }
237 |  },
238 |  "nbformat": 4,
239 |  "nbformat_minor": 4
240 | }
241 | 


--------------------------------------------------------------------------------
/include/great_expectations/notebooks/spark/validation_playground.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Validation Playground\n",
  8 |     "\n",
  9 |     "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n",
 10 |     "\n",
 11 |     "#### This notebook assumes that you created at least one expectation suite in your project.\n",
 12 |     "#### Here you will learn how to validate data loaded into a PySpark DataFrame against an expectation suite.\n",
 13 |     "\n",
 14 |     "\n",
 15 |     "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import json\n",
 25 |     "import great_expectations as ge\n",
 26 |     "import great_expectations.jupyter_ux\n",
 27 |     "from great_expectations.datasource.types import BatchKwargs\n",
 28 |     "import datetime"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## 1. Get a DataContext\n",
 36 |     "This represents your **project** that you just created using `great_expectations init`."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "context = ge.data_context.DataContext()"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## 2. Choose an Expectation Suite\n",
 53 |     "\n",
 54 |     "List expectation suites that you created in your project"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "context.list_expectation_suite_names()"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "expectation_suite_name =  # TODO: set to a name from the list above"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## 3. Load a batch of data you want to validate\n",
 80 |     "\n",
 81 |     "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# list datasources of the type SparkDFDatasource in your project\n",
 91 |     "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SparkDFDatasource']"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "datasource_name = # TODO: set to a datasource name from above"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# If you would like to validate a file on a filesystem:\n",
110 |     "batch_kwargs = {'path': \"YOUR_FILE_PATH\", 'datasource': datasource_name}\n",
111 |     "# To customize how Spark reads the file, you can add options under reader_options key in batch_kwargs (e.g., header='true')\n",
112 |     "\n",
113 |     "# If you already loaded the data into a PySpark Data Frame:\n",
114 |     "batch_kwargs = {'dataset': \"YOUR_DATAFRAME\", 'datasource': datasource_name}\n",
115 |     "\n",
116 |     "\n",
117 |     "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n",
118 |     "batch.head()"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "## 4. Validate the batch with Validation Operators\n",
126 |     "\n",
127 |     "`Validation Operators` provide a convenient way to bundle the validation of\n",
128 |     "multiple expectation suites and the actions that should be taken after validation.\n",
129 |     "\n",
130 |     "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n",
131 |     "\n",
132 |     "* validating a group of batches that are logically related\n",
133 |     "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n",
134 |     "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n",
135 |     "\n",
136 |     "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n",
146 |     "\n",
147 |     "\"\"\"\n",
148 |     "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n",
149 |     "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n",
150 |     "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n",
151 |     "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n",
152 |     "be None and run_time will default to the current UTC datetime.\n",
153 |     "\"\"\"\n",
154 |     "\n",
155 |     "run_id = {\n",
156 |     "  \"run_name\": \"some_string_that_uniquely_identifies_this_run\",  # insert your own run_name here\n",
157 |     "  \"run_time\": datetime.datetime.now(datetime.timezone.utc)\n",
158 |     "}\n",
159 |     "\n",
160 |     "results = context.run_validation_operator(\n",
161 |     "    \"action_list_operator\",\n",
162 |     "    assets_to_validate=[batch],\n",
163 |     "    run_id=run_id)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "## 5. View the Validation Results in Data Docs\n",
171 |     "\n",
172 |     "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n",
173 |     "\n",
174 |     "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "context.open_data_docs()"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "metadata": {},
189 |    "source": [
190 |     "## Congratulations! You ran Validations!\n",
191 |     "\n",
192 |     "## Next steps:\n",
193 |     "\n",
194 |     "### 1. Read about the typical workflow with Great Expectations:\n",
195 |     "\n",
196 |     "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n",
197 |     "\n",
198 |     "### 2. Explore the documentation & community\n",
199 |     "\n",
200 |     "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers."
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": []
209 |   }
210 |  ],
211 |  "metadata": {
212 |   "kernelspec": {
213 |    "display_name": "Python 3",
214 |    "language": "python",
215 |    "name": "python3"
216 |   },
217 |   "language_info": {
218 |    "codemirror_mode": {
219 |     "name": "ipython",
220 |     "version": 3
221 |    },
222 |    "file_extension": ".py",
223 |    "mimetype": "text/x-python",
224 |    "name": "python",
225 |    "nbconvert_exporter": "python",
226 |    "pygments_lexer": "ipython3",
227 |    "version": "3.7.0"
228 |   },
229 |   "pycharm": {
230 |    "stem_cell": {
231 |     "cell_type": "raw",
232 |     "source": [],
233 |     "metadata": {
234 |      "collapsed": false
235 |     }
236 |    }
237 |   }
238 |  },
239 |  "nbformat": 4,
240 |  "nbformat_minor": 4
241 | }
242 | 


--------------------------------------------------------------------------------
/include/great_expectations/notebooks/sql/validation_playground.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Validation Playground\n",
  8 |     "\n",
  9 |     "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n",
 10 |     "\n",
 11 |     "#### This notebook assumes that you created at least one expectation suite in your project.\n",
 12 |     "#### Here you will learn how to validate data in a SQL database against an expectation suite.\n",
 13 |     "\n",
 14 |     "\n",
 15 |     "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import json\n",
 25 |     "import great_expectations as ge\n",
 26 |     "import great_expectations.jupyter_ux\n",
 27 |     "from great_expectations.datasource.types import BatchKwargs\n",
 28 |     "import datetime"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## 1. Get a DataContext\n",
 36 |     "This represents your **project** that you just created using `great_expectations init`."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "context = ge.data_context.DataContext()"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## 2. Choose an Expectation Suite\n",
 53 |     "\n",
 54 |     "List expectation suites that you created in your project"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "context.list_expectation_suite_names()"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "expectation_suite_name =  # TODO: set to a name from the list above"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## 3. Load a batch of data you want to validate\n",
 80 |     "\n",
 81 |     "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# list datasources of the type SqlAlchemyDatasource in your project\n",
 91 |     "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SqlAlchemyDatasource']"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "datasource_name = # TODO: set to a datasource name from above"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# If you would like to validate an entire table or view in your database's default schema:\n",
110 |     "batch_kwargs = {'table': \"YOUR_TABLE\", 'datasource': datasource_name}\n",
111 |     "\n",
112 |     "# If you would like to validate an entire table or view from a non-default schema in your database:\n",
113 |     "batch_kwargs = {'table': \"YOUR_TABLE\", \"schema\": \"YOUR_SCHEMA\", 'datasource': datasource_name}\n",
114 |     "\n",
115 |     "# If you would like to validate the result set of a query:\n",
116 |     "# batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE', 'datasource': datasource_name}\n",
117 |     "\n",
118 |     "\n",
119 |     "\n",
120 |     "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n",
121 |     "batch.head()"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "## 4. Validate the batch with Validation Operators\n",
129 |     "\n",
130 |     "`Validation Operators` provide a convenient way to bundle the validation of\n",
131 |     "multiple expectation suites and the actions that should be taken after validation.\n",
132 |     "\n",
133 |     "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n",
134 |     "\n",
135 |     "* validating a group of batches that are logically related\n",
136 |     "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n",
137 |     "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n",
138 |     "\n",
139 |     "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n",
149 |     "\n",
150 |     "\"\"\"\n",
151 |     "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n",
152 |     "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n",
153 |     "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n",
154 |     "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n",
155 |     "be None and run_time will default to the current UTC datetime.\n",
156 |     "\"\"\"\n",
157 |     "\n",
158 |     "run_id = {\n",
159 |     "  \"run_name\": \"some_string_that_uniquely_identifies_this_run\",  # insert your own run_name here\n",
160 |     "  \"run_time\": datetime.datetime.now(datetime.timezone.utc)\n",
161 |     "}\n",
162 |     "\n",
163 |     "results = context.run_validation_operator(\n",
164 |     "    \"action_list_operator\",\n",
165 |     "    assets_to_validate=[batch],\n",
166 |     "    run_id=run_id)"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "## 5. View the Validation Results in Data Docs\n",
174 |     "\n",
175 |     "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n",
176 |     "\n",
177 |     "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "context.open_data_docs()"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {},
192 |    "source": [
193 |     "## Congratulations! You ran Validations!\n",
194 |     "\n",
195 |     "## Next steps:\n",
196 |     "\n",
197 |     "### 1. Read about the typical workflow with Great Expectations:\n",
198 |     "\n",
199 |     "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n",
200 |     "\n",
201 |     "### 2. Explore the documentation & community\n",
202 |     "\n",
203 |     "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers."
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": []
212 |   }
213 |  ],
214 |  "metadata": {
215 |   "kernelspec": {
216 |    "display_name": "Python 3",
217 |    "language": "python",
218 |    "name": "python3"
219 |   },
220 |   "language_info": {
221 |    "codemirror_mode": {
222 |     "name": "ipython",
223 |     "version": 3
224 |    },
225 |    "file_extension": ".py",
226 |    "mimetype": "text/x-python",
227 |    "name": "python",
228 |    "nbconvert_exporter": "python",
229 |    "pygments_lexer": "ipython3",
230 |    "version": "3.7.0"
231 |   },
232 |   "pycharm": {
233 |    "stem_cell": {
234 |     "cell_type": "raw",
235 |     "source": [],
236 |     "metadata": {
237 |      "collapsed": false
238 |     }
239 |    }
240 |   }
241 |  },
242 |  "nbformat": 4,
243 |  "nbformat_minor": 4
244 | }
245 | 


--------------------------------------------------------------------------------
/include/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css:
--------------------------------------------------------------------------------
 1 | /*index page*/
 2 | .ge-index-page-site-name-title {}
 3 | .ge-index-page-table-container {}
 4 | .ge-index-page-table {}
 5 | .ge-index-page-table-profiling-links-header {}
 6 | .ge-index-page-table-expectations-links-header {}
 7 | .ge-index-page-table-validations-links-header {}
 8 | .ge-index-page-table-profiling-links-list {}
 9 | .ge-index-page-table-profiling-links-item {}
10 | .ge-index-page-table-expectation-suite-link {}
11 | .ge-index-page-table-validation-links-list {}
12 | .ge-index-page-table-validation-links-item {}
13 | 
14 | /*breadcrumbs*/
15 | .ge-breadcrumbs {}
16 | .ge-breadcrumbs-item {}
17 | 
18 | /*navigation sidebar*/
19 | .ge-navigation-sidebar-container {}
20 | .ge-navigation-sidebar-content {}
21 | .ge-navigation-sidebar-title {}
22 | .ge-navigation-sidebar-link {}
23 | 


--------------------------------------------------------------------------------
/include/grid_configs.py:
--------------------------------------------------------------------------------
 1 | from numpy.random.mtrand import seed
 2 | from sklearn.linear_model import LogisticRegression
 3 | import lightgbm as lgb
 4 | 
 5 | 
 6 | 
 7 | models = {
 8 |     'lgbm': lgb.LGBMClassifier(objective='binary', metric=['auc', 'binary_logloss'], seed=55, boosting_type='gbdt'),
 9 |     'log_reg': LogisticRegression(max_iter=500)
10 | }
11 | 
12 | params = {
13 |     'lgbm':{
14 |         'learning_rate': [0.01, .05, .1], 
15 |         'n_estimators': [50, 100, 150],
16 |         'num_leaves': [31, 40, 80],
17 |         'max_depth': [16, 24, 31, 40]
18 |         },
19 |     'log_reg':{
20 |         'penalty': ['l1','l2','elasticnet'],
21 |         'C': [0.001, 0.01, 0.1, 1, 10, 100],
22 |         'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
23 |         }
24 |     }


--------------------------------------------------------------------------------
/include/libs/schema_reg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/airflow-data-quality-demo/8847b1c9e749966a762ed5b9fa8d2075d4772352/include/libs/schema_reg/__init__.py


--------------------------------------------------------------------------------
/include/libs/schema_reg/base_schema_transforms.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Parses schema definitions in include/libs/table_schemas/snowflake/<db_name>/<schema_name>.json
  3 | """
  4 | import json
  5 | import logging
  6 | import pandas as pd
  7 | import numpy as np
  8 | 
  9 | ### RELIES ON TEMPORARY SCHEMA DESIGN FOR ORDERING #####
 10 | # "id": {
 11 | #     "type": "varchar(25)",
 12 | #     "description": "source_order=1"
 13 | # }
 14 | def get_schema_source_col(key, prop):
 15 |     """
 16 |     Temporary solution to map source columns to
 17 |     destination columns.
 18 |     Added due to some unexpected column renaming in houston
 19 |     and needing fixing immediately.
 20 |     """
 21 |     if prop.get('default_source', None):
 22 |         return prop.get('default_source').split('=')[1]
 23 |     else:
 24 |         return key
 25 | 
 26 | def get_schema_order(prop: dict,) -> str:
 27 |     """
 28 |     Get 'description' field from schema definition file
 29 |     """
 30 | 
 31 |     if prop.get('description', None):
 32 |         return int(prop.get('description').split('=')[1])
 33 |     else:
 34 |         return -1
 35 | 
 36 | def get_schema_type(prop: dict,) -> str:
 37 |     """
 38 |     Get 'type' field from schema definition file
 39 |     """
 40 | 
 41 |     return prop.get('type', None)
 42 | 
 43 | 
 44 | def get_schema_raw_columns(table_props: dict) -> list:
 45 |     """
 46 |     This can be used to add headers to a CSV or resolve
 47 |     columns that should only be included source data cleaning
 48 |     """
 49 |     return [key for key, val in table_props.items()
 50 |                 if val.get('description') is not None]
 51 | 
 52 | 
 53 | def get_table_def_schema(TABLE_SCHEMA_DIR, transform_db, transform_schema):
 54 |     """
 55 |     Fow now, we'll access local json files. This is meant to evolve
 56 |     """
 57 |     with open(f'{TABLE_SCHEMA_DIR}/{transform_db}/{transform_schema}.json', 'r') as f:
 58 |         table_schema = json.load(f)
 59 |         table_def = table_schema.get('definitions')
 60 |         return table_def
 61 | 
 62 | 
 63 | def snowflake_load_column_string(table_props: dict) -> str:
 64 |     """
 65 |     Use the json table definition to build string necessary for
 66 |     selecting fields of interest for loading (i.e. omit passwords)
 67 |     Scrub it
 68 |     :param table_props: python dictionary of table properties dictionary
 69 |         from houston.json schema def
 70 |     :type table_props: dict
 71 |     :return col_string: string encoded for select on COPY ($1,$3,$6,et...)
 72 |     """
 73 |     # This should work with if discription is not None
 74 |     try:
 75 |         vals = [f"${val.get('description','').split('=')[1]}"
 76 |                 for key, val in table_props.items()
 77 |                     if key not in ['insert_timestamp','hash_diff']
 78 |         ]
 79 |     except Exception as e:
 80 |         logging.error('Bad Table Def Schema %s' % e)
 81 |         raise
 82 |     col_string = ','.join(vals)
 83 |     return col_string
 84 | 
 85 | 
 86 | def resolve_schemas(df:pd.DataFrame, table_props: dict) -> pd.DataFrame:
 87 |     """
 88 |     Take dataframe with raw data and remove or rename columns to match
 89 |     table schema, and if any remain from schema that aren't in dataframe,
 90 |     set null types for those colums
 91 |     """
 92 |     df.columns = map(str.lower, df.columns)
 93 |     # get returned columns, the nmap returned column names to
 94 |     # new names old: new
 95 |     col_mapping = {get_schema_source_col(k, v): k.lower()
 96 |                         for k,v in table_props.items()}
 97 | 
 98 |     df.rename(columns=col_mapping, inplace=True)
 99 |     current_cols = df.columns.tolist()
100 | 
101 |     schema_orders = {k.lower(): get_schema_order(v)
102 |                         for k,v in table_props.items()
103 |                             if get_schema_order(v) != -1}
104 | 
105 |     schema_cols = list(schema_orders.keys())
106 | 
107 |     # Take the values we have defined in the schema and set the order
108 |     schema_inters_cols = list(set(current_cols).intersection(schema_cols))
109 |     schema_inters_cols.sort(key=schema_orders.__getitem__)
110 | 
111 |     if schema_inters_cols:
112 |         df = df.loc[:,schema_inters_cols]
113 |     else:
114 |         raise ValueError(f"Bad Schema Design in Sorting Columns {df.columns.tolist()}")
115 | 
116 |     # Check if any columns exist in full schema cols
117 |     # that aren't in schema intersection cols
118 |     # If so, add them with null values
119 |     remaining = list(set(schema_cols) - set(schema_inters_cols))
120 |     if remaining:
121 |         for col in remaining:
122 |             prop = table_props.get(col)
123 |             if prop.get('default', None):
124 |                 df.loc[:,col] =  prop.get('default')
125 |             else:
126 |                 dtype = get_schema_type(prop).lower()
127 |                 if (('varchar' in dtype) or ('text' in dtype) or
128 |                     ('string' in dtype)):
129 |                     df.loc[:,col] = None
130 |                 elif (('number' in dtype) or ('timestamp' in dtype) or
131 |                       ('float' in dtype) or ('int' in dtype)):
132 |                     df.loc[:,col] = np.nan
133 |                 elif 'bool' in dtype:
134 |                     df.loc[:,col] = False
135 |                 else:
136 |                     df.loc[:,col] = ''
137 | 
138 |     #make sure we rearrange
139 |     schema_cols.sort(key=schema_orders.__getitem__)
140 |     return df.loc[:,schema_cols]
141 | 


--------------------------------------------------------------------------------
/include/metrics.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import matplotlib.pyplot as plt
 3 | import mlflow
 4 | import numpy as np
 5 | from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve, roc_auc_score
 6 | import pandas as pd
 7 | 
 8 | 
 9 | 
10 | def log_roc_curve(y_test: list, y_pred: list):
11 |     fpr, tpr, thresholds = roc_curve(y_test, y_pred)
12 |     plt.plot(fpr,tpr) 
13 |     plt.ylabel('False Positive Rate')
14 |     plt.xlabel('True Positive Rate')
15 |     plt.title('ROC Curve')
16 |     plt.savefig("roc_curve.png")
17 |     mlflow.log_artifact("roc_curve.png")
18 |     plt.close()
19 | 
20 | 
21 | def log_confusion_matrix(y_test: list, y_pred: list):
22 |     cm = confusion_matrix(y_test, y_pred)
23 |     t_n, f_p, f_n, t_p = cm.ravel()
24 |     mlflow.log_metrics({'True Positive': t_p, 'True Negative': t_n, 'False Positive': f_p, 'False Negatives': f_n})
25 | 
26 |     ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
27 |     plt.savefig("confusion_matrix.png")
28 |     mlflow.log_artifact("confusion_matrix.png")
29 |     plt.close()
30 | 
31 | 
32 | def log_classification_report(y_test: list, y_pred: list):
33 |     cr = classification_report(y_test, y_pred, output_dict=True)
34 |     logging.info(cr)
35 |     cr_metrics = pd.json_normalize(cr, sep='_').to_dict(orient='records')[0]
36 |     mlflow.log_metrics(cr_metrics)
37 | 
38 | 
39 | def log_all_eval_metrics(y_test: list, y_pred: list):
40 |     
41 |     # Classification Report
42 |     log_classification_report(y_test, y_pred)
43 | 
44 |     # Confusion Matrix
45 |     log_confusion_matrix(y_test, y_pred)
46 | 
47 |     # ROC Curve
48 |     log_roc_curve(y_test, y_pred)
49 | 
50 |     # AUC Score
51 |     mlflow.log_metric('test_auc_score', roc_auc_score(y_test, y_pred))
52 | 
53 | 
54 | def test(clf, test_set):    
55 |     logging.info('Gathering Validation set results')
56 |     y_pred = clf.predict(test_set)
57 | 
58 |     return np.where(y_pred > 0.5, 1, 0)


--------------------------------------------------------------------------------
/include/sample_data/cost_data/cost_data.csv:
--------------------------------------------------------------------------------
 1 | 1,150000,32000,10000
 2 | 2,200000,50000,50000
 3 | 3,90000,120000,300000
 4 | 4,230000,14000,7000
 5 | 5,98000,27000,48000
 6 | 6,72000,800000,0
 7 | 7,50000,2500000,0
 8 | 8,8000000,33000000,0
 9 | 9,6325000,450000,76000
10 | 


--------------------------------------------------------------------------------
/include/sample_data/forestfire_data/forestfires.csv:
--------------------------------------------------------------------------------
 1 | 1,2,aug,fri,91,166.9,752.6,7.1,25.9,41,3.6,0,100
 2 | 2,2,feb,mon,84,9.3,34,2.1,13.9,40,5.4,0,57.8
 3 | 3,4,mar,sat,69,2.4,15.5,0.7,17.4,24,5.4,0,92.9
 4 | 4,4,mar,mon,87.2,23.9,64.7,4.1,11.8,35,1.8,0,1300
 5 | 5,5,mar,sat,91.7,35.8,80.8,7.8,15.1,27,5.4,0,4857
 6 | 6,5,sep,wed,92.9,133.3,699.6,9.2,26.4,21,4.5,0,9800
 7 | 7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,14
 8 | 8,6,mar,fri,91.7,33.3,77.5,9,8.3,97,4,0.2,74.5
 9 | 9,9,feb,thu,84.2,6.8,26.6,7.7,6.7,79,3.1,0,8880.7
10 | 


--------------------------------------------------------------------------------
/include/sample_data/forestfire_data/forestfires_corrupt.csv:
--------------------------------------------------------------------------------
 1 | one,2,aug,fri,91,166.9,752.6,7.1,25.9,41,3.6,0,0
 2 | two,2,feb,mon,84,9.3,34,2.1,13.9,40,5.4,0,0
 3 | three,4,mar,satoorday,69,2.4,15.5,0.7,17.4,24,5.4,0,0
 4 | four,4,mar,mon,87.2,23.9,64.7,4.1,11.8,35,1.8,0,0
 5 | five,5,mar,sat,91.7,35.8,80.8,7.8,15.1,27,5.4,0,0
 6 | six,5,sep,wed,92.9,133.3,abcd,9.2,26.4,21,4.5,0,0
 7 | seven,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0
 8 | eight,6,mar,fri,91.7,33.3,77.5,9,8.3,97,4,0.2,0
 9 | nine,9,fb,thu,84.2,6.8,26.6,7.7,6.7,79,3.1,0,0
10 | 


--------------------------------------------------------------------------------
/include/sample_data/forestfire_data/forestfires_invalid.csv:
--------------------------------------------------------------------------------
 1 | 1,2,ag,fri,91,166.9,752.6,7.1,25.9,41,3.6,0,0
 2 | 2,2,feb,mon,84,9001,34,2.1,13.9,40,5.4,0,0
 3 | 3,4,mar,satoorday,69,2.4,15.5,0.7,17.4,24,5.4,0,0
 4 | 4,4,mar,mon,87.2,23.9,64.7,4.1,11.8,35,1.8,0,0
 5 | 5,5,mar,sat,91.7,35.8,80.8,7.8,15.1,27,5.4,0,0
 6 | 6,5,sep,wed,92.9,133.3,5,9.2,26.4,21,4.5,0,0
 7 | 7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0,0
 8 | 8,6,mar,fri,91.7,33.3,77.5,9,8.3,97,4,0.2,0
 9 | 9,9,fb,thu,84.2,6.8,26.6,7.7,6.7,79,3.1,0,0
10 | 


--------------------------------------------------------------------------------
/include/sql/bigquery_examples/load_bigquery_forestfire_data.sql:
--------------------------------------------------------------------------------
 1 | INSERT simple_bigquery_example_dag.forestfires VALUES
 2 |   (1,2,'aug','fri',91,166.9,752.6,7.1,25.9,41,3.6,0,0),
 3 |   (2,2,'feb','mon',84,9.3,34,2.1,13.9,40,5.4,0,0),
 4 |   (3,4,'mar','sat',69,2.4,15.5,0.7,17.4,24,5.4,0,0),
 5 |   (4,4,'mar','mon',87.2,23.9,64.7,4.1,11.8,35,1.8,0,0),
 6 |   (5,5,'mar','sat',91.7,35.8,80.8,7.8,15.1,27,5.4,0,0),
 7 |   (6,5,'sep','wed',92.9,133.3,699.6,9.2,26.4,21,4.5,0,0),
 8 |   (7,5,'mar','fri',86.2,26.2,94.3,5.1,8.2,51,6.7,0,0),
 9 |   (8,6,'mar','fri',91.7,33.3,77.5,9,8.3,97,4,0.2,0),
10 |   (9,9,'feb','thu',84.2,6.8,26.6,7.7,6.7,79,3.1,0,0);
11 | 


--------------------------------------------------------------------------------
/include/sql/bigquery_examples/row_quality_bigquery_forestfire_check.sql:
--------------------------------------------------------------------------------
 1 | -- Query to check if row items match particular parameters passed in by Operator.
 2 | SELECT ID,
 3 |   CASE y WHEN {{ params.y }} THEN 1 ELSE 0 END AS y_check,
 4 |   CASE month WHEN '{{ params.month }}' THEN 1 ELSE 0 END AS month_check,
 5 |   CASE day WHEN '{{ params.day }}' THEN 1 ELSE 0 END AS day_check,
 6 |   CASE ffmc WHEN {{ params.ffmc }} THEN 1 ELSE 0 END AS ffmc_check,
 7 |   CASE dmc WHEN {{ params.dmc }} THEN 1 ELSE 0 END AS dmc_check,
 8 |   CASE dc WHEN {{ params.dc }} THEN 1 ELSE 0 END AS dc_check,
 9 |   CASE isi WHEN {{ params.isi }} THEN 1 ELSE 0 END AS isi_check,
10 |   CASE temp WHEN {{ params.temp }} THEN 1 ELSE 0 END AS temp_check,
11 |   CASE rh WHEN {{ params.rh }} THEN 1 ELSE 0 END AS rh_check,
12 |   CASE wind WHEN {{ params.wind }} THEN 1 ELSE 0 END AS wind_check,
13 |   CASE rain WHEN {{ params.rain }} THEN 1 ELSE 0 END AS rain_check,
14 |   CASE area WHEN {{ params.area }} THEN 1 ELSE 0 END AS area_check
15 | FROM {{ params.dataset }}.{{ params.table }}
16 | WHERE ID = {{ params.id }}
17 | 


--------------------------------------------------------------------------------
/include/sql/dbt_examples/copy_store_failures.sql:
--------------------------------------------------------------------------------
1 | -- Load store_failures dbt data from the default, overwritten table to a permanent table
2 | INSERT INTO {{ params.destination_table }} ({{ params.columns }})
3 | SELECT {{ params.columns }}
4 | FROM {{ params.source_table }};
5 | 


--------------------------------------------------------------------------------
/include/sql/firebolt_examples/create_table.sql:
--------------------------------------------------------------------------------
 1 | CREATE FACT TABLE IF NOT EXISTS {{ conn.firebolt_default.schema }}.{{ params.table }}
 2 | (
 3 |     id INT,
 4 |     y INT,
 5 |     month VARCHAR(25),
 6 |     day VARCHAR(25),
 7 |     ffmc FLOAT,
 8 |     dmc FLOAT,
 9 |     dc FLOAT,
10 |     isi FLOAT,
11 |     temp FLOAT,
12 |     rh FLOAT,
13 |     wind FLOAT,
14 |     rain FLOAT,
15 |     area FLOAT
16 | ) PRIMARY INDEX id;
17 | 


--------------------------------------------------------------------------------
/include/sql/firebolt_examples/drop_table.sql:
--------------------------------------------------------------------------------
1 | DROP TABLE IF EXISTS {{ params.table }};
2 | 


--------------------------------------------------------------------------------
/include/sql/firebolt_examples/load_forestfire_data.sql:
--------------------------------------------------------------------------------
 1 | INSERT INTO {{ params.table }} VALUES
 2 |   (1,2,'aug','fri',91,166.9,752.6,7.1,25.9,41,3.6,0,0),
 3 |   (2,2,'feb','mon',84,9.3,34,2.1,13.9,40,5.4,0,0),
 4 |   (3,4,'mar','sat',69,2.4,15.5,0.7,17.4,24,5.4,0,0),
 5 |   (4,4,'mar','mon',87.2,23.9,64.7,4.1,11.8,35,1.8,0,0),
 6 |   (5,5,'mar','sat',91.7,35.8,80.8,7.8,15.1,27,5.4,0,0),
 7 |   (6,5,'sep','wed',92.9,133.3,699.6,9.2,26.4,21,4.5,0,0),
 8 |   (7,5,'mar','fri',86.2,26.2,94.3,5.1,8.2,51,6.7,0,0),
 9 |   (8,6,'mar','fri',91.7,33.3,77.5,9,8.3,97,4,0.2,0),
10 |   (9,9,'feb','thu',84.2,6.8,26.6,7.7,6.7,79,3.1,0,0);
11 | 


--------------------------------------------------------------------------------
/include/sql/firebolt_examples/quality_check_template.sql:
--------------------------------------------------------------------------------
1 | SELECT MIN({{ params.col }})
2 | FROM(
3 |   SELECT
4 |     CASE WHEN {{ params.check_statement }} THEN 1 ELSE 0 END AS {{ params.col }}
5 |   FROM {{ params.table }}
6 | )
7 | 


--------------------------------------------------------------------------------
/include/sql/great_expectations_examples/copy_yellow_tripdata_snowflake_staging.sql:
--------------------------------------------------------------------------------
 1 | {% set table_schema = params.table_schema %}
 2 | MERGE INTO {{ conn.snowflake_default.schema }}.{{ params.table_name }} as dest
 3 | USING (
 4 |     SELECT *
 5 |     FROM
 6 |     {{ conn.snowflake_default.schema }}.{{ params.audit_table_name }}
 7 | ) as stg
 8 | ON dest.PICKUP_DATETIME = stg.PICKUP_DATETIME
 9 |   AND dest.DROPOFF_DATETIME = stg.DROPOFF_DATETIME
10 | WHEN NOT MATCHED THEN
11 | INSERT (
12 |     {%- for name, col_dict in table_schema.items() -%}
13 |     {%- if loop.first %}
14 |     {{ name }}
15 |     {%- else %}
16 |     ,{{ name }}
17 |     {%- endif %}
18 |     {%- endfor %}
19 | )
20 | VALUES
21 | (
22 |     {% for name, col_dict in table_schema.items() %}
23 |     {%- if not loop.first %}
24 |     ,{%- endif -%}
25 |     {%- if 'default' in col_dict.keys() -%}
26 |         COALESCE(stg.{{ name }}, '{{col_dict.get('default', 'missing_value')}}')
27 |     {%- else -%}
28 |         stg.{{ name }}
29 |     {%- endif -%}
30 |     {%- endfor %}
31 | )
32 | 


--------------------------------------------------------------------------------
/include/sql/great_expectations_examples/create_snowflake_yellow_tripdata_stage.sql:
--------------------------------------------------------------------------------
1 | CREATE OR REPLACE STAGE {{ params.stage_name }} url=s3://{{ var.json.aws_configs.s3_bucket }}
2 | credentials=(aws_key_id='{{ conn.aws_default.login }}' aws_secret_key='{{ conn.aws_default.password }}')
3 | file_format=(type = 'CSV', skip_header = 1, time_format = 'YYYY-MM-DD HH24:MI:SS');
4 | 


--------------------------------------------------------------------------------
/include/sql/great_expectations_examples/create_yellow_tripdata_redshift_table.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE IF NOT EXISTS {{ params.table_name }}
 2 | (vendor_id int,
 3 | pickup_datetime timestamp,
 4 | dropoff_datetime timestamp,
 5 | passenger_count int,
 6 | trip_distance float,
 7 | rate_code_id int,
 8 | store_and_fwd_flag varchar,
 9 | pickup_location_id int,
10 | dropoff_location_id int,
11 | payment_type int,
12 | fare_amount float,
13 | extra float,
14 | mta_tax float,
15 | tip_amount float,
16 | tolls_amount float,
17 | improvement_surcharge float,
18 | total_amount float,
19 | congestion_surcharge float);
20 | 


--------------------------------------------------------------------------------
/include/sql/great_expectations_examples/create_yellow_tripdata_snowflake_table.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE IF NOT EXISTS {{ conn.snowflake_default.schema }}.{{ params.table_name }}
 2 | (vendor_id int,
 3 | pickup_datetime timestamp,
 4 | dropoff_datetime timestamp,
 5 | passenger_count int,
 6 | trip_distance float,
 7 | rate_code_id int,
 8 | store_and_fwd_flag varchar,
 9 | pickup_location_id int,
10 | dropoff_location_id int,
11 | payment_type int,
12 | fare_amount float,
13 | extra float,
14 | mta_tax float,
15 | tip_amount float,
16 | tolls_amount float,
17 | improvement_surcharge float,
18 | total_amount float,
19 | congestion_surcharge float);
20 | 


--------------------------------------------------------------------------------
/include/sql/great_expectations_examples/delete_yellow_tripdata_table.sql:
--------------------------------------------------------------------------------
1 | DROP TABLE IF EXISTS {{ params.table_name }};
2 | 


--------------------------------------------------------------------------------
/include/sql/great_expectations_examples/table_schemas/tripdata_schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "title": "yellow_tripdata_example",
 3 |     "yellow_tripdata": {
 4 |         "properties": {
 5 |             "VENDOR_ID": {"type": "int", "description": "source_order=1"},
 6 |             "PICKUP_DATETIME": {"type": "timestamp", "description": "source_order=2"},
 7 |             "DROPOFF_DATETIME": {"type": "timestamp", "description": "source_order=3"},
 8 |             "PASSENGER_COUNT": {"type": "int", "description": "source_order=4"},
 9 |             "TRIP_DISTANCE": {"type": "float", "description": "source_order=5"},
10 |             "RATE_CODE_ID": {"type": "int", "description": "source_order=6"},
11 |             "STORE_AND_FWD_FLAG": {"type": "varchar(64)", "description": "source_order=7"},
12 |             "PICKUP_LOCATION_ID": {"type": "int", "description": "source_order=8"},
13 |             "DROPOFF_LOCATION_ID": {"type": "int", "description": "source_order=9"},
14 |             "PAYMENT_TYPE": {"type": "int", "description": "source_order=10"},
15 |             "FARE_AMOUNT": {"type": "float", "description": "source_order=11"},
16 |             "EXTRA": {"type": "float", "description": "source_order=12"},
17 |             "MTA_TAX": {"type": "float", "description": "source_order=13"},
18 |             "TIP_AMOUNT": {"type": "float", "description": "source_order=14"},
19 |             "TOLLS_AMOUNT": {"type": "float", "description": "source_order=15"},
20 |             "IMPROVEMENT_SURCHARGE": {"type": "float", "description": "source_order=16"},
21 |             "TOTAL_AMOUNT": {"type": "float", "description": "source_order=17"},
22 |             "CONGESTION_SURCHARGE": {"type": "float", "description": "source_order=18"}
23 |         },
24 |         "dimensions": ["vendor_id",
25 |           "pickup_datetime",
26 |           "dropoff_datetime"],
27 |         "metrics":["passenger_count",
28 |           "trip_distance",
29 |           "rate_code_id",
30 |           "store_and_fwd_flag",
31 |           "pickup_location_id",
32 |           "dropoff_location_id",
33 |           "payment_type",
34 |           "fare_amount",
35 |           "extra",
36 |           "mta_tax",
37 |           "tip_amount",
38 |           "tolls_amount",
39 |           "improvement_surcharge",
40 |           "total_amount",
41 |           "congestion_surcharge"],
42 |          "cluster_keys": {
43 |             "columns":["VENDOR_ID","PICKUP_DATETIME","DROPOFF_DATETIME"],
44 |             "description": null
45 |         }
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/include/sql/redshift_examples/create_redshift_forestfire_table.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE IF NOT EXISTS {{ var.json.aws_configs.redshift_table }}
2 | (ID int,Y int,month varchar,day varchar,FFMC float,DMC float,DC float,ISI float,temp float,RH float,wind float,rain float,area float);
3 | 


--------------------------------------------------------------------------------
/include/sql/redshift_examples/drop_redshift_forestfire_table.sql:
--------------------------------------------------------------------------------
1 | DROP TABLE IF EXISTS {{ var.json.aws_configs.redshift_table }};
2 | 


--------------------------------------------------------------------------------
/include/sql/redshift_examples/row_quality_redshift_forestfire_check.sql:
--------------------------------------------------------------------------------
 1 | -- Query to check if row items match particular parameters passed in by Operator.
 2 | SELECT ID,
 3 |   CASE y WHEN {{ params.y }} THEN 1 ELSE 0 END AS y_check,
 4 |   CASE month WHEN '{{ params.month }}' THEN 1 ELSE 0 END AS month_check,
 5 |   CASE day WHEN '{{ params.day }}' THEN 1 ELSE 0 END AS day_check,
 6 |   CASE ffmc WHEN {{ params.ffmc }} THEN 1 ELSE 0 END AS ffmc_check,
 7 |   CASE dmc WHEN {{ params.dmc }} THEN 1 ELSE 0 END AS dmc_check,
 8 |   CASE dc WHEN {{ params.dc }} THEN 1 ELSE 0 END AS dc_check,
 9 |   CASE isi WHEN {{ params.isi }} THEN 1 ELSE 0 END AS isi_check,
10 |   CASE temp WHEN {{ params.temp }} THEN 1 ELSE 0 END AS temp_check,
11 |   CASE rh WHEN {{ params.rh }} THEN 1 ELSE 0 END AS rh_check,
12 |   CASE wind WHEN {{ params.wind }} THEN 1 ELSE 0 END AS wind_check,
13 |   CASE rain WHEN {{ params.rain }} THEN 1 ELSE 0 END AS rain_check,
14 |   CASE area WHEN {{ params.area }} THEN 1 ELSE 0 END AS area_check
15 | FROM {{ var.json.aws_configs.redshift_table }}
16 | WHERE ID = {{ params.id }}
17 | 


--------------------------------------------------------------------------------
/include/sql/redshift_examples/validate_redshift_forestfire_load.sql:
--------------------------------------------------------------------------------
1 | -- SQL query to validate the upload of forestfires.csv
2 | SELECT CASE 0 WHEN COUNT(trim(filename)) THEN 1 ELSE 0 END as filename_check
3 | FROM stl_load_errors
4 | WHERE filename LIKE '%{{ params.filename }}%'
5 | LIMIT 1;
6 | 


--------------------------------------------------------------------------------
/include/sql/snowflake_examples/copy_forestfire_snowflake_audit.sql:
--------------------------------------------------------------------------------
 1 | {% set table_schema = params.table_schema %}
 2 | MERGE INTO {{ conn.snowflake_default.schema }}.{{ params.table_name }} as dest
 3 | USING (
 4 |     SELECT *
 5 |     FROM
 6 |     {{ conn.snowflake_default.schema }}.{{ params.audit_table_name }}
 7 | ) as stg
 8 | ON dest.ID = stg.ID
 9 | WHEN NOT MATCHED THEN
10 | INSERT (
11 |     {%- for name, col_dict in table_schema.items() -%}
12 |     {%- if loop.first %}
13 |     {{ name }}
14 |     {%- else %}
15 |     ,{{ name }}
16 |     {%- endif %}
17 |     {%- endfor %}
18 | )
19 | VALUES
20 | (
21 |     {% for name, col_dict in table_schema.items() %}
22 |     {%- if not loop.first %}
23 |     ,{%- endif -%}
24 |     {%- if 'default' in col_dict.keys() -%}
25 |         COALESCE(stg.{{ name }}, '{{col_dict.get('default', 'missing_value')}}')
26 |     {%- else -%}
27 |         stg.{{ name }}
28 |     {%- endif -%}
29 |     {%- endfor %}
30 | )
31 | 


--------------------------------------------------------------------------------
/include/sql/snowflake_examples/create_cost_table.sql:
--------------------------------------------------------------------------------
1 | CREATE OR REPLACE TRANSIENT TABLE {{ params.table_name }}
2 |   (id INT,
3 |     land_damage_cost INT,
4 |     property_damage_cost INT,
5 |     lost_profits_cost INT);
6 | 


--------------------------------------------------------------------------------
/include/sql/snowflake_examples/create_forestfire_cost_table.sql:
--------------------------------------------------------------------------------
 1 | CREATE OR REPLACE TRANSIENT TABLE {{ params.table_name }}
 2 |   (id INT,
 3 |     land_damage_cost INT,
 4 |     property_damage_cost INT,
 5 |     lost_profits_cost INT,
 6 |     total_cost INT,
 7 |     y INT,
 8 |     month VARCHAR(25),
 9 |     day VARCHAR(25),
10 |     area FLOAT);
11 | 


--------------------------------------------------------------------------------
/include/sql/snowflake_examples/create_forestfire_table.sql:
--------------------------------------------------------------------------------
 1 | CREATE OR REPLACE TRANSIENT TABLE {{ params.table_name }}
 2 |   (id INT,
 3 |     y INT,
 4 |     month VARCHAR(25),
 5 |     day VARCHAR(25),
 6 |     ffmc FLOAT,
 7 |     dmc FLOAT,
 8 |     dc FLOAT,
 9 |     isi FLOAT,
10 |     temp FLOAT,
11 |     rh FLOAT,
12 |     wind FLOAT,
13 |     rain FLOAT,
14 |     area FLOAT);
15 | 


--------------------------------------------------------------------------------
/include/sql/snowflake_examples/create_snowflake_yellow_tripdata_stage.sql:
--------------------------------------------------------------------------------
1 | CREATE OR REPLACE STAGE {{ params.stage_name }} url=s3://{{ var.json.aws_configs.s3_bucket }}
2 | credentials=(aws_key_id='{{ conn.aws_default.login }}' aws_secret_key='{{ conn.aws_default.password }}')
3 | file_format=(type = 'CSV', skip_header = 1, time_format = 'YYYY-MM-DD HH24:MI:SS');
4 | 


--------------------------------------------------------------------------------
/include/sql/snowflake_examples/create_snowflake_yellow_tripdata_table.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE IF NOT EXISTS {{ conn.snowflake_default.schema }}.{{ params.table_name }}
 2 | (
 3 |     vendor_id int,
 4 |     pickup_datetime timestamp,
 5 |     dropoff_datetime timestamp,
 6 |     passenger_count int,
 7 |     trip_distance float,
 8 |     rate_code_id int,
 9 |     store_and_fwd_flag varchar,
10 |     pickup_location_id int,
11 |     dropoff_location_id int,
12 |     payment_type int,
13 |     fare_amount float,
14 |     extra float,
15 |     mta_tax float,
16 |     tip_amount float,
17 |     tolls_amount float,
18 |     improvement_surcharge float,
19 |     total_amount float,
20 |     congestion_surcharge float
21 | );
22 | 


--------------------------------------------------------------------------------
/include/sql/snowflake_examples/delete_forestfire_table.sql:
--------------------------------------------------------------------------------
1 | DROP TABLE IF EXISTS {{ params.table_name }};
2 | 


--------------------------------------------------------------------------------
/include/sql/snowflake_examples/delete_snowflake_table.sql:
--------------------------------------------------------------------------------
1 | DROP TABLE IF EXISTS {{ params.table_name }};
2 | 


--------------------------------------------------------------------------------
/include/sql/snowflake_examples/load_cost_data.sql:
--------------------------------------------------------------------------------
 1 | INSERT INTO {{ params.table_name }} VALUES
 2 |     (1,150000,32000,10000),
 3 |     (2,200000,50000,50000),
 4 |     (3,90000,120000,300000),
 5 |     (4,230000,14000,7000),
 6 |     (5,98000,27000,48000),
 7 |     (6,72000,800000,0),
 8 |     (7,50000,2500000,0),
 9 |     (8,8000000,33000000,0),
10 |     (9,6325000,450000,76000);
11 | 


--------------------------------------------------------------------------------
/include/sql/snowflake_examples/load_forestfire_cost_data.sql:
--------------------------------------------------------------------------------
 1 | INSERT INTO forestfire_costs (id, land_damage_cost, property_damage_cost, lost_profits_cost, total_cost, y, month, day, area)
 2 |     SELECT
 3 |         c.id,
 4 |         c.land_damage_cost,
 5 |         c.property_damage_cost,
 6 |         c.lost_profits_cost,
 7 |         c.land_damage_cost + c.property_damage_cost + c.lost_profits_cost,
 8 |         ff.y,
 9 |         ff.month,
10 |         ff.day,
11 |         ff.area
12 |     FROM costs c
13 |     LEFT JOIN forestfires ff
14 |         ON c.id = ff.id
15 | 


--------------------------------------------------------------------------------
/include/sql/snowflake_examples/load_snowflake_forestfire_data.sql:
--------------------------------------------------------------------------------
 1 | INSERT INTO {{ params.table_name }} VALUES
 2 |   (1,2,'aug','fri',91,166.9,752.6,7.1,25.9,41,3.6,0,100),
 3 |   (2,2,'feb','mon',84,9.3,34,2.1,13.9,40,5.4,0,57.8),
 4 |   (3,4,'mar','sat',69,2.4,15.5,0.7,17.4,24,5.4,0,92.9),
 5 |   (4,4,'mar','mon',87.2,23.9,64.7,4.1,11.8,35,1.8,0,1300),
 6 |   (5,5,'mar','sat',91.7,35.8,80.8,7.8,15.1,27,5.4,0,4857),
 7 |   (6,5,'sep','wed',92.9,133.3,699.6,9.2,26.4,21,4.5,0,9800),
 8 |   (7,5,'mar','fri',86.2,26.2,94.3,5.1,8.2,51,6.7,0,14),
 9 |   (8,6,'mar','fri',91.7,33.3,77.5,9,8.3,97,4,0.2,74.5),
10 |   (9,9,'feb','thu',84.2,6.8,26.6,7.7,6.7,79,3.1,0,8880.7);
11 | 


--------------------------------------------------------------------------------
/include/sql/snowflake_examples/row_quality_snowflake_forestfire_check.sql:
--------------------------------------------------------------------------------
 1 | -- Query to check if row items match particular parameters passed in by Operator.
 2 | SELECT ID,
 3 |   CASE y WHEN {{ params.y }} THEN 1 ELSE 0 END AS y_check,
 4 |   CASE month WHEN '{{ params.month }}' THEN 1 ELSE 0 END AS month_check,
 5 |   CASE day WHEN '{{ params.day }}' THEN 1 ELSE 0 END AS day_check,
 6 |   CASE ffmc WHEN {{ params.ffmc }} THEN 1 ELSE 0 END AS ffmc_check,
 7 |   CASE dmc WHEN {{ params.dmc }} THEN 1 ELSE 0 END AS dmc_check,
 8 |   CASE dc WHEN {{ params.dc }} THEN 1 ELSE 0 END AS dc_check,
 9 |   CASE isi WHEN {{ params.isi }} THEN 1 ELSE 0 END AS isi_check,
10 |   CASE temp WHEN {{ params.temp }} THEN 1 ELSE 0 END AS temp_check,
11 |   CASE rh WHEN {{ params.rh }} THEN 1 ELSE 0 END AS rh_check,
12 |   CASE wind WHEN {{ params.wind }} THEN 1 ELSE 0 END AS wind_check,
13 |   CASE rain WHEN {{ params.rain }} THEN 1 ELSE 0 END AS rain_check,
14 |   CASE area WHEN {{ params.area }} THEN 1 ELSE 0 END AS area_check
15 | FROM {{ params.table_name }}
16 | WHERE ID = {{ params.id }}
17 | 


--------------------------------------------------------------------------------
/include/sql/snowflake_examples/row_quality_yellow_tripdata_check.sql:
--------------------------------------------------------------------------------
 1 | -- Query to check row items
 2 | SELECT vendor_id, pickup_datetime,
 3 |   CASE WHEN dropoff_datetime > pickup_datetime THEN 1 ELSE 0 END AS date_check,
 4 |   CASE WHEN passenger_count >= 0 THEN 1 ELSE 0 END AS passenger_count_check,
 5 |   CASE WHEN trip_distance >= 0 AND trip_distance <= 100 THEN 1 ELSE 0 END AS trip_distance_check,
 6 |   CASE WHEN ROUND((fare_amount + extra + mta_tax + tip_amount + improvement_surcharge + COALESCE(congestion_surcharge, 0)), 1) = ROUND(total_amount, 1) THEN 1
 7 |        WHEN ROUND(fare_amount + extra + mta_tax + tip_amount + improvement_surcharge, 1) = ROUND(total_amount, 1) THEN 1 ELSE 0 END AS fare_check
 8 | FROM {{ params.table }}
 9 | WHERE pickup_datetime IN (SELECT pickup_datetime FROM {{ params.table }} ORDER BY RANDOM() LIMIT 1)
10 | 


--------------------------------------------------------------------------------
/include/sql/snowflake_examples/row_quality_yellow_tripdata_template.sql:
--------------------------------------------------------------------------------
1 | -- Template to check various columns in the yellow tripdata data set.
2 | SELECT MIN({{ params.check_name }})
3 | FROM(
4 |   SELECT
5 |     CASE WHEN {{ params.check_statement }} THEN 1 ELSE 0 END AS {{ params.check_name }}
6 |   FROM {{ params.table }}
7 | )
8 | 


--------------------------------------------------------------------------------
/include/sql/snowflake_examples/table_schemas/forestfire_schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "title": "forestfire_example",
 3 |     "forestfire": {
 4 |         "properties": {
 5 |             "ID": {"type": "int", "description": "source_order=1"},
 6 |             "Y": {"type": "int", "description": "source_order=2"},
 7 |             "MONTH": {"type": "varchar(25)", "description": "source_order=3"},
 8 |             "DAY": {"type": "varchar(25)", "description": "source_order=4"},
 9 |             "FFMC": {"type": "float", "description": "source_order=5"},
10 |             "DMC": {"type": "float", "description": "source_order=6"},
11 |             "DC": {"type": "float", "description": "source_order=7"},
12 |             "ISI": {"type": "float", "description": "source_order=8"},
13 |             "TEMP": {"type": "float", "description": "source_order=9"},
14 |             "RH": {"type": "float", "description": "source_order=10"},
15 |             "WIND": {"type": "float", "description": "source_order=11"},
16 |             "RAIN": {"type": "float", "description": "source_order=12"},
17 |             "AREA": {"type": "float", "description": "source_order=13"}
18 |         },
19 |         "dimensions": ["id"],
20 |         "metrics":[
21 |             "y",
22 |             "month",
23 |             "day",
24 |             "ffmc",
25 |             "dmc",
26 |             "dc",
27 |             "isi",
28 |             "temp",
29 |             "rh",
30 |             "wind",
31 |             "rain",
32 |             "area"
33 |         ],
34 |          "cluster_keys": {
35 |             "columns":["ID"],
36 |             "description": null
37 |         }
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/include/sql/snowflake_examples/transform_forestfire_cost_table.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 |     id,
3 |     month,
4 |     day,
5 |     total_cost,
6 |     area,
7 |     total_cost / area as cost_per_area
8 | FROM {{ params.table_name }}
9 | 


--------------------------------------------------------------------------------
/include/sql/sql_examples/create_redshift_yellow_tripdata_table.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE IF NOT EXISTS {{ var.json.aws_configs.redshift_table }}
 2 | (vendor_id int,
 3 | pickup_datetime timestamp,
 4 | dropoff_datetime timestamp,
 5 | passenger_count int,
 6 | trip_distance float,
 7 | rate_code_id int,
 8 | store_and_fwd_flag varchar,
 9 | pickup_location_id int,
10 | dropoff_location_id int,
11 | payment_type int,
12 | fare_amount float,
13 | extra float,
14 | mta_tax float,
15 | tip_amount float,
16 | tolls_amount float,
17 | improvement_surcharge float,
18 | total_amount float,
19 | congestion_surcharge float,
20 | upload_date timestamp);
21 | 


--------------------------------------------------------------------------------
/include/sql/sql_examples/drop_redshift_yellow_tripdata_table.sql:
--------------------------------------------------------------------------------
1 | DROP TABLE IF EXISTS {{ var.json.aws_configs.redshift_table }};
2 | 


--------------------------------------------------------------------------------
/include/sql/sql_examples/row_quality_yellow_tripdata_check.sql:
--------------------------------------------------------------------------------
 1 | -- Query to check row items
 2 | SELECT vendor_id, pickup_datetime,
 3 |   CASE WHEN dropoff_datetime > pickup_datetime THEN 1 ELSE 0 END AS date_check,
 4 |   CASE WHEN passenger_count >= 0 THEN 1 ELSE 0 END AS passenger_count_check,
 5 |   CASE WHEN trip_distance >= 0 AND trip_distance <= 100 THEN 1 ELSE 0 END AS trip_distance_check,
 6 |   CASE WHEN ROUND((fare_amount + extra + mta_tax + tip_amount + improvement_surcharge + COALESCE(congestion_surcharge, 0)), 1) = ROUND(total_amount, 1) THEN 1
 7 |        WHEN ROUND(fare_amount + extra + mta_tax + tip_amount + improvement_surcharge, 1) = ROUND(total_amount, 1) THEN 1 ELSE 0 END AS fare_check
 8 | FROM {{ var.json.aws_configs.redshift_table }}
 9 | WHERE pickup_datetime IN (SELECT pickup_datetime FROM {{ var.json.aws_configs.redshift_table }} ORDER BY RANDOM() LIMIT 1)
10 | 


--------------------------------------------------------------------------------
/include/validation/forestfire_validation.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "1": {
 3 |     "y": "2",
 4 |     "month": "aug",
 5 |     "day": "fri",
 6 |     "ffmc": "91",
 7 |     "dmc": "166.9",
 8 |     "dc": "752.6",
 9 |     "isi": "7.1",
10 |     "temp": "25.9",
11 |     "rh": "41",
12 |     "wind": "3.6",
13 |     "rain": "0",
14 |     "area": "0"
15 |   },
16 |   "2": {
17 |     "y": "2",
18 |     "month": "feb",
19 |     "day": "mon",
20 |     "ffmc": "84",
21 |     "dmc": "9.3",
22 |     "dc": "34",
23 |     "isi": "2.1",
24 |     "temp": "13.9",
25 |     "rh": "40",
26 |     "wind": "5.4",
27 |     "rain": "0",
28 |     "area": "0"
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/packages.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/astronomer/airflow-data-quality-demo/8847b1c9e749966a762ed5b9fa8d2075d4772352/packages.txt


--------------------------------------------------------------------------------
/plugins/firebolt_operator_test.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Licensed to the Apache Software Foundation (ASF) under one
  3 | # or more contributor license agreements.  See the NOTICE file
  4 | # distributed with this work for additional information
  5 | # regarding copyright ownership.  The ASF licenses this file
  6 | # to you under the Apache License, Version 2.0 (the
  7 | # "License"); you may not use this file except in compliance
  8 | # with the License.  You may obtain a copy of the License at
  9 | #
 10 | #   http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing,
 13 | # software distributed under the License is distributed on an
 14 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 | # KIND, either express or implied.  See the License for the
 16 | # specific language governing permissions and limitations
 17 | # under the License.
 18 | from typing import Any, Dict, List, Optional, Union
 19 | 
 20 | from airflow.models import BaseOperator, BaseOperatorLink
 21 | # from firebolt_provider.hooks.firebolt import FireboltHook
 22 | from airflow.operators.sql import BaseSQLOperator
 23 | 
 24 | """
 25 | def get_db_hook(self) -> SqlHook:
 26 | 
 27 |     Create and return FireboltHook.
 28 | 
 29 |     :return: a FireboltHook instance.
 30 |     :rtype: FireboltHook
 31 | 
 32 |     return FireboltHook(
 33 |         firebolt_conn_id=self.firebolt_conn_id,
 34 |         database=self.database,
 35 |         engine_name=self.engine_name,
 36 |     )
 37 | """
 38 | 
 39 | 
 40 | class RegistryLink(BaseOperatorLink):
 41 |     """Link to Registry"""
 42 | 
 43 |     name = "Astronomer Registry"
 44 | 
 45 |     def get_link(self, operator, dttm):
 46 |         """Get link to registry page."""
 47 | 
 48 |         registry_link = (
 49 |             "https://registry.astronomer.io/providers/{provider}/modules/{operator}"
 50 |         )
 51 |         return registry_link.format(provider="firebolt", operator="fireboltoperator")
 52 | 
 53 | 
 54 | class FireboltOperator(BaseSQLOperator):
 55 |     """
 56 |     Executes SQL code in a Firebolt database
 57 | 
 58 |     :param firebolt_conn_id: Firebolt connection id
 59 |     :type firebolt_conn_id: str
 60 |     :param sql: the sql code to be executed. (templated)
 61 |     :type sql: Can receive a str representing a sql statement,
 62 |         a list of str (sql statements), or reference to a template file.
 63 |         Template reference are recognized by str ending in '.sql'
 64 |     :param autocommit: if True, each command is automatically committed.
 65 |         Currently firebolt doesn't support autocommit feature.
 66 |         (default value: False)
 67 |     :type autocommit: bool
 68 |     :param parameters: (optional) the parameters to render the SQL query with.
 69 |     :type parameters: dict or iterable
 70 |     :param database: name of database (will overwrite database defined
 71 |         in connection)
 72 |     :type database: str
 73 |     :param engine_name: name of engine (will overwrite engine_name defined in
 74 |         connection)
 75 |     :type engine_name: str
 76 |     """
 77 | 
 78 |     template_fields = ('sql',)
 79 |     template_ext = ('.sql',)
 80 |     ui_color = '#b4e0ff'
 81 | 
 82 |     def __init__(
 83 |         self,
 84 |         *,
 85 |         sql: Union[str, List[str]],
 86 |         conn_id: str = 'firebolt_default',
 87 |         parameters: Optional[dict] = None,
 88 |         database: Optional[str] = None,
 89 |         engine_name: Optional[str] = None,
 90 |         autocommit: bool = False,
 91 |         hook_params: Optional[Dict] = None,
 92 |         ** kwargs: Any,
 93 |     ) -> None:
 94 |         super().__init__(**kwargs)
 95 |         #self.firebolt_conn_id = conn_id
 96 |         self.sql = sql
 97 |         self.database = database
 98 |         self.engine_name = engine_name
 99 |         self.parameters = parameters
100 |         self.autocommit = autocommit
101 |         self.hook_params = hook_params
102 | 
103 |     def execute(self, context: Dict[Any, Any]) -> None:
104 |         """Run query on firebolt"""
105 |         self.log.info('Executing: %s', self.sql)
106 |         hook = self.get_db_hook()
107 |         hook.run(sql=self.sql, autocommit=self.autocommit,
108 |                  parameters=self.parameters)
109 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | apache-airflow-providers-slack==7.2.0
 2 | airflow-provider-great-expectations==0.2.5
 3 | airflow-provider-firebolt==0.1.3
 4 | apache-airflow-providers-trino==4.3.1
 5 | great-expectations==0.15.50
 6 | lightgbm==3.2.1
 7 | matplotlib==3.5.1
 8 | mlflow==1.23.0
 9 | openlineage-airflow==0.19.2
10 | pandas==1.3.4
11 | scikit-learn==1.0.1
12 | sqlalchemy-bigquery==1.3.0
13 | 


--------------------------------------------------------------------------------