├── .gitignore ├── README.md ├── data ├── yellow_tripdata_sample_2019-01.csv └── yellow_tripdata_sample_2019-02.csv ├── ge_dbt_airflow_tutorial ├── Dockerfile ├── README.md ├── airflow │ ├── ge_tutorials_dag_with_great_expectations.py │ └── ge_tutorials_dag_without_great_expectations.py ├── data │ ├── .gitkeep │ ├── npi_small.csv │ └── state_abbreviations.csv ├── dbt │ ├── dbt_project.yml │ └── models │ │ ├── count_providers_by_state.sql │ │ ├── npi_with_state.sql │ │ ├── sources.yml │ │ └── staging │ │ ├── stg_npi.sql │ │ └── stg_state_abbreviations.sql ├── deploy │ ├── config │ │ └── airflow.cfg │ └── script │ │ └── entrypoint.sh ├── docker-compose.yml ├── example_dbt_profile.yml ├── great_expectations_projects │ └── final │ │ └── great_expectations │ │ ├── .gitignore │ │ ├── config_variables.yml │ │ ├── expectations │ │ ├── count_providers_by_state │ │ │ └── critical.json │ │ ├── npi_small_db_table │ │ │ └── critical.json │ │ ├── npi_small_file │ │ │ └── critical.json │ │ └── state_abbreviations_file │ │ │ └── critical.json │ │ ├── great_expectations.yml │ │ ├── notebooks │ │ ├── pandas │ │ │ └── validation_playground.ipynb │ │ ├── spark │ │ │ └── validation_playground.ipynb │ │ └── sql │ │ │ └── validation_playground.ipynb │ │ └── plugins │ │ └── custom_data_docs │ │ └── styles │ │ └── data_docs_custom_styles.css ├── images │ ├── dbt_dag.png │ ├── enable_dag.gif │ ├── pipeline_airflow_dag_with_ge.png │ └── pipeline_airflow_dag_without_ge.png └── requirements.txt ├── getting_started_tutorial_final_v2_api ├── README.md └── great_expectations │ ├── .gitignore │ ├── checkpoints │ ├── .ge_store_backend_id │ └── my_chk.yml │ ├── expectations │ ├── .ge_store_backend_id │ └── taxi │ │ └── demo.json │ ├── great_expectations.yml │ ├── notebooks │ ├── pandas │ │ └── validation_playground.ipynb │ ├── spark │ │ └── validation_playground.ipynb │ └── sql │ │ └── validation_playground.ipynb │ └── plugins │ └── custom_data_docs │ └── styles │ └── data_docs_custom_styles.css └── getting_started_tutorial_final_v3_api ├── README.md ├── great_expectations ├── .gitignore ├── checkpoints │ ├── my_checkpoint.yml │ └── my_checkpoint_with_custom_expectation.yml ├── expectations │ ├── .ge_store_backend_id │ └── taxi │ │ ├── demo.json │ │ └── demo_with_custom_expectation.json ├── great_expectations.yml ├── notebooks │ ├── pandas │ │ └── validation_playground.ipynb │ ├── spark │ │ └── validation_playground.ipynb │ └── sql │ │ └── validation_playground.ipynb └── plugins │ ├── column_custom_max_expectation.py │ └── custom_data_docs │ └── styles │ └── data_docs_custom_styles.css └── run_checkpoint_with_custom_expectation.py /.gitignore: -------------------------------------------------------------------------------- 1 | venv/ 2 | ge_dbt_airflow_tutorial/target/ 3 | ge_dbt_airflow_tutorial/dbt_modules/ 4 | logs/ 5 | .venv/ 6 | __pycache__/ 7 | .ipynb_checkpoints 8 | */.ipynb_checkpoints/* 9 | **/.DS_Store 10 | .idea/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | *This repository is no longer maintained. Please see our current guides at https://docs.greatexpectations.io/docs/guides/setup/get_started_lp to get started.* 2 | 3 | # Great Expectations tutorials 4 | 5 | This repository contains the material for a number of Great Expectations tutorials. They all contain instructions in the respective README files. 6 | 7 | **We invite community contributions for these tutorials!** 8 | 9 | 10 | ## getting_started_tutorial_final_v3_api [TBD] 11 | This example contains the final state of the "Getting started with Great Expectations" tutorial for the Great 12 | Expectations API v3 (Batch Kwargs API), which is included in Great Expectations version 0.13 and above. 13 | It also acts as a starting point to explore and demo Great Expectations. See the README in the directory for details. 14 | 15 | ## getting_started_tutorial_final_v2_api 16 | This example contains the final state of the "Getting started with Great Expectations" tutorial for the Great Expectations 17 | API v2 (Batch Kwargs API) which applies to Great Expectations version 0.12.x and below. It also acts as a starting point 18 | to explore and demo Great Expectations. See the README in the directory for details. 19 | 20 | ## ge_dbt_airflow_tutorial 21 | This example demonstrates the use of Great Expectations in a data pipeline with dbt and Apache Airflow. 22 | See the README in the directory for details. **Note** This tutorial currently requires an update to work with the 23 | new-style Checkpoints that were introduced in version 0.13.8. 24 | -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/Dockerfile: -------------------------------------------------------------------------------- 1 | # VERSION 1.10.9 2 | # AUTHOR: Matthieu "Puckel_" Roisil 3 | # DESCRIPTION: Basic Airflow container 4 | # BUILD: docker build --rm -t puckel/docker-airflow . 5 | # SOURCE: https://github.com/puckel/docker-airflow 6 | 7 | FROM python:3.7-slim-buster 8 | LABEL maintainer="Puckel_" 9 | 10 | # Never prompt the user for choices on installation/configuration of packages 11 | ENV DEBIAN_FRONTEND noninteractive 12 | ENV TERM linux 13 | 14 | # Airflow 15 | ARG AIRFLOW_VERSION=1.10.9 16 | ARG AIRFLOW_USER_HOME=/usr/local/airflow 17 | ARG AIRFLOW_DEPS="" 18 | ARG PYTHON_DEPS="" 19 | ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME} 20 | 21 | # Define en_US. 22 | ENV LANGUAGE en_US.UTF-8 23 | ENV LANG en_US.UTF-8 24 | ENV LC_ALL en_US.UTF-8 25 | ENV LC_CTYPE en_US.UTF-8 26 | ENV LC_MESSAGES en_US.UTF-8 27 | 28 | # Disable noisy "Handling signal" log messages: 29 | # ENV GUNICORN_CMD_ARGS --log-level WARNING 30 | 31 | RUN set -ex \ 32 | && buildDeps=' \ 33 | freetds-dev \ 34 | libkrb5-dev \ 35 | libsasl2-dev \ 36 | libssl-dev \ 37 | libffi-dev \ 38 | libpq-dev \ 39 | git \ 40 | ' \ 41 | && apt-get update -yqq \ 42 | && apt-get upgrade -yqq \ 43 | && apt-get install -yqq --no-install-recommends \ 44 | $buildDeps \ 45 | freetds-bin \ 46 | build-essential \ 47 | default-libmysqlclient-dev \ 48 | apt-utils \ 49 | curl \ 50 | rsync \ 51 | netcat \ 52 | locales \ 53 | && sed -i 's/^# en_US.UTF-8 UTF-8$/en_US.UTF-8 UTF-8/g' /etc/locale.gen \ 54 | && locale-gen \ 55 | && update-locale LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 \ 56 | && useradd -ms /bin/bash -d ${AIRFLOW_USER_HOME} airflow \ 57 | && pip install -U pip setuptools wheel \ 58 | && pip install pyOpenSSL \ 59 | && pip install ndg-httpsclient \ 60 | && pip install pyasn1 \ 61 | && pip install apache-airflow[crypto,celery,postgres,hive,jdbc,mysql,ssh${AIRFLOW_DEPS:+,}${AIRFLOW_DEPS}]==${AIRFLOW_VERSION} \ 62 | && pip install 'redis==3.2' \ 63 | && if [ -n "${PYTHON_DEPS}" ]; then pip install ${PYTHON_DEPS}; fi \ 64 | && apt-get purge --auto-remove -yqq $buildDeps \ 65 | && apt-get autoremove -yqq --purge \ 66 | && apt-get clean \ 67 | && rm -rf \ 68 | /var/lib/apt/lists/* \ 69 | /tmp/* \ 70 | /var/tmp/* \ 71 | /usr/share/man \ 72 | /usr/share/doc \ 73 | /usr/share/doc-base 74 | 75 | COPY deploy/script/entrypoint.sh /entrypoint.sh 76 | 77 | COPY deploy/config/airflow.cfg ${AIRFLOW_USER_HOME}/airflow.cfg 78 | 79 | RUN chown -R airflow: ${AIRFLOW_USER_HOME} 80 | 81 | EXPOSE 8080 5555 8793 82 | 83 | RUN set -ex \ 84 | && pip install scipy \ 85 | && pip install great_expectations \ 86 | && pip install dbt \ 87 | && pip uninstall -y SQLAlchemy \ 88 | && pip install SQLAlchemy==1.3.15 89 | 90 | USER airflow 91 | WORKDIR ${AIRFLOW_USER_HOME} 92 | ENTRYPOINT ["/entrypoint.sh"] 93 | CMD ["webserver"] 94 | 95 | -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/README.md: -------------------------------------------------------------------------------- 1 | # Great Expectations dbt + Airflow Pipeline Tutorial 2 | 3 | The purpose of this example is to show how [Great Expectations](https://greatexpectations.io) can protect a data pipeline from bad data and code bugs. 4 | 5 | **Please note** This tutorial is work in progress. Feel free to provide *feedback via our [Slack channel](https://greatexpectations.io/slack), a GitHub issue, or just fork it and show us your own implementation*, we'll be happy to answer questions and iterate on the content to make it more useful for the community! 6 | 7 | ## Pipeline overview 8 | 9 | The pipeline will look familiar to lots of data teams working with ELT pattern. 10 | It loads data from files into a database and then transforms it. 11 | 12 | Airflow is used to orchestrate the pipeline. dbt is used to transform for the "T" step of ELT. 13 | 14 | The purpose of this tutorial is to show how the individual components work together. Therefore, the Airflow setuo and the dbt DAG are kept fairly trivial, but hopefully realistic. 15 | 16 | This tutorial directory contains two Airflow DAGs of this data pipeline: 17 | * before Great Expectations was added - in `ge_tutorials_dag_without_great_expectations.py` 18 | * after Great Expectations was added - in `airflow/ge_tutorials_dag_with_great_expectations.py` 19 | 20 | ### Without Great Expectations 21 | 22 | ![The airflow DAG](images/pipeline_airflow_dag_without_ge.png) 23 | 24 | 1. Load the source files to a postgres database using SQLAlchemy 25 | 2. Run the dbt DAG to create a simple analytical table, see the dbt DAG snapshot below: 26 | ![The dbt DAG](images/dbt_dag.png) 27 | 28 | 29 | ### With Great Expectations 30 | 31 | ![The airflow DAG](images/pipeline_airflow_dag_with_ge.png) 32 | 33 | 1. Use GE to validate the input CSV files. Stop if they do not meet our expectations 34 | 2. Load the source files to a postgres database using SQLAlchemy 35 | 3. Use GE to validate that the data was loaded into the database successfully 36 | 4. Run the dbt DAG to create a simple analytical table, see the dbt DAG snapshot below: 37 | 5. Use GE to validate the analytical result. 38 | 6. If the analytical result is valid, publish (promote) the analytical table to a "prod" table by renaming it 39 | 40 | ## Setup 41 | 42 | We assume that you will run the "after" version of the pipeline, with Great Expectations integrated. 43 | 44 | Instructions are provided below to setup this tutorial either with or without using Docker. 45 | 46 | ### Setup with Docker 47 | 48 | If you want to quickly get started, use Docker. If you already know Docker, then we have shortcut road for you to run your project: 49 | 50 | ``` 51 | git clone https://github.com/superconductive/ge_tutorials.git 52 | cd ge_tutorials/ge_dbt_airflow_tutorial 53 | # you can run this command everytime you need to start superset now: 54 | docker-compose up 55 | docker exec ge_dbt_airflow_tutorial_webserver_1 airflow upgradedb 56 | ``` 57 | 58 | Once these steps are completed, you can access Airflow at http://localhost:8080/admin/. 59 | 60 | To run the DAG, you first need to turn it on, then manually trigger it. You can do so through the UI: 61 | 62 | ![Screen Recording](images/enable_dag.gif) 63 | (https://share.getcloudapp.com/7Ku0oygJ) 64 | 65 | Once the DAG has run successfully, you'll be able to access the Great Expectations Data Docs at the following URL: http://localhost:8081 66 | 67 | From there, the container servers will reload on modification made to dbt, great expectations final expectations and the airflow dags. Don’t forget to reload the page to take the new frontend into account though. 68 | 69 | ---- 70 | 71 | ### Setup without Docker 72 | 73 | In order to run this project, you will need to go through some basic setup steps. 74 | 75 | #### Database setup 76 | For the purpose of this demo, we assume you have a relational database available that can be accessed using a SQLAlchemy connection URL. We developed the tutorial using a postgres database. Of course, this can be replaced by any other DBMS when working on a real pipeline. 77 | Create an empty database `tutorials_db` 78 | 79 | #### Great Expectations 80 | 81 | * Install Great Expectations 82 | 83 | ``` 84 | pip install great_expectations 85 | ``` 86 | 87 | #### dbt 88 | 89 | * Make sure that you have dbt installed and set up 90 | * Add your database credentials in the dbt_profile.yml (see the example_dbt_profile.yml in this project) 91 | 92 | #### Airflow 93 | 94 | * Make sure you have Airflow installed and set up. 95 | * Point the dags_folder in airflow.cfg to the root directory of this project 96 | 97 | #### Environment variables 98 | 99 | The pipeline's configuration variables are passed using environment variables. Set the following variables: 100 | ``` 101 | export GE_TUTORIAL_DB_URL=postgresql://your_user:your_password@your_dh_host:5432/your_db_name 102 | export GE_TUTORIAL_ROOT_PATH=your_project_path 103 | ``` 104 | 105 | ## Running the pipeline 106 | 107 | You can run each individual task in the airflow DAG with `airflow test ge_tutorials_dag_with_ge `. 108 | In order to run the entire DAG, use `airflow backfill ge_tutorials_dag_with_ge -s -e `. 109 | 110 | -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/airflow/ge_tutorials_dag_with_great_expectations.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import airflow 3 | from airflow import AirflowException 4 | from airflow.operators.bash_operator import BashOperator 5 | from airflow.operators.python_operator import PythonOperator 6 | from airflow import DAG 7 | import os 8 | import pandas as pd 9 | from sqlalchemy import create_engine 10 | import great_expectations as ge 11 | 12 | 13 | # Global variables that are set using environment varaiables 14 | GE_TUTORIAL_DB_URL = os.getenv('GE_TUTORIAL_DB_URL') 15 | GE_TUTORIAL_ROOT_PATH = os.getenv('GE_TUTORIAL_ROOT_PATH') 16 | 17 | great_expectations_context_path = os.getenv('GE_TUTORIAL_GE_CONTEXT_PATH') or os.path.join(GE_TUTORIAL_ROOT_PATH, "great_expectations_projects", "final", "great_expectations") 18 | 19 | 20 | default_args = { 21 | "owner":` "Airflow", 22 | "start_date": airflow.utils.dates.days_ago(1) 23 | } 24 | 25 | 26 | # The DAG definition 27 | dag = DAG( 28 | dag_id='ge_tutorials_dag_with_ge', 29 | default_args=default_args, 30 | schedule_interval=None, 31 | ) 32 | 33 | 34 | def load_files_into_db(ds, **kwargs): 35 | """ 36 | A method to simply load CSV files into a database using SQLAlchemy. 37 | """ 38 | 39 | engine = create_engine(GE_TUTORIAL_DB_URL) 40 | 41 | with engine.connect() as conn: 42 | conn.execute("drop table if exists npi_small cascade ") 43 | conn.execute("drop table if exists state_abbreviations cascade ") 44 | 45 | df_npi_small = pd.read_csv(os.path.join(GE_TUTORIAL_ROOT_PATH, "data", "npi_small.csv")) 46 | column_rename_dict = {old_column_name: old_column_name.lower() for old_column_name in df_npi_small.columns} 47 | df_npi_small.rename(columns=column_rename_dict, inplace=True) 48 | df_npi_small.to_sql("npi_small", engine, 49 | schema=None, 50 | if_exists='replace', 51 | index=False, 52 | index_label=None, 53 | chunksize=None, 54 | dtype=None) 55 | 56 | df_state_abbreviations = pd.read_csv(os.path.join(GE_TUTORIAL_ROOT_PATH, "data", "state_abbreviations.csv")) 57 | df_state_abbreviations.to_sql("state_abbreviations", engine, 58 | schema=None, 59 | if_exists='replace', 60 | index=False, 61 | index_label=None, 62 | chunksize=None, 63 | dtype=None) 64 | 65 | return 'Loaded files into the database' 66 | 67 | 68 | def validate_source_data(ds, **kwargs): 69 | 70 | context = ge.data_context.DataContext(great_expectations_context_path) 71 | 72 | batch_kwargs_file = {"path": os.path.join(GE_TUTORIAL_ROOT_PATH, "data", "npi_small.csv"), 73 | 'datasource': 'input_files'} 74 | 75 | batch_file = context.get_batch(batch_kwargs_file, 'npi_small_file.critical') 76 | 77 | 78 | results = context.run_validation_operator( 79 | "action_list_operator", 80 | assets_to_validate=[batch_file], 81 | run_id="airflow:" + kwargs['dag_run'].run_id + ":" + str(kwargs['dag_run'].start_date)) 82 | 83 | if not results["success"]: 84 | raise AirflowException("Validation of the source data is not successful ") 85 | 86 | 87 | def validate_source_data_load(ds, **kwargs): 88 | 89 | # Data Context is a GE object that represents your project. 90 | # Your project's great_expectations.yml contains all the config 91 | # options for the project's GE Data Context. 92 | context = ge.data_context.DataContext(great_expectations_context_path) 93 | 94 | datasource_name_file = "input_files" 95 | expectation_suite_name_file = "npi_small_file.critical" 96 | batch_kwargs_file = {"path": os.path.join(GE_TUTORIAL_ROOT_PATH, "data", "npi_small.csv"), 97 | 'datasource': 'input_files'} 98 | batch_file = context.get_batch(batch_kwargs_file, expectation_suite_name_file) 99 | 100 | expectation_suite_name_db = "npi_small_db_table.critical" 101 | datasource_name_file_db = "datawarehouse" 102 | 103 | # If you would like to validate an entire table or view in your database's default schema: 104 | batch_kwargs_db = {'table': "npi_small", 'datasource': datasource_name_file_db} 105 | 106 | # # If you would like to validate an entire table or view from a non-default schema in your database: 107 | # batch_kwargs = {'table': "YOUR_TABLE", "schema": "YOUR_SCHEMA", 'datasource': datasource_name} 108 | 109 | # If you would like to validate the result set of a query: 110 | # batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE', 'datasource': datasource_name} 111 | 112 | batch_db = context.get_batch(batch_kwargs_db, expectation_suite_name_db) 113 | 114 | # Call a validation operator to validate the batch. 115 | # The operator will evaluate the data against the expectations 116 | # and perform a list of actions, such as saving the validation 117 | # result, updating Data Docs, and firing a notification (e.g., Slack). 118 | run_id = datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ") 119 | run_id = "airflow:" + kwargs['dag_run'].run_id + ":" + str(kwargs['dag_run'].start_date) 120 | results = context.run_validation_operator( 121 | "action_list_operator", 122 | assets_to_validate=[batch_file, batch_db], 123 | run_id=run_id) # e.g., Airflow run id or some run identifier that your pipeline uses. 124 | 125 | if not results["success"]: 126 | raise AirflowException("Validation of the source data loading is not successful ") 127 | 128 | 129 | 130 | def validate_analytical_output(ds, **kwargs): 131 | 132 | # Data Context is a GE object that represents your project. 133 | # Your project's great_expectations.yml contains all the config 134 | # options for the project's GE Data Context. 135 | context = ge.data_context.DataContext(great_expectations_context_path) 136 | 137 | datasource_name = "datawarehouse" # a datasource configured in your great_expectations.yml 138 | 139 | # Tell GE how to fetch the batch of data that should be validated... 140 | 141 | # ... from the result set of a SQL query: 142 | # batch_kwargs = {"query": "your SQL query", "datasource": datasource_name} 143 | 144 | # ... or from a database table: 145 | batch_kwargs = {"table": "count_providers_by_state", "datasource": datasource_name} 146 | 147 | # ... or from a file: 148 | # batch_kwargs = {"path": "path to your data file", "datasource": datasource_name} 149 | 150 | # ... or from a Pandas or PySpark DataFrame 151 | # batch_kwargs = {"dataset": "your Pandas or PySpark DataFrame", "datasource": datasource_name} 152 | 153 | # Get the batch of data you want to validate. 154 | # Specify the name of the expectation suite that holds the expectations. 155 | expectation_suite_name = "count_providers_by_state.critical" # this is an example of 156 | # a suite that you created 157 | batch = context.get_batch(batch_kwargs, expectation_suite_name) 158 | 159 | # Call a validation operator to validate the batch. 160 | # The operator will evaluate the data against the expectations 161 | # and perform a list of actions, such as saving the validation 162 | # result, updating Data Docs, and firing a notification (e.g., Slack). 163 | run_id = datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ") 164 | run_id = "airflow:" + kwargs['dag_run'].run_id + ":" + str(kwargs['dag_run'].start_date) 165 | results = context.run_validation_operator( 166 | "action_list_operator", 167 | assets_to_validate=[batch], 168 | run_id=run_id) # e.g., Airflow run id or some run identifier that your pipeline uses. 169 | 170 | if not results["success"]: 171 | raise AirflowException("The analytical output does not meet the expectations in the suite: {0:s}".format(expectation_suite_name)) 172 | 173 | 174 | def publish_to_prod(): 175 | """ 176 | A method to simply "promote' a table in a database by renaming it using SQLAlchemy. 177 | """ 178 | engine = create_engine(GE_TUTORIAL_DB_URL) 179 | 180 | with engine.connect() as conn: 181 | conn.execute("drop table if exists prod_count_providers_by_state") 182 | conn.execute("alter table count_providers_by_state rename to prod_count_providers_by_state") 183 | 184 | 185 | task_validate_source_data = PythonOperator( 186 | task_id='task_validate_source_data', 187 | python_callable=validate_source_data, 188 | provide_context=True, 189 | dag=dag) 190 | 191 | task_load_files_into_db = PythonOperator( 192 | task_id='task_load_files_into_db', 193 | provide_context=True, 194 | python_callable=load_files_into_db, 195 | dag=dag, 196 | ) 197 | 198 | task_validate_source_data_load = PythonOperator( 199 | task_id='task_validate_source_data_load', 200 | python_callable=validate_source_data_load, 201 | provide_context=True, 202 | dag=dag) 203 | 204 | task_transform_data_in_db = BashOperator( 205 | task_id='task_transform_data_in_db', 206 | bash_command='dbt run --project-dir {}'.format(os.path.join(GE_TUTORIAL_ROOT_PATH, 'dbt')), 207 | dag=dag) 208 | 209 | 210 | task_validate_analytical_output = PythonOperator( 211 | task_id='task_validate_analytical_output', 212 | python_callable=validate_analytical_output, 213 | provide_context=True, 214 | dag=dag) 215 | 216 | 217 | task_publish = PythonOperator( 218 | task_id='task_publish', 219 | python_callable=publish_to_prod, 220 | dag=dag) 221 | 222 | 223 | # DAG dependencies 224 | task_validate_source_data >> task_load_files_into_db >> task_validate_source_data_load >> task_transform_data_in_db >> task_validate_analytical_output >> task_publish 225 | -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/airflow/ge_tutorials_dag_without_great_expectations.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import airflow 3 | from airflow import AirflowException 4 | from airflow.operators.bash_operator import BashOperator 5 | from airflow.operators.python_operator import PythonOperator 6 | from airflow import DAG 7 | import os 8 | import pandas as pd 9 | from sqlalchemy import create_engine 10 | 11 | 12 | # Global variables that are set using environment varaiables 13 | GE_TUTORIAL_DB_URL = os.getenv('GE_TUTORIAL_DB_URL') 14 | GE_TUTORIAL_ROOT_PATH = os.getenv('GE_TUTORIAL_ROOT_PATH') 15 | 16 | 17 | default_args = { 18 | "owner": "Airflow", 19 | "start_date": airflow.utils.dates.days_ago(1) 20 | } 21 | 22 | 23 | # The DAG definition 24 | dag = DAG( 25 | dag_id='ge_tutorials_dag_no_ge', 26 | default_args=default_args, 27 | schedule_interval=None, 28 | ) 29 | 30 | 31 | def load_files_into_db(ds, **kwargs): 32 | """ 33 | A method to simply load CSV files into a database using SQLAlchemy. 34 | """ 35 | 36 | engine = create_engine(GE_TUTORIAL_DB_URL) 37 | 38 | with engine.connect() as conn: 39 | conn.execute("drop table if exists npi_small cascade ") 40 | conn.execute("drop table if exists state_abbreviations cascade ") 41 | 42 | df_npi_small = pd.read_csv(os.path.join(GE_TUTORIAL_ROOT_PATH, "data", "npi_small.csv")) 43 | column_rename_dict = {old_column_name: old_column_name.lower() for old_column_name in df_npi_small.columns} 44 | df_npi_small.rename(columns=column_rename_dict, inplace=True) 45 | df_npi_small.to_sql("npi_small", engine, 46 | schema=None, 47 | if_exists='replace', 48 | index=False, 49 | index_label=None, 50 | chunksize=None, 51 | dtype=None) 52 | 53 | df_state_abbreviations = pd.read_csv(os.path.join(GE_TUTORIAL_ROOT_PATH, "data", "state_abbreviations.csv")) 54 | df_state_abbreviations.to_sql("state_abbreviations", engine, 55 | schema=None, 56 | if_exists='replace', 57 | index=False, 58 | index_label=None, 59 | chunksize=None, 60 | dtype=None) 61 | 62 | return 'Loaded files into the database' 63 | 64 | 65 | task_load_files_into_db = PythonOperator( 66 | task_id='task_load_files_into_db', 67 | provide_context=True, 68 | python_callable=load_files_into_db, 69 | dag=dag, 70 | ) 71 | 72 | 73 | task_transform_data_in_db = BashOperator( 74 | task_id='task_transform_data_in_db', 75 | bash_command='dbt run --project-dir {}'.format(os.path.join(GE_TUTORIAL_ROOT_PATH, 'dbt')), 76 | dag=dag) 77 | 78 | 79 | # DAG dependencies 80 | task_load_files_into_db >> task_transform_data_in_db 81 | -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greatexpectationslabs/ge_tutorials/1b04332e4f9b4d8621a95e7aa2837c371f41e682/ge_dbt_airflow_tutorial/data/.gitkeep -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/data/state_abbreviations.csv: -------------------------------------------------------------------------------- 1 | "name","abbreviation" 2 | "Alabama","AL" 3 | "Alaska","AK" 4 | "American Samoa","AS" 5 | "Arizona","AZ" 6 | "Arkansas","AR" 7 | "California","CA" 8 | "Colorado","CO" 9 | "Connecticut","CT" 10 | "Delaware","DE" 11 | "District Of Columbia","DC" 12 | "Federated States Of Micronesia","FM" 13 | "Florida","FL" 14 | "Georgia","GA" 15 | "Guam","GU" 16 | "Hawaii","HI" 17 | "Idaho","ID" 18 | "Illinois","IL" 19 | "Indiana","IN" 20 | "Iowa","IA" 21 | "Kansas","KS" 22 | "Kentucky","KY" 23 | "Louisiana","LA" 24 | "Maine","ME" 25 | "Marshall Islands","MH" 26 | "Maryland","MD" 27 | "Massachusetts","MA" 28 | "Michigan","MI" 29 | "Minnesota","MN" 30 | "Mississippi","MS" 31 | "Missouri","MO" 32 | "Montana","MT" 33 | "Nebraska","NE" 34 | "Nevada","NV" 35 | "New Hampshire","NH" 36 | "New Jersey","NJ" 37 | "New Mexico","NM" 38 | "New York","NY" 39 | "North Carolina","NC" 40 | "North Dakota","ND" 41 | "Northern Mariana Islands","MP" 42 | "Ohio","OH" 43 | "Oklahoma","OK" 44 | "Oregon","OR" 45 | "Palau","PW" 46 | "Pennsylvania","PA" 47 | "Puerto Rico","PR" 48 | "Rhode Island","RI" 49 | "South Carolina","SC" 50 | "South Dakota","SD" 51 | "Tennessee","TN" 52 | "Texas","TX" 53 | "Utah","UT" 54 | "Vermont","VT" 55 | "Virgin Islands","VI" 56 | "Virginia","VA" 57 | "Washington","WA" 58 | "West Virginia","WV" 59 | "Wisconsin","WI" 60 | "Wyoming","WY" 61 | -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/dbt/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | # and underscores. A good package name should reflect your organization's 4 | # name or the intended use of these models 5 | name: 'ge_tutorials' 6 | version: '1.0.0' 7 | 8 | # This setting configures which "profile" dbt uses for this project. 9 | profile: 'ge_tutorials' 10 | 11 | # These configurations specify where dbt should look for different types of files. 12 | # The `source-paths` config, for example, states that models in this project can be 13 | # found in the "models/" directory. You probably won't need to change these! 14 | source-paths: ["models"] 15 | data-paths: ["data"] 16 | 17 | target-path: "target" # directory which will store compiled SQL files 18 | clean-targets: # directories to be removed by `dbt clean` 19 | - "target" 20 | - "dbt_modules" 21 | 22 | # Configuring models 23 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 24 | 25 | # In this example config, we tell dbt to build all models in the example/ directory 26 | # as tables. These settings can be overridden in the individual model files 27 | # using the `{{ config(...) }}` macro. 28 | models: 29 | ge_tutorials: 30 | materialized: view 31 | -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/dbt/models/count_providers_by_state.sql: -------------------------------------------------------------------------------- 1 | select 2 | state_name, 3 | count(distinct npi) as count_providers 4 | from {{ ref('npi_with_state') }} n 5 | group by state_name 6 | -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/dbt/models/npi_with_state.sql: -------------------------------------------------------------------------------- 1 | select 2 | n.npi, 3 | n.entity_type_code, 4 | n.organization_name, 5 | n.last_name, 6 | n.first_name, 7 | n.taxonomy_code, 8 | n.state_abbreviation, 9 | s.state_name 10 | from {{ ref('stg_npi') }} n 11 | -- due to the nature of the data some state abbreviations are not valid 12 | -- which results in state names being null - in this case, 13 | -- switch to inner join 14 | inner join {{ ref('stg_state_abbreviations') }} s 15 | on n.state_abbreviation = s.state_abbreviation 16 | -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/dbt/models/sources.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: source 5 | schema: public 6 | tables: 7 | - name: npi_small 8 | - name: state_abbreviations 9 | -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/dbt/models/staging/stg_npi.sql: -------------------------------------------------------------------------------- 1 | select 2 | npi as npi, 3 | entity_type_code as entity_type_code, 4 | organization_name as organization_name, 5 | last_name as last_name, 6 | first_name as first_name, 7 | state as state_abbreviation, 8 | taxonomy_code as taxonomy_code 9 | from {{ source('source', 'npi_small') }} 10 | -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/dbt/models/staging/stg_state_abbreviations.sql: -------------------------------------------------------------------------------- 1 | select 2 | name as state_name, 3 | abbreviation as state_abbreviation 4 | from {{ source('source', 'state_abbreviations') }} 5 | -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/deploy/script/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # User-provided configuration must always be respected. 4 | # 5 | # Therefore, this script must only derives Airflow AIRFLOW__ variables from other variables 6 | # when the user did not provide their own configuration. 7 | 8 | TRY_LOOP="20" 9 | 10 | # Global defaults and back-compat 11 | : "${AIRFLOW_HOME:="/usr/local/airflow"}" 12 | : "${AIRFLOW__CORE__FERNET_KEY:=${FERNET_KEY:=$(python -c "from cryptography.fernet import Fernet; FERNET_KEY = Fernet.generate_key().decode(); print(FERNET_KEY)")}}" 13 | : "${AIRFLOW__CORE__EXECUTOR:=${EXECUTOR:-Sequential}Executor}" 14 | 15 | # Load DAGs examples (default: Yes) 16 | if [[ -z "$AIRFLOW__CORE__LOAD_EXAMPLES" && "${LOAD_EX:=n}" == n ]]; then 17 | AIRFLOW__CORE__LOAD_EXAMPLES=False 18 | fi 19 | 20 | export \ 21 | AIRFLOW_HOME \ 22 | AIRFLOW__CORE__EXECUTOR \ 23 | AIRFLOW__CORE__FERNET_KEY \ 24 | AIRFLOW__CORE__LOAD_EXAMPLES \ 25 | 26 | # Install custom python package if requirements.txt is present 27 | if [ -e "/requirements.txt" ]; then 28 | $(command -v pip) install --user -r /requirements.txt 29 | fi 30 | 31 | wait_for_port() { 32 | local name="$1" host="$2" port="$3" 33 | local j=0 34 | while ! nc -z "$host" "$port" >/dev/null 2>&1 < /dev/null; do 35 | j=$((j+1)) 36 | if [ $j -ge $TRY_LOOP ]; then 37 | echo >&2 "$(date) - $host:$port still not reachable, giving up" 38 | exit 1 39 | fi 40 | echo "$(date) - waiting for $name... $j/$TRY_LOOP" 41 | sleep 5 42 | done 43 | } 44 | 45 | # Other executors than SequentialExecutor drive the need for an SQL database, here PostgreSQL is used 46 | if [ "$AIRFLOW__CORE__EXECUTOR" != "SequentialExecutor" ]; then 47 | # Check if the user has provided explicit Airflow configuration concerning the database 48 | if [ -z "$AIRFLOW__CORE__SQL_ALCHEMY_CONN" ]; then 49 | # Default values corresponding to the default compose files 50 | : "${POSTGRES_HOST:="postgres"}" 51 | : "${POSTGRES_PORT:="5432"}" 52 | : "${POSTGRES_USER:="airflow"}" 53 | : "${POSTGRES_PASSWORD:="airflow"}" 54 | : "${POSTGRES_DB:="airflow"}" 55 | : "${POSTGRES_EXTRAS:-""}" 56 | 57 | AIRFLOW__CORE__SQL_ALCHEMY_CONN="postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}${POSTGRES_EXTRAS}" 58 | export AIRFLOW__CORE__SQL_ALCHEMY_CONN 59 | 60 | # Check if the user has provided explicit Airflow configuration for the broker's connection to the database 61 | if [ "$AIRFLOW__CORE__EXECUTOR" = "CeleryExecutor" ]; then 62 | AIRFLOW__CELERY__RESULT_BACKEND="db+postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}${POSTGRES_EXTRAS}" 63 | export AIRFLOW__CELERY__RESULT_BACKEND 64 | fi 65 | else 66 | if [[ "$AIRFLOW__CORE__EXECUTOR" == "CeleryExecutor" && -z "$AIRFLOW__CELERY__RESULT_BACKEND" ]]; then 67 | >&2 printf '%s\n' "FATAL: if you set AIRFLOW__CORE__SQL_ALCHEMY_CONN manually with CeleryExecutor you must also set AIRFLOW__CELERY__RESULT_BACKEND" 68 | exit 1 69 | fi 70 | 71 | # Derive useful variables from the AIRFLOW__ variables provided explicitly by the user 72 | POSTGRES_ENDPOINT=$(echo -n "$AIRFLOW__CORE__SQL_ALCHEMY_CONN" | cut -d '/' -f3 | sed -e 's,.*@,,') 73 | POSTGRES_HOST=$(echo -n "$POSTGRES_ENDPOINT" | cut -d ':' -f1) 74 | POSTGRES_PORT=$(echo -n "$POSTGRES_ENDPOINT" | cut -d ':' -f2) 75 | fi 76 | 77 | wait_for_port "Postgres" "$POSTGRES_HOST" "$POSTGRES_PORT" 78 | fi 79 | 80 | # CeleryExecutor drives the need for a Celery broker, here Redis is used 81 | if [ "$AIRFLOW__CORE__EXECUTOR" = "CeleryExecutor" ]; then 82 | # Check if the user has provided explicit Airflow configuration concerning the broker 83 | if [ -z "$AIRFLOW__CELERY__BROKER_URL" ]; then 84 | # Default values corresponding to the default compose files 85 | : "${REDIS_PROTO:="redis://"}" 86 | : "${REDIS_HOST:="redis"}" 87 | : "${REDIS_PORT:="6379"}" 88 | : "${REDIS_PASSWORD:=""}" 89 | : "${REDIS_DBNUM:="1"}" 90 | 91 | # When Redis is secured by basic auth, it does not handle the username part of basic auth, only a token 92 | if [ -n "$REDIS_PASSWORD" ]; then 93 | REDIS_PREFIX=":${REDIS_PASSWORD}@" 94 | else 95 | REDIS_PREFIX= 96 | fi 97 | 98 | AIRFLOW__CELERY__BROKER_URL="${REDIS_PROTO}${REDIS_PREFIX}${REDIS_HOST}:${REDIS_PORT}/${REDIS_DBNUM}" 99 | export AIRFLOW__CELERY__BROKER_URL 100 | else 101 | # Derive useful variables from the AIRFLOW__ variables provided explicitly by the user 102 | REDIS_ENDPOINT=$(echo -n "$AIRFLOW__CELERY__BROKER_URL" | cut -d '/' -f3 | sed -e 's,.*@,,') 103 | REDIS_HOST=$(echo -n "$POSTGRES_ENDPOINT" | cut -d ':' -f1) 104 | REDIS_PORT=$(echo -n "$POSTGRES_ENDPOINT" | cut -d ':' -f2) 105 | fi 106 | 107 | wait_for_port "Redis" "$REDIS_HOST" "$REDIS_PORT" 108 | fi 109 | 110 | case "$1" in 111 | webserver) 112 | airflow initdb 113 | if [ "$AIRFLOW__CORE__EXECUTOR" = "LocalExecutor" ] || [ "$AIRFLOW__CORE__EXECUTOR" = "SequentialExecutor" ]; then 114 | # With the "Local" and "Sequential" executors it should all run in one container. 115 | airflow scheduler & 116 | fi 117 | exec airflow webserver 118 | ;; 119 | worker|scheduler) 120 | # Give the webserver time to run initdb. 121 | sleep 10 122 | exec airflow "$@" 123 | ;; 124 | flower) 125 | sleep 10 126 | exec airflow "$@" 127 | ;; 128 | version) 129 | exec airflow "$@" 130 | ;; 131 | *) 132 | # The command is something like bash, not an airflow subcommand. Just run it in the right environment. 133 | exec "$@" 134 | ;; 135 | esac 136 | -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | services: 3 | postgres: 4 | image: postgres:9.6 5 | environment: 6 | - POSTGRES_USER=airflow 7 | - POSTGRES_PASSWORD=airflow 8 | - POSTGRES_DB=airflow 9 | ports: 10 | - "5432:5432" 11 | 12 | logging: 13 | options: 14 | max-size: 10m 15 | max-file: "3" 16 | 17 | webserver: 18 | build: . 19 | restart: always 20 | depends_on: 21 | - postgres 22 | environment: 23 | - LOAD_EX=n 24 | - EXECUTOR=Local 25 | # TODO: create a separate schema for real data 26 | - GE_TUTORIAL_DB_URL=postgres://airflow:airflow@postgres:5432/airflow 27 | - GE_TUTORIAL_ROOT_PATH=/usr/local/airflow/ 28 | - GE_TUTORIAL_PROJECT_PATH=/usr/local/airflow/ 29 | logging: 30 | options: 31 | max-size: 10m 32 | max-file: "3" 33 | volumes: 34 | # TODO: Might be better to mount everything at once 35 | - ./airflow:/usr/local/airflow/dags 36 | - ./great_expectations_projects/final/great_expectations:/usr/local/airflow/great_expectations_projects/final/great_expectations 37 | - ./dbt:/usr/local/airflow/dbt 38 | - ./data:/usr/local/airflow/data 39 | - ./requirements.txt:/requirements.txt 40 | - ./example_dbt_profile.yml:/usr/local/airflow/.dbt/profiles.yml 41 | ports: 42 | - "8080:8080" 43 | - "8888:8888" 44 | command: webserver 45 | healthcheck: 46 | test: ["CMD-SHELL", "[ -f /usr/local/airflow/airflow-webserver.pid ]"] 47 | interval: 30s 48 | timeout: 30s 49 | retries: 3 50 | ge_data_docs: 51 | image: flashspys/nginx-static 52 | container_name: ge_data_docs 53 | ports: 54 | - 8081:80 55 | volumes: 56 | - ./great_expectations_projects/final/great_expectations/uncommitted/data_docs/local_site:/static -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/example_dbt_profile.yml: -------------------------------------------------------------------------------- 1 | # This is an example dbt_profile.yml file that should live in your .dbt 2 | # directory that's set up during `dbt init`. It is only included in this repo as an example. 3 | 4 | # For more information on how to configure this file, please see: 5 | # https://docs.getdbt.com/docs/profile 6 | 7 | ge_tutorials: 8 | outputs: 9 | dev: 10 | type: postgres 11 | threads: 1 12 | host: postgres 13 | port: 5432 14 | user: airflow 15 | pass: airflow 16 | dbname: airflow 17 | schema: public 18 | prod: 19 | type: postgres 20 | threads: 1 21 | host: postgres 22 | port: 5432 23 | user: airflow 24 | pass: airflow 25 | dbname: airflow 26 | schema: public 27 | target: dev 28 | -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/.gitignore: -------------------------------------------------------------------------------- 1 | uncommitted/ -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/config_variables.yml: -------------------------------------------------------------------------------- 1 | ge_comment_preservation_key: 1 2 | # This config file supports variable substitution which enables: 1) keeping 3 | # secrets out of source control & 2) environment-based configuration changes 4 | # such as staging vs prod. 5 | # 6 | # When GE encounters substitution syntax (like `my_key: ${my_value}` or 7 | # `my_key: $my_value`) in the config file, it will attempt to replace the value 8 | # of `my_key` with the value from an environment variable `my_value` or a 9 | # corresponding key read from the file specified using 10 | # `config_variables_file_path`. Environment variables take precedence. 11 | # 12 | # If the substitution value comes from the config variables file, it can be a 13 | # simple (non-nested) value or a nested value such as a dictionary. If it comes 14 | # from an environment variable, it must be a simple value. Read more at: 15 | # 16 | datawarehouse: 17 | drivername: postgres 18 | username: airflow 19 | password: airflow 20 | host: postgres 21 | database: airflow 22 | port: '5432' 23 | -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/expectations/count_providers_by_state/critical.json: -------------------------------------------------------------------------------- 1 | {"expectation_suite_name": "count_providers_by_state.critical", "data_asset_type": "Dataset", "expectations": [{"expectation_type": "expect_column_values_to_be_of_type", "kwargs": {"column": "count_providers", "type_": "BIGINT"}, "meta": {}}, {"expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "state_name"}, "meta": {}}, {"expectation_type": "expect_column_values_to_be_unique", "kwargs": {"column": "state_name"}, "meta": {}}], "meta": {"great_expectations.__version__": "0.9.2"}} -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/expectations/npi_small_db_table/critical.json: -------------------------------------------------------------------------------- 1 | {"data_asset_type": "Dataset", "expectation_suite_name": "npi_small_db_table.critical", "meta": {"great_expectations.__version__": "0.9.2"}, "expectations": [{"expectation_type": "expect_table_row_count_to_equal", "kwargs": {"value": {"$PARAMETER": "urn:great_expectations:validations:npi_small_file.critical:expect_table_row_count_to_be_between.result.observed_value"}}, "meta": {}}, {"expectation_type": "expect_table_column_count_to_equal", "kwargs": {"value": 7}, "meta": {}}, {"expectation_type": "expect_table_columns_to_match_ordered_list", "kwargs": {"column_list": ["npi", "entity_type_code", "organization_name", "last_name", "first_name", "state", "taxonomy_code"]}, "meta": {}}]} -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/expectations/npi_small_file/critical.json: -------------------------------------------------------------------------------- 1 | {"expectations": [{"meta": {}, "kwargs": {"min_value": 1, "max_value": 1000000}, "expectation_type": "expect_table_row_count_to_be_between"}, {"meta": {}, "kwargs": {"value": 7}, "expectation_type": "expect_table_column_count_to_equal"}, {"meta": {}, "kwargs": {"column_list": ["NPI", "Entity_Type_Code", "Organization_Name", "Last_Name", "First_Name", "State", "Taxonomy_Code"]}, "expectation_type": "expect_table_columns_to_match_ordered_list"}, {"meta": {}, "kwargs": {"column": "State", "mostly": 0.05}, "expectation_type": "expect_column_values_to_not_be_null"}, {"meta": {}, "kwargs": {"column": "State", "value_set": ["AE", "AK", "AL", "AP", "AR", "AS", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "GU", "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", "MP", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "PR", "PUERTO RICO", "PW", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VI", "VT", "WA", "WI", "WV", "WY"], "mostly": 0.05}, "expectation_type": "expect_column_values_to_be_in_set"}, {"meta": {}, "kwargs": {"column": "NPI"}, "expectation_type": "expect_column_values_to_not_be_null"}], "meta": {"great_expectations.__version__": "0.9.2"}, "data_asset_type": "Dataset", "expectation_suite_name": "npi_small_file.critical"} -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/expectations/state_abbreviations_file/critical.json: -------------------------------------------------------------------------------- 1 | {"expectations": [{"kwargs": {"min_value": 49, "max_value": 69}, "expectation_type": "expect_table_row_count_to_be_between", "meta": {"SampleExpectationsDatasetProfiler": {"confidence": "very low"}}}, {"kwargs": {"value": 2}, "expectation_type": "expect_table_column_count_to_equal", "meta": {"SampleExpectationsDatasetProfiler": {"confidence": "very low"}}}, {"kwargs": {"column_list": ["name", "abbreviation"]}, "expectation_type": "expect_table_columns_to_match_ordered_list", "meta": {"SampleExpectationsDatasetProfiler": {"confidence": "very low"}}}, {"kwargs": {"column": "name"}, "expectation_type": "expect_column_values_to_not_be_null", "meta": {"SampleExpectationsDatasetProfiler": {"confidence": "very low"}}}, {"kwargs": {"column": "name", "min_value": 1}, "expectation_type": "expect_column_value_lengths_to_be_between", "meta": {"SampleExpectationsDatasetProfiler": {"confidence": "very low"}}}], "meta": {"great_expectations.__version__": "0.9.2", "columns": {"name": {"description": ""}, "abbreviation": {"description": ""}}, "notes": {"format": "markdown", "content": ["#### This is an _example_ suite\n\n- This suite was made by quickly glancing at 1000 rows of your data.\n- This is **not a production suite**. It is meant to show examples of expectations.\n- Because this suite was auto-generated using a very basic profiler that does not know your data like you do, many of the expectations may not be meaningful.\n"]}, "SampleExpectationsDatasetProfiler": {"created_by": "SampleExpectationsDatasetProfiler", "created_at": 1582843165.39407, "batch_kwargs": {"path": "/Users/eugenemandel/projects/ge_tutorials/great_expectations/../data/state_abbreviations.csv", "datasource": "input_files"}}}, "expectation_suite_name": "state_abbreviations_file.critical", "data_asset_type": "Dataset"} -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/great_expectations.yml: -------------------------------------------------------------------------------- 1 | # Welcome to Great Expectations! Always know what to expect from your data. 2 | # 3 | # Here you can define datasources, batch kwarg generators, integrations and 4 | # more. This file is intended to be committed to your repo. For help with 5 | # configuration please: 6 | # - Read our docs: https://docs.greatexpectations.io/en/latest/reference/data_context_reference.html#configuration 7 | # - Join our slack channel: http://greatexpectations.io/slack 8 | 9 | config_version: 2 10 | 11 | # Datasources tell Great Expectations where your data lives and how to get it. 12 | # You can use the CLI command `great_expectations datasource new` to help you 13 | # add a new datasource. Read more at https://docs.greatexpectations.io/en/latest/features/datasource.html 14 | datasources: 15 | input_files: 16 | data_asset_type: 17 | class_name: PandasDataset 18 | class_name: PandasDatasource 19 | module_name: great_expectations.datasource 20 | datawarehouse: 21 | credentials: ${datawarehouse} 22 | data_asset_type: 23 | class_name: SqlAlchemyDataset 24 | class_name: SqlAlchemyDatasource 25 | module_name: great_expectations.datasource 26 | config_variables_file_path: config_variables.yml 27 | 28 | # The plugins_directory will be added to your python path for custom modules 29 | # used to override and extend Great Expectations. 30 | plugins_directory: plugins/ 31 | 32 | # Validation Operators are customizable workflows that bundle the validation of 33 | # one or more expectation suites and subsequent actions. The example below 34 | # stores validations and send a slack notification. To read more about 35 | # customizing and extending these, read: https://docs.greatexpectations.io/en/latest/features/validation_operators_and_actions.html 36 | validation_operators: 37 | action_list_operator: 38 | class_name: ActionListValidationOperator 39 | action_list: 40 | - name: store_validation_result 41 | action: 42 | class_name: StoreValidationResultAction 43 | - name: store_evaluation_params 44 | action: 45 | class_name: StoreEvaluationParametersAction 46 | - name: update_data_docs 47 | action: 48 | class_name: UpdateDataDocsAction 49 | stores: 50 | # Stores are configurable places to store things like Expectations, Validations 51 | # Data Docs, and more. These are for advanced users only - most users can simply 52 | # leave this section alone. 53 | # 54 | # Three stores are required: expectations, validations, and 55 | # evaluation_parameters, and must exist with a valid store entry. Additional 56 | # stores can be configured for uses such as data_docs, validation_operators, etc. 57 | expectations_store: 58 | class_name: ExpectationsStore 59 | store_backend: 60 | class_name: TupleFilesystemStoreBackend 61 | base_directory: expectations/ 62 | 63 | validations_store: 64 | class_name: ValidationsStore 65 | store_backend: 66 | class_name: TupleFilesystemStoreBackend 67 | base_directory: uncommitted/validations/ 68 | 69 | evaluation_parameter_store: 70 | # Evaluation Parameters enable dynamic expectations. Read more here: 71 | # https://docs.greatexpectations.io/en/latest/reference/evaluation_parameters.html 72 | class_name: EvaluationParameterStore 73 | 74 | expectations_store_name: expectations_store 75 | validations_store_name: validations_store 76 | evaluation_parameter_store_name: evaluation_parameter_store 77 | 78 | data_docs_sites: 79 | # Data Docs make it simple to visualize data quality in your project. These 80 | # include Expectations, Validations & Profiles. The are built for all 81 | # Datasources from JSON artifacts in the local repo including validations & 82 | # profiles from the uncommitted directory. Read more at https://docs.greatexpectations.io/en/latest/features/data_docs.html 83 | local_site: 84 | class_name: SiteBuilder 85 | # set to false to hide how-to buttons in Data Docs 86 | show_how_to_buttons: true 87 | store_backend: 88 | class_name: TupleFilesystemStoreBackend 89 | base_directory: uncommitted/data_docs/local_site/ 90 | site_index_builder: 91 | class_name: DefaultSiteIndexBuilder 92 | anonymous_usage_statistics: 93 | data_context_id: a3454240-fa69-4c9a-904b-726fac29c60b 94 | enabled: false 95 | notebooks: 96 | -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/notebooks/pandas/validation_playground.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Validation Playground\n", 8 | "\n", 9 | "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", 10 | "\n", 11 | "#### This notebook assumes that you created at least one expectation suite in your project.\n", 12 | "#### Here you will learn how to validate data loaded into a Pandas DataFrame against an expectation suite.\n", 13 | "\n", 14 | "\n", 15 | "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": {}, 22 | "outputs": [ 23 | { 24 | "name": "stderr", 25 | "output_type": "stream", 26 | "text": [ 27 | "Old pybigquery driver version detected. Consider upgrading to 0.4.14 or later.\n" 28 | ] 29 | }, 30 | { 31 | "name": "stdout", 32 | "output_type": "stream", 33 | "text": [ 34 | "2020-03-27T12:13:30-0700 - INFO - Great Expectations logging enabled at INFO level by JupyterUX module.\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "import json\n", 40 | "import great_expectations as ge\n", 41 | "import great_expectations.jupyter_ux\n", 42 | "from great_expectations.datasource.types import BatchKwargs\n", 43 | "from datetime import datetime" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## 1. Get a DataContext\n", 51 | "This represents your **project** that you just created using `great_expectations init`." 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 2, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "context = ge.data_context.DataContext()" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## 2. Choose an Expectation Suite\n", 68 | "\n", 69 | "List expectation suites that you created in your project" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 3, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/plain": [ 80 | "['npi_small_file.critical']" 81 | ] 82 | }, 83 | "execution_count": 3, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "context.list_expectation_suite_names()" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 4, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "expectation_suite_name = 'npi_small_file.critical'# TODO: set to a name from the list above" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "## 3. Load a batch of data you want to validate\n", 106 | "\n", 107 | "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 5, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "['files_datasource']" 119 | ] 120 | }, 121 | "execution_count": 5, 122 | "metadata": {}, 123 | "output_type": "execute_result" 124 | } 125 | ], 126 | "source": [ 127 | "# list datasources of the type PandasDatasource in your project\n", 128 | "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'PandasDatasource']" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 6, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "datasource_name = 'files_datasource'# TODO: set to a datasource name from above" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 7, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "text/html": [ 148 | "
\n", 149 | "\n", 162 | "\n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | "
NPIEntity_Type_CodeOrganization_NameLast_NameFirst_NameStateTaxonomy_Code
014579008392.0TEXAS CLINIC OF CHIROPRACTICNaNNaNTX111N00000X
112555190471.0NaNBRYANT-JONESMARIAFL261QH0700X
213660917461.0NaNJONESEBONYDC3747P1801X
312751826511.0NaNORNELASLUPECA101YA0400X
411943713441.0NaNWINTERSSTACYMD363L00000X
\n", 228 | "
" 229 | ], 230 | "text/plain": [ 231 | " NPI Entity_Type_Code Organization_Name Last_Name \\\n", 232 | "0 1457900839 2.0 TEXAS CLINIC OF CHIROPRACTIC NaN \n", 233 | "1 1255519047 1.0 NaN BRYANT-JONES \n", 234 | "2 1366091746 1.0 NaN JONES \n", 235 | "3 1275182651 1.0 NaN ORNELAS \n", 236 | "4 1194371344 1.0 NaN WINTERS \n", 237 | "\n", 238 | " First_Name State Taxonomy_Code \n", 239 | "0 NaN TX 111N00000X \n", 240 | "1 MARIA FL 261QH0700X \n", 241 | "2 EBONY DC 3747P1801X \n", 242 | "3 LUPE CA 101YA0400X \n", 243 | "4 STACY MD 363L00000X " 244 | ] 245 | }, 246 | "execution_count": 7, 247 | "metadata": {}, 248 | "output_type": "execute_result" 249 | } 250 | ], 251 | "source": [ 252 | "# If you would like to validate a file on a filesystem:\n", 253 | "batch_kwargs = {'path': \"/Users/eugenemandel/projects/ge_tutorials/data/npi_small.csv\", 'datasource': datasource_name}\n", 254 | "\n", 255 | "# # If you already loaded the data into a Pandas Data Frame:\n", 256 | "# batch_kwargs = {'dataset': \"YOUR_DATAFRAME\", 'datasource': datasource_name}\n", 257 | "\n", 258 | "\n", 259 | "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", 260 | "batch.head()" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 8, 266 | "metadata": {}, 267 | "outputs": [ 268 | { 269 | "name": "stdout", 270 | "output_type": "stream", 271 | "text": [ 272 | "2020-03-27T11:18:30-0700 - INFO - \t14 expectation(s) included in expectation_suite.\n" 273 | ] 274 | }, 275 | { 276 | "data": { 277 | "text/plain": [ 278 | "{\n", 279 | " \"evaluation_parameters\": {},\n", 280 | " \"statistics\": {\n", 281 | " \"evaluated_expectations\": 14,\n", 282 | " \"successful_expectations\": 14,\n", 283 | " \"unsuccessful_expectations\": 0,\n", 284 | " \"success_percent\": 100.0\n", 285 | " },\n", 286 | " \"results\": [\n", 287 | " {\n", 288 | " \"result\": {\n", 289 | " \"observed_value\": 18649\n", 290 | " },\n", 291 | " \"expectation_config\": {\n", 292 | " \"kwargs\": {\n", 293 | " \"min_value\": 18639,\n", 294 | " \"max_value\": 18659\n", 295 | " },\n", 296 | " \"expectation_type\": \"expect_table_row_count_to_be_between\",\n", 297 | " \"meta\": {\n", 298 | " \"SampleExpectationsDatasetProfiler\": {\n", 299 | " \"confidence\": \"very low\"\n", 300 | " }\n", 301 | " }\n", 302 | " },\n", 303 | " \"success\": true,\n", 304 | " \"exception_info\": {\n", 305 | " \"raised_exception\": false,\n", 306 | " \"exception_message\": null,\n", 307 | " \"exception_traceback\": null\n", 308 | " },\n", 309 | " \"meta\": {}\n", 310 | " },\n", 311 | " {\n", 312 | " \"result\": {\n", 313 | " \"observed_value\": 7\n", 314 | " },\n", 315 | " \"expectation_config\": {\n", 316 | " \"kwargs\": {\n", 317 | " \"value\": 7\n", 318 | " },\n", 319 | " \"expectation_type\": \"expect_table_column_count_to_equal\",\n", 320 | " \"meta\": {\n", 321 | " \"SampleExpectationsDatasetProfiler\": {\n", 322 | " \"confidence\": \"very low\"\n", 323 | " }\n", 324 | " }\n", 325 | " },\n", 326 | " \"success\": true,\n", 327 | " \"exception_info\": {\n", 328 | " \"raised_exception\": false,\n", 329 | " \"exception_message\": null,\n", 330 | " \"exception_traceback\": null\n", 331 | " },\n", 332 | " \"meta\": {}\n", 333 | " },\n", 334 | " {\n", 335 | " \"result\": {\n", 336 | " \"observed_value\": [\n", 337 | " \"NPI\",\n", 338 | " \"Entity_Type_Code\",\n", 339 | " \"Organization_Name\",\n", 340 | " \"Last_Name\",\n", 341 | " \"First_Name\",\n", 342 | " \"State\",\n", 343 | " \"Taxonomy_Code\"\n", 344 | " ]\n", 345 | " },\n", 346 | " \"expectation_config\": {\n", 347 | " \"kwargs\": {\n", 348 | " \"column_list\": [\n", 349 | " \"NPI\",\n", 350 | " \"Entity_Type_Code\",\n", 351 | " \"Organization_Name\",\n", 352 | " \"Last_Name\",\n", 353 | " \"First_Name\",\n", 354 | " \"State\",\n", 355 | " \"Taxonomy_Code\"\n", 356 | " ]\n", 357 | " },\n", 358 | " \"expectation_type\": \"expect_table_columns_to_match_ordered_list\",\n", 359 | " \"meta\": {\n", 360 | " \"SampleExpectationsDatasetProfiler\": {\n", 361 | " \"confidence\": \"very low\"\n", 362 | " }\n", 363 | " }\n", 364 | " },\n", 365 | " \"success\": true,\n", 366 | " \"exception_info\": {\n", 367 | " \"raised_exception\": false,\n", 368 | " \"exception_message\": null,\n", 369 | " \"exception_traceback\": null\n", 370 | " },\n", 371 | " \"meta\": {}\n", 372 | " },\n", 373 | " {\n", 374 | " \"result\": {\n", 375 | " \"element_count\": 18649,\n", 376 | " \"unexpected_count\": 491,\n", 377 | " \"unexpected_percent\": 2.6328489463241995,\n", 378 | " \"partial_unexpected_list\": []\n", 379 | " },\n", 380 | " \"expectation_config\": {\n", 381 | " \"kwargs\": {\n", 382 | " \"column\": \"Entity_Type_Code\",\n", 383 | " \"mostly\": 0.873671510536758\n", 384 | " },\n", 385 | " \"expectation_type\": \"expect_column_values_to_not_be_null\",\n", 386 | " \"meta\": {\n", 387 | " \"SampleExpectationsDatasetProfiler\": {\n", 388 | " \"confidence\": \"very low\"\n", 389 | " }\n", 390 | " }\n", 391 | " },\n", 392 | " \"success\": true,\n", 393 | " \"exception_info\": {\n", 394 | " \"raised_exception\": false,\n", 395 | " \"exception_message\": null,\n", 396 | " \"exception_traceback\": null\n", 397 | " },\n", 398 | " \"meta\": {}\n", 399 | " },\n", 400 | " {\n", 401 | " \"result\": {\n", 402 | " \"observed_value\": [\n", 403 | " 1.0,\n", 404 | " 2.0\n", 405 | " ],\n", 406 | " \"element_count\": 18649,\n", 407 | " \"missing_count\": 491,\n", 408 | " \"missing_percent\": 2.6328489463241995\n", 409 | " },\n", 410 | " \"expectation_config\": {\n", 411 | " \"kwargs\": {\n", 412 | " \"column\": \"Entity_Type_Code\",\n", 413 | " \"value_set\": [\n", 414 | " 1.0,\n", 415 | " 2.0\n", 416 | " ]\n", 417 | " },\n", 418 | " \"expectation_type\": \"expect_column_distinct_values_to_be_in_set\",\n", 419 | " \"meta\": {\n", 420 | " \"SampleExpectationsDatasetProfiler\": {\n", 421 | " \"confidence\": \"very low\"\n", 422 | " }\n", 423 | " }\n", 424 | " },\n", 425 | " \"success\": true,\n", 426 | " \"exception_info\": {\n", 427 | " \"raised_exception\": false,\n", 428 | " \"exception_message\": null,\n", 429 | " \"exception_traceback\": null\n", 430 | " },\n", 431 | " \"meta\": {}\n", 432 | " },\n", 433 | " {\n", 434 | " \"result\": {\n", 435 | " \"observed_value\": 0.0,\n", 436 | " \"element_count\": 18649,\n", 437 | " \"missing_count\": 491,\n", 438 | " \"missing_percent\": 2.6328489463241995\n", 439 | " },\n", 440 | " \"expectation_config\": {\n", 441 | " \"kwargs\": {\n", 442 | " \"column\": \"Entity_Type_Code\",\n", 443 | " \"partition_object\": {\n", 444 | " \"values\": [\n", 445 | " 1.0,\n", 446 | " 2.0\n", 447 | " ],\n", 448 | " \"weights\": [\n", 449 | " 0.812314131512281,\n", 450 | " 0.1876858684877189\n", 451 | " ]\n", 452 | " },\n", 453 | " \"threshold\": 0.6\n", 454 | " },\n", 455 | " \"expectation_type\": \"expect_column_kl_divergence_to_be_less_than\",\n", 456 | " \"meta\": {\n", 457 | " \"SampleExpectationsDatasetProfiler\": {\n", 458 | " \"confidence\": \"very low\"\n", 459 | " }\n", 460 | " }\n", 461 | " },\n", 462 | " \"success\": true,\n", 463 | " \"exception_info\": {\n", 464 | " \"raised_exception\": false,\n", 465 | " \"exception_message\": null,\n", 466 | " \"exception_traceback\": null\n", 467 | " },\n", 468 | " \"meta\": {}\n", 469 | " },\n", 470 | " {\n", 471 | " \"result\": {\n", 472 | " \"element_count\": 18649,\n", 473 | " \"unexpected_count\": 0,\n", 474 | " \"unexpected_percent\": 0.0,\n", 475 | " \"partial_unexpected_list\": []\n", 476 | " },\n", 477 | " \"expectation_config\": {\n", 478 | " \"kwargs\": {\n", 479 | " \"column\": \"NPI\"\n", 480 | " },\n", 481 | " \"expectation_type\": \"expect_column_values_to_not_be_null\",\n", 482 | " \"meta\": {\n", 483 | " \"SampleExpectationsDatasetProfiler\": {\n", 484 | " \"confidence\": \"very low\"\n", 485 | " }\n", 486 | " }\n", 487 | " },\n", 488 | " \"success\": true,\n", 489 | " \"exception_info\": {\n", 490 | " \"raised_exception\": false,\n", 491 | " \"exception_message\": null,\n", 492 | " \"exception_traceback\": null\n", 493 | " },\n", 494 | " \"meta\": {}\n", 495 | " },\n", 496 | " {\n", 497 | " \"result\": {\n", 498 | " \"observed_value\": 1003007766,\n", 499 | " \"element_count\": 18649,\n", 500 | " \"missing_count\": 0,\n", 501 | " \"missing_percent\": 0.0\n", 502 | " },\n", 503 | " \"expectation_config\": {\n", 504 | " \"kwargs\": {\n", 505 | " \"column\": \"NPI\",\n", 506 | " \"min_value\": 1003007765,\n", 507 | " \"max_value\": 1003007767\n", 508 | " },\n", 509 | " \"expectation_type\": \"expect_column_min_to_be_between\",\n", 510 | " \"meta\": {\n", 511 | " \"SampleExpectationsDatasetProfiler\": {\n", 512 | " \"confidence\": \"very low\"\n", 513 | " }\n", 514 | " }\n", 515 | " },\n", 516 | " \"success\": true,\n", 517 | " \"exception_info\": {\n", 518 | " \"raised_exception\": false,\n", 519 | " \"exception_message\": null,\n", 520 | " \"exception_traceback\": null\n", 521 | " },\n", 522 | " \"meta\": {}\n", 523 | " },\n", 524 | " {\n", 525 | " \"result\": {\n", 526 | " \"observed_value\": 1992999676,\n", 527 | " \"element_count\": 18649,\n", 528 | " \"missing_count\": 0,\n", 529 | " \"missing_percent\": 0.0\n", 530 | " },\n", 531 | " \"expectation_config\": {\n", 532 | " \"kwargs\": {\n", 533 | " \"column\": \"NPI\",\n", 534 | " \"min_value\": 1992999675,\n", 535 | " \"max_value\": 1992999677\n", 536 | " },\n", 537 | " \"expectation_type\": \"expect_column_max_to_be_between\",\n", 538 | " \"meta\": {\n", 539 | " \"SampleExpectationsDatasetProfiler\": {\n", 540 | " \"confidence\": \"very low\"\n", 541 | " }\n", 542 | " }\n", 543 | " },\n", 544 | " \"success\": true,\n", 545 | " \"exception_info\": {\n", 546 | " \"raised_exception\": false,\n", 547 | " \"exception_message\": null,\n", 548 | " \"exception_traceback\": null\n", 549 | " },\n", 550 | " \"meta\": {}\n", 551 | " },\n", 552 | " {\n", 553 | " \"result\": {\n", 554 | " \"observed_value\": 1500841664.0457933,\n", 555 | " \"element_count\": 18649,\n", 556 | " \"missing_count\": 0,\n", 557 | " \"missing_percent\": 0.0\n", 558 | " },\n", 559 | " \"expectation_config\": {\n", 560 | " \"kwargs\": {\n", 561 | " \"column\": \"NPI\",\n", 562 | " \"min_value\": 1500841663.0457933,\n", 563 | " \"max_value\": 1500841665.0457933\n", 564 | " },\n", 565 | " \"expectation_type\": \"expect_column_mean_to_be_between\",\n", 566 | " \"meta\": {\n", 567 | " \"SampleExpectationsDatasetProfiler\": {\n", 568 | " \"confidence\": \"very low\"\n", 569 | " }\n", 570 | " }\n", 571 | " },\n", 572 | " \"success\": true,\n", 573 | " \"exception_info\": {\n", 574 | " \"raised_exception\": false,\n", 575 | " \"exception_message\": null,\n", 576 | " \"exception_traceback\": null\n", 577 | " },\n", 578 | " \"meta\": {}\n", 579 | " },\n", 580 | " {\n", 581 | " \"result\": {\n", 582 | " \"observed_value\": 1508307745.0,\n", 583 | " \"element_count\": 18649,\n", 584 | " \"missing_count\": 0,\n", 585 | " \"missing_percent\": 0.0\n", 586 | " },\n", 587 | " \"expectation_config\": {\n", 588 | " \"kwargs\": {\n", 589 | " \"column\": \"NPI\",\n", 590 | " \"min_value\": 1508307744.0,\n", 591 | " \"max_value\": 1508307746.0\n", 592 | " },\n", 593 | " \"expectation_type\": \"expect_column_median_to_be_between\",\n", 594 | " \"meta\": {\n", 595 | " \"SampleExpectationsDatasetProfiler\": {\n", 596 | " \"confidence\": \"very low\"\n", 597 | " }\n", 598 | " }\n", 599 | " },\n", 600 | " \"success\": true,\n", 601 | " \"exception_info\": {\n", 602 | " \"raised_exception\": false,\n", 603 | " \"exception_message\": null,\n", 604 | " \"exception_traceback\": null\n", 605 | " },\n", 606 | " \"meta\": {}\n", 607 | " },\n", 608 | " {\n", 609 | " \"result\": {\n", 610 | " \"observed_value\": {\n", 611 | " \"quantiles\": [\n", 612 | " 0.05,\n", 613 | " 0.25,\n", 614 | " 0.5,\n", 615 | " 0.75,\n", 616 | " 0.95\n", 617 | " ],\n", 618 | " \"values\": [\n", 619 | " 1053339952,\n", 620 | " 1245889518,\n", 621 | " 1508307745,\n", 622 | " 1750668489,\n", 623 | " 1952551186\n", 624 | " ]\n", 625 | " },\n", 626 | " \"element_count\": 18649,\n", 627 | " \"missing_count\": 0,\n", 628 | " \"missing_percent\": 0.0\n", 629 | " },\n", 630 | " \"expectation_config\": {\n", 631 | " \"kwargs\": {\n", 632 | " \"column\": \"NPI\",\n", 633 | " \"quantile_ranges\": {\n", 634 | " \"quantiles\": [\n", 635 | " 0.05,\n", 636 | " 0.25,\n", 637 | " 0.5,\n", 638 | " 0.75,\n", 639 | " 0.95\n", 640 | " ],\n", 641 | " \"value_ranges\": [\n", 642 | " [\n", 643 | " 1053339951,\n", 644 | " 1053339953\n", 645 | " ],\n", 646 | " [\n", 647 | " 1245889517,\n", 648 | " 1245889519\n", 649 | " ],\n", 650 | " [\n", 651 | " 1508307744,\n", 652 | " 1508307746\n", 653 | " ],\n", 654 | " [\n", 655 | " 1750668488,\n", 656 | " 1750668490\n", 657 | " ],\n", 658 | " [\n", 659 | " 1952551185,\n", 660 | " 1952551187\n", 661 | " ]\n", 662 | " ]\n", 663 | " }\n", 664 | " },\n", 665 | " \"expectation_type\": \"expect_column_quantile_values_to_be_between\",\n", 666 | " \"meta\": {\n", 667 | " \"SampleExpectationsDatasetProfiler\": {\n", 668 | " \"confidence\": \"very low\"\n", 669 | " }\n", 670 | " }\n", 671 | " },\n", 672 | " \"success\": true,\n", 673 | " \"exception_info\": {\n", 674 | " \"raised_exception\": false,\n", 675 | " \"exception_message\": null,\n", 676 | " \"exception_traceback\": null\n", 677 | " },\n", 678 | " \"meta\": {}\n", 679 | " },\n", 680 | " {\n", 681 | " \"result\": {\n", 682 | " \"element_count\": 18649,\n", 683 | " \"unexpected_count\": 15241,\n", 684 | " \"unexpected_percent\": 81.72556169231594,\n", 685 | " \"partial_unexpected_list\": []\n", 686 | " },\n", 687 | " \"expectation_config\": {\n", 688 | " \"kwargs\": {\n", 689 | " \"column\": \"Organization_Name\",\n", 690 | " \"mostly\": 0.08274438307684065\n", 691 | " },\n", 692 | " \"expectation_type\": \"expect_column_values_to_not_be_null\",\n", 693 | " \"meta\": {\n", 694 | " \"SampleExpectationsDatasetProfiler\": {\n", 695 | " \"confidence\": \"very low\"\n", 696 | " }\n", 697 | " }\n", 698 | " },\n", 699 | " \"success\": true,\n", 700 | " \"exception_info\": {\n", 701 | " \"raised_exception\": false,\n", 702 | " \"exception_message\": null,\n", 703 | " \"exception_traceback\": null\n", 704 | " },\n", 705 | " \"meta\": {}\n", 706 | " },\n", 707 | " {\n", 708 | " \"result\": {\n", 709 | " \"element_count\": 18649,\n", 710 | " \"missing_count\": 15241,\n", 711 | " \"missing_percent\": 81.72556169231594,\n", 712 | " \"unexpected_count\": 0,\n", 713 | " \"unexpected_percent\": 0.0,\n", 714 | " \"unexpected_percent_nonmissing\": 0.0,\n", 715 | " \"partial_unexpected_list\": []\n", 716 | " },\n", 717 | " \"expectation_config\": {\n", 718 | " \"kwargs\": {\n", 719 | " \"column\": \"Organization_Name\",\n", 720 | " \"min_value\": 1\n", 721 | " },\n", 722 | " \"expectation_type\": \"expect_column_value_lengths_to_be_between\",\n", 723 | " \"meta\": {\n", 724 | " \"SampleExpectationsDatasetProfiler\": {\n", 725 | " \"confidence\": \"very low\"\n", 726 | " }\n", 727 | " }\n", 728 | " },\n", 729 | " \"success\": true,\n", 730 | " \"exception_info\": {\n", 731 | " \"raised_exception\": false,\n", 732 | " \"exception_message\": null,\n", 733 | " \"exception_traceback\": null\n", 734 | " },\n", 735 | " \"meta\": {}\n", 736 | " }\n", 737 | " ],\n", 738 | " \"success\": true,\n", 739 | " \"meta\": {\n", 740 | " \"great_expectations.__version__\": \"0.9.7+228.g7b410a57\",\n", 741 | " \"expectation_suite_name\": \"npi_small_file.critical\",\n", 742 | " \"run_id\": \"20200327T181830.633221Z\",\n", 743 | " \"batch_kwargs\": {\n", 744 | " \"path\": \"/Users/eugenemandel/projects/ge_tutorials/data/npi_small.csv\",\n", 745 | " \"datasource\": \"files_datasource\"\n", 746 | " },\n", 747 | " \"batch_markers\": {\n", 748 | " \"ge_load_time\": \"20200327T181801.219912Z\",\n", 749 | " \"pandas_data_fingerprint\": \"a5ebd04919bde23bcf25afadb9e661fb\"\n", 750 | " },\n", 751 | " \"batch_parameters\": null\n", 752 | " }\n", 753 | "}" 754 | ] 755 | }, 756 | "execution_count": 8, 757 | "metadata": {}, 758 | "output_type": "execute_result" 759 | } 760 | ], 761 | "source": [ 762 | "batch.validate()" 763 | ] 764 | }, 765 | { 766 | "cell_type": "code", 767 | "execution_count": 9, 768 | "metadata": {}, 769 | "outputs": [ 770 | { 771 | "data": { 772 | "text/plain": [ 773 | "great_expectations.dataset.pandas_dataset.PandasDataset" 774 | ] 775 | }, 776 | "execution_count": 9, 777 | "metadata": {}, 778 | "output_type": "execute_result" 779 | } 780 | ], 781 | "source": [ 782 | "type(batch)" 783 | ] 784 | }, 785 | { 786 | "cell_type": "markdown", 787 | "metadata": {}, 788 | "source": [ 789 | "## 4. Validate the batch with Validation Operators\n", 790 | "\n", 791 | "`Validation Operators` provide a convenient way to bundle the validation of\n", 792 | "multiple expectation suites and the actions that should be taken after validation.\n", 793 | "\n", 794 | "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", 795 | "\n", 796 | "* validating a group of batches that are logically related\n", 797 | "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", 798 | "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", 799 | "\n", 800 | "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" 801 | ] 802 | }, 803 | { 804 | "cell_type": "code", 805 | "execution_count": 8, 806 | "metadata": {}, 807 | "outputs": [ 808 | { 809 | "name": "stdout", 810 | "output_type": "stream", 811 | "text": [ 812 | "2020-03-27T12:13:41-0700 - INFO - \t14 expectation(s) included in expectation_suite.\n" 813 | ] 814 | } 815 | ], 816 | "source": [ 817 | "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", 818 | "\n", 819 | "#Generate a run id, a timestamp, or a meaningful string that will help you refer to validation results. We recommend they be chronologically sortable.\n", 820 | "# Let's make a simple sortable timestamp. Note this could come from your pipeline runner (e.g., Airflow run id).\n", 821 | "run_id = datetime.utcnow().isoformat().replace(\":\", \"\") + \"Z\"\n", 822 | "\n", 823 | "results = context.run_validation_operator(\n", 824 | " \"action_list_operator\", \n", 825 | " assets_to_validate=[batch], \n", 826 | " run_id=run_id)" 827 | ] 828 | }, 829 | { 830 | "cell_type": "markdown", 831 | "metadata": {}, 832 | "source": [ 833 | "## 5. View the Validation Results in Data Docs\n", 834 | "\n", 835 | "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", 836 | "\n", 837 | "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" 838 | ] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": null, 843 | "metadata": {}, 844 | "outputs": [], 845 | "source": [ 846 | "context.open_data_docs()" 847 | ] 848 | }, 849 | { 850 | "cell_type": "markdown", 851 | "metadata": {}, 852 | "source": [ 853 | "## Congratulations! You ran Validations!\n", 854 | "\n", 855 | "## Next steps:\n", 856 | "\n", 857 | "### 1. Read about the typical workflow with Great Expectations:\n", 858 | "\n", 859 | "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", 860 | "\n", 861 | "### 2. Explore the documentation & community\n", 862 | "\n", 863 | "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." 864 | ] 865 | }, 866 | { 867 | "cell_type": "code", 868 | "execution_count": null, 869 | "metadata": {}, 870 | "outputs": [], 871 | "source": [] 872 | } 873 | ], 874 | "metadata": { 875 | "kernelspec": { 876 | "display_name": "Python 3", 877 | "language": "python", 878 | "name": "python3" 879 | }, 880 | "language_info": { 881 | "codemirror_mode": { 882 | "name": "ipython", 883 | "version": 3 884 | }, 885 | "file_extension": ".py", 886 | "mimetype": "text/x-python", 887 | "name": "python", 888 | "nbconvert_exporter": "python", 889 | "pygments_lexer": "ipython3", 890 | "version": "3.7.0" 891 | }, 892 | "pycharm": { 893 | "stem_cell": { 894 | "cell_type": "raw", 895 | "metadata": { 896 | "collapsed": false 897 | }, 898 | "source": [] 899 | } 900 | } 901 | }, 902 | "nbformat": 4, 903 | "nbformat_minor": 4 904 | } 905 | -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/notebooks/spark/validation_playground.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Validation Playground\n", 8 | "\n", 9 | "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", 10 | "\n", 11 | "#### This notebook assumes that you created at least one expectation suite in your project.\n", 12 | "#### Here you will learn how to validate data loaded into a PySpark DataFrame against an expectation suite.\n", 13 | "\n", 14 | "\n", 15 | "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import json\n", 25 | "import great_expectations as ge\n", 26 | "import great_expectations.jupyter_ux\n", 27 | "from great_expectations.datasource.types import BatchKwargs\n", 28 | "from datetime import datetime" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## 1. Get a DataContext\n", 36 | "This represents your **project** that you just created using `great_expectations init`." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "context = ge.data_context.DataContext()" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## 2. Choose an Expectation Suite\n", 53 | "\n", 54 | "List expectation suites that you created in your project" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "context.list_expectation_suite_names()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "expectation_suite_name = # TODO: set to a name from the list above" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## 3. Load a batch of data you want to validate\n", 80 | "\n", 81 | "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# list datasources of the type SparkDFDatasource in your project\n", 91 | "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SparkDFDatasource']" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "datasource_name = # TODO: set to a datasource name from above" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "# If you would like to validate a file on a filesystem:\n", 110 | "batch_kwargs = {'path': \"YOUR_FILE_PATH\", 'datasource': datasource_name}\n", 111 | "# To customize how Spark reads the file, you can add options under reader_options key in batch_kwargs (e.g., header='true') \n", 112 | "\n", 113 | "# If you already loaded the data into a PySpark Data Frame:\n", 114 | "batch_kwargs = {'dataset': \"YOUR_DATAFRAME\", 'datasource': datasource_name}\n", 115 | "\n", 116 | "\n", 117 | "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", 118 | "batch.head()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "## 4. Validate the batch with Validation Operators\n", 126 | "\n", 127 | "`Validation Operators` provide a convenient way to bundle the validation of\n", 128 | "multiple expectation suites and the actions that should be taken after validation.\n", 129 | "\n", 130 | "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", 131 | "\n", 132 | "* validating a group of batches that are logically related\n", 133 | "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", 134 | "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", 135 | "\n", 136 | "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", 146 | "\n", 147 | "#Generate a run id, a timestamp, or a meaningful string that will help you refer to validation results. We recommend they be chronologically sortable.\n", 148 | "# Let's make a simple sortable timestamp. Note this could come from your pipeline runner (e.g., Airflow run id).\n", 149 | "run_id = datetime.utcnow().isoformat().replace(\":\", \"\") + \"Z\"\n", 150 | "\n", 151 | "results = context.run_validation_operator(\n", 152 | " \"action_list_operator\", \n", 153 | " assets_to_validate=[batch], \n", 154 | " run_id=run_id)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "## 5. View the Validation Results in Data Docs\n", 162 | "\n", 163 | "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", 164 | "\n", 165 | "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "context.open_data_docs()" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "## Congratulations! You ran Validations!\n", 182 | "\n", 183 | "## Next steps:\n", 184 | "\n", 185 | "### 1. Read about the typical workflow with Great Expectations:\n", 186 | "\n", 187 | "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", 188 | "\n", 189 | "### 2. Explore the documentation & community\n", 190 | "\n", 191 | "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [] 200 | } 201 | ], 202 | "metadata": { 203 | "kernelspec": { 204 | "display_name": "Python 3", 205 | "language": "python", 206 | "name": "python3" 207 | }, 208 | "language_info": { 209 | "codemirror_mode": { 210 | "name": "ipython", 211 | "version": 3 212 | }, 213 | "file_extension": ".py", 214 | "mimetype": "text/x-python", 215 | "name": "python", 216 | "nbconvert_exporter": "python", 217 | "pygments_lexer": "ipython3", 218 | "version": "3.7.0" 219 | }, 220 | "pycharm": { 221 | "stem_cell": { 222 | "cell_type": "raw", 223 | "source": [], 224 | "metadata": { 225 | "collapsed": false 226 | } 227 | } 228 | } 229 | }, 230 | "nbformat": 4, 231 | "nbformat_minor": 4 232 | } -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/notebooks/sql/validation_playground.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Validation Playground\n", 8 | "\n", 9 | "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", 10 | "\n", 11 | "#### This notebook assumes that you created at least one expectation suite in your project.\n", 12 | "#### Here you will learn how to validate data in a SQL database against an expectation suite.\n", 13 | "\n", 14 | "\n", 15 | "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import json\n", 25 | "import great_expectations as ge\n", 26 | "import great_expectations.jupyter_ux\n", 27 | "from great_expectations.datasource.types import BatchKwargs\n", 28 | "from datetime import datetime" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## 1. Get a DataContext\n", 36 | "This represents your **project** that you just created using `great_expectations init`." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "context = ge.data_context.DataContext()" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## 2. Choose an Expectation Suite\n", 53 | "\n", 54 | "List expectation suites that you created in your project" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "context.list_expectation_suite_names()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "expectation_suite_name = # TODO: set to a name from the list above" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## 3. Load a batch of data you want to validate\n", 80 | "\n", 81 | "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# list datasources of the type SqlAlchemyDatasource in your project\n", 91 | "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SqlAlchemyDatasource']" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "datasource_name = # TODO: set to a datasource name from above" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "# If you would like to validate an entire table or view in your database's default schema:\n", 110 | "batch_kwargs = {'table': \"YOUR_TABLE\", 'datasource': datasource_name}\n", 111 | "\n", 112 | "# If you would like to validate an entire table or view from a non-default schema in your database:\n", 113 | "batch_kwargs = {'table': \"YOUR_TABLE\", \"schema\": \"YOUR_SCHEMA\", 'datasource': datasource_name}\n", 114 | "\n", 115 | "# If you would like to validate the result set of a query:\n", 116 | "# batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE', 'datasource': datasource_name}\n", 117 | "\n", 118 | "\n", 119 | "\n", 120 | "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", 121 | "batch.head()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "## 4. Validate the batch with Validation Operators\n", 129 | "\n", 130 | "`Validation Operators` provide a convenient way to bundle the validation of\n", 131 | "multiple expectation suites and the actions that should be taken after validation.\n", 132 | "\n", 133 | "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", 134 | "\n", 135 | "* validating a group of batches that are logically related\n", 136 | "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", 137 | "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", 138 | "\n", 139 | "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", 149 | "\n", 150 | "#Generate a run id, a timestamp, or a meaningful string that will help you refer to validation results. We recommend they be chronologically sortable.\n", 151 | "# Let's make a simple sortable timestamp. Note this could come from your pipeline runner (e.g., Airflow run id).\n", 152 | "run_id = datetime.utcnow().isoformat().replace(\":\", \"\") + \"Z\"\n", 153 | "\n", 154 | "results = context.run_validation_operator(\n", 155 | " \"action_list_operator\", \n", 156 | " assets_to_validate=[batch], \n", 157 | " run_id=run_id)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "## 5. View the Validation Results in Data Docs\n", 165 | "\n", 166 | "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", 167 | "\n", 168 | "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "context.open_data_docs()" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "## Congratulations! You ran Validations!\n", 185 | "\n", 186 | "## Next steps:\n", 187 | "\n", 188 | "### 1. Read about the typical workflow with Great Expectations:\n", 189 | "\n", 190 | "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", 191 | "\n", 192 | "### 2. Explore the documentation & community\n", 193 | "\n", 194 | "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [] 203 | } 204 | ], 205 | "metadata": { 206 | "kernelspec": { 207 | "display_name": "Python 3", 208 | "language": "python", 209 | "name": "python3" 210 | }, 211 | "language_info": { 212 | "codemirror_mode": { 213 | "name": "ipython", 214 | "version": 3 215 | }, 216 | "file_extension": ".py", 217 | "mimetype": "text/x-python", 218 | "name": "python", 219 | "nbconvert_exporter": "python", 220 | "pygments_lexer": "ipython3", 221 | "version": "3.7.0" 222 | }, 223 | "pycharm": { 224 | "stem_cell": { 225 | "cell_type": "raw", 226 | "source": [], 227 | "metadata": { 228 | "collapsed": false 229 | } 230 | } 231 | } 232 | }, 233 | "nbformat": 4, 234 | "nbformat_minor": 4 235 | } -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css: -------------------------------------------------------------------------------- 1 | /*index page*/ 2 | .ge-index-page-site-name-title {} 3 | .ge-index-page-table-container {} 4 | .ge-index-page-table {} 5 | .ge-index-page-table-profiling-links-header {} 6 | .ge-index-page-table-expectations-links-header {} 7 | .ge-index-page-table-validations-links-header {} 8 | .ge-index-page-table-profiling-links-list {} 9 | .ge-index-page-table-profiling-links-item {} 10 | .ge-index-page-table-expectation-suite-link {} 11 | .ge-index-page-table-validation-links-list {} 12 | .ge-index-page-table-validation-links-item {} 13 | 14 | /*breadcrumbs*/ 15 | .ge-breadcrumbs {} 16 | .ge-breadcrumbs-item {} 17 | 18 | /*navigation sidebar*/ 19 | .ge-navigation-sidebar-container {} 20 | .ge-navigation-sidebar-content {} 21 | .ge-navigation-sidebar-title {} 22 | .ge-navigation-sidebar-link {} 23 | -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/images/dbt_dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greatexpectationslabs/ge_tutorials/1b04332e4f9b4d8621a95e7aa2837c371f41e682/ge_dbt_airflow_tutorial/images/dbt_dag.png -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/images/enable_dag.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greatexpectationslabs/ge_tutorials/1b04332e4f9b4d8621a95e7aa2837c371f41e682/ge_dbt_airflow_tutorial/images/enable_dag.gif -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/images/pipeline_airflow_dag_with_ge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greatexpectationslabs/ge_tutorials/1b04332e4f9b4d8621a95e7aa2837c371f41e682/ge_dbt_airflow_tutorial/images/pipeline_airflow_dag_with_ge.png -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/images/pipeline_airflow_dag_without_ge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/greatexpectationslabs/ge_tutorials/1b04332e4f9b4d8621a95e7aa2837c371f41e682/ge_dbt_airflow_tutorial/images/pipeline_airflow_dag_without_ge.png -------------------------------------------------------------------------------- /ge_dbt_airflow_tutorial/requirements.txt: -------------------------------------------------------------------------------- 1 | dbt 2 | great_expectations -------------------------------------------------------------------------------- /getting_started_tutorial_final_v2_api/README.md: -------------------------------------------------------------------------------- 1 | # Getting started with Great Expectations tutorial - v2 (Batch Kwargs) API 2 | 3 | This repository contains the final version of the "Getting started with Great Expectations" tutorial in the Great 4 | Expectations docs. This repo can be used as a demo and to explore a complete Great Expectations deploy. 5 | 6 | **THIS VERSION WAS CREATED WITH THE V2 (BATCH KWARGS) GREAT EXPECTATIONS API**, i.e. Great Expectations version 0.12.x and below. 7 | 8 | ## 1. How to run through the tutorial 9 | [Please follow the tutorial in our docs for instructions!](https://docs.greatexpectations.io/en/latest/guides/tutorials/getting_started.html) 10 | 11 | ## 2. How to use this repo to explore and demo Great Expectations 12 | 13 | ### The `data` directory 14 | 15 | The CSV files in the data directory are yellow taxi trip data that have been downloaded from the NYC taxi data website: 16 | * [TLC trip record data](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) 17 | * [Data dictionary](https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf) 18 | 19 | We created 10,000 row samples (using the Pandas ``sample`` function) from teh original CSV files for convenience and manually added some breaking changes (0s in the passenger_count column) to demonstrate potential data issues. 20 | 21 | In a future version of this tutorial, we might use "naturally occurring" data bugs :) 22 | 23 | ### The `great_expectations` directory 24 | Currently, this demo contains the following: 25 | * A `great_expectations.yml` file that's configured to use the top-level `data` directory as a Datasource. You will not need to set up anything to get it to work. 26 | * A single Expectation Suite, `taxi.demo`, containing a handful of simple Expectations 27 | * A Checkpoint `my_checkpoint` that is set up to run the suite against the February data set 28 | -------------------------------------------------------------------------------- /getting_started_tutorial_final_v2_api/great_expectations/.gitignore: -------------------------------------------------------------------------------- 1 | uncommitted/ -------------------------------------------------------------------------------- /getting_started_tutorial_final_v2_api/great_expectations/checkpoints/.ge_store_backend_id: -------------------------------------------------------------------------------- 1 | store_backend_id = 2d1df717-ce13-4988-a121-459c863a5072 -------------------------------------------------------------------------------- /getting_started_tutorial_final_v2_api/great_expectations/checkpoints/my_chk.yml: -------------------------------------------------------------------------------- 1 | name: my_chk 2 | config_version: 3 | module_name: great_expectations.checkpoint 4 | class_name: LegacyCheckpoint 5 | validation_operator_name: action_list_operator 6 | batches: 7 | - batch_kwargs: 8 | path: /Users/sam/code/ge_tutorials/getting_started_tutorial_final_v2_api/great_expectations/../../data/yellow_tripdata_sample_2019-01.csv 9 | datasource: data__dir 10 | data_asset_name: yellow_tripdata_sample_2019-01 11 | expectation_suite_names: 12 | - taxi.demo 13 | -------------------------------------------------------------------------------- /getting_started_tutorial_final_v2_api/great_expectations/expectations/.ge_store_backend_id: -------------------------------------------------------------------------------- 1 | store_backend_id = 73999387-e31a-4e53-a3b4-9bfb5acc285e -------------------------------------------------------------------------------- /getting_started_tutorial_final_v2_api/great_expectations/expectations/taxi/demo.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_asset_type": "Dataset", 3 | "expectation_suite_name": "taxi.demo", 4 | "expectations": [ 5 | { 6 | "expectation_type": "expect_table_row_count_to_be_between", 7 | "kwargs": { 8 | "max_value": 11000, 9 | "min_value": 9000 10 | }, 11 | "meta": { 12 | "BasicSuiteBuilderProfiler": { 13 | "confidence": "very low" 14 | } 15 | } 16 | }, 17 | { 18 | "expectation_type": "expect_table_column_count_to_equal", 19 | "kwargs": { 20 | "value": 18 21 | }, 22 | "meta": { 23 | "BasicSuiteBuilderProfiler": { 24 | "confidence": "very low" 25 | } 26 | } 27 | }, 28 | { 29 | "expectation_type": "expect_table_columns_to_match_ordered_list", 30 | "kwargs": { 31 | "column_list": [ 32 | "vendor_id", 33 | "pickup_datetime", 34 | "dropoff_datetime", 35 | "passenger_count", 36 | "trip_distance", 37 | "rate_code_id", 38 | "store_and_fwd_flag", 39 | "pickup_location_id", 40 | "dropoff_location_id", 41 | "payment_type", 42 | "fare_amount", 43 | "extra", 44 | "mta_tax", 45 | "tip_amount", 46 | "tolls_amount", 47 | "improvement_surcharge", 48 | "total_amount", 49 | "congestion_surcharge" 50 | ] 51 | }, 52 | "meta": { 53 | "BasicSuiteBuilderProfiler": { 54 | "confidence": "very low" 55 | } 56 | } 57 | }, 58 | { 59 | "expectation_type": "expect_column_values_to_not_be_null", 60 | "kwargs": { 61 | "column": "passenger_count" 62 | }, 63 | "meta": { 64 | "BasicSuiteBuilderProfiler": { 65 | "confidence": "very low" 66 | } 67 | } 68 | }, 69 | { 70 | "expectation_type": "expect_column_distinct_values_to_be_in_set", 71 | "kwargs": { 72 | "column": "passenger_count", 73 | "value_set": [ 74 | 1, 75 | 2, 76 | 3, 77 | 4, 78 | 5, 79 | 6 80 | ] 81 | }, 82 | "meta": { 83 | "BasicSuiteBuilderProfiler": { 84 | "confidence": "very low" 85 | } 86 | } 87 | }, 88 | { 89 | "expectation_type": "expect_column_kl_divergence_to_be_less_than", 90 | "kwargs": { 91 | "column": "passenger_count", 92 | "partition_object": { 93 | "values": [ 94 | 1, 95 | 2, 96 | 3, 97 | 4, 98 | 5, 99 | 6 100 | ], 101 | "weights": [ 102 | 0.7299, 103 | 0.1458, 104 | 0.039, 105 | 0.0186, 106 | 0.0415, 107 | 0.0252 108 | ] 109 | }, 110 | "threshold": 0.6 111 | }, 112 | "meta": { 113 | "BasicSuiteBuilderProfiler": { 114 | "confidence": "very low" 115 | } 116 | } 117 | } 118 | ], 119 | "meta": { 120 | "BasicSuiteBuilderProfiler": { 121 | "batch_kwargs": { 122 | "data_asset_name": "yellow_tripdata_sample_2019-01", 123 | "datasource": "data__dir", 124 | "path": "/Users/sam/code/ge_tutorials/getting_started_tutorial_stable_api_final/great_expectations/../data/yellow_tripdata_sample_2019-01.csv" 125 | }, 126 | "created_at": 1612842185.286588, 127 | "created_by": "BasicSuiteBuilderProfiler" 128 | }, 129 | "citations": [ 130 | { 131 | "batch_kwargs": { 132 | "data_asset_name": "yellow_tripdata_sample_2019-01", 133 | "datasource": "data__dir", 134 | "path": "/Users/sam/code/ge_tutorials/getting_started_tutorial_stable_api_final/great_expectations/../data/yellow_tripdata_sample_2019-01.csv" 135 | }, 136 | "batch_markers": { 137 | "ge_load_time": "20210209T034305.243453Z", 138 | "pandas_data_fingerprint": "c4f929e6d4fab001fedc9e075bf4b612" 139 | }, 140 | "batch_parameters": null, 141 | "citation_date": "20210209T034305.301394Z", 142 | "comment": "BasicSuiteBuilderProfiler added a citation based on the current batch." 143 | } 144 | ], 145 | "columns": { 146 | "congestion_surcharge": { 147 | "description": "" 148 | }, 149 | "dropoff_datetime": { 150 | "description": "" 151 | }, 152 | "dropoff_location_id": { 153 | "description": "" 154 | }, 155 | "extra": { 156 | "description": "" 157 | }, 158 | "fare_amount": { 159 | "description": "" 160 | }, 161 | "improvement_surcharge": { 162 | "description": "" 163 | }, 164 | "mta_tax": { 165 | "description": "" 166 | }, 167 | "passenger_count": { 168 | "description": "" 169 | }, 170 | "payment_type": { 171 | "description": "" 172 | }, 173 | "pickup_datetime": { 174 | "description": "" 175 | }, 176 | "pickup_location_id": { 177 | "description": "" 178 | }, 179 | "rate_code_id": { 180 | "description": "" 181 | }, 182 | "store_and_fwd_flag": { 183 | "description": "" 184 | }, 185 | "tip_amount": { 186 | "description": "" 187 | }, 188 | "tolls_amount": { 189 | "description": "" 190 | }, 191 | "total_amount": { 192 | "description": "" 193 | }, 194 | "trip_distance": { 195 | "description": "" 196 | }, 197 | "vendor_id": { 198 | "description": "" 199 | } 200 | }, 201 | "great_expectations_version": "0.13.9+1.g62265bff3.dirty", 202 | "notes": { 203 | "content": [ 204 | "_To add additional notes, edit the meta.notes.content field in the appropriate Expectation json file._" 205 | ], 206 | "format": "markdown" 207 | } 208 | } 209 | } -------------------------------------------------------------------------------- /getting_started_tutorial_final_v2_api/great_expectations/great_expectations.yml: -------------------------------------------------------------------------------- 1 | # Welcome to Great Expectations! Always know what to expect from your data. 2 | # 3 | # Here you can define datasources, batch kwargs generators, integrations and 4 | # more. This file is intended to be committed to your repo. For help with 5 | # configuration please: 6 | # - Read our docs: https://docs.greatexpectations.io/en/latest/how_to_guides/spare_parts/data_context_reference.html#configuration 7 | # - Join our slack channel: http://greatexpectations.io/slack 8 | 9 | # config_version refers to the syntactic version of this config file, and is used in maintaining backwards compatibility 10 | # It is auto-generated and usually does not need to be changed. 11 | config_version: 3.0 12 | 13 | # Datasources tell Great Expectations where your data lives and how to get it. 14 | # You can use the CLI command `great_expectations datasource new` to help you 15 | # add a new datasource. Read more at https://docs.greatexpectations.io/en/latest/reference/core_concepts/datasource_reference.html 16 | datasources: 17 | data__dir: 18 | class_name: PandasDatasource 19 | batch_kwargs_generators: 20 | subdir_reader: 21 | class_name: SubdirReaderBatchKwargsGenerator 22 | base_directory: ../../data 23 | module_name: great_expectations.datasource 24 | data_asset_type: 25 | class_name: PandasDataset 26 | module_name: great_expectations.dataset 27 | 28 | # This config file supports variable substitution which enables: 1) keeping 29 | # secrets out of source control & 2) environment-based configuration changes 30 | # such as staging vs prod. 31 | # 32 | # When GE encounters substitution syntax (like `my_key: ${my_value}` or 33 | # `my_key: $my_value`) in the great_expectations.yml file, it will attempt 34 | # to replace the value of `my_key` with the value from an environment 35 | # variable `my_value` or a corresponding key read from this config file, 36 | # which is defined through the `config_variables_file_path`. 37 | # Environment variables take precedence over variables defined here. 38 | # 39 | # Substitution values defined here can be a simple (non-nested) value, 40 | # nested value such as a dictionary, or an environment variable (i.e. ${ENV_VAR}) 41 | # 42 | # 43 | # https://docs.greatexpectations.io/en/latest/guides/how_to_guides/configuring_data_contexts/how_to_use_a_yaml_file_or_environment_variables_to_populate_credentials.html 44 | 45 | 46 | config_variables_file_path: uncommitted/config_variables.yml 47 | 48 | # The plugins_directory will be added to your python path for custom modules 49 | # used to override and extend Great Expectations. 50 | plugins_directory: plugins/ 51 | 52 | stores: 53 | # Stores are configurable places to store things like Expectations, Validations 54 | # Data Docs, and more. These are for advanced users only - most users can simply 55 | # leave this section alone. 56 | # 57 | # Three stores are required: expectations, validations, and 58 | # evaluation_parameters, and must exist with a valid store entry. Additional 59 | # stores can be configured for uses such as data_docs, etc. 60 | expectations_store: 61 | class_name: ExpectationsStore 62 | store_backend: 63 | class_name: TupleFilesystemStoreBackend 64 | base_directory: expectations/ 65 | 66 | validations_store: 67 | class_name: ValidationsStore 68 | store_backend: 69 | class_name: TupleFilesystemStoreBackend 70 | base_directory: uncommitted/validations/ 71 | 72 | evaluation_parameter_store: 73 | # Evaluation Parameters enable dynamic expectations. Read more here: 74 | # https://docs.greatexpectations.io/en/latest/reference/core_concepts/evaluation_parameters.html 75 | class_name: EvaluationParameterStore 76 | 77 | checkpoint_store: 78 | class_name: CheckpointStore 79 | store_backend: 80 | class_name: TupleFilesystemStoreBackend 81 | base_directory: checkpoints/ 82 | 83 | expectations_store_name: expectations_store 84 | validations_store_name: validations_store 85 | evaluation_parameter_store_name: evaluation_parameter_store 86 | checkpoint_store_name: checkpoint_store 87 | 88 | data_docs_sites: 89 | # Data Docs make it simple to visualize data quality in your project. These 90 | # include Expectations, Validations & Profiles. The are built for all 91 | # Datasources from JSON artifacts in the local repo including validations & 92 | # profiles from the uncommitted directory. Read more at https://docs.greatexpectations.io/en/latest/reference/core_concepts/data_docs.html 93 | local_site: 94 | class_name: SiteBuilder 95 | # set to false to hide how-to buttons in Data Docs 96 | show_how_to_buttons: true 97 | store_backend: 98 | class_name: TupleFilesystemStoreBackend 99 | base_directory: uncommitted/data_docs/local_site/ 100 | site_index_builder: 101 | class_name: DefaultSiteIndexBuilder 102 | 103 | anonymous_usage_statistics: 104 | data_context_id: 73999387-e31a-4e53-a3b4-9bfb5acc285e 105 | enabled: true 106 | notebooks: 107 | -------------------------------------------------------------------------------- /getting_started_tutorial_final_v2_api/great_expectations/notebooks/pandas/validation_playground.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Validation Playground\n", 8 | "\n", 9 | "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", 10 | "\n", 11 | "#### This notebook assumes that you created at least one expectation suite in your project.\n", 12 | "#### Here you will learn how to validate data loaded into a Pandas DataFrame against an expectation suite.\n", 13 | "\n", 14 | "\n", 15 | "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import json\n", 25 | "import great_expectations as ge\n", 26 | "import great_expectations.jupyter_ux\n", 27 | "from great_expectations.datasource.types import BatchKwargs\n", 28 | "import datetime" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## 1. Get a DataContext\n", 36 | "This represents your **project** that you just created using `great_expectations init`." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "context = ge.data_context.DataContext()" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## 2. Choose an Expectation Suite\n", 53 | "\n", 54 | "List expectation suites that you created in your project" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "context.list_expectation_suite_names()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "expectation_suite_name = # TODO: set to a name from the list above" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## 3. Load a batch of data you want to validate\n", 80 | "\n", 81 | "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# list datasources of the type PandasDatasource in your project\n", 91 | "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'PandasDatasource']" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "datasource_name = # TODO: set to a datasource name from above" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "# If you would like to validate a file on a filesystem:\n", 110 | "batch_kwargs = {'path': \"YOUR_FILE_PATH\", 'datasource': datasource_name}\n", 111 | "\n", 112 | "# If you already loaded the data into a Pandas Data Frame:\n", 113 | "batch_kwargs = {'dataset': \"YOUR_DATAFRAME\", 'datasource': datasource_name}\n", 114 | "\n", 115 | "\n", 116 | "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", 117 | "batch.head()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "## 4. Validate the batch with Validation Operators\n", 125 | "\n", 126 | "`Validation Operators` provide a convenient way to bundle the validation of\n", 127 | "multiple expectation suites and the actions that should be taken after validation.\n", 128 | "\n", 129 | "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", 130 | "\n", 131 | "* validating a group of batches that are logically related\n", 132 | "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", 133 | "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", 134 | "\n", 135 | "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", 145 | "\n", 146 | "\"\"\"\n", 147 | "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n", 148 | "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n", 149 | "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n", 150 | "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n", 151 | "be None and run_time will default to the current UTC datetime.\n", 152 | "\"\"\"\n", 153 | "\n", 154 | "run_id = {\n", 155 | " \"run_name\": \"some_string_that_uniquely_identifies_this_run\", # insert your own run_name here\n", 156 | " \"run_time\": datetime.datetime.now(datetime.timezone.utc)\n", 157 | "}\n", 158 | "\n", 159 | "results = context.run_validation_operator(\n", 160 | " \"action_list_operator\",\n", 161 | " assets_to_validate=[batch],\n", 162 | " run_id=run_id)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "## 5. View the Validation Results in Data Docs\n", 170 | "\n", 171 | "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", 172 | "\n", 173 | "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "context.open_data_docs()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "## Congratulations! You ran Validations!\n", 190 | "\n", 191 | "## Next steps:\n", 192 | "\n", 193 | "### 1. Read about the typical workflow with Great Expectations:\n", 194 | "\n", 195 | "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", 196 | "\n", 197 | "### 2. Explore the documentation & community\n", 198 | "\n", 199 | "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [] 208 | } 209 | ], 210 | "metadata": { 211 | "kernelspec": { 212 | "display_name": "Python 3", 213 | "language": "python", 214 | "name": "python3" 215 | }, 216 | "language_info": { 217 | "codemirror_mode": { 218 | "name": "ipython", 219 | "version": 3 220 | }, 221 | "file_extension": ".py", 222 | "mimetype": "text/x-python", 223 | "name": "python", 224 | "nbconvert_exporter": "python", 225 | "pygments_lexer": "ipython3", 226 | "version": "3.7.0" 227 | }, 228 | "pycharm": { 229 | "stem_cell": { 230 | "cell_type": "raw", 231 | "source": [], 232 | "metadata": { 233 | "collapsed": false 234 | } 235 | } 236 | } 237 | }, 238 | "nbformat": 4, 239 | "nbformat_minor": 4 240 | } 241 | -------------------------------------------------------------------------------- /getting_started_tutorial_final_v2_api/great_expectations/notebooks/spark/validation_playground.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Validation Playground\n", 8 | "\n", 9 | "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", 10 | "\n", 11 | "#### This notebook assumes that you created at least one expectation suite in your project.\n", 12 | "#### Here you will learn how to validate data loaded into a PySpark DataFrame against an expectation suite.\n", 13 | "\n", 14 | "\n", 15 | "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import json\n", 25 | "import great_expectations as ge\n", 26 | "import great_expectations.jupyter_ux\n", 27 | "from great_expectations.datasource.types import BatchKwargs\n", 28 | "import datetime" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## 1. Get a DataContext\n", 36 | "This represents your **project** that you just created using `great_expectations init`." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "context = ge.data_context.DataContext()" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## 2. Choose an Expectation Suite\n", 53 | "\n", 54 | "List expectation suites that you created in your project" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "context.list_expectation_suite_names()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "expectation_suite_name = # TODO: set to a name from the list above" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## 3. Load a batch of data you want to validate\n", 80 | "\n", 81 | "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# list datasources of the type SparkDFDatasource in your project\n", 91 | "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SparkDFDatasource']" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "datasource_name = # TODO: set to a datasource name from above" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "# If you would like to validate a file on a filesystem:\n", 110 | "batch_kwargs = {'path': \"YOUR_FILE_PATH\", 'datasource': datasource_name}\n", 111 | "# To customize how Spark reads the file, you can add options under reader_options key in batch_kwargs (e.g., header='true')\n", 112 | "\n", 113 | "# If you already loaded the data into a PySpark Data Frame:\n", 114 | "batch_kwargs = {'dataset': \"YOUR_DATAFRAME\", 'datasource': datasource_name}\n", 115 | "\n", 116 | "\n", 117 | "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", 118 | "batch.head()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "## 4. Validate the batch with Validation Operators\n", 126 | "\n", 127 | "`Validation Operators` provide a convenient way to bundle the validation of\n", 128 | "multiple expectation suites and the actions that should be taken after validation.\n", 129 | "\n", 130 | "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", 131 | "\n", 132 | "* validating a group of batches that are logically related\n", 133 | "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", 134 | "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", 135 | "\n", 136 | "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", 146 | "\n", 147 | "\"\"\"\n", 148 | "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n", 149 | "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n", 150 | "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n", 151 | "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n", 152 | "be None and run_time will default to the current UTC datetime.\n", 153 | "\"\"\"\n", 154 | "\n", 155 | "run_id = {\n", 156 | " \"run_name\": \"some_string_that_uniquely_identifies_this_run\", # insert your own run_name here\n", 157 | " \"run_time\": datetime.datetime.now(datetime.timezone.utc)\n", 158 | "}\n", 159 | "\n", 160 | "results = context.run_validation_operator(\n", 161 | " \"action_list_operator\",\n", 162 | " assets_to_validate=[batch],\n", 163 | " run_id=run_id)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "## 5. View the Validation Results in Data Docs\n", 171 | "\n", 172 | "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", 173 | "\n", 174 | "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "context.open_data_docs()" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "## Congratulations! You ran Validations!\n", 191 | "\n", 192 | "## Next steps:\n", 193 | "\n", 194 | "### 1. Read about the typical workflow with Great Expectations:\n", 195 | "\n", 196 | "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", 197 | "\n", 198 | "### 2. Explore the documentation & community\n", 199 | "\n", 200 | "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [] 209 | } 210 | ], 211 | "metadata": { 212 | "kernelspec": { 213 | "display_name": "Python 3", 214 | "language": "python", 215 | "name": "python3" 216 | }, 217 | "language_info": { 218 | "codemirror_mode": { 219 | "name": "ipython", 220 | "version": 3 221 | }, 222 | "file_extension": ".py", 223 | "mimetype": "text/x-python", 224 | "name": "python", 225 | "nbconvert_exporter": "python", 226 | "pygments_lexer": "ipython3", 227 | "version": "3.7.0" 228 | }, 229 | "pycharm": { 230 | "stem_cell": { 231 | "cell_type": "raw", 232 | "source": [], 233 | "metadata": { 234 | "collapsed": false 235 | } 236 | } 237 | } 238 | }, 239 | "nbformat": 4, 240 | "nbformat_minor": 4 241 | } 242 | -------------------------------------------------------------------------------- /getting_started_tutorial_final_v2_api/great_expectations/notebooks/sql/validation_playground.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Validation Playground\n", 8 | "\n", 9 | "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", 10 | "\n", 11 | "#### This notebook assumes that you created at least one expectation suite in your project.\n", 12 | "#### Here you will learn how to validate data in a SQL database against an expectation suite.\n", 13 | "\n", 14 | "\n", 15 | "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import json\n", 25 | "import great_expectations as ge\n", 26 | "import great_expectations.jupyter_ux\n", 27 | "from great_expectations.datasource.types import BatchKwargs\n", 28 | "import datetime" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## 1. Get a DataContext\n", 36 | "This represents your **project** that you just created using `great_expectations init`." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "context = ge.data_context.DataContext()" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## 2. Choose an Expectation Suite\n", 53 | "\n", 54 | "List expectation suites that you created in your project" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "context.list_expectation_suite_names()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "expectation_suite_name = # TODO: set to a name from the list above" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## 3. Load a batch of data you want to validate\n", 80 | "\n", 81 | "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# list datasources of the type SqlAlchemyDatasource in your project\n", 91 | "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SqlAlchemyDatasource']" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "datasource_name = # TODO: set to a datasource name from above" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "# If you would like to validate an entire table or view in your database's default schema:\n", 110 | "batch_kwargs = {'table': \"YOUR_TABLE\", 'datasource': datasource_name}\n", 111 | "\n", 112 | "# If you would like to validate an entire table or view from a non-default schema in your database:\n", 113 | "batch_kwargs = {'table': \"YOUR_TABLE\", \"schema\": \"YOUR_SCHEMA\", 'datasource': datasource_name}\n", 114 | "\n", 115 | "# If you would like to validate the result set of a query:\n", 116 | "# batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE', 'datasource': datasource_name}\n", 117 | "\n", 118 | "\n", 119 | "\n", 120 | "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", 121 | "batch.head()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "## 4. Validate the batch with Validation Operators\n", 129 | "\n", 130 | "`Validation Operators` provide a convenient way to bundle the validation of\n", 131 | "multiple expectation suites and the actions that should be taken after validation.\n", 132 | "\n", 133 | "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", 134 | "\n", 135 | "* validating a group of batches that are logically related\n", 136 | "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", 137 | "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", 138 | "\n", 139 | "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", 149 | "\n", 150 | "\"\"\"\n", 151 | "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n", 152 | "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n", 153 | "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n", 154 | "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n", 155 | "be None and run_time will default to the current UTC datetime.\n", 156 | "\"\"\"\n", 157 | "\n", 158 | "run_id = {\n", 159 | " \"run_name\": \"some_string_that_uniquely_identifies_this_run\", # insert your own run_name here\n", 160 | " \"run_time\": datetime.datetime.now(datetime.timezone.utc)\n", 161 | "}\n", 162 | "\n", 163 | "results = context.run_validation_operator(\n", 164 | " \"action_list_operator\",\n", 165 | " assets_to_validate=[batch],\n", 166 | " run_id=run_id)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "## 5. View the Validation Results in Data Docs\n", 174 | "\n", 175 | "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", 176 | "\n", 177 | "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "context.open_data_docs()" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "## Congratulations! You ran Validations!\n", 194 | "\n", 195 | "## Next steps:\n", 196 | "\n", 197 | "### 1. Read about the typical workflow with Great Expectations:\n", 198 | "\n", 199 | "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", 200 | "\n", 201 | "### 2. Explore the documentation & community\n", 202 | "\n", 203 | "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [] 212 | } 213 | ], 214 | "metadata": { 215 | "kernelspec": { 216 | "display_name": "Python 3", 217 | "language": "python", 218 | "name": "python3" 219 | }, 220 | "language_info": { 221 | "codemirror_mode": { 222 | "name": "ipython", 223 | "version": 3 224 | }, 225 | "file_extension": ".py", 226 | "mimetype": "text/x-python", 227 | "name": "python", 228 | "nbconvert_exporter": "python", 229 | "pygments_lexer": "ipython3", 230 | "version": "3.7.0" 231 | }, 232 | "pycharm": { 233 | "stem_cell": { 234 | "cell_type": "raw", 235 | "source": [], 236 | "metadata": { 237 | "collapsed": false 238 | } 239 | } 240 | } 241 | }, 242 | "nbformat": 4, 243 | "nbformat_minor": 4 244 | } 245 | -------------------------------------------------------------------------------- /getting_started_tutorial_final_v2_api/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css: -------------------------------------------------------------------------------- 1 | /*index page*/ 2 | .ge-index-page-site-name-title {} 3 | .ge-index-page-table-container {} 4 | .ge-index-page-table {} 5 | .ge-index-page-table-profiling-links-header {} 6 | .ge-index-page-table-expectations-links-header {} 7 | .ge-index-page-table-validations-links-header {} 8 | .ge-index-page-table-profiling-links-list {} 9 | .ge-index-page-table-profiling-links-item {} 10 | .ge-index-page-table-expectation-suite-link {} 11 | .ge-index-page-table-validation-links-list {} 12 | .ge-index-page-table-validation-links-item {} 13 | 14 | /*breadcrumbs*/ 15 | .ge-breadcrumbs {} 16 | .ge-breadcrumbs-item {} 17 | 18 | /*navigation sidebar*/ 19 | .ge-navigation-sidebar-container {} 20 | .ge-navigation-sidebar-content {} 21 | .ge-navigation-sidebar-title {} 22 | .ge-navigation-sidebar-link {} 23 | -------------------------------------------------------------------------------- /getting_started_tutorial_final_v3_api/README.md: -------------------------------------------------------------------------------- 1 | # Getting started with Great Expectations tutorial - v3 (Batch Request) API 2 | 3 | This repository contains the final version of the "Getting started with Great Expectations" tutorial in the Great 4 | Expectations docs. This repo can be used as a demo and to explore a complete Great Expectations deploy. 5 | 6 | **THIS VERSION WAS CREATED WITH THE V3 (BATCH REQUEST) GREAT EXPECTATIONS API**, which is available in Great Expectations 7 | version 0.13.x and above. 8 | 9 | ## 1. How to run through the tutorial 10 | [Please follow the tutorial in our docs for instructions!](https://docs.greatexpectations.io/en/latest/guides/tutorials/getting_started_v3_api.html) 11 | 12 | ## 2. How to use this repo to explore and demo Great Expectations 13 | 14 | ### The `data` directory 15 | 16 | The CSV files in the data directory are yellow taxi trip data that have been downloaded from the NYC taxi data website: 17 | * [TLC trip record data](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) 18 | * [Data dictionary](https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf) 19 | 20 | We created 10,000 row samples (using the Pandas ``sample`` function) from teh original CSV files for convenience and manually added some breaking changes (0s in the passenger_count column) to demonstrate potential data issues. 21 | 22 | In a future version of this tutorial, we might use "naturally occurring" data bugs :) 23 | 24 | ### The `great_expectations` directory 25 | Currently, this demo contains the following: 26 | * A `great_expectations.yml` file that's configured to use the top-level `data` directory as a Datasource. You will not need to set up anything to get it to work. 27 | * A single Expectation Suite, `taxi.demo`, containing a handful of simple Expectations 28 | * A Checkpoint `my_chk` that is set up to run the suite against the February data set 29 | -------------------------------------------------------------------------------- /getting_started_tutorial_final_v3_api/great_expectations/.gitignore: -------------------------------------------------------------------------------- 1 | uncommitted/ -------------------------------------------------------------------------------- /getting_started_tutorial_final_v3_api/great_expectations/checkpoints/my_checkpoint.yml: -------------------------------------------------------------------------------- 1 | name: my_checkpoint 2 | config_version: 1.0 3 | template_name: 4 | module_name: great_expectations.checkpoint 5 | class_name: Checkpoint 6 | run_name_template: '%Y%m%d-%H%M%S-my-run-name-template' 7 | expectation_suite_name: 8 | batch_request: 9 | action_list: 10 | - name: store_validation_result 11 | action: 12 | class_name: StoreValidationResultAction 13 | - name: store_evaluation_params 14 | action: 15 | class_name: StoreEvaluationParametersAction 16 | - name: update_data_docs 17 | action: 18 | class_name: UpdateDataDocsAction 19 | site_names: [] 20 | evaluation_parameters: {} 21 | runtime_configuration: {} 22 | validations: 23 | - batch_request: 24 | datasource_name: data__dir 25 | data_connector_name: data__dir_example_data_connector 26 | data_asset_name: yellow_tripdata_sample_2019-02.csv 27 | data_connector_query: 28 | index: -1 29 | expectation_suite_name: taxi.demo 30 | profilers: [] 31 | -------------------------------------------------------------------------------- /getting_started_tutorial_final_v3_api/great_expectations/checkpoints/my_checkpoint_with_custom_expectation.yml: -------------------------------------------------------------------------------- 1 | name: my_checkpoint_with_custom_expectation 2 | config_version: 1.0 3 | template_name: 4 | module_name: great_expectations.checkpoint 5 | class_name: Checkpoint 6 | run_name_template: '%Y%m%d-%H%M%S-my-run-name-template' 7 | expectation_suite_name: 8 | batch_request: 9 | action_list: 10 | - name: store_validation_result 11 | action: 12 | class_name: StoreValidationResultAction 13 | - name: store_evaluation_params 14 | action: 15 | class_name: StoreEvaluationParametersAction 16 | - name: update_data_docs 17 | action: 18 | class_name: UpdateDataDocsAction 19 | site_names: [] 20 | evaluation_parameters: {} 21 | runtime_configuration: {} 22 | validations: 23 | - batch_request: 24 | datasource_name: data__dir 25 | data_connector_name: data__dir_example_data_connector 26 | data_asset_name: yellow_tripdata_sample_2019-02.csv 27 | data_connector_query: 28 | index: -1 29 | expectation_suite_name: taxi.demo_with_custom_expectation 30 | profilers: [] 31 | -------------------------------------------------------------------------------- /getting_started_tutorial_final_v3_api/great_expectations/expectations/.ge_store_backend_id: -------------------------------------------------------------------------------- 1 | store_backend_id = 2c8220c3-63db-42c7-be97-b7909cc59f8b -------------------------------------------------------------------------------- /getting_started_tutorial_final_v3_api/great_expectations/expectations/taxi/demo.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_asset_type": null, 3 | "expectation_suite_name": "taxi.demo", 4 | "expectations": [ 5 | { 6 | "expectation_type": "expect_table_columns_to_match_ordered_list", 7 | "kwargs": { 8 | "column_list": [ 9 | "vendor_id", 10 | "pickup_datetime", 11 | "dropoff_datetime", 12 | "passenger_count", 13 | "trip_distance", 14 | "rate_code_id", 15 | "store_and_fwd_flag", 16 | "pickup_location_id", 17 | "dropoff_location_id", 18 | "payment_type", 19 | "fare_amount", 20 | "extra", 21 | "mta_tax", 22 | "tip_amount", 23 | "tolls_amount", 24 | "improvement_surcharge", 25 | "total_amount", 26 | "congestion_surcharge" 27 | ] 28 | }, 29 | "meta": {} 30 | }, 31 | { 32 | "expectation_type": "expect_table_row_count_to_be_between", 33 | "kwargs": { 34 | "max_value": 10000, 35 | "min_value": 10000 36 | }, 37 | "meta": {} 38 | }, 39 | { 40 | "expectation_type": "expect_column_min_to_be_between", 41 | "kwargs": { 42 | "column": "passenger_count", 43 | "max_value": 1, 44 | "min_value": 1 45 | }, 46 | "meta": {} 47 | }, 48 | { 49 | "expectation_type": "expect_column_max_to_be_between", 50 | "kwargs": { 51 | "column": "passenger_count", 52 | "max_value": 6, 53 | "min_value": 6 54 | }, 55 | "meta": {} 56 | }, 57 | { 58 | "expectation_type": "expect_column_mean_to_be_between", 59 | "kwargs": { 60 | "column": "passenger_count", 61 | "max_value": 1.5716, 62 | "min_value": 1.5716 63 | }, 64 | "meta": {} 65 | }, 66 | { 67 | "expectation_type": "expect_column_median_to_be_between", 68 | "kwargs": { 69 | "column": "passenger_count", 70 | "max_value": 1.0, 71 | "min_value": 1.0 72 | }, 73 | "meta": {} 74 | }, 75 | { 76 | "expectation_type": "expect_column_quantile_values_to_be_between", 77 | "kwargs": { 78 | "allow_relative_error": "lower", 79 | "column": "passenger_count", 80 | "quantile_ranges": { 81 | "quantiles": [ 82 | 0.05, 83 | 0.25, 84 | 0.5, 85 | 0.75, 86 | 0.95 87 | ], 88 | "value_ranges": [ 89 | [ 90 | 1, 91 | 1 92 | ], 93 | [ 94 | 1, 95 | 1 96 | ], 97 | [ 98 | 1, 99 | 1 100 | ], 101 | [ 102 | 2, 103 | 2 104 | ], 105 | [ 106 | 5, 107 | 5 108 | ] 109 | ] 110 | } 111 | }, 112 | "meta": {} 113 | }, 114 | { 115 | "expectation_type": "expect_column_values_to_be_in_set", 116 | "kwargs": { 117 | "column": "passenger_count", 118 | "value_set": [ 119 | 1, 120 | 2, 121 | 3, 122 | 4, 123 | 5, 124 | 6 125 | ] 126 | }, 127 | "meta": {} 128 | }, 129 | { 130 | "expectation_type": "expect_column_values_to_not_be_null", 131 | "kwargs": { 132 | "column": "passenger_count" 133 | }, 134 | "meta": {} 135 | }, 136 | { 137 | "expectation_type": "expect_column_proportion_of_unique_values_to_be_between", 138 | "kwargs": { 139 | "column": "passenger_count", 140 | "max_value": 0.0006, 141 | "min_value": 0.0006 142 | }, 143 | "meta": {} 144 | }, 145 | { 146 | "expectation_type": "expect_column_values_to_be_in_type_list", 147 | "kwargs": { 148 | "column": "passenger_count", 149 | "type_list": [ 150 | "INTEGER", 151 | "integer", 152 | "int", 153 | "int_", 154 | "int8", 155 | "int16", 156 | "int32", 157 | "int64", 158 | "uint8", 159 | "uint16", 160 | "uint32", 161 | "uint64", 162 | "INT", 163 | "TINYINT", 164 | "BYTEINT", 165 | "SMALLINT", 166 | "BIGINT", 167 | "IntegerType", 168 | "LongType", 169 | "DECIMAL" 170 | ] 171 | }, 172 | "meta": {} 173 | } 174 | ], 175 | "meta": { 176 | "citations": [ 177 | { 178 | "batch_definition": null, 179 | "batch_kwargs": null, 180 | "batch_markers": null, 181 | "batch_parameters": null, 182 | "batch_request": { 183 | "data_asset_name": "yellow_tripdata_sample_2019-01.csv", 184 | "data_connector_name": "data__dir_example_data_connector", 185 | "datasource_name": "data__dir", 186 | "limit": 1000 187 | }, 188 | "batch_spec": null, 189 | "citation_date": "2021-04-25T22:43:50.694402Z", 190 | "comment": "Created suite added via CLI" 191 | } 192 | ], 193 | "great_expectations_version": "0.13.19" 194 | } 195 | } -------------------------------------------------------------------------------- /getting_started_tutorial_final_v3_api/great_expectations/expectations/taxi/demo_with_custom_expectation.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_asset_type": null, 3 | "expectation_suite_name": "taxi.demo_with_custom_expectation", 4 | "expectations": [ 5 | { 6 | "expectation_type": "expect_column_values_to_be_in_set", 7 | "kwargs": { 8 | "column": "passenger_count", 9 | "value_set": [ 10 | 1, 11 | 2, 12 | 3, 13 | 4, 14 | 5, 15 | 6 16 | ] 17 | }, 18 | "meta": {} 19 | }, 20 | { 21 | "expectation_type": "expect_column_max_to_be_between_custom", 22 | "kwargs": { 23 | "column": "passenger_count", 24 | "max_value": 6, 25 | "min_value": 1 26 | }, 27 | "meta": {} 28 | } 29 | ] 30 | } 31 | -------------------------------------------------------------------------------- /getting_started_tutorial_final_v3_api/great_expectations/great_expectations.yml: -------------------------------------------------------------------------------- 1 | # Welcome to Great Expectations! Always know what to expect from your data. 2 | # 3 | # Here you can define datasources, batch kwargs generators, integrations and 4 | # more. This file is intended to be committed to your repo. For help with 5 | # configuration please: 6 | # - Read our docs: https://docs.greatexpectations.io/en/latest/reference/spare_parts/data_context_reference.html#configuration 7 | # - Join our slack channel: http://greatexpectations.io/slack 8 | 9 | # config_version refers to the syntactic version of this config file, and is used in maintaining backwards compatibility 10 | # It is auto-generated and usually does not need to be changed. 11 | config_version: 3.0 12 | 13 | # Datasources tell Great Expectations where your data lives and how to get it. 14 | # You can use the CLI command `great_expectations datasource new` to help you 15 | # add a new datasource. Read more at https://docs.greatexpectations.io/en/latest/reference/core_concepts/datasource.html 16 | datasources: 17 | data__dir: 18 | execution_engine: 19 | module_name: great_expectations.execution_engine 20 | class_name: PandasExecutionEngine 21 | module_name: great_expectations.datasource 22 | class_name: Datasource 23 | data_connectors: 24 | data__dir_example_data_connector: 25 | base_directory: ../../data 26 | default_regex: 27 | group_names: 28 | - data_asset_name 29 | pattern: (.*) 30 | module_name: great_expectations.datasource.data_connector 31 | class_name: InferredAssetFilesystemDataConnector 32 | 33 | # This config file supports variable substitution which enables: 1) keeping 34 | # secrets out of source control & 2) environment-based configuration changes 35 | # such as staging vs prod. 36 | # 37 | # When GE encounters substitution syntax (like `my_key: ${my_value}` or 38 | # `my_key: $my_value`) in the great_expectations.yml file, it will attempt 39 | # to replace the value of `my_key` with the value from an environment 40 | # variable `my_value` or a corresponding key read from this config file, 41 | # which is defined through the `config_variables_file_path`. 42 | # Environment variables take precedence over variables defined here. 43 | # 44 | # Substitution values defined here can be a simple (non-nested) value, 45 | # nested value such as a dictionary, or an environment variable (i.e. ${ENV_VAR}) 46 | # 47 | # 48 | # https://docs.greatexpectations.io/en/latest/guides/how_to_guides/configuring_data_contexts/how_to_use_a_yaml_file_or_environment_variables_to_populate_credentials.html 49 | 50 | 51 | config_variables_file_path: uncommitted/config_variables.yml 52 | 53 | # The plugins_directory will be added to your python path for custom modules 54 | # used to override and extend Great Expectations. 55 | plugins_directory: plugins/ 56 | 57 | stores: 58 | # Stores are configurable places to store things like Expectations, Validations 59 | # Data Docs, and more. These are for advanced users only - most users can simply 60 | # leave this section alone. 61 | # 62 | # Three stores are required: expectations, validations, and 63 | # evaluation_parameters, and must exist with a valid store entry. Additional 64 | # stores can be configured for uses such as data_docs, etc. 65 | expectations_store: 66 | class_name: ExpectationsStore 67 | store_backend: 68 | class_name: TupleFilesystemStoreBackend 69 | base_directory: expectations/ 70 | 71 | validations_store: 72 | class_name: ValidationsStore 73 | store_backend: 74 | class_name: TupleFilesystemStoreBackend 75 | base_directory: uncommitted/validations/ 76 | 77 | evaluation_parameter_store: 78 | # Evaluation Parameters enable dynamic expectations. Read more here: 79 | # https://docs.greatexpectations.io/en/latest/reference/core_concepts/evaluation_parameters.html 80 | class_name: EvaluationParameterStore 81 | 82 | checkpoint_store: 83 | class_name: CheckpointStore 84 | store_backend: 85 | class_name: TupleFilesystemStoreBackend 86 | suppress_store_backend_id: true 87 | base_directory: checkpoints/ 88 | 89 | expectations_store_name: expectations_store 90 | validations_store_name: validations_store 91 | evaluation_parameter_store_name: evaluation_parameter_store 92 | checkpoint_store_name: checkpoint_store 93 | 94 | data_docs_sites: 95 | # Data Docs make it simple to visualize data quality in your project. These 96 | # include Expectations, Validations & Profiles. The are built for all 97 | # Datasources from JSON artifacts in the local repo including validations & 98 | # profiles from the uncommitted directory. Read more at https://docs.greatexpectations.io/en/latest/reference/core_concepts/data_docs.html 99 | local_site: 100 | class_name: SiteBuilder 101 | # set to false to hide how-to buttons in Data Docs 102 | show_how_to_buttons: true 103 | store_backend: 104 | class_name: TupleFilesystemStoreBackend 105 | base_directory: uncommitted/data_docs/local_site/ 106 | site_index_builder: 107 | class_name: DefaultSiteIndexBuilder 108 | 109 | anonymous_usage_statistics: 110 | enabled: true 111 | data_context_id: 2c8220c3-63db-42c7-be97-b7909cc59f8b 112 | notebooks: 113 | -------------------------------------------------------------------------------- /getting_started_tutorial_final_v3_api/great_expectations/notebooks/pandas/validation_playground.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Validation Playground\n", 8 | "\n", 9 | "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", 10 | "\n", 11 | "#### This notebook assumes that you created at least one expectation suite in your project.\n", 12 | "#### Here you will learn how to validate data loaded into a Pandas DataFrame against an expectation suite.\n", 13 | "\n", 14 | "\n", 15 | "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import json\n", 25 | "import great_expectations as ge\n", 26 | "import great_expectations.jupyter_ux\n", 27 | "from great_expectations.datasource.types import BatchKwargs\n", 28 | "import datetime" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## 1. Get a DataContext\n", 36 | "This represents your **project** that you just created using `great_expectations init`." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "context = ge.data_context.DataContext()" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## 2. Choose an Expectation Suite\n", 53 | "\n", 54 | "List expectation suites that you created in your project" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "context.list_expectation_suite_names()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "expectation_suite_name = # TODO: set to a name from the list above" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## 3. Load a batch of data you want to validate\n", 80 | "\n", 81 | "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# list datasources of the type PandasDatasource in your project\n", 91 | "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'PandasDatasource']" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "datasource_name = # TODO: set to a datasource name from above" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "# If you would like to validate a file on a filesystem:\n", 110 | "batch_kwargs = {'path': \"YOUR_FILE_PATH\", 'datasource': datasource_name}\n", 111 | "\n", 112 | "# If you already loaded the data into a Pandas Data Frame:\n", 113 | "batch_kwargs = {'dataset': \"YOUR_DATAFRAME\", 'datasource': datasource_name}\n", 114 | "\n", 115 | "\n", 116 | "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", 117 | "batch.head()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "## 4. Validate the batch with Validation Operators\n", 125 | "\n", 126 | "`Validation Operators` provide a convenient way to bundle the validation of\n", 127 | "multiple expectation suites and the actions that should be taken after validation.\n", 128 | "\n", 129 | "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", 130 | "\n", 131 | "* validating a group of batches that are logically related\n", 132 | "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", 133 | "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", 134 | "\n", 135 | "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", 145 | "\n", 146 | "\"\"\"\n", 147 | "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n", 148 | "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n", 149 | "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n", 150 | "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n", 151 | "be None and run_time will default to the current UTC datetime.\n", 152 | "\"\"\"\n", 153 | "\n", 154 | "run_id = {\n", 155 | " \"run_name\": \"some_string_that_uniquely_identifies_this_run\", # insert your own run_name here\n", 156 | " \"run_time\": datetime.datetime.now(datetime.timezone.utc)\n", 157 | "}\n", 158 | "\n", 159 | "results = context.run_validation_operator(\n", 160 | " \"action_list_operator\",\n", 161 | " assets_to_validate=[batch],\n", 162 | " run_id=run_id)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "## 5. View the Validation Results in Data Docs\n", 170 | "\n", 171 | "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", 172 | "\n", 173 | "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "context.open_data_docs()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "## Congratulations! You ran Validations!\n", 190 | "\n", 191 | "## Next steps:\n", 192 | "\n", 193 | "### 1. Read about the typical workflow with Great Expectations:\n", 194 | "\n", 195 | "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", 196 | "\n", 197 | "### 2. Explore the documentation & community\n", 198 | "\n", 199 | "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [] 208 | } 209 | ], 210 | "metadata": { 211 | "kernelspec": { 212 | "display_name": "Python 3", 213 | "language": "python", 214 | "name": "python3" 215 | }, 216 | "language_info": { 217 | "codemirror_mode": { 218 | "name": "ipython", 219 | "version": 3 220 | }, 221 | "file_extension": ".py", 222 | "mimetype": "text/x-python", 223 | "name": "python", 224 | "nbconvert_exporter": "python", 225 | "pygments_lexer": "ipython3", 226 | "version": "3.7.0" 227 | }, 228 | "pycharm": { 229 | "stem_cell": { 230 | "cell_type": "raw", 231 | "source": [], 232 | "metadata": { 233 | "collapsed": false 234 | } 235 | } 236 | } 237 | }, 238 | "nbformat": 4, 239 | "nbformat_minor": 4 240 | } 241 | -------------------------------------------------------------------------------- /getting_started_tutorial_final_v3_api/great_expectations/notebooks/spark/validation_playground.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Validation Playground\n", 8 | "\n", 9 | "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", 10 | "\n", 11 | "#### This notebook assumes that you created at least one expectation suite in your project.\n", 12 | "#### Here you will learn how to validate data loaded into a PySpark DataFrame against an expectation suite.\n", 13 | "\n", 14 | "\n", 15 | "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import json\n", 25 | "import great_expectations as ge\n", 26 | "import great_expectations.jupyter_ux\n", 27 | "from great_expectations.datasource.types import BatchKwargs\n", 28 | "import datetime" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## 1. Get a DataContext\n", 36 | "This represents your **project** that you just created using `great_expectations init`." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "context = ge.data_context.DataContext()" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## 2. Choose an Expectation Suite\n", 53 | "\n", 54 | "List expectation suites that you created in your project" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "context.list_expectation_suite_names()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "expectation_suite_name = # TODO: set to a name from the list above" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## 3. Load a batch of data you want to validate\n", 80 | "\n", 81 | "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# list datasources of the type SparkDFDatasource in your project\n", 91 | "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SparkDFDatasource']" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "datasource_name = # TODO: set to a datasource name from above" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "# If you would like to validate a file on a filesystem:\n", 110 | "batch_kwargs = {'path': \"YOUR_FILE_PATH\", 'datasource': datasource_name}\n", 111 | "# To customize how Spark reads the file, you can add options under reader_options key in batch_kwargs (e.g., header='true')\n", 112 | "\n", 113 | "# If you already loaded the data into a PySpark Data Frame:\n", 114 | "batch_kwargs = {'dataset': \"YOUR_DATAFRAME\", 'datasource': datasource_name}\n", 115 | "\n", 116 | "\n", 117 | "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", 118 | "batch.head()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "## 4. Validate the batch with Validation Operators\n", 126 | "\n", 127 | "`Validation Operators` provide a convenient way to bundle the validation of\n", 128 | "multiple expectation suites and the actions that should be taken after validation.\n", 129 | "\n", 130 | "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", 131 | "\n", 132 | "* validating a group of batches that are logically related\n", 133 | "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", 134 | "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", 135 | "\n", 136 | "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", 146 | "\n", 147 | "\"\"\"\n", 148 | "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n", 149 | "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n", 150 | "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n", 151 | "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n", 152 | "be None and run_time will default to the current UTC datetime.\n", 153 | "\"\"\"\n", 154 | "\n", 155 | "run_id = {\n", 156 | " \"run_name\": \"some_string_that_uniquely_identifies_this_run\", # insert your own run_name here\n", 157 | " \"run_time\": datetime.datetime.now(datetime.timezone.utc)\n", 158 | "}\n", 159 | "\n", 160 | "results = context.run_validation_operator(\n", 161 | " \"action_list_operator\",\n", 162 | " assets_to_validate=[batch],\n", 163 | " run_id=run_id)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "## 5. View the Validation Results in Data Docs\n", 171 | "\n", 172 | "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", 173 | "\n", 174 | "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "context.open_data_docs()" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "## Congratulations! You ran Validations!\n", 191 | "\n", 192 | "## Next steps:\n", 193 | "\n", 194 | "### 1. Read about the typical workflow with Great Expectations:\n", 195 | "\n", 196 | "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", 197 | "\n", 198 | "### 2. Explore the documentation & community\n", 199 | "\n", 200 | "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [] 209 | } 210 | ], 211 | "metadata": { 212 | "kernelspec": { 213 | "display_name": "Python 3", 214 | "language": "python", 215 | "name": "python3" 216 | }, 217 | "language_info": { 218 | "codemirror_mode": { 219 | "name": "ipython", 220 | "version": 3 221 | }, 222 | "file_extension": ".py", 223 | "mimetype": "text/x-python", 224 | "name": "python", 225 | "nbconvert_exporter": "python", 226 | "pygments_lexer": "ipython3", 227 | "version": "3.7.0" 228 | }, 229 | "pycharm": { 230 | "stem_cell": { 231 | "cell_type": "raw", 232 | "source": [], 233 | "metadata": { 234 | "collapsed": false 235 | } 236 | } 237 | } 238 | }, 239 | "nbformat": 4, 240 | "nbformat_minor": 4 241 | } 242 | -------------------------------------------------------------------------------- /getting_started_tutorial_final_v3_api/great_expectations/notebooks/sql/validation_playground.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Validation Playground\n", 8 | "\n", 9 | "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n", 10 | "\n", 11 | "#### This notebook assumes that you created at least one expectation suite in your project.\n", 12 | "#### Here you will learn how to validate data in a SQL database against an expectation suite.\n", 13 | "\n", 14 | "\n", 15 | "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import json\n", 25 | "import great_expectations as ge\n", 26 | "import great_expectations.jupyter_ux\n", 27 | "from great_expectations.datasource.types import BatchKwargs\n", 28 | "import datetime" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## 1. Get a DataContext\n", 36 | "This represents your **project** that you just created using `great_expectations init`." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "context = ge.data_context.DataContext()" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## 2. Choose an Expectation Suite\n", 53 | "\n", 54 | "List expectation suites that you created in your project" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "context.list_expectation_suite_names()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "expectation_suite_name = # TODO: set to a name from the list above" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## 3. Load a batch of data you want to validate\n", 80 | "\n", 81 | "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# list datasources of the type SqlAlchemyDatasource in your project\n", 91 | "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SqlAlchemyDatasource']" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "datasource_name = # TODO: set to a datasource name from above" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "# If you would like to validate an entire table or view in your database's default schema:\n", 110 | "batch_kwargs = {'table': \"YOUR_TABLE\", 'datasource': datasource_name}\n", 111 | "\n", 112 | "# If you would like to validate an entire table or view from a non-default schema in your database:\n", 113 | "batch_kwargs = {'table': \"YOUR_TABLE\", \"schema\": \"YOUR_SCHEMA\", 'datasource': datasource_name}\n", 114 | "\n", 115 | "# If you would like to validate the result set of a query:\n", 116 | "# batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE', 'datasource': datasource_name}\n", 117 | "\n", 118 | "\n", 119 | "\n", 120 | "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n", 121 | "batch.head()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "## 4. Validate the batch with Validation Operators\n", 129 | "\n", 130 | "`Validation Operators` provide a convenient way to bundle the validation of\n", 131 | "multiple expectation suites and the actions that should be taken after validation.\n", 132 | "\n", 133 | "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n", 134 | "\n", 135 | "* validating a group of batches that are logically related\n", 136 | "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n", 137 | "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n", 138 | "\n", 139 | "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n", 149 | "\n", 150 | "\"\"\"\n", 151 | "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n", 152 | "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n", 153 | "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n", 154 | "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n", 155 | "be None and run_time will default to the current UTC datetime.\n", 156 | "\"\"\"\n", 157 | "\n", 158 | "run_id = {\n", 159 | " \"run_name\": \"some_string_that_uniquely_identifies_this_run\", # insert your own run_name here\n", 160 | " \"run_time\": datetime.datetime.now(datetime.timezone.utc)\n", 161 | "}\n", 162 | "\n", 163 | "results = context.run_validation_operator(\n", 164 | " \"action_list_operator\",\n", 165 | " assets_to_validate=[batch],\n", 166 | " run_id=run_id)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "## 5. View the Validation Results in Data Docs\n", 174 | "\n", 175 | "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n", 176 | "\n", 177 | "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "context.open_data_docs()" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "## Congratulations! You ran Validations!\n", 194 | "\n", 195 | "## Next steps:\n", 196 | "\n", 197 | "### 1. Read about the typical workflow with Great Expectations:\n", 198 | "\n", 199 | "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n", 200 | "\n", 201 | "### 2. Explore the documentation & community\n", 202 | "\n", 203 | "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers." 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [] 212 | } 213 | ], 214 | "metadata": { 215 | "kernelspec": { 216 | "display_name": "Python 3", 217 | "language": "python", 218 | "name": "python3" 219 | }, 220 | "language_info": { 221 | "codemirror_mode": { 222 | "name": "ipython", 223 | "version": 3 224 | }, 225 | "file_extension": ".py", 226 | "mimetype": "text/x-python", 227 | "name": "python", 228 | "nbconvert_exporter": "python", 229 | "pygments_lexer": "ipython3", 230 | "version": "3.7.0" 231 | }, 232 | "pycharm": { 233 | "stem_cell": { 234 | "cell_type": "raw", 235 | "source": [], 236 | "metadata": { 237 | "collapsed": false 238 | } 239 | } 240 | } 241 | }, 242 | "nbformat": 4, 243 | "nbformat_minor": 4 244 | } 245 | -------------------------------------------------------------------------------- /getting_started_tutorial_final_v3_api/great_expectations/plugins/column_custom_max_expectation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example of custom expectation with renderer. 3 | 4 | This custom expectation can be run as part of a checkpoint with the script run_checkpoint_with_custom_expectation.py 5 | in the getting_started_tutorial_final_v3_api directory, e.g. 6 | 7 | getting_started_tutorial_final_v3_api$ python run_checkpoint_with_custom_expectation.py 8 | 9 | See corresponding documentation: 10 | * https://docs.greatexpectations.io/en/latest/guides/how_to_guides/creating_and_editing_expectations/how_to_create_custom_expectations.html 11 | * https://docs.greatexpectations.io/en/latest/guides/how_to_guides/configuring_data_docs/how_to_create_renderers_for_custom_expectations.html 12 | """ 13 | 14 | from great_expectations.core import ExpectationConfiguration, ExpectationValidationResult 15 | from great_expectations.execution_engine import ( 16 | ExecutionEngine, 17 | PandasExecutionEngine, 18 | SparkDFExecutionEngine, 19 | SqlAlchemyExecutionEngine, 20 | ) 21 | from great_expectations.expectations.expectation import ColumnExpectation 22 | from great_expectations.expectations.metrics import ( 23 | ColumnMetricProvider, 24 | column_aggregate_value, column_aggregate_partial, 25 | ) 26 | from great_expectations.expectations.metrics.import_manager import F, sa 27 | from great_expectations.expectations.util import render_evaluation_parameter_string 28 | from great_expectations.render.renderer.renderer import renderer 29 | from great_expectations.render.types import RenderedStringTemplateContent, RenderedTableContent, RenderedBulletListContent, RenderedGraphContent 30 | from great_expectations.render.util import substitute_none_for_missing 31 | 32 | from typing import Any, Dict, List, Optional, Union 33 | 34 | 35 | class ColumnCustomMax(ColumnMetricProvider): 36 | """MetricProvider Class for Custom Aggregate Max MetricProvider""" 37 | 38 | metric_name = "column.aggregate.custom.max" 39 | 40 | @column_aggregate_value(engine=PandasExecutionEngine) 41 | def _pandas(cls, column, **kwargs): 42 | """Pandas Max Implementation""" 43 | return column.max() 44 | 45 | @column_aggregate_partial(engine=SqlAlchemyExecutionEngine) 46 | def _sqlalchemy(cls, column, **kwargs): 47 | """SqlAlchemy Max Implementation""" 48 | return sa.func.max(column) 49 | 50 | @column_aggregate_partial(engine=SparkDFExecutionEngine) 51 | def _spark(cls, column, _table, _column_name, **kwargs): 52 | """Spark Max Implementation""" 53 | types = dict(_table.dtypes) 54 | return F.maxcolumn() 55 | 56 | 57 | class ExpectColumnMaxToBeBetweenCustom(ColumnExpectation): 58 | # Setting necessary computation metric dependencies and defining kwargs, as well as assigning kwargs default values 59 | metric_dependencies = ("column.aggregate.custom.max",) 60 | success_keys = ("min_value", "strict_min", "max_value", "strict_max") 61 | 62 | # Default values 63 | default_kwarg_values = { 64 | "row_condition": None, 65 | "condition_parser": None, 66 | "min_value": None, 67 | "max_value": None, 68 | "strict_min": None, 69 | "strict_max": None, 70 | "mostly": 1 71 | } 72 | 73 | def _validate( 74 | self, 75 | configuration: ExpectationConfiguration, 76 | metrics: Dict, 77 | runtime_configuration: dict = None, 78 | execution_engine: ExecutionEngine = None, 79 | ): 80 | """Validates the given data against the set minimum and maximum value thresholds for the column max""" 81 | column_max = metrics["column.aggregate.custom.max"] 82 | 83 | # Obtaining components needed for validation 84 | min_value = self.get_success_kwargs(configuration).get("min_value") 85 | strict_min = self.get_success_kwargs(configuration).get("strict_min") 86 | max_value = self.get_success_kwargs(configuration).get("max_value") 87 | strict_max = self.get_success_kwargs(configuration).get("strict_max") 88 | 89 | # Checking if mean lies between thresholds 90 | if min_value is not None: 91 | if strict_min: 92 | above_min = column_max > min_value 93 | else: 94 | above_min = column_max >= min_value 95 | else: 96 | above_min = True 97 | 98 | if max_value is not None: 99 | if strict_max: 100 | below_max = column_max < max_value 101 | else: 102 | below_max = column_max <= max_value 103 | else: 104 | below_max = True 105 | 106 | success = above_min and below_max 107 | 108 | return {"success": success, "result": {"observed_value": column_max}} 109 | 110 | def validate_configuration(self, configuration: Optional[ExpectationConfiguration]): 111 | """ 112 | Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that 113 | necessary configuration arguments have been provided for the validation of the expectation. 114 | 115 | Args: 116 | configuration (OPTIONAL[ExpectationConfiguration]): \ 117 | An optional Expectation Configuration entry that will be used to configure the expectation 118 | Returns: 119 | True if the configuration has been validated successfully. Otherwise, raises an exception 120 | """ 121 | min_val = None 122 | max_val = None 123 | 124 | # Setting up a configuration 125 | super().validate_configuration(configuration) 126 | if configuration is None: 127 | configuration = self.configuration 128 | 129 | # Ensuring basic configuration parameters are properly set 130 | try: 131 | assert ( 132 | "column" in configuration.kwargs 133 | ), "'column' parameter is required for column map expectations" 134 | except AssertionError as e: 135 | raise InvalidExpectationConfigurationError(str(e)) 136 | 137 | # Validating that Minimum and Maximum values are of the proper format and type 138 | if "min_value" in configuration.kwargs: 139 | min_val = configuration.kwargs["min_value"] 140 | 141 | if "max_value" in configuration.kwargs: 142 | max_val = configuration.kwargs["max_value"] 143 | 144 | try: 145 | # Ensuring Proper interval has been provided 146 | assert ( 147 | min_val is not None or max_val is not None 148 | ), "min_value and max_value cannot both be none" 149 | assert min_val is None or isinstance( 150 | min_val, (float, int) 151 | ), "Provided min threshold must be a number" 152 | assert max_val is None or isinstance( 153 | max_val, (float, int) 154 | ), "Provided max threshold must be a number" 155 | except AssertionError as e: 156 | raise InvalidExpectationConfigurationError(str(e)) 157 | 158 | @classmethod 159 | @renderer(renderer_type="renderer.prescriptive") 160 | @render_evaluation_parameter_string 161 | def _prescriptive_renderer( 162 | cls, 163 | configuration: ExpectationConfiguration = None, 164 | result: ExpectationValidationResult = None, 165 | language: str = None, 166 | runtime_configuration: dict = None, 167 | **kwargs, 168 | ) -> List[Union[dict, str, RenderedStringTemplateContent, RenderedTableContent, RenderedBulletListContent, 169 | RenderedGraphContent, Any]]: 170 | runtime_configuration = runtime_configuration or {} 171 | include_column_name = runtime_configuration.get("include_column_name", True) 172 | include_column_name = ( 173 | include_column_name if include_column_name is not None else True 174 | ) 175 | styling = runtime_configuration.get("styling") 176 | # get params dict with all expected kwargs 177 | params = substitute_none_for_missing( 178 | configuration.kwargs, 179 | [ 180 | "column", 181 | "min_value", 182 | "max_value", 183 | "mostly", 184 | "row_condition", 185 | "condition_parser", 186 | "strict_min", 187 | "strict_max", 188 | ], 189 | ) 190 | 191 | # build string template 192 | if (params["min_value"] is None) and (params["max_value"] is None): 193 | template_str = "values may have any length." 194 | else: 195 | at_least_str = ( 196 | "greater than" 197 | if params.get("strict_min") is True 198 | else "greater than or equal to" 199 | ) 200 | at_most_str = ( 201 | "less than" if params.get("strict_max") is True else "less than or equal to" 202 | ) 203 | 204 | if params["mostly"] is not None: 205 | params["mostly_pct"] = num_to_str( 206 | params["mostly"] * 100, precision=15, no_scientific=True 207 | ) 208 | 209 | if params["min_value"] is not None and params["max_value"] is not None: 210 | template_str = f"values must be {at_least_str} $min_value and {at_most_str} $max_value characters long, at least $mostly_pct % of the time." 211 | 212 | elif params["min_value"] is None: 213 | template_str = f"values must be {at_most_str} $max_value characters long, at least $mostly_pct % of the time." 214 | 215 | elif params["max_value"] is None: 216 | template_str = f"values must be {at_least_str} $min_value characters long, at least $mostly_pct % of the time." 217 | else: 218 | if params["min_value"] is not None and params["max_value"] is not None: 219 | template_str = f"values must always be {at_least_str} $min_value and {at_most_str} $max_value characters long." 220 | 221 | elif params["min_value"] is None: 222 | template_str = f"values must always be {at_most_str} $max_value characters long." 223 | 224 | elif params["max_value"] is None: 225 | template_str = f"values must always be {at_least_str} $min_value characters long." 226 | 227 | if include_column_name: 228 | template_str = "$column " + template_str 229 | 230 | if params["row_condition"] is not None: 231 | ( 232 | conditional_template_str, 233 | conditional_params, 234 | ) = parse_row_condition_string_pandas_engine(params["row_condition"]) 235 | template_str = conditional_template_str + ", then " + template_str 236 | params.update(conditional_params) 237 | 238 | # return simple string 239 | return [ 240 | RenderedStringTemplateContent( 241 | **{ 242 | "content_block_type": "string_template", 243 | "string_template": { 244 | "template": template_str, 245 | "params": params, 246 | "styling": styling, 247 | }, 248 | } 249 | ) 250 | ] 251 | -------------------------------------------------------------------------------- /getting_started_tutorial_final_v3_api/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css: -------------------------------------------------------------------------------- 1 | /*index page*/ 2 | .ge-index-page-site-name-title {} 3 | .ge-index-page-table-container {} 4 | .ge-index-page-table {} 5 | .ge-index-page-table-profiling-links-header {} 6 | .ge-index-page-table-expectations-links-header {} 7 | .ge-index-page-table-validations-links-header {} 8 | .ge-index-page-table-profiling-links-list {} 9 | .ge-index-page-table-profiling-links-item {} 10 | .ge-index-page-table-expectation-suite-link {} 11 | .ge-index-page-table-validation-links-list {} 12 | .ge-index-page-table-validation-links-item {} 13 | 14 | /*breadcrumbs*/ 15 | .ge-breadcrumbs {} 16 | .ge-breadcrumbs-item {} 17 | 18 | /*navigation sidebar*/ 19 | .ge-navigation-sidebar-container {} 20 | .ge-navigation-sidebar-content {} 21 | .ge-navigation-sidebar-title {} 22 | .ge-navigation-sidebar-link {} 23 | -------------------------------------------------------------------------------- /getting_started_tutorial_final_v3_api/run_checkpoint_with_custom_expectation.py: -------------------------------------------------------------------------------- 1 | import great_expectations as ge 2 | 3 | # add great_expectations/plugins to path 4 | import sys, os 5 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'great_expectations')) 6 | 7 | from plugins import column_custom_max_expectation 8 | 9 | context = ge.get_context() 10 | context.run_checkpoint(checkpoint_name="my_checkpoint_with_custom_expectation") 11 | context.open_data_docs() 12 | --------------------------------------------------------------------------------