├── .gitignore
├── README.md
├── data
    ├── yellow_tripdata_sample_2019-01.csv
    └── yellow_tripdata_sample_2019-02.csv
├── ge_dbt_airflow_tutorial
    ├── Dockerfile
    ├── README.md
    ├── airflow
    │   ├── ge_tutorials_dag_with_great_expectations.py
    │   └── ge_tutorials_dag_without_great_expectations.py
    ├── data
    │   ├── .gitkeep
    │   ├── npi_small.csv
    │   └── state_abbreviations.csv
    ├── dbt
    │   ├── dbt_project.yml
    │   └── models
    │   │   ├── count_providers_by_state.sql
    │   │   ├── npi_with_state.sql
    │   │   ├── sources.yml
    │   │   └── staging
    │   │       ├── stg_npi.sql
    │   │       └── stg_state_abbreviations.sql
    ├── deploy
    │   ├── config
    │   │   └── airflow.cfg
    │   └── script
    │   │   └── entrypoint.sh
    ├── docker-compose.yml
    ├── example_dbt_profile.yml
    ├── great_expectations_projects
    │   └── final
    │   │   └── great_expectations
    │   │       ├── .gitignore
    │   │       ├── config_variables.yml
    │   │       ├── expectations
    │   │           ├── count_providers_by_state
    │   │           │   └── critical.json
    │   │           ├── npi_small_db_table
    │   │           │   └── critical.json
    │   │           ├── npi_small_file
    │   │           │   └── critical.json
    │   │           └── state_abbreviations_file
    │   │           │   └── critical.json
    │   │       ├── great_expectations.yml
    │   │       ├── notebooks
    │   │           ├── pandas
    │   │           │   └── validation_playground.ipynb
    │   │           ├── spark
    │   │           │   └── validation_playground.ipynb
    │   │           └── sql
    │   │           │   └── validation_playground.ipynb
    │   │       └── plugins
    │   │           └── custom_data_docs
    │   │               └── styles
    │   │                   └── data_docs_custom_styles.css
    ├── images
    │   ├── dbt_dag.png
    │   ├── enable_dag.gif
    │   ├── pipeline_airflow_dag_with_ge.png
    │   └── pipeline_airflow_dag_without_ge.png
    └── requirements.txt
├── getting_started_tutorial_final_v2_api
    ├── README.md
    └── great_expectations
    │   ├── .gitignore
    │   ├── checkpoints
    │       ├── .ge_store_backend_id
    │       └── my_chk.yml
    │   ├── expectations
    │       ├── .ge_store_backend_id
    │       └── taxi
    │       │   └── demo.json
    │   ├── great_expectations.yml
    │   ├── notebooks
    │       ├── pandas
    │       │   └── validation_playground.ipynb
    │       ├── spark
    │       │   └── validation_playground.ipynb
    │       └── sql
    │       │   └── validation_playground.ipynb
    │   └── plugins
    │       └── custom_data_docs
    │           └── styles
    │               └── data_docs_custom_styles.css
└── getting_started_tutorial_final_v3_api
    ├── README.md
    ├── great_expectations
        ├── .gitignore
        ├── checkpoints
        │   ├── my_checkpoint.yml
        │   └── my_checkpoint_with_custom_expectation.yml
        ├── expectations
        │   ├── .ge_store_backend_id
        │   └── taxi
        │   │   ├── demo.json
        │   │   └── demo_with_custom_expectation.json
        ├── great_expectations.yml
        ├── notebooks
        │   ├── pandas
        │   │   └── validation_playground.ipynb
        │   ├── spark
        │   │   └── validation_playground.ipynb
        │   └── sql
        │   │   └── validation_playground.ipynb
        └── plugins
        │   ├── column_custom_max_expectation.py
        │   └── custom_data_docs
        │       └── styles
        │           └── data_docs_custom_styles.css
    └── run_checkpoint_with_custom_expectation.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | venv/
 2 | ge_dbt_airflow_tutorial/target/
 3 | ge_dbt_airflow_tutorial/dbt_modules/
 4 | logs/
 5 | .venv/
 6 | __pycache__/
 7 | .ipynb_checkpoints
 8 | */.ipynb_checkpoints/*
 9 | **/.DS_Store
10 | .idea/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | *This repository is no longer maintained. Please see our current guides at https://docs.greatexpectations.io/docs/guides/setup/get_started_lp to get started.*
 2 | 
 3 | # Great Expectations tutorials
 4 | 
 5 | This repository contains the material for a number of Great Expectations tutorials. They all contain instructions in the respective README files.
 6 | 
 7 | **We invite community contributions for these tutorials!**
 8 | 
 9 | 
10 | ## getting_started_tutorial_final_v3_api [TBD]
11 | This example contains the final state of the "Getting started with Great Expectations" tutorial for the Great 
12 | Expectations API v3 (Batch Kwargs API), which is included in Great Expectations version 0.13 and above. 
13 | It also acts as a starting point to explore and demo Great Expectations. See the README in the directory for details.
14 | 
15 | ## getting_started_tutorial_final_v2_api
16 | This example contains the final state of the "Getting started with Great Expectations" tutorial for the Great Expectations 
17 | API v2 (Batch Kwargs API) which applies to Great Expectations version 0.12.x and below. It also acts as a starting point 
18 | to explore and demo Great Expectations. See the README in the directory for details.
19 | 
20 | ## ge_dbt_airflow_tutorial
21 | This example demonstrates the use of Great Expectations in a data pipeline with dbt and Apache Airflow. 
22 | See the README in the directory for details. **Note** This tutorial currently requires an update to work with the 
23 | new-style Checkpoints that were introduced in version 0.13.8.
24 | 


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/Dockerfile:
--------------------------------------------------------------------------------
 1 | # VERSION 1.10.9
 2 | # AUTHOR: Matthieu "Puckel_" Roisil
 3 | # DESCRIPTION: Basic Airflow container
 4 | # BUILD: docker build --rm -t puckel/docker-airflow .
 5 | # SOURCE: https://github.com/puckel/docker-airflow
 6 | 
 7 | FROM python:3.7-slim-buster
 8 | LABEL maintainer="Puckel_"
 9 | 
10 | # Never prompt the user for choices on installation/configuration of packages
11 | ENV DEBIAN_FRONTEND noninteractive
12 | ENV TERM linux
13 | 
14 | # Airflow
15 | ARG AIRFLOW_VERSION=1.10.9
16 | ARG AIRFLOW_USER_HOME=/usr/local/airflow
17 | ARG AIRFLOW_DEPS=""
18 | ARG PYTHON_DEPS=""
19 | ENV AIRFLOW_HOME=${AIRFLOW_USER_HOME}
20 | 
21 | # Define en_US.
22 | ENV LANGUAGE en_US.UTF-8
23 | ENV LANG en_US.UTF-8
24 | ENV LC_ALL en_US.UTF-8
25 | ENV LC_CTYPE en_US.UTF-8
26 | ENV LC_MESSAGES en_US.UTF-8
27 | 
28 | # Disable noisy "Handling signal" log messages:
29 | # ENV GUNICORN_CMD_ARGS --log-level WARNING
30 | 
31 | RUN set -ex \
32 |     && buildDeps=' \
33 |         freetds-dev \
34 |         libkrb5-dev \
35 |         libsasl2-dev \
36 |         libssl-dev \
37 |         libffi-dev \
38 |         libpq-dev \
39 |         git \
40 |     ' \
41 |     && apt-get update -yqq \
42 |     && apt-get upgrade -yqq \
43 |     && apt-get install -yqq --no-install-recommends \
44 |         $buildDeps \
45 |         freetds-bin \
46 |         build-essential \
47 |         default-libmysqlclient-dev \
48 |         apt-utils \
49 |         curl \
50 |         rsync \
51 |         netcat \
52 |         locales \
53 |     && sed -i 's/^# en_US.UTF-8 UTF-8$/en_US.UTF-8 UTF-8/g' /etc/locale.gen \
54 |     && locale-gen \
55 |     && update-locale LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 \
56 |     && useradd -ms /bin/bash -d ${AIRFLOW_USER_HOME} airflow \
57 |     && pip install -U pip setuptools wheel \
58 |     && pip install pyOpenSSL \
59 |     && pip install ndg-httpsclient \
60 |     && pip install pyasn1 \
61 |     && pip install apache-airflow[crypto,celery,postgres,hive,jdbc,mysql,ssh${AIRFLOW_DEPS:+,}${AIRFLOW_DEPS}]==${AIRFLOW_VERSION} \
62 |     && pip install 'redis==3.2' \
63 |     && if [ -n "${PYTHON_DEPS}" ]; then pip install ${PYTHON_DEPS}; fi \
64 |     && apt-get purge --auto-remove -yqq $buildDeps \
65 |     && apt-get autoremove -yqq --purge \
66 |     && apt-get clean \
67 |     && rm -rf \
68 |         /var/lib/apt/lists/* \
69 |         /tmp/* \
70 |         /var/tmp/* \
71 |         /usr/share/man \
72 |         /usr/share/doc \
73 |         /usr/share/doc-base
74 | 
75 | COPY deploy/script/entrypoint.sh /entrypoint.sh
76 | 
77 | COPY deploy/config/airflow.cfg ${AIRFLOW_USER_HOME}/airflow.cfg
78 | 
79 | RUN chown -R airflow: ${AIRFLOW_USER_HOME}
80 | 
81 | EXPOSE 8080 5555 8793
82 | 
83 | RUN set -ex \
84 |     && pip install scipy \
85 |     && pip install great_expectations \
86 |     && pip install dbt \
87 |     && pip uninstall -y SQLAlchemy \
88 |     && pip install SQLAlchemy==1.3.15
89 | 
90 | USER airflow
91 | WORKDIR ${AIRFLOW_USER_HOME}
92 | ENTRYPOINT ["/entrypoint.sh"]
93 | CMD ["webserver"]
94 | 
95 | 


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/README.md:
--------------------------------------------------------------------------------
  1 | # Great Expectations dbt + Airflow Pipeline Tutorial
  2 | 
  3 | The purpose of this example is to show how [Great Expectations](https://greatexpectations.io) can protect a data pipeline from bad data and code bugs.
  4 | 
  5 | **Please note** This tutorial is work in progress. Feel free to provide *feedback via our [Slack channel](https://greatexpectations.io/slack), a GitHub issue, or just fork it and show us your own implementation*, we'll be happy to answer questions and iterate on the content to make it more useful for the community!
  6 | 
  7 | ## Pipeline overview
  8 | 
  9 | The pipeline will look familiar to lots of data teams working with ELT pattern. 
 10 | It loads data from files into a database and then transforms it.
 11 | 
 12 | Airflow is used to orchestrate the pipeline. dbt is used to transform for the "T" step of ELT.
 13 | 
 14 | The purpose of this tutorial is to show how the individual components work together. Therefore, the Airflow setuo and the dbt DAG are kept fairly trivial, but hopefully realistic.
 15 | 
 16 | This tutorial directory contains two Airflow DAGs of this data pipeline:
 17 | * before Great Expectations was added - in `ge_tutorials_dag_without_great_expectations.py`
 18 | * after Great Expectations was added - in `airflow/ge_tutorials_dag_with_great_expectations.py` 
 19 | 
 20 | ### Without Great Expectations
 21 | 
 22 | ![The airflow DAG](images/pipeline_airflow_dag_without_ge.png)
 23 |  
 24 | 1. Load the source files to a postgres database using SQLAlchemy
 25 | 2. Run the dbt DAG to create a simple analytical table, see the dbt DAG snapshot below:
 26 | ![The dbt DAG](images/dbt_dag.png)
 27 | 
 28 | 
 29 | ### With Great Expectations
 30 | 
 31 | ![The airflow DAG](images/pipeline_airflow_dag_with_ge.png)
 32 | 
 33 | 1. Use GE to validate the input CSV files. Stop if they do not meet our expectations 
 34 | 2. Load the source files to a postgres database using SQLAlchemy
 35 | 3. Use GE to validate that the data was loaded into the database successfully
 36 | 4. Run the dbt DAG to create a simple analytical table, see the dbt DAG snapshot below:
 37 | 5. Use GE to validate the analytical result.
 38 | 6. If the analytical result is valid, publish (promote) the analytical table to a "prod" table by renaming it
 39 | 
 40 | ## Setup
 41 | 
 42 | We assume that you will run the "after" version of the pipeline, with Great Expectations integrated.
 43 | 
 44 | Instructions are provided below to setup this tutorial either with or without using Docker.
 45 | 
 46 | ### Setup with Docker
 47 | 
 48 | If you want to quickly get started, use Docker. If you already know Docker, then we have shortcut road for you to run your project:
 49 | 
 50 | ```
 51 | git clone https://github.com/superconductive/ge_tutorials.git
 52 | cd ge_tutorials/ge_dbt_airflow_tutorial
 53 | # you can run this command everytime you need to start superset now:
 54 | docker-compose up
 55 | docker exec ge_dbt_airflow_tutorial_webserver_1 airflow upgradedb
 56 | ```
 57 | 
 58 | Once these steps are completed, you can access Airflow at http://localhost:8080/admin/.
 59 | 
 60 | To run the DAG, you first need to turn it on, then manually trigger it. You can do so through the UI: 
 61 | 
 62 | ![Screen Recording](images/enable_dag.gif)
 63 | (https://share.getcloudapp.com/7Ku0oygJ)
 64 | 
 65 | Once the DAG has run successfully, you'll be able to access the Great Expectations Data Docs at the following URL: http://localhost:8081 
 66 | 
 67 | From there, the container servers will reload on modification made to dbt, great expectations final expectations and the airflow dags. Don’t forget to reload the page to take the new frontend into account though.
 68 | 
 69 | ----
 70 | 
 71 | ### Setup without Docker
 72 | 
 73 | In order to run this project, you will need to go through some basic setup steps.
 74 | 
 75 | #### Database setup
 76 | For the purpose of this demo, we assume you have a relational database available that can be accessed using a SQLAlchemy connection URL. We developed the tutorial using a postgres database. Of course, this can be replaced by any other DBMS when working on a real pipeline.
 77 | Create an empty database `tutorials_db`
 78 | 
 79 | #### Great Expectations
 80 | 
 81 | * Install Great Expectations
 82 | 
 83 | ```
 84 |     pip install great_expectations
 85 | ```
 86 | 
 87 | #### dbt
 88 | 
 89 | * Make sure that you have dbt installed and set up
 90 | * Add your database credentials in the dbt_profile.yml (see the example_dbt_profile.yml in this project)
 91 | 
 92 | #### Airflow
 93 | 
 94 | * Make sure you have Airflow installed and set up.
 95 | * Point the dags_folder in airflow.cfg to the root directory of this project
 96 | 
 97 | #### Environment variables
 98 | 
 99 | The pipeline's configuration variables are passed using environment variables. Set the following variables:
100 | ```
101 | export GE_TUTORIAL_DB_URL=postgresql://your_user:your_password@your_dh_host:5432/your_db_name
102 | export GE_TUTORIAL_ROOT_PATH=your_project_path
103 | ```
104 | 
105 | ## Running the pipeline
106 | 
107 | You can run each individual task in the airflow DAG with `airflow test ge_tutorials_dag_with_ge <task_name>`.
108 | In order to run the entire DAG, use `airflow backfill ge_tutorials_dag_with_ge -s <start_date> -e <end_date>`.
109 | 
110 | 


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/airflow/ge_tutorials_dag_with_great_expectations.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | import airflow
  3 | from airflow import AirflowException
  4 | from airflow.operators.bash_operator import BashOperator
  5 | from airflow.operators.python_operator import PythonOperator
  6 | from airflow import DAG
  7 | import os
  8 | import pandas as pd
  9 | from sqlalchemy import create_engine
 10 | import great_expectations as ge
 11 | 
 12 | 
 13 | # Global variables that are set using environment varaiables
 14 | GE_TUTORIAL_DB_URL = os.getenv('GE_TUTORIAL_DB_URL')
 15 | GE_TUTORIAL_ROOT_PATH = os.getenv('GE_TUTORIAL_ROOT_PATH')
 16 | 
 17 | great_expectations_context_path = os.getenv('GE_TUTORIAL_GE_CONTEXT_PATH') or os.path.join(GE_TUTORIAL_ROOT_PATH, "great_expectations_projects", "final", "great_expectations")
 18 | 
 19 | 
 20 | default_args = {
 21 |     "owner":` "Airflow",
 22 |     "start_date": airflow.utils.dates.days_ago(1)
 23 | }
 24 | 
 25 | 
 26 | # The DAG definition
 27 | dag = DAG(
 28 |     dag_id='ge_tutorials_dag_with_ge',
 29 |     default_args=default_args,
 30 |     schedule_interval=None,
 31 | )
 32 | 
 33 | 
 34 | def load_files_into_db(ds, **kwargs):
 35 |     """
 36 |         A method to simply load CSV files into a database using SQLAlchemy.
 37 |     """
 38 | 
 39 |     engine = create_engine(GE_TUTORIAL_DB_URL)
 40 | 
 41 |     with engine.connect() as conn:
 42 |         conn.execute("drop table if exists npi_small cascade ")
 43 |         conn.execute("drop table if exists state_abbreviations cascade ")
 44 | 
 45 |         df_npi_small = pd.read_csv(os.path.join(GE_TUTORIAL_ROOT_PATH, "data", "npi_small.csv"))
 46 |         column_rename_dict = {old_column_name: old_column_name.lower() for old_column_name in df_npi_small.columns}
 47 |         df_npi_small.rename(columns=column_rename_dict, inplace=True)
 48 |         df_npi_small.to_sql("npi_small", engine,
 49 |                             schema=None,
 50 |                             if_exists='replace',
 51 |                             index=False,
 52 |                             index_label=None,
 53 |                             chunksize=None,
 54 |                             dtype=None)
 55 | 
 56 |         df_state_abbreviations = pd.read_csv(os.path.join(GE_TUTORIAL_ROOT_PATH, "data", "state_abbreviations.csv"))
 57 |         df_state_abbreviations.to_sql("state_abbreviations", engine,
 58 |                                       schema=None,
 59 |                                       if_exists='replace',
 60 |                                       index=False,
 61 |                                       index_label=None,
 62 |                                       chunksize=None,
 63 |                                       dtype=None)
 64 | 
 65 |     return 'Loaded files into the database'
 66 | 
 67 | 
 68 | def validate_source_data(ds, **kwargs):
 69 | 
 70 |     context = ge.data_context.DataContext(great_expectations_context_path)
 71 | 
 72 |     batch_kwargs_file = {"path": os.path.join(GE_TUTORIAL_ROOT_PATH, "data", "npi_small.csv"),
 73 |                          'datasource': 'input_files'}
 74 | 
 75 |     batch_file = context.get_batch(batch_kwargs_file, 'npi_small_file.critical')
 76 | 
 77 | 
 78 |     results = context.run_validation_operator(
 79 |         "action_list_operator",
 80 |         assets_to_validate=[batch_file],
 81 |         run_id="airflow:" + kwargs['dag_run'].run_id + ":" + str(kwargs['dag_run'].start_date))
 82 | 
 83 |     if not results["success"]:
 84 |         raise AirflowException("Validation of the source data is not successful ")
 85 | 
 86 | 
 87 | def validate_source_data_load(ds, **kwargs):
 88 | 
 89 |     # Data Context is a GE object that represents your project.
 90 |     # Your project's great_expectations.yml contains all the config
 91 |     # options for the project's GE Data Context.
 92 |     context = ge.data_context.DataContext(great_expectations_context_path)
 93 | 
 94 |     datasource_name_file = "input_files"
 95 |     expectation_suite_name_file = "npi_small_file.critical"
 96 |     batch_kwargs_file = {"path": os.path.join(GE_TUTORIAL_ROOT_PATH, "data", "npi_small.csv"),
 97 |                          'datasource': 'input_files'}
 98 |     batch_file = context.get_batch(batch_kwargs_file, expectation_suite_name_file)
 99 | 
100 |     expectation_suite_name_db = "npi_small_db_table.critical"
101 |     datasource_name_file_db = "datawarehouse"
102 | 
103 |     # If you would like to validate an entire table or view in your database's default schema:
104 |     batch_kwargs_db = {'table': "npi_small", 'datasource': datasource_name_file_db}
105 | 
106 |     # # If you would like to validate an entire table or view from a non-default schema in your database:
107 |     # batch_kwargs = {'table': "YOUR_TABLE", "schema": "YOUR_SCHEMA", 'datasource': datasource_name}
108 | 
109 |     # If you would like to validate the result set of a query:
110 |     # batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE', 'datasource': datasource_name}
111 | 
112 |     batch_db = context.get_batch(batch_kwargs_db, expectation_suite_name_db)
113 | 
114 |     # Call a validation operator to validate the batch.
115 |     # The operator will evaluate the data against the expectations
116 |     # and perform a list of actions, such as saving the validation
117 |     # result, updating Data Docs, and firing a notification (e.g., Slack).
118 |     run_id = datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ")
119 |     run_id = "airflow:" + kwargs['dag_run'].run_id + ":" + str(kwargs['dag_run'].start_date)
120 |     results = context.run_validation_operator(
121 |         "action_list_operator",
122 |         assets_to_validate=[batch_file, batch_db],
123 |         run_id=run_id)  # e.g., Airflow run id or some run identifier that your pipeline uses.
124 | 
125 |     if not results["success"]:
126 |         raise AirflowException("Validation of the source data loading is not successful ")
127 | 
128 | 
129 | 
130 | def validate_analytical_output(ds, **kwargs):
131 | 
132 |     # Data Context is a GE object that represents your project.
133 |     # Your project's great_expectations.yml contains all the config
134 |     # options for the project's GE Data Context.
135 |     context = ge.data_context.DataContext(great_expectations_context_path)
136 | 
137 |     datasource_name = "datawarehouse"  # a datasource configured in your great_expectations.yml
138 | 
139 |     # Tell GE how to fetch the batch of data that should be validated...
140 | 
141 |     # ... from the result set of a SQL query:
142 |     # batch_kwargs = {"query": "your SQL query", "datasource": datasource_name}
143 | 
144 |     # ... or from a database table:
145 |     batch_kwargs = {"table": "count_providers_by_state", "datasource": datasource_name}
146 | 
147 |     # ... or from a file:
148 |     # batch_kwargs = {"path": "path to your data file", "datasource": datasource_name}
149 | 
150 |     # ... or from a Pandas or PySpark DataFrame
151 |     # batch_kwargs = {"dataset": "your Pandas or PySpark DataFrame", "datasource": datasource_name}
152 | 
153 |     # Get the batch of data you want to validate.
154 |     # Specify the name of the expectation suite that holds the expectations.
155 |     expectation_suite_name = "count_providers_by_state.critical"  # this is an example of
156 |     # a suite that you created
157 |     batch = context.get_batch(batch_kwargs, expectation_suite_name)
158 | 
159 |     # Call a validation operator to validate the batch.
160 |     # The operator will evaluate the data against the expectations
161 |     # and perform a list of actions, such as saving the validation
162 |     # result, updating Data Docs, and firing a notification (e.g., Slack).
163 |     run_id = datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ")
164 |     run_id = "airflow:" + kwargs['dag_run'].run_id + ":" + str(kwargs['dag_run'].start_date)
165 |     results = context.run_validation_operator(
166 |         "action_list_operator",
167 |         assets_to_validate=[batch],
168 |         run_id=run_id)  # e.g., Airflow run id or some run identifier that your pipeline uses.
169 | 
170 |     if not results["success"]:
171 |         raise AirflowException("The analytical output does not meet the expectations in the suite: {0:s}".format(expectation_suite_name))
172 | 
173 | 
174 | def publish_to_prod():
175 |     """
176 |         A method to simply "promote' a table in a database by renaming it using SQLAlchemy.
177 |     """
178 |     engine = create_engine(GE_TUTORIAL_DB_URL)
179 | 
180 |     with engine.connect() as conn:
181 |         conn.execute("drop table if exists prod_count_providers_by_state")
182 |         conn.execute("alter table count_providers_by_state rename to prod_count_providers_by_state")
183 | 
184 | 
185 | task_validate_source_data = PythonOperator(
186 |     task_id='task_validate_source_data',
187 |     python_callable=validate_source_data,
188 |     provide_context=True,
189 |     dag=dag)
190 | 
191 | task_load_files_into_db = PythonOperator(
192 |     task_id='task_load_files_into_db',
193 |     provide_context=True,
194 |     python_callable=load_files_into_db,
195 |     dag=dag,
196 | )
197 | 
198 | task_validate_source_data_load = PythonOperator(
199 |     task_id='task_validate_source_data_load',
200 |     python_callable=validate_source_data_load,
201 |     provide_context=True,
202 |     dag=dag)
203 | 
204 | task_transform_data_in_db = BashOperator(
205 |     task_id='task_transform_data_in_db',
206 |     bash_command='dbt run --project-dir {}'.format(os.path.join(GE_TUTORIAL_ROOT_PATH, 'dbt')),
207 |     dag=dag)
208 | 
209 | 
210 | task_validate_analytical_output = PythonOperator(
211 |     task_id='task_validate_analytical_output',
212 |     python_callable=validate_analytical_output,
213 |     provide_context=True,
214 |     dag=dag)
215 | 
216 | 
217 | task_publish = PythonOperator(
218 |     task_id='task_publish',
219 |     python_callable=publish_to_prod,
220 |     dag=dag)
221 | 
222 | 
223 | # DAG dependencies
224 | task_validate_source_data >> task_load_files_into_db >> task_validate_source_data_load >> task_transform_data_in_db >> task_validate_analytical_output >> task_publish
225 | 


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/airflow/ge_tutorials_dag_without_great_expectations.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | import airflow
 3 | from airflow import AirflowException
 4 | from airflow.operators.bash_operator import BashOperator
 5 | from airflow.operators.python_operator import PythonOperator
 6 | from airflow import DAG
 7 | import os
 8 | import pandas as pd
 9 | from sqlalchemy import create_engine
10 | 
11 | 
12 | # Global variables that are set using environment varaiables
13 | GE_TUTORIAL_DB_URL = os.getenv('GE_TUTORIAL_DB_URL')
14 | GE_TUTORIAL_ROOT_PATH = os.getenv('GE_TUTORIAL_ROOT_PATH')
15 | 
16 | 
17 | default_args = {
18 |     "owner": "Airflow",
19 |     "start_date": airflow.utils.dates.days_ago(1)
20 | }
21 | 
22 | 
23 | # The DAG definition
24 | dag = DAG(
25 |     dag_id='ge_tutorials_dag_no_ge',
26 |     default_args=default_args,
27 |     schedule_interval=None,
28 | )
29 | 
30 | 
31 | def load_files_into_db(ds, **kwargs):
32 |     """
33 |         A method to simply load CSV files into a database using SQLAlchemy.
34 |     """
35 | 
36 |     engine = create_engine(GE_TUTORIAL_DB_URL)
37 | 
38 |     with engine.connect() as conn:
39 |         conn.execute("drop table if exists npi_small cascade ")
40 |         conn.execute("drop table if exists state_abbreviations cascade ")
41 | 
42 |         df_npi_small = pd.read_csv(os.path.join(GE_TUTORIAL_ROOT_PATH, "data", "npi_small.csv"))
43 |         column_rename_dict = {old_column_name: old_column_name.lower() for old_column_name in df_npi_small.columns}
44 |         df_npi_small.rename(columns=column_rename_dict, inplace=True)
45 |         df_npi_small.to_sql("npi_small", engine,
46 |                             schema=None,
47 |                             if_exists='replace',
48 |                             index=False,
49 |                             index_label=None,
50 |                             chunksize=None,
51 |                             dtype=None)
52 | 
53 |         df_state_abbreviations = pd.read_csv(os.path.join(GE_TUTORIAL_ROOT_PATH, "data", "state_abbreviations.csv"))
54 |         df_state_abbreviations.to_sql("state_abbreviations", engine,
55 |                                       schema=None,
56 |                                       if_exists='replace',
57 |                                       index=False,
58 |                                       index_label=None,
59 |                                       chunksize=None,
60 |                                       dtype=None)
61 | 
62 |     return 'Loaded files into the database'
63 | 
64 | 
65 | task_load_files_into_db = PythonOperator(
66 |     task_id='task_load_files_into_db',
67 |     provide_context=True,
68 |     python_callable=load_files_into_db,
69 |     dag=dag,
70 | )
71 | 
72 | 
73 | task_transform_data_in_db = BashOperator(
74 |     task_id='task_transform_data_in_db',
75 |     bash_command='dbt run --project-dir {}'.format(os.path.join(GE_TUTORIAL_ROOT_PATH, 'dbt')),
76 |     dag=dag)
77 | 
78 | 
79 | # DAG dependencies
80 | task_load_files_into_db >> task_transform_data_in_db
81 | 


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greatexpectationslabs/ge_tutorials/1b04332e4f9b4d8621a95e7aa2837c371f41e682/ge_dbt_airflow_tutorial/data/.gitkeep


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/data/state_abbreviations.csv:
--------------------------------------------------------------------------------
 1 | "name","abbreviation"
 2 | "Alabama","AL"
 3 | "Alaska","AK"
 4 | "American Samoa","AS"
 5 | "Arizona","AZ"
 6 | "Arkansas","AR"
 7 | "California","CA"
 8 | "Colorado","CO"
 9 | "Connecticut","CT"
10 | "Delaware","DE"
11 | "District Of Columbia","DC"
12 | "Federated States Of Micronesia","FM"
13 | "Florida","FL"
14 | "Georgia","GA"
15 | "Guam","GU"
16 | "Hawaii","HI"
17 | "Idaho","ID"
18 | "Illinois","IL"
19 | "Indiana","IN"
20 | "Iowa","IA"
21 | "Kansas","KS"
22 | "Kentucky","KY"
23 | "Louisiana","LA"
24 | "Maine","ME"
25 | "Marshall Islands","MH"
26 | "Maryland","MD"
27 | "Massachusetts","MA"
28 | "Michigan","MI"
29 | "Minnesota","MN"
30 | "Mississippi","MS"
31 | "Missouri","MO"
32 | "Montana","MT"
33 | "Nebraska","NE"
34 | "Nevada","NV"
35 | "New Hampshire","NH"
36 | "New Jersey","NJ"
37 | "New Mexico","NM"
38 | "New York","NY"
39 | "North Carolina","NC"
40 | "North Dakota","ND"
41 | "Northern Mariana Islands","MP"
42 | "Ohio","OH"
43 | "Oklahoma","OK"
44 | "Oregon","OR"
45 | "Palau","PW"
46 | "Pennsylvania","PA"
47 | "Puerto Rico","PR"
48 | "Rhode Island","RI"
49 | "South Carolina","SC"
50 | "South Dakota","SD"
51 | "Tennessee","TN"
52 | "Texas","TX"
53 | "Utah","UT"
54 | "Vermont","VT"
55 | "Virgin Islands","VI"
56 | "Virginia","VA"
57 | "Washington","WA"
58 | "West Virginia","WV"
59 | "Wisconsin","WI"
60 | "Wyoming","WY"
61 | 


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/dbt/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | # Name your project! Project names should contain only lowercase characters
 3 | # and underscores. A good package name should reflect your organization's
 4 | # name or the intended use of these models
 5 | name: 'ge_tutorials'
 6 | version: '1.0.0'
 7 | 
 8 | # This setting configures which "profile" dbt uses for this project.
 9 | profile: 'ge_tutorials'
10 | 
11 | # These configurations specify where dbt should look for different types of files.
12 | # The `source-paths` config, for example, states that models in this project can be
13 | # found in the "models/" directory. You probably won't need to change these!
14 | source-paths: ["models"]
15 | data-paths: ["data"]
16 | 
17 | target-path: "target"  # directory which will store compiled SQL files
18 | clean-targets:         # directories to be removed by `dbt clean`
19 |     - "target"
20 |     - "dbt_modules"
21 | 
22 | # Configuring models
23 | # Full documentation: https://docs.getdbt.com/docs/configuring-models
24 | 
25 | # In this example config, we tell dbt to build all models in the example/ directory
26 | # as tables. These settings can be overridden in the individual model files
27 | # using the `{{ config(...) }}` macro.
28 | models:
29 |   ge_tutorials:
30 |     materialized: view
31 | 


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/dbt/models/count_providers_by_state.sql:
--------------------------------------------------------------------------------
1 | select 
2 | 	state_name,
3 | 	count(distinct npi) as count_providers
4 | from {{ ref('npi_with_state') }} n
5 | group by state_name
6 | 


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/dbt/models/npi_with_state.sql:
--------------------------------------------------------------------------------
 1 | select 
 2 | 	n.npi,
 3 | 	n.entity_type_code,
 4 | 	n.organization_name,
 5 | 	n.last_name,
 6 | 	n.first_name,
 7 | 	n.taxonomy_code,
 8 | 	n.state_abbreviation,
 9 | 	s.state_name
10 | from {{ ref('stg_npi') }} n
11 | -- due to the nature of the data some state abbreviations are not valid
12 | -- which results in state names being null  - in this case,
13 | -- switch to inner join
14 | inner join {{ ref('stg_state_abbreviations') }} s
15 | 	on n.state_abbreviation = s.state_abbreviation
16 | 


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/dbt/models/sources.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | 
3 | sources:
4 |   - name: source
5 |     schema: public
6 |     tables:
7 |       - name: npi_small
8 |       - name: state_abbreviations
9 | 


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/dbt/models/staging/stg_npi.sql:
--------------------------------------------------------------------------------
 1 | select 
 2 | 	npi as npi,
 3 | 	entity_type_code as entity_type_code,
 4 | 	organization_name as organization_name,
 5 | 	last_name as last_name,
 6 | 	first_name as first_name,
 7 | 	state as state_abbreviation,
 8 | 	taxonomy_code as taxonomy_code
 9 |   from {{ source('source', 'npi_small') }}
10 | 


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/dbt/models/staging/stg_state_abbreviations.sql:
--------------------------------------------------------------------------------
1 | select 
2 | 	name as state_name,
3 | 	abbreviation as state_abbreviation
4 |   from {{ source('source', 'state_abbreviations') }}
5 | 


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/deploy/script/entrypoint.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # User-provided configuration must always be respected.
  4 | #
  5 | # Therefore, this script must only derives Airflow AIRFLOW__ variables from other variables
  6 | # when the user did not provide their own configuration.
  7 | 
  8 | TRY_LOOP="20"
  9 | 
 10 | # Global defaults and back-compat
 11 | : "${AIRFLOW_HOME:="/usr/local/airflow"}"
 12 | : "${AIRFLOW__CORE__FERNET_KEY:=${FERNET_KEY:=$(python -c "from cryptography.fernet import Fernet; FERNET_KEY = Fernet.generate_key().decode(); print(FERNET_KEY)")}}"
 13 | : "${AIRFLOW__CORE__EXECUTOR:=${EXECUTOR:-Sequential}Executor}"
 14 | 
 15 | # Load DAGs examples (default: Yes)
 16 | if [[ -z "$AIRFLOW__CORE__LOAD_EXAMPLES" && "${LOAD_EX:=n}" == n ]]; then
 17 |   AIRFLOW__CORE__LOAD_EXAMPLES=False
 18 | fi
 19 | 
 20 | export \
 21 |   AIRFLOW_HOME \
 22 |   AIRFLOW__CORE__EXECUTOR \
 23 |   AIRFLOW__CORE__FERNET_KEY \
 24 |   AIRFLOW__CORE__LOAD_EXAMPLES \
 25 | 
 26 | # Install custom python package if requirements.txt is present
 27 | if [ -e "/requirements.txt" ]; then
 28 |     $(command -v pip) install --user -r /requirements.txt
 29 | fi
 30 | 
 31 | wait_for_port() {
 32 |   local name="$1" host="$2" port="$3"
 33 |   local j=0
 34 |   while ! nc -z "$host" "$port" >/dev/null 2>&1 < /dev/null; do
 35 |     j=$((j+1))
 36 |     if [ $j -ge $TRY_LOOP ]; then
 37 |       echo >&2 "$(date) - $host:$port still not reachable, giving up"
 38 |       exit 1
 39 |     fi
 40 |     echo "$(date) - waiting for $name... $j/$TRY_LOOP"
 41 |     sleep 5
 42 |   done
 43 | }
 44 | 
 45 | # Other executors than SequentialExecutor drive the need for an SQL database, here PostgreSQL is used
 46 | if [ "$AIRFLOW__CORE__EXECUTOR" != "SequentialExecutor" ]; then
 47 |   # Check if the user has provided explicit Airflow configuration concerning the database
 48 |   if [ -z "$AIRFLOW__CORE__SQL_ALCHEMY_CONN" ]; then
 49 |     # Default values corresponding to the default compose files
 50 |     : "${POSTGRES_HOST:="postgres"}"
 51 |     : "${POSTGRES_PORT:="5432"}"
 52 |     : "${POSTGRES_USER:="airflow"}"
 53 |     : "${POSTGRES_PASSWORD:="airflow"}"
 54 |     : "${POSTGRES_DB:="airflow"}"
 55 |     : "${POSTGRES_EXTRAS:-""}"
 56 | 
 57 |     AIRFLOW__CORE__SQL_ALCHEMY_CONN="postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}${POSTGRES_EXTRAS}"
 58 |     export AIRFLOW__CORE__SQL_ALCHEMY_CONN
 59 | 
 60 |     # Check if the user has provided explicit Airflow configuration for the broker's connection to the database
 61 |     if [ "$AIRFLOW__CORE__EXECUTOR" = "CeleryExecutor" ]; then
 62 |       AIRFLOW__CELERY__RESULT_BACKEND="db+postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}${POSTGRES_EXTRAS}"
 63 |       export AIRFLOW__CELERY__RESULT_BACKEND
 64 |     fi
 65 |   else
 66 |     if [[ "$AIRFLOW__CORE__EXECUTOR" == "CeleryExecutor" && -z "$AIRFLOW__CELERY__RESULT_BACKEND" ]]; then
 67 |       >&2 printf '%s\n' "FATAL: if you set AIRFLOW__CORE__SQL_ALCHEMY_CONN manually with CeleryExecutor you must also set AIRFLOW__CELERY__RESULT_BACKEND"
 68 |       exit 1
 69 |     fi
 70 | 
 71 |     # Derive useful variables from the AIRFLOW__ variables provided explicitly by the user
 72 |     POSTGRES_ENDPOINT=$(echo -n "$AIRFLOW__CORE__SQL_ALCHEMY_CONN" | cut -d '/' -f3 | sed -e 's,.*@,,')
 73 |     POSTGRES_HOST=$(echo -n "$POSTGRES_ENDPOINT" | cut -d ':' -f1)
 74 |     POSTGRES_PORT=$(echo -n "$POSTGRES_ENDPOINT" | cut -d ':' -f2)
 75 |   fi
 76 | 
 77 |   wait_for_port "Postgres" "$POSTGRES_HOST" "$POSTGRES_PORT"
 78 | fi
 79 | 
 80 | # CeleryExecutor drives the need for a Celery broker, here Redis is used
 81 | if [ "$AIRFLOW__CORE__EXECUTOR" = "CeleryExecutor" ]; then
 82 |   # Check if the user has provided explicit Airflow configuration concerning the broker
 83 |   if [ -z "$AIRFLOW__CELERY__BROKER_URL" ]; then
 84 |     # Default values corresponding to the default compose files
 85 |     : "${REDIS_PROTO:="redis://"}"
 86 |     : "${REDIS_HOST:="redis"}"
 87 |     : "${REDIS_PORT:="6379"}"
 88 |     : "${REDIS_PASSWORD:=""}"
 89 |     : "${REDIS_DBNUM:="1"}"
 90 | 
 91 |     # When Redis is secured by basic auth, it does not handle the username part of basic auth, only a token
 92 |     if [ -n "$REDIS_PASSWORD" ]; then
 93 |       REDIS_PREFIX=":${REDIS_PASSWORD}@"
 94 |     else
 95 |       REDIS_PREFIX=
 96 |     fi
 97 | 
 98 |     AIRFLOW__CELERY__BROKER_URL="${REDIS_PROTO}${REDIS_PREFIX}${REDIS_HOST}:${REDIS_PORT}/${REDIS_DBNUM}"
 99 |     export AIRFLOW__CELERY__BROKER_URL
100 |   else
101 |     # Derive useful variables from the AIRFLOW__ variables provided explicitly by the user
102 |     REDIS_ENDPOINT=$(echo -n "$AIRFLOW__CELERY__BROKER_URL" | cut -d '/' -f3 | sed -e 's,.*@,,')
103 |     REDIS_HOST=$(echo -n "$POSTGRES_ENDPOINT" | cut -d ':' -f1)
104 |     REDIS_PORT=$(echo -n "$POSTGRES_ENDPOINT" | cut -d ':' -f2)
105 |   fi
106 | 
107 |   wait_for_port "Redis" "$REDIS_HOST" "$REDIS_PORT"
108 | fi
109 | 
110 | case "$1" in
111 |   webserver)
112 |     airflow initdb
113 |     if [ "$AIRFLOW__CORE__EXECUTOR" = "LocalExecutor" ] || [ "$AIRFLOW__CORE__EXECUTOR" = "SequentialExecutor" ]; then
114 |       # With the "Local" and "Sequential" executors it should all run in one container.
115 |       airflow scheduler &
116 |     fi
117 |     exec airflow webserver
118 |     ;;
119 |   worker|scheduler)
120 |     # Give the webserver time to run initdb.
121 |     sleep 10
122 |     exec airflow "$@"
123 |     ;;
124 |   flower)
125 |     sleep 10
126 |     exec airflow "$@"
127 |     ;;
128 |   version)
129 |     exec airflow "$@"
130 |     ;;
131 |   *)
132 |     # The command is something like bash, not an airflow subcommand. Just run it in the right environment.
133 |     exec "$@"
134 |     ;;
135 | esac
136 | 


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.7'
 2 | services:
 3 |     postgres:
 4 |         image: postgres:9.6
 5 |         environment:
 6 |             - POSTGRES_USER=airflow
 7 |             - POSTGRES_PASSWORD=airflow
 8 |             - POSTGRES_DB=airflow
 9 |         ports:
10 |             - "5432:5432"
11 | 
12 |         logging:
13 |             options:
14 |                 max-size: 10m
15 |                 max-file: "3"
16 | 
17 |     webserver:
18 |         build: .
19 |         restart: always
20 |         depends_on:
21 |             - postgres
22 |         environment:
23 |             - LOAD_EX=n
24 |             - EXECUTOR=Local
25 |             # TODO: create a separate schema for real data
26 |             - GE_TUTORIAL_DB_URL=postgres://airflow:airflow@postgres:5432/airflow
27 |             - GE_TUTORIAL_ROOT_PATH=/usr/local/airflow/
28 |             - GE_TUTORIAL_PROJECT_PATH=/usr/local/airflow/
29 |         logging:
30 |             options:
31 |                 max-size: 10m
32 |                 max-file: "3"
33 |         volumes:
34 |             # TODO: Might be better to mount everything at once
35 |             - ./airflow:/usr/local/airflow/dags
36 |             - ./great_expectations_projects/final/great_expectations:/usr/local/airflow/great_expectations_projects/final/great_expectations
37 |             - ./dbt:/usr/local/airflow/dbt
38 |             - ./data:/usr/local/airflow/data
39 |             - ./requirements.txt:/requirements.txt
40 |             - ./example_dbt_profile.yml:/usr/local/airflow/.dbt/profiles.yml
41 |         ports:
42 |             - "8080:8080"
43 |             - "8888:8888"
44 |         command: webserver
45 |         healthcheck:
46 |             test: ["CMD-SHELL", "[ -f /usr/local/airflow/airflow-webserver.pid ]"]
47 |             interval: 30s
48 |             timeout: 30s
49 |             retries: 3
50 |     ge_data_docs:
51 |         image: flashspys/nginx-static
52 |         container_name: ge_data_docs
53 |         ports:
54 |             - 8081:80
55 |         volumes: 
56 |             - ./great_expectations_projects/final/great_expectations/uncommitted/data_docs/local_site:/static


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/example_dbt_profile.yml:
--------------------------------------------------------------------------------
 1 | # This is an example dbt_profile.yml file that should live in your .dbt
 2 | # directory that's set up during `dbt init`. It is only included in this repo as an example.
 3 | 
 4 | # For more information on how to configure this file, please see:
 5 | # https://docs.getdbt.com/docs/profile
 6 | 
 7 | ge_tutorials:
 8 |   outputs:
 9 |     dev:
10 |       type: postgres
11 |       threads: 1
12 |       host: postgres
13 |       port: 5432
14 |       user: airflow
15 |       pass: airflow
16 |       dbname: airflow
17 |       schema: public
18 |     prod:
19 |       type: postgres
20 |       threads: 1
21 |       host: postgres
22 |       port: 5432
23 |       user: airflow
24 |       pass: airflow
25 |       dbname: airflow
26 |       schema: public
27 |   target: dev
28 | 


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/.gitignore:
--------------------------------------------------------------------------------
1 | uncommitted/


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/config_variables.yml:
--------------------------------------------------------------------------------
 1 | ge_comment_preservation_key: 1
 2 | # This config file supports variable substitution which enables: 1) keeping
 3 | # secrets out of source control & 2) environment-based configuration changes
 4 | # such as staging vs prod.
 5 | #
 6 | # When GE encounters substitution syntax (like `my_key: ${my_value}` or
 7 | # `my_key: $my_value`) in the config file, it will attempt to replace the value
 8 | # of `my_key` with the value from an environment variable `my_value` or a
 9 | # corresponding key read from the file specified using
10 | # `config_variables_file_path`. Environment variables take precedence.
11 | #
12 | # If the substitution value comes from the config variables file, it can be a
13 | # simple (non-nested) value or a nested value such as a dictionary. If it comes
14 | # from an environment variable, it must be a simple value. Read more at:
15 | #
16 | datawarehouse:
17 |   drivername: postgres
18 |   username: airflow
19 |   password: airflow
20 |   host: postgres
21 |   database: airflow
22 |   port: '5432'
23 | 


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/expectations/count_providers_by_state/critical.json:
--------------------------------------------------------------------------------
1 | {"expectation_suite_name": "count_providers_by_state.critical", "data_asset_type": "Dataset", "expectations": [{"expectation_type": "expect_column_values_to_be_of_type", "kwargs": {"column": "count_providers", "type_": "BIGINT"}, "meta": {}}, {"expectation_type": "expect_column_values_to_not_be_null", "kwargs": {"column": "state_name"}, "meta": {}}, {"expectation_type": "expect_column_values_to_be_unique", "kwargs": {"column": "state_name"}, "meta": {}}], "meta": {"great_expectations.__version__": "0.9.2"}}


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/expectations/npi_small_db_table/critical.json:
--------------------------------------------------------------------------------
1 | {"data_asset_type": "Dataset", "expectation_suite_name": "npi_small_db_table.critical", "meta": {"great_expectations.__version__": "0.9.2"}, "expectations": [{"expectation_type": "expect_table_row_count_to_equal", "kwargs": {"value": {"$PARAMETER": "urn:great_expectations:validations:npi_small_file.critical:expect_table_row_count_to_be_between.result.observed_value"}}, "meta": {}}, {"expectation_type": "expect_table_column_count_to_equal", "kwargs": {"value": 7}, "meta": {}}, {"expectation_type": "expect_table_columns_to_match_ordered_list", "kwargs": {"column_list": ["npi", "entity_type_code", "organization_name", "last_name", "first_name", "state", "taxonomy_code"]}, "meta": {}}]}


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/expectations/npi_small_file/critical.json:
--------------------------------------------------------------------------------
1 | {"expectations": [{"meta": {}, "kwargs": {"min_value": 1, "max_value": 1000000}, "expectation_type": "expect_table_row_count_to_be_between"}, {"meta": {}, "kwargs": {"value": 7}, "expectation_type": "expect_table_column_count_to_equal"}, {"meta": {}, "kwargs": {"column_list": ["NPI", "Entity_Type_Code", "Organization_Name", "Last_Name", "First_Name", "State", "Taxonomy_Code"]}, "expectation_type": "expect_table_columns_to_match_ordered_list"}, {"meta": {}, "kwargs": {"column": "State", "mostly": 0.05}, "expectation_type": "expect_column_values_to_not_be_null"}, {"meta": {}, "kwargs": {"column": "State", "value_set": ["AE", "AK", "AL", "AP", "AR", "AS", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "GU", "HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", "MP", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK", "OR", "PA", "PR", "PUERTO RICO", "PW", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VI", "VT", "WA", "WI", "WV", "WY"], "mostly": 0.05}, "expectation_type": "expect_column_values_to_be_in_set"}, {"meta": {}, "kwargs": {"column": "NPI"}, "expectation_type": "expect_column_values_to_not_be_null"}], "meta": {"great_expectations.__version__": "0.9.2"}, "data_asset_type": "Dataset", "expectation_suite_name": "npi_small_file.critical"}


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/expectations/state_abbreviations_file/critical.json:
--------------------------------------------------------------------------------
1 | {"expectations": [{"kwargs": {"min_value": 49, "max_value": 69}, "expectation_type": "expect_table_row_count_to_be_between", "meta": {"SampleExpectationsDatasetProfiler": {"confidence": "very low"}}}, {"kwargs": {"value": 2}, "expectation_type": "expect_table_column_count_to_equal", "meta": {"SampleExpectationsDatasetProfiler": {"confidence": "very low"}}}, {"kwargs": {"column_list": ["name", "abbreviation"]}, "expectation_type": "expect_table_columns_to_match_ordered_list", "meta": {"SampleExpectationsDatasetProfiler": {"confidence": "very low"}}}, {"kwargs": {"column": "name"}, "expectation_type": "expect_column_values_to_not_be_null", "meta": {"SampleExpectationsDatasetProfiler": {"confidence": "very low"}}}, {"kwargs": {"column": "name", "min_value": 1}, "expectation_type": "expect_column_value_lengths_to_be_between", "meta": {"SampleExpectationsDatasetProfiler": {"confidence": "very low"}}}], "meta": {"great_expectations.__version__": "0.9.2", "columns": {"name": {"description": ""}, "abbreviation": {"description": ""}}, "notes": {"format": "markdown", "content": ["#### This is an _example_ suite\n\n- This suite was made by quickly glancing at 1000 rows of your data.\n- This is **not a production suite**. It is meant to show examples of expectations.\n- Because this suite was auto-generated using a very basic profiler that does not know your data like you do, many of the expectations may not be meaningful.\n"]}, "SampleExpectationsDatasetProfiler": {"created_by": "SampleExpectationsDatasetProfiler", "created_at": 1582843165.39407, "batch_kwargs": {"path": "/Users/eugenemandel/projects/ge_tutorials/great_expectations/../data/state_abbreviations.csv", "datasource": "input_files"}}}, "expectation_suite_name": "state_abbreviations_file.critical", "data_asset_type": "Dataset"}


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/great_expectations.yml:
--------------------------------------------------------------------------------
 1 | # Welcome to Great Expectations! Always know what to expect from your data.
 2 | # 
 3 | # Here you can define datasources, batch kwarg generators, integrations and
 4 | # more. This file is intended to be committed to your repo. For help with
 5 | # configuration please:
 6 | #   - Read our docs: https://docs.greatexpectations.io/en/latest/reference/data_context_reference.html#configuration
 7 | #   - Join our slack channel: http://greatexpectations.io/slack
 8 | 
 9 | config_version: 2
10 | 
11 | # Datasources tell Great Expectations where your data lives and how to get it.
12 | # You can use the CLI command `great_expectations datasource new` to help you
13 | # add a new datasource. Read more at https://docs.greatexpectations.io/en/latest/features/datasource.html
14 | datasources:
15 |   input_files:
16 |     data_asset_type:
17 |       class_name: PandasDataset
18 |     class_name: PandasDatasource
19 |     module_name: great_expectations.datasource
20 |   datawarehouse:
21 |     credentials: ${datawarehouse}
22 |     data_asset_type:
23 |       class_name: SqlAlchemyDataset
24 |     class_name: SqlAlchemyDatasource
25 |     module_name: great_expectations.datasource
26 | config_variables_file_path: config_variables.yml
27 | 
28 | # The plugins_directory will be added to your python path for custom modules
29 | # used to override and extend Great Expectations.
30 | plugins_directory: plugins/
31 | 
32 | # Validation Operators are customizable workflows that bundle the validation of
33 | # one or more expectation suites and subsequent actions. The example below
34 | # stores validations and send a slack notification. To read more about
35 | # customizing and extending these, read: https://docs.greatexpectations.io/en/latest/features/validation_operators_and_actions.html
36 | validation_operators:
37 |   action_list_operator:
38 |     class_name: ActionListValidationOperator
39 |     action_list:
40 |       - name: store_validation_result
41 |         action:
42 |           class_name: StoreValidationResultAction
43 |       - name: store_evaluation_params
44 |         action:
45 |           class_name: StoreEvaluationParametersAction
46 |       - name: update_data_docs
47 |         action:
48 |           class_name: UpdateDataDocsAction
49 | stores:
50 | # Stores are configurable places to store things like Expectations, Validations
51 | # Data Docs, and more. These are for advanced users only - most users can simply
52 | # leave this section alone.
53 | # 
54 | # Three stores are required: expectations, validations, and
55 | # evaluation_parameters, and must exist with a valid store entry. Additional
56 | # stores can be configured for uses such as data_docs, validation_operators, etc.
57 |   expectations_store:
58 |     class_name: ExpectationsStore
59 |     store_backend:
60 |       class_name: TupleFilesystemStoreBackend
61 |       base_directory: expectations/
62 | 
63 |   validations_store:
64 |     class_name: ValidationsStore
65 |     store_backend:
66 |       class_name: TupleFilesystemStoreBackend
67 |       base_directory: uncommitted/validations/
68 | 
69 |   evaluation_parameter_store:
70 |     # Evaluation Parameters enable dynamic expectations. Read more here:
71 |     # https://docs.greatexpectations.io/en/latest/reference/evaluation_parameters.html
72 |     class_name: EvaluationParameterStore
73 | 
74 | expectations_store_name: expectations_store
75 | validations_store_name: validations_store
76 | evaluation_parameter_store_name: evaluation_parameter_store
77 | 
78 | data_docs_sites:
79 |   # Data Docs make it simple to visualize data quality in your project. These
80 |   # include Expectations, Validations & Profiles. The are built for all
81 |   # Datasources from JSON artifacts in the local repo including validations &
82 |   # profiles from the uncommitted directory. Read more at https://docs.greatexpectations.io/en/latest/features/data_docs.html
83 |   local_site:
84 |     class_name: SiteBuilder
85 |     # set to false to hide how-to buttons in Data Docs
86 |     show_how_to_buttons: true
87 |     store_backend:
88 |       class_name: TupleFilesystemStoreBackend
89 |       base_directory: uncommitted/data_docs/local_site/
90 |     site_index_builder:
91 |       class_name: DefaultSiteIndexBuilder
92 | anonymous_usage_statistics:
93 |   data_context_id: a3454240-fa69-4c9a-904b-726fac29c60b
94 |   enabled: false
95 | notebooks:
96 | 


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/notebooks/pandas/validation_playground.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Validation Playground\n",
  8 |     "\n",
  9 |     "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n",
 10 |     "\n",
 11 |     "#### This notebook assumes that you created at least one expectation suite in your project.\n",
 12 |     "#### Here you will learn how to validate data loaded into a Pandas DataFrame against an expectation suite.\n",
 13 |     "\n",
 14 |     "\n",
 15 |     "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {},
 22 |    "outputs": [
 23 |     {
 24 |      "name": "stderr",
 25 |      "output_type": "stream",
 26 |      "text": [
 27 |       "Old pybigquery driver version detected. Consider upgrading to 0.4.14 or later.\n"
 28 |      ]
 29 |     },
 30 |     {
 31 |      "name": "stdout",
 32 |      "output_type": "stream",
 33 |      "text": [
 34 |       "2020-03-27T12:13:30-0700 - INFO - Great Expectations logging enabled at INFO level by JupyterUX module.\n"
 35 |      ]
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "import json\n",
 40 |     "import great_expectations as ge\n",
 41 |     "import great_expectations.jupyter_ux\n",
 42 |     "from great_expectations.datasource.types import BatchKwargs\n",
 43 |     "from datetime import datetime"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "## 1. Get a DataContext\n",
 51 |     "This represents your **project** that you just created using `great_expectations init`."
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 2,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "context = ge.data_context.DataContext()"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "## 2. Choose an Expectation Suite\n",
 68 |     "\n",
 69 |     "List expectation suites that you created in your project"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 3,
 75 |    "metadata": {},
 76 |    "outputs": [
 77 |     {
 78 |      "data": {
 79 |       "text/plain": [
 80 |        "['npi_small_file.critical']"
 81 |       ]
 82 |      },
 83 |      "execution_count": 3,
 84 |      "metadata": {},
 85 |      "output_type": "execute_result"
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "context.list_expectation_suite_names()"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 4,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "expectation_suite_name =  'npi_small_file.critical'# TODO: set to a name from the list above"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {},
104 |    "source": [
105 |     "## 3. Load a batch of data you want to validate\n",
106 |     "\n",
107 |     "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 5,
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "data": {
117 |       "text/plain": [
118 |        "['files_datasource']"
119 |       ]
120 |      },
121 |      "execution_count": 5,
122 |      "metadata": {},
123 |      "output_type": "execute_result"
124 |     }
125 |    ],
126 |    "source": [
127 |     "# list datasources of the type PandasDatasource in your project\n",
128 |     "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'PandasDatasource']"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 6,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "datasource_name = 'files_datasource'# TODO: set to a datasource name from above"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 7,
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "data": {
147 |       "text/html": [
148 |        "<div>\n",
149 |        "<style scoped>\n",
150 |        "    .dataframe tbody tr th:only-of-type {\n",
151 |        "        vertical-align: middle;\n",
152 |        "    }\n",
153 |        "\n",
154 |        "    .dataframe tbody tr th {\n",
155 |        "        vertical-align: top;\n",
156 |        "    }\n",
157 |        "\n",
158 |        "    .dataframe thead th {\n",
159 |        "        text-align: right;\n",
160 |        "    }\n",
161 |        "</style>\n",
162 |        "<table border=\"1\" class=\"dataframe\">\n",
163 |        "  <thead>\n",
164 |        "    <tr style=\"text-align: right;\">\n",
165 |        "      <th></th>\n",
166 |        "      <th>NPI</th>\n",
167 |        "      <th>Entity_Type_Code</th>\n",
168 |        "      <th>Organization_Name</th>\n",
169 |        "      <th>Last_Name</th>\n",
170 |        "      <th>First_Name</th>\n",
171 |        "      <th>State</th>\n",
172 |        "      <th>Taxonomy_Code</th>\n",
173 |        "    </tr>\n",
174 |        "  </thead>\n",
175 |        "  <tbody>\n",
176 |        "    <tr>\n",
177 |        "      <th>0</th>\n",
178 |        "      <td>1457900839</td>\n",
179 |        "      <td>2.0</td>\n",
180 |        "      <td>TEXAS CLINIC OF CHIROPRACTIC</td>\n",
181 |        "      <td>NaN</td>\n",
182 |        "      <td>NaN</td>\n",
183 |        "      <td>TX</td>\n",
184 |        "      <td>111N00000X</td>\n",
185 |        "    </tr>\n",
186 |        "    <tr>\n",
187 |        "      <th>1</th>\n",
188 |        "      <td>1255519047</td>\n",
189 |        "      <td>1.0</td>\n",
190 |        "      <td>NaN</td>\n",
191 |        "      <td>BRYANT-JONES</td>\n",
192 |        "      <td>MARIA</td>\n",
193 |        "      <td>FL</td>\n",
194 |        "      <td>261QH0700X</td>\n",
195 |        "    </tr>\n",
196 |        "    <tr>\n",
197 |        "      <th>2</th>\n",
198 |        "      <td>1366091746</td>\n",
199 |        "      <td>1.0</td>\n",
200 |        "      <td>NaN</td>\n",
201 |        "      <td>JONES</td>\n",
202 |        "      <td>EBONY</td>\n",
203 |        "      <td>DC</td>\n",
204 |        "      <td>3747P1801X</td>\n",
205 |        "    </tr>\n",
206 |        "    <tr>\n",
207 |        "      <th>3</th>\n",
208 |        "      <td>1275182651</td>\n",
209 |        "      <td>1.0</td>\n",
210 |        "      <td>NaN</td>\n",
211 |        "      <td>ORNELAS</td>\n",
212 |        "      <td>LUPE</td>\n",
213 |        "      <td>CA</td>\n",
214 |        "      <td>101YA0400X</td>\n",
215 |        "    </tr>\n",
216 |        "    <tr>\n",
217 |        "      <th>4</th>\n",
218 |        "      <td>1194371344</td>\n",
219 |        "      <td>1.0</td>\n",
220 |        "      <td>NaN</td>\n",
221 |        "      <td>WINTERS</td>\n",
222 |        "      <td>STACY</td>\n",
223 |        "      <td>MD</td>\n",
224 |        "      <td>363L00000X</td>\n",
225 |        "    </tr>\n",
226 |        "  </tbody>\n",
227 |        "</table>\n",
228 |        "</div>"
229 |       ],
230 |       "text/plain": [
231 |        "          NPI  Entity_Type_Code             Organization_Name     Last_Name  \\\n",
232 |        "0  1457900839               2.0  TEXAS CLINIC OF CHIROPRACTIC           NaN   \n",
233 |        "1  1255519047               1.0                           NaN  BRYANT-JONES   \n",
234 |        "2  1366091746               1.0                           NaN         JONES   \n",
235 |        "3  1275182651               1.0                           NaN       ORNELAS   \n",
236 |        "4  1194371344               1.0                           NaN       WINTERS   \n",
237 |        "\n",
238 |        "  First_Name State Taxonomy_Code  \n",
239 |        "0        NaN    TX    111N00000X  \n",
240 |        "1      MARIA    FL    261QH0700X  \n",
241 |        "2      EBONY    DC    3747P1801X  \n",
242 |        "3       LUPE    CA    101YA0400X  \n",
243 |        "4      STACY    MD    363L00000X  "
244 |       ]
245 |      },
246 |      "execution_count": 7,
247 |      "metadata": {},
248 |      "output_type": "execute_result"
249 |     }
250 |    ],
251 |    "source": [
252 |     "# If you would like to validate a file on a filesystem:\n",
253 |     "batch_kwargs = {'path': \"/Users/eugenemandel/projects/ge_tutorials/data/npi_small.csv\", 'datasource': datasource_name}\n",
254 |     "\n",
255 |     "# # If you already loaded the data into a Pandas Data Frame:\n",
256 |     "# batch_kwargs = {'dataset': \"YOUR_DATAFRAME\", 'datasource': datasource_name}\n",
257 |     "\n",
258 |     "\n",
259 |     "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n",
260 |     "batch.head()"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 8,
266 |    "metadata": {},
267 |    "outputs": [
268 |     {
269 |      "name": "stdout",
270 |      "output_type": "stream",
271 |      "text": [
272 |       "2020-03-27T11:18:30-0700 - INFO - \t14 expectation(s) included in expectation_suite.\n"
273 |      ]
274 |     },
275 |     {
276 |      "data": {
277 |       "text/plain": [
278 |        "{\n",
279 |        "  \"evaluation_parameters\": {},\n",
280 |        "  \"statistics\": {\n",
281 |        "    \"evaluated_expectations\": 14,\n",
282 |        "    \"successful_expectations\": 14,\n",
283 |        "    \"unsuccessful_expectations\": 0,\n",
284 |        "    \"success_percent\": 100.0\n",
285 |        "  },\n",
286 |        "  \"results\": [\n",
287 |        "    {\n",
288 |        "      \"result\": {\n",
289 |        "        \"observed_value\": 18649\n",
290 |        "      },\n",
291 |        "      \"expectation_config\": {\n",
292 |        "        \"kwargs\": {\n",
293 |        "          \"min_value\": 18639,\n",
294 |        "          \"max_value\": 18659\n",
295 |        "        },\n",
296 |        "        \"expectation_type\": \"expect_table_row_count_to_be_between\",\n",
297 |        "        \"meta\": {\n",
298 |        "          \"SampleExpectationsDatasetProfiler\": {\n",
299 |        "            \"confidence\": \"very low\"\n",
300 |        "          }\n",
301 |        "        }\n",
302 |        "      },\n",
303 |        "      \"success\": true,\n",
304 |        "      \"exception_info\": {\n",
305 |        "        \"raised_exception\": false,\n",
306 |        "        \"exception_message\": null,\n",
307 |        "        \"exception_traceback\": null\n",
308 |        "      },\n",
309 |        "      \"meta\": {}\n",
310 |        "    },\n",
311 |        "    {\n",
312 |        "      \"result\": {\n",
313 |        "        \"observed_value\": 7\n",
314 |        "      },\n",
315 |        "      \"expectation_config\": {\n",
316 |        "        \"kwargs\": {\n",
317 |        "          \"value\": 7\n",
318 |        "        },\n",
319 |        "        \"expectation_type\": \"expect_table_column_count_to_equal\",\n",
320 |        "        \"meta\": {\n",
321 |        "          \"SampleExpectationsDatasetProfiler\": {\n",
322 |        "            \"confidence\": \"very low\"\n",
323 |        "          }\n",
324 |        "        }\n",
325 |        "      },\n",
326 |        "      \"success\": true,\n",
327 |        "      \"exception_info\": {\n",
328 |        "        \"raised_exception\": false,\n",
329 |        "        \"exception_message\": null,\n",
330 |        "        \"exception_traceback\": null\n",
331 |        "      },\n",
332 |        "      \"meta\": {}\n",
333 |        "    },\n",
334 |        "    {\n",
335 |        "      \"result\": {\n",
336 |        "        \"observed_value\": [\n",
337 |        "          \"NPI\",\n",
338 |        "          \"Entity_Type_Code\",\n",
339 |        "          \"Organization_Name\",\n",
340 |        "          \"Last_Name\",\n",
341 |        "          \"First_Name\",\n",
342 |        "          \"State\",\n",
343 |        "          \"Taxonomy_Code\"\n",
344 |        "        ]\n",
345 |        "      },\n",
346 |        "      \"expectation_config\": {\n",
347 |        "        \"kwargs\": {\n",
348 |        "          \"column_list\": [\n",
349 |        "            \"NPI\",\n",
350 |        "            \"Entity_Type_Code\",\n",
351 |        "            \"Organization_Name\",\n",
352 |        "            \"Last_Name\",\n",
353 |        "            \"First_Name\",\n",
354 |        "            \"State\",\n",
355 |        "            \"Taxonomy_Code\"\n",
356 |        "          ]\n",
357 |        "        },\n",
358 |        "        \"expectation_type\": \"expect_table_columns_to_match_ordered_list\",\n",
359 |        "        \"meta\": {\n",
360 |        "          \"SampleExpectationsDatasetProfiler\": {\n",
361 |        "            \"confidence\": \"very low\"\n",
362 |        "          }\n",
363 |        "        }\n",
364 |        "      },\n",
365 |        "      \"success\": true,\n",
366 |        "      \"exception_info\": {\n",
367 |        "        \"raised_exception\": false,\n",
368 |        "        \"exception_message\": null,\n",
369 |        "        \"exception_traceback\": null\n",
370 |        "      },\n",
371 |        "      \"meta\": {}\n",
372 |        "    },\n",
373 |        "    {\n",
374 |        "      \"result\": {\n",
375 |        "        \"element_count\": 18649,\n",
376 |        "        \"unexpected_count\": 491,\n",
377 |        "        \"unexpected_percent\": 2.6328489463241995,\n",
378 |        "        \"partial_unexpected_list\": []\n",
379 |        "      },\n",
380 |        "      \"expectation_config\": {\n",
381 |        "        \"kwargs\": {\n",
382 |        "          \"column\": \"Entity_Type_Code\",\n",
383 |        "          \"mostly\": 0.873671510536758\n",
384 |        "        },\n",
385 |        "        \"expectation_type\": \"expect_column_values_to_not_be_null\",\n",
386 |        "        \"meta\": {\n",
387 |        "          \"SampleExpectationsDatasetProfiler\": {\n",
388 |        "            \"confidence\": \"very low\"\n",
389 |        "          }\n",
390 |        "        }\n",
391 |        "      },\n",
392 |        "      \"success\": true,\n",
393 |        "      \"exception_info\": {\n",
394 |        "        \"raised_exception\": false,\n",
395 |        "        \"exception_message\": null,\n",
396 |        "        \"exception_traceback\": null\n",
397 |        "      },\n",
398 |        "      \"meta\": {}\n",
399 |        "    },\n",
400 |        "    {\n",
401 |        "      \"result\": {\n",
402 |        "        \"observed_value\": [\n",
403 |        "          1.0,\n",
404 |        "          2.0\n",
405 |        "        ],\n",
406 |        "        \"element_count\": 18649,\n",
407 |        "        \"missing_count\": 491,\n",
408 |        "        \"missing_percent\": 2.6328489463241995\n",
409 |        "      },\n",
410 |        "      \"expectation_config\": {\n",
411 |        "        \"kwargs\": {\n",
412 |        "          \"column\": \"Entity_Type_Code\",\n",
413 |        "          \"value_set\": [\n",
414 |        "            1.0,\n",
415 |        "            2.0\n",
416 |        "          ]\n",
417 |        "        },\n",
418 |        "        \"expectation_type\": \"expect_column_distinct_values_to_be_in_set\",\n",
419 |        "        \"meta\": {\n",
420 |        "          \"SampleExpectationsDatasetProfiler\": {\n",
421 |        "            \"confidence\": \"very low\"\n",
422 |        "          }\n",
423 |        "        }\n",
424 |        "      },\n",
425 |        "      \"success\": true,\n",
426 |        "      \"exception_info\": {\n",
427 |        "        \"raised_exception\": false,\n",
428 |        "        \"exception_message\": null,\n",
429 |        "        \"exception_traceback\": null\n",
430 |        "      },\n",
431 |        "      \"meta\": {}\n",
432 |        "    },\n",
433 |        "    {\n",
434 |        "      \"result\": {\n",
435 |        "        \"observed_value\": 0.0,\n",
436 |        "        \"element_count\": 18649,\n",
437 |        "        \"missing_count\": 491,\n",
438 |        "        \"missing_percent\": 2.6328489463241995\n",
439 |        "      },\n",
440 |        "      \"expectation_config\": {\n",
441 |        "        \"kwargs\": {\n",
442 |        "          \"column\": \"Entity_Type_Code\",\n",
443 |        "          \"partition_object\": {\n",
444 |        "            \"values\": [\n",
445 |        "              1.0,\n",
446 |        "              2.0\n",
447 |        "            ],\n",
448 |        "            \"weights\": [\n",
449 |        "              0.812314131512281,\n",
450 |        "              0.1876858684877189\n",
451 |        "            ]\n",
452 |        "          },\n",
453 |        "          \"threshold\": 0.6\n",
454 |        "        },\n",
455 |        "        \"expectation_type\": \"expect_column_kl_divergence_to_be_less_than\",\n",
456 |        "        \"meta\": {\n",
457 |        "          \"SampleExpectationsDatasetProfiler\": {\n",
458 |        "            \"confidence\": \"very low\"\n",
459 |        "          }\n",
460 |        "        }\n",
461 |        "      },\n",
462 |        "      \"success\": true,\n",
463 |        "      \"exception_info\": {\n",
464 |        "        \"raised_exception\": false,\n",
465 |        "        \"exception_message\": null,\n",
466 |        "        \"exception_traceback\": null\n",
467 |        "      },\n",
468 |        "      \"meta\": {}\n",
469 |        "    },\n",
470 |        "    {\n",
471 |        "      \"result\": {\n",
472 |        "        \"element_count\": 18649,\n",
473 |        "        \"unexpected_count\": 0,\n",
474 |        "        \"unexpected_percent\": 0.0,\n",
475 |        "        \"partial_unexpected_list\": []\n",
476 |        "      },\n",
477 |        "      \"expectation_config\": {\n",
478 |        "        \"kwargs\": {\n",
479 |        "          \"column\": \"NPI\"\n",
480 |        "        },\n",
481 |        "        \"expectation_type\": \"expect_column_values_to_not_be_null\",\n",
482 |        "        \"meta\": {\n",
483 |        "          \"SampleExpectationsDatasetProfiler\": {\n",
484 |        "            \"confidence\": \"very low\"\n",
485 |        "          }\n",
486 |        "        }\n",
487 |        "      },\n",
488 |        "      \"success\": true,\n",
489 |        "      \"exception_info\": {\n",
490 |        "        \"raised_exception\": false,\n",
491 |        "        \"exception_message\": null,\n",
492 |        "        \"exception_traceback\": null\n",
493 |        "      },\n",
494 |        "      \"meta\": {}\n",
495 |        "    },\n",
496 |        "    {\n",
497 |        "      \"result\": {\n",
498 |        "        \"observed_value\": 1003007766,\n",
499 |        "        \"element_count\": 18649,\n",
500 |        "        \"missing_count\": 0,\n",
501 |        "        \"missing_percent\": 0.0\n",
502 |        "      },\n",
503 |        "      \"expectation_config\": {\n",
504 |        "        \"kwargs\": {\n",
505 |        "          \"column\": \"NPI\",\n",
506 |        "          \"min_value\": 1003007765,\n",
507 |        "          \"max_value\": 1003007767\n",
508 |        "        },\n",
509 |        "        \"expectation_type\": \"expect_column_min_to_be_between\",\n",
510 |        "        \"meta\": {\n",
511 |        "          \"SampleExpectationsDatasetProfiler\": {\n",
512 |        "            \"confidence\": \"very low\"\n",
513 |        "          }\n",
514 |        "        }\n",
515 |        "      },\n",
516 |        "      \"success\": true,\n",
517 |        "      \"exception_info\": {\n",
518 |        "        \"raised_exception\": false,\n",
519 |        "        \"exception_message\": null,\n",
520 |        "        \"exception_traceback\": null\n",
521 |        "      },\n",
522 |        "      \"meta\": {}\n",
523 |        "    },\n",
524 |        "    {\n",
525 |        "      \"result\": {\n",
526 |        "        \"observed_value\": 1992999676,\n",
527 |        "        \"element_count\": 18649,\n",
528 |        "        \"missing_count\": 0,\n",
529 |        "        \"missing_percent\": 0.0\n",
530 |        "      },\n",
531 |        "      \"expectation_config\": {\n",
532 |        "        \"kwargs\": {\n",
533 |        "          \"column\": \"NPI\",\n",
534 |        "          \"min_value\": 1992999675,\n",
535 |        "          \"max_value\": 1992999677\n",
536 |        "        },\n",
537 |        "        \"expectation_type\": \"expect_column_max_to_be_between\",\n",
538 |        "        \"meta\": {\n",
539 |        "          \"SampleExpectationsDatasetProfiler\": {\n",
540 |        "            \"confidence\": \"very low\"\n",
541 |        "          }\n",
542 |        "        }\n",
543 |        "      },\n",
544 |        "      \"success\": true,\n",
545 |        "      \"exception_info\": {\n",
546 |        "        \"raised_exception\": false,\n",
547 |        "        \"exception_message\": null,\n",
548 |        "        \"exception_traceback\": null\n",
549 |        "      },\n",
550 |        "      \"meta\": {}\n",
551 |        "    },\n",
552 |        "    {\n",
553 |        "      \"result\": {\n",
554 |        "        \"observed_value\": 1500841664.0457933,\n",
555 |        "        \"element_count\": 18649,\n",
556 |        "        \"missing_count\": 0,\n",
557 |        "        \"missing_percent\": 0.0\n",
558 |        "      },\n",
559 |        "      \"expectation_config\": {\n",
560 |        "        \"kwargs\": {\n",
561 |        "          \"column\": \"NPI\",\n",
562 |        "          \"min_value\": 1500841663.0457933,\n",
563 |        "          \"max_value\": 1500841665.0457933\n",
564 |        "        },\n",
565 |        "        \"expectation_type\": \"expect_column_mean_to_be_between\",\n",
566 |        "        \"meta\": {\n",
567 |        "          \"SampleExpectationsDatasetProfiler\": {\n",
568 |        "            \"confidence\": \"very low\"\n",
569 |        "          }\n",
570 |        "        }\n",
571 |        "      },\n",
572 |        "      \"success\": true,\n",
573 |        "      \"exception_info\": {\n",
574 |        "        \"raised_exception\": false,\n",
575 |        "        \"exception_message\": null,\n",
576 |        "        \"exception_traceback\": null\n",
577 |        "      },\n",
578 |        "      \"meta\": {}\n",
579 |        "    },\n",
580 |        "    {\n",
581 |        "      \"result\": {\n",
582 |        "        \"observed_value\": 1508307745.0,\n",
583 |        "        \"element_count\": 18649,\n",
584 |        "        \"missing_count\": 0,\n",
585 |        "        \"missing_percent\": 0.0\n",
586 |        "      },\n",
587 |        "      \"expectation_config\": {\n",
588 |        "        \"kwargs\": {\n",
589 |        "          \"column\": \"NPI\",\n",
590 |        "          \"min_value\": 1508307744.0,\n",
591 |        "          \"max_value\": 1508307746.0\n",
592 |        "        },\n",
593 |        "        \"expectation_type\": \"expect_column_median_to_be_between\",\n",
594 |        "        \"meta\": {\n",
595 |        "          \"SampleExpectationsDatasetProfiler\": {\n",
596 |        "            \"confidence\": \"very low\"\n",
597 |        "          }\n",
598 |        "        }\n",
599 |        "      },\n",
600 |        "      \"success\": true,\n",
601 |        "      \"exception_info\": {\n",
602 |        "        \"raised_exception\": false,\n",
603 |        "        \"exception_message\": null,\n",
604 |        "        \"exception_traceback\": null\n",
605 |        "      },\n",
606 |        "      \"meta\": {}\n",
607 |        "    },\n",
608 |        "    {\n",
609 |        "      \"result\": {\n",
610 |        "        \"observed_value\": {\n",
611 |        "          \"quantiles\": [\n",
612 |        "            0.05,\n",
613 |        "            0.25,\n",
614 |        "            0.5,\n",
615 |        "            0.75,\n",
616 |        "            0.95\n",
617 |        "          ],\n",
618 |        "          \"values\": [\n",
619 |        "            1053339952,\n",
620 |        "            1245889518,\n",
621 |        "            1508307745,\n",
622 |        "            1750668489,\n",
623 |        "            1952551186\n",
624 |        "          ]\n",
625 |        "        },\n",
626 |        "        \"element_count\": 18649,\n",
627 |        "        \"missing_count\": 0,\n",
628 |        "        \"missing_percent\": 0.0\n",
629 |        "      },\n",
630 |        "      \"expectation_config\": {\n",
631 |        "        \"kwargs\": {\n",
632 |        "          \"column\": \"NPI\",\n",
633 |        "          \"quantile_ranges\": {\n",
634 |        "            \"quantiles\": [\n",
635 |        "              0.05,\n",
636 |        "              0.25,\n",
637 |        "              0.5,\n",
638 |        "              0.75,\n",
639 |        "              0.95\n",
640 |        "            ],\n",
641 |        "            \"value_ranges\": [\n",
642 |        "              [\n",
643 |        "                1053339951,\n",
644 |        "                1053339953\n",
645 |        "              ],\n",
646 |        "              [\n",
647 |        "                1245889517,\n",
648 |        "                1245889519\n",
649 |        "              ],\n",
650 |        "              [\n",
651 |        "                1508307744,\n",
652 |        "                1508307746\n",
653 |        "              ],\n",
654 |        "              [\n",
655 |        "                1750668488,\n",
656 |        "                1750668490\n",
657 |        "              ],\n",
658 |        "              [\n",
659 |        "                1952551185,\n",
660 |        "                1952551187\n",
661 |        "              ]\n",
662 |        "            ]\n",
663 |        "          }\n",
664 |        "        },\n",
665 |        "        \"expectation_type\": \"expect_column_quantile_values_to_be_between\",\n",
666 |        "        \"meta\": {\n",
667 |        "          \"SampleExpectationsDatasetProfiler\": {\n",
668 |        "            \"confidence\": \"very low\"\n",
669 |        "          }\n",
670 |        "        }\n",
671 |        "      },\n",
672 |        "      \"success\": true,\n",
673 |        "      \"exception_info\": {\n",
674 |        "        \"raised_exception\": false,\n",
675 |        "        \"exception_message\": null,\n",
676 |        "        \"exception_traceback\": null\n",
677 |        "      },\n",
678 |        "      \"meta\": {}\n",
679 |        "    },\n",
680 |        "    {\n",
681 |        "      \"result\": {\n",
682 |        "        \"element_count\": 18649,\n",
683 |        "        \"unexpected_count\": 15241,\n",
684 |        "        \"unexpected_percent\": 81.72556169231594,\n",
685 |        "        \"partial_unexpected_list\": []\n",
686 |        "      },\n",
687 |        "      \"expectation_config\": {\n",
688 |        "        \"kwargs\": {\n",
689 |        "          \"column\": \"Organization_Name\",\n",
690 |        "          \"mostly\": 0.08274438307684065\n",
691 |        "        },\n",
692 |        "        \"expectation_type\": \"expect_column_values_to_not_be_null\",\n",
693 |        "        \"meta\": {\n",
694 |        "          \"SampleExpectationsDatasetProfiler\": {\n",
695 |        "            \"confidence\": \"very low\"\n",
696 |        "          }\n",
697 |        "        }\n",
698 |        "      },\n",
699 |        "      \"success\": true,\n",
700 |        "      \"exception_info\": {\n",
701 |        "        \"raised_exception\": false,\n",
702 |        "        \"exception_message\": null,\n",
703 |        "        \"exception_traceback\": null\n",
704 |        "      },\n",
705 |        "      \"meta\": {}\n",
706 |        "    },\n",
707 |        "    {\n",
708 |        "      \"result\": {\n",
709 |        "        \"element_count\": 18649,\n",
710 |        "        \"missing_count\": 15241,\n",
711 |        "        \"missing_percent\": 81.72556169231594,\n",
712 |        "        \"unexpected_count\": 0,\n",
713 |        "        \"unexpected_percent\": 0.0,\n",
714 |        "        \"unexpected_percent_nonmissing\": 0.0,\n",
715 |        "        \"partial_unexpected_list\": []\n",
716 |        "      },\n",
717 |        "      \"expectation_config\": {\n",
718 |        "        \"kwargs\": {\n",
719 |        "          \"column\": \"Organization_Name\",\n",
720 |        "          \"min_value\": 1\n",
721 |        "        },\n",
722 |        "        \"expectation_type\": \"expect_column_value_lengths_to_be_between\",\n",
723 |        "        \"meta\": {\n",
724 |        "          \"SampleExpectationsDatasetProfiler\": {\n",
725 |        "            \"confidence\": \"very low\"\n",
726 |        "          }\n",
727 |        "        }\n",
728 |        "      },\n",
729 |        "      \"success\": true,\n",
730 |        "      \"exception_info\": {\n",
731 |        "        \"raised_exception\": false,\n",
732 |        "        \"exception_message\": null,\n",
733 |        "        \"exception_traceback\": null\n",
734 |        "      },\n",
735 |        "      \"meta\": {}\n",
736 |        "    }\n",
737 |        "  ],\n",
738 |        "  \"success\": true,\n",
739 |        "  \"meta\": {\n",
740 |        "    \"great_expectations.__version__\": \"0.9.7+228.g7b410a57\",\n",
741 |        "    \"expectation_suite_name\": \"npi_small_file.critical\",\n",
742 |        "    \"run_id\": \"20200327T181830.633221Z\",\n",
743 |        "    \"batch_kwargs\": {\n",
744 |        "      \"path\": \"/Users/eugenemandel/projects/ge_tutorials/data/npi_small.csv\",\n",
745 |        "      \"datasource\": \"files_datasource\"\n",
746 |        "    },\n",
747 |        "    \"batch_markers\": {\n",
748 |        "      \"ge_load_time\": \"20200327T181801.219912Z\",\n",
749 |        "      \"pandas_data_fingerprint\": \"a5ebd04919bde23bcf25afadb9e661fb\"\n",
750 |        "    },\n",
751 |        "    \"batch_parameters\": null\n",
752 |        "  }\n",
753 |        "}"
754 |       ]
755 |      },
756 |      "execution_count": 8,
757 |      "metadata": {},
758 |      "output_type": "execute_result"
759 |     }
760 |    ],
761 |    "source": [
762 |     "batch.validate()"
763 |    ]
764 |   },
765 |   {
766 |    "cell_type": "code",
767 |    "execution_count": 9,
768 |    "metadata": {},
769 |    "outputs": [
770 |     {
771 |      "data": {
772 |       "text/plain": [
773 |        "great_expectations.dataset.pandas_dataset.PandasDataset"
774 |       ]
775 |      },
776 |      "execution_count": 9,
777 |      "metadata": {},
778 |      "output_type": "execute_result"
779 |     }
780 |    ],
781 |    "source": [
782 |     "type(batch)"
783 |    ]
784 |   },
785 |   {
786 |    "cell_type": "markdown",
787 |    "metadata": {},
788 |    "source": [
789 |     "## 4. Validate the batch with Validation Operators\n",
790 |     "\n",
791 |     "`Validation Operators` provide a convenient way to bundle the validation of\n",
792 |     "multiple expectation suites and the actions that should be taken after validation.\n",
793 |     "\n",
794 |     "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n",
795 |     "\n",
796 |     "* validating a group of batches that are logically related\n",
797 |     "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n",
798 |     "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n",
799 |     "\n",
800 |     "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)"
801 |    ]
802 |   },
803 |   {
804 |    "cell_type": "code",
805 |    "execution_count": 8,
806 |    "metadata": {},
807 |    "outputs": [
808 |     {
809 |      "name": "stdout",
810 |      "output_type": "stream",
811 |      "text": [
812 |       "2020-03-27T12:13:41-0700 - INFO - \t14 expectation(s) included in expectation_suite.\n"
813 |      ]
814 |     }
815 |    ],
816 |    "source": [
817 |     "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n",
818 |     "\n",
819 |     "#Generate a run id, a timestamp, or a meaningful string that will help you refer to validation results. We recommend they be chronologically sortable.\n",
820 |     "# Let's make a simple sortable timestamp. Note this could come from your pipeline runner (e.g., Airflow run id).\n",
821 |     "run_id = datetime.utcnow().isoformat().replace(\":\", \"\") + \"Z\"\n",
822 |     "\n",
823 |     "results = context.run_validation_operator(\n",
824 |     "    \"action_list_operator\", \n",
825 |     "    assets_to_validate=[batch], \n",
826 |     "    run_id=run_id)"
827 |    ]
828 |   },
829 |   {
830 |    "cell_type": "markdown",
831 |    "metadata": {},
832 |    "source": [
833 |     "## 5. View the Validation Results in Data Docs\n",
834 |     "\n",
835 |     "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n",
836 |     "\n",
837 |     "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)"
838 |    ]
839 |   },
840 |   {
841 |    "cell_type": "code",
842 |    "execution_count": null,
843 |    "metadata": {},
844 |    "outputs": [],
845 |    "source": [
846 |     "context.open_data_docs()"
847 |    ]
848 |   },
849 |   {
850 |    "cell_type": "markdown",
851 |    "metadata": {},
852 |    "source": [
853 |     "## Congratulations! You ran Validations!\n",
854 |     "\n",
855 |     "## Next steps:\n",
856 |     "\n",
857 |     "### 1. Read about the typical workflow with Great Expectations:\n",
858 |     "\n",
859 |     "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n",
860 |     "\n",
861 |     "### 2. Explore the documentation & community\n",
862 |     "\n",
863 |     "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers."
864 |    ]
865 |   },
866 |   {
867 |    "cell_type": "code",
868 |    "execution_count": null,
869 |    "metadata": {},
870 |    "outputs": [],
871 |    "source": []
872 |   }
873 |  ],
874 |  "metadata": {
875 |   "kernelspec": {
876 |    "display_name": "Python 3",
877 |    "language": "python",
878 |    "name": "python3"
879 |   },
880 |   "language_info": {
881 |    "codemirror_mode": {
882 |     "name": "ipython",
883 |     "version": 3
884 |    },
885 |    "file_extension": ".py",
886 |    "mimetype": "text/x-python",
887 |    "name": "python",
888 |    "nbconvert_exporter": "python",
889 |    "pygments_lexer": "ipython3",
890 |    "version": "3.7.0"
891 |   },
892 |   "pycharm": {
893 |    "stem_cell": {
894 |     "cell_type": "raw",
895 |     "metadata": {
896 |      "collapsed": false
897 |     },
898 |     "source": []
899 |    }
900 |   }
901 |  },
902 |  "nbformat": 4,
903 |  "nbformat_minor": 4
904 | }
905 | 


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/notebooks/spark/validation_playground.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Validation Playground\n",
  8 |     "\n",
  9 |     "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n",
 10 |     "\n",
 11 |     "#### This notebook assumes that you created at least one expectation suite in your project.\n",
 12 |     "#### Here you will learn how to validate data loaded into a PySpark DataFrame against an expectation suite.\n",
 13 |     "\n",
 14 |     "\n",
 15 |     "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import json\n",
 25 |     "import great_expectations as ge\n",
 26 |     "import great_expectations.jupyter_ux\n",
 27 |     "from great_expectations.datasource.types import BatchKwargs\n",
 28 |     "from datetime import datetime"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## 1. Get a DataContext\n",
 36 |     "This represents your **project** that you just created using `great_expectations init`."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "context = ge.data_context.DataContext()"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## 2. Choose an Expectation Suite\n",
 53 |     "\n",
 54 |     "List expectation suites that you created in your project"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "context.list_expectation_suite_names()"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "expectation_suite_name =  # TODO: set to a name from the list above"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## 3. Load a batch of data you want to validate\n",
 80 |     "\n",
 81 |     "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# list datasources of the type SparkDFDatasource in your project\n",
 91 |     "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SparkDFDatasource']"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "datasource_name = # TODO: set to a datasource name from above"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# If you would like to validate a file on a filesystem:\n",
110 |     "batch_kwargs = {'path': \"YOUR_FILE_PATH\", 'datasource': datasource_name}\n",
111 |     "# To customize how Spark reads the file, you can add options under reader_options key in batch_kwargs (e.g., header='true') \n",
112 |     "\n",
113 |     "# If you already loaded the data into a PySpark Data Frame:\n",
114 |     "batch_kwargs = {'dataset': \"YOUR_DATAFRAME\", 'datasource': datasource_name}\n",
115 |     "\n",
116 |     "\n",
117 |     "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n",
118 |     "batch.head()"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "## 4. Validate the batch with Validation Operators\n",
126 |     "\n",
127 |     "`Validation Operators` provide a convenient way to bundle the validation of\n",
128 |     "multiple expectation suites and the actions that should be taken after validation.\n",
129 |     "\n",
130 |     "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n",
131 |     "\n",
132 |     "* validating a group of batches that are logically related\n",
133 |     "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n",
134 |     "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n",
135 |     "\n",
136 |     "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n",
146 |     "\n",
147 |     "#Generate a run id, a timestamp, or a meaningful string that will help you refer to validation results. We recommend they be chronologically sortable.\n",
148 |     "# Let's make a simple sortable timestamp. Note this could come from your pipeline runner (e.g., Airflow run id).\n",
149 |     "run_id = datetime.utcnow().isoformat().replace(\":\", \"\") + \"Z\"\n",
150 |     "\n",
151 |     "results = context.run_validation_operator(\n",
152 |     "    \"action_list_operator\", \n",
153 |     "    assets_to_validate=[batch], \n",
154 |     "    run_id=run_id)"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "## 5. View the Validation Results in Data Docs\n",
162 |     "\n",
163 |     "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n",
164 |     "\n",
165 |     "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "context.open_data_docs()"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "## Congratulations! You ran Validations!\n",
182 |     "\n",
183 |     "## Next steps:\n",
184 |     "\n",
185 |     "### 1. Read about the typical workflow with Great Expectations:\n",
186 |     "\n",
187 |     "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n",
188 |     "\n",
189 |     "### 2. Explore the documentation & community\n",
190 |     "\n",
191 |     "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers."
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": []
200 |   }
201 |  ],
202 |  "metadata": {
203 |   "kernelspec": {
204 |    "display_name": "Python 3",
205 |    "language": "python",
206 |    "name": "python3"
207 |   },
208 |   "language_info": {
209 |    "codemirror_mode": {
210 |     "name": "ipython",
211 |     "version": 3
212 |    },
213 |    "file_extension": ".py",
214 |    "mimetype": "text/x-python",
215 |    "name": "python",
216 |    "nbconvert_exporter": "python",
217 |    "pygments_lexer": "ipython3",
218 |    "version": "3.7.0"
219 |   },
220 |   "pycharm": {
221 |    "stem_cell": {
222 |     "cell_type": "raw",
223 |     "source": [],
224 |     "metadata": {
225 |      "collapsed": false
226 |     }
227 |    }
228 |   }
229 |  },
230 |  "nbformat": 4,
231 |  "nbformat_minor": 4
232 | }


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/notebooks/sql/validation_playground.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Validation Playground\n",
  8 |     "\n",
  9 |     "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n",
 10 |     "\n",
 11 |     "#### This notebook assumes that you created at least one expectation suite in your project.\n",
 12 |     "#### Here you will learn how to validate data in a SQL database against an expectation suite.\n",
 13 |     "\n",
 14 |     "\n",
 15 |     "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import json\n",
 25 |     "import great_expectations as ge\n",
 26 |     "import great_expectations.jupyter_ux\n",
 27 |     "from great_expectations.datasource.types import BatchKwargs\n",
 28 |     "from datetime import datetime"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## 1. Get a DataContext\n",
 36 |     "This represents your **project** that you just created using `great_expectations init`."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "context = ge.data_context.DataContext()"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## 2. Choose an Expectation Suite\n",
 53 |     "\n",
 54 |     "List expectation suites that you created in your project"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "context.list_expectation_suite_names()"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "expectation_suite_name =  # TODO: set to a name from the list above"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## 3. Load a batch of data you want to validate\n",
 80 |     "\n",
 81 |     "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# list datasources of the type SqlAlchemyDatasource in your project\n",
 91 |     "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SqlAlchemyDatasource']"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "datasource_name = # TODO: set to a datasource name from above"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# If you would like to validate an entire table or view in your database's default schema:\n",
110 |     "batch_kwargs = {'table': \"YOUR_TABLE\", 'datasource': datasource_name}\n",
111 |     "\n",
112 |     "# If you would like to validate an entire table or view from a non-default schema in your database:\n",
113 |     "batch_kwargs = {'table': \"YOUR_TABLE\", \"schema\": \"YOUR_SCHEMA\", 'datasource': datasource_name}\n",
114 |     "\n",
115 |     "# If you would like to validate the result set of a query:\n",
116 |     "# batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE', 'datasource': datasource_name}\n",
117 |     "\n",
118 |     "\n",
119 |     "\n",
120 |     "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n",
121 |     "batch.head()"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "## 4. Validate the batch with Validation Operators\n",
129 |     "\n",
130 |     "`Validation Operators` provide a convenient way to bundle the validation of\n",
131 |     "multiple expectation suites and the actions that should be taken after validation.\n",
132 |     "\n",
133 |     "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n",
134 |     "\n",
135 |     "* validating a group of batches that are logically related\n",
136 |     "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n",
137 |     "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n",
138 |     "\n",
139 |     "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n",
149 |     "\n",
150 |     "#Generate a run id, a timestamp, or a meaningful string that will help you refer to validation results. We recommend they be chronologically sortable.\n",
151 |     "# Let's make a simple sortable timestamp. Note this could come from your pipeline runner (e.g., Airflow run id).\n",
152 |     "run_id = datetime.utcnow().isoformat().replace(\":\", \"\") + \"Z\"\n",
153 |     "\n",
154 |     "results = context.run_validation_operator(\n",
155 |     "    \"action_list_operator\", \n",
156 |     "    assets_to_validate=[batch], \n",
157 |     "    run_id=run_id)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "## 5. View the Validation Results in Data Docs\n",
165 |     "\n",
166 |     "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n",
167 |     "\n",
168 |     "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "context.open_data_docs()"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {},
183 |    "source": [
184 |     "## Congratulations! You ran Validations!\n",
185 |     "\n",
186 |     "## Next steps:\n",
187 |     "\n",
188 |     "### 1. Read about the typical workflow with Great Expectations:\n",
189 |     "\n",
190 |     "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n",
191 |     "\n",
192 |     "### 2. Explore the documentation & community\n",
193 |     "\n",
194 |     "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers."
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": []
203 |   }
204 |  ],
205 |  "metadata": {
206 |   "kernelspec": {
207 |    "display_name": "Python 3",
208 |    "language": "python",
209 |    "name": "python3"
210 |   },
211 |   "language_info": {
212 |    "codemirror_mode": {
213 |     "name": "ipython",
214 |     "version": 3
215 |    },
216 |    "file_extension": ".py",
217 |    "mimetype": "text/x-python",
218 |    "name": "python",
219 |    "nbconvert_exporter": "python",
220 |    "pygments_lexer": "ipython3",
221 |    "version": "3.7.0"
222 |   },
223 |   "pycharm": {
224 |    "stem_cell": {
225 |     "cell_type": "raw",
226 |     "source": [],
227 |     "metadata": {
228 |      "collapsed": false
229 |     }
230 |    }
231 |   }
232 |  },
233 |  "nbformat": 4,
234 |  "nbformat_minor": 4
235 | }


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/great_expectations_projects/final/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css:
--------------------------------------------------------------------------------
 1 | /*index page*/
 2 | .ge-index-page-site-name-title {}
 3 | .ge-index-page-table-container {}
 4 | .ge-index-page-table {}
 5 | .ge-index-page-table-profiling-links-header {}
 6 | .ge-index-page-table-expectations-links-header {}
 7 | .ge-index-page-table-validations-links-header {}
 8 | .ge-index-page-table-profiling-links-list {}
 9 | .ge-index-page-table-profiling-links-item {}
10 | .ge-index-page-table-expectation-suite-link {}
11 | .ge-index-page-table-validation-links-list {}
12 | .ge-index-page-table-validation-links-item {}
13 | 
14 | /*breadcrumbs*/
15 | .ge-breadcrumbs {}
16 | .ge-breadcrumbs-item {}
17 | 
18 | /*navigation sidebar*/
19 | .ge-navigation-sidebar-container {}
20 | .ge-navigation-sidebar-content {}
21 | .ge-navigation-sidebar-title {}
22 | .ge-navigation-sidebar-link {}
23 | 


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/images/dbt_dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greatexpectationslabs/ge_tutorials/1b04332e4f9b4d8621a95e7aa2837c371f41e682/ge_dbt_airflow_tutorial/images/dbt_dag.png


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/images/enable_dag.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greatexpectationslabs/ge_tutorials/1b04332e4f9b4d8621a95e7aa2837c371f41e682/ge_dbt_airflow_tutorial/images/enable_dag.gif


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/images/pipeline_airflow_dag_with_ge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greatexpectationslabs/ge_tutorials/1b04332e4f9b4d8621a95e7aa2837c371f41e682/ge_dbt_airflow_tutorial/images/pipeline_airflow_dag_with_ge.png


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/images/pipeline_airflow_dag_without_ge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/greatexpectationslabs/ge_tutorials/1b04332e4f9b4d8621a95e7aa2837c371f41e682/ge_dbt_airflow_tutorial/images/pipeline_airflow_dag_without_ge.png


--------------------------------------------------------------------------------
/ge_dbt_airflow_tutorial/requirements.txt:
--------------------------------------------------------------------------------
1 | dbt
2 | great_expectations


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v2_api/README.md:
--------------------------------------------------------------------------------
 1 | # Getting started with Great Expectations tutorial - v2 (Batch Kwargs) API 
 2 | 
 3 | This repository contains the final version of the "Getting started with Great Expectations" tutorial in the Great 
 4 | Expectations docs. This repo can be used as a demo and to explore a complete Great Expectations deploy.
 5 | 
 6 | **THIS VERSION WAS CREATED WITH THE V2 (BATCH KWARGS) GREAT EXPECTATIONS API**, i.e. Great Expectations version 0.12.x and below. 
 7 | 
 8 | ## 1. How to run through the tutorial
 9 | [Please follow the tutorial in our docs for instructions!](https://docs.greatexpectations.io/en/latest/guides/tutorials/getting_started.html)
10 | 
11 | ## 2. How to use this repo to explore and demo Great Expectations
12 | 
13 | ### The `data` directory
14 | 
15 | The CSV files in the data directory are yellow taxi trip data that have been downloaded from the NYC taxi data website:
16 | * [TLC trip record data](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
17 | * [Data dictionary](https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf)
18 | 
19 | We created 10,000 row samples (using the Pandas ``sample`` function) from teh original CSV files for convenience and manually added some breaking changes (0s in the passenger_count column) to demonstrate potential data issues. 
20 | 
21 | In a future version of this tutorial, we might use "naturally occurring" data bugs :)
22 | 
23 | ### The `great_expectations` directory
24 | Currently, this demo contains the following:
25 | * A `great_expectations.yml` file that's configured to use the top-level `data` directory as a Datasource. You will not need to set up anything to get it to work.
26 | * A single Expectation Suite, `taxi.demo`, containing a handful of simple Expectations
27 | * A Checkpoint `my_checkpoint` that is set up to run the suite against the February data set
28 | 


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v2_api/great_expectations/.gitignore:
--------------------------------------------------------------------------------
1 | uncommitted/


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v2_api/great_expectations/checkpoints/.ge_store_backend_id:
--------------------------------------------------------------------------------
1 | store_backend_id = 2d1df717-ce13-4988-a121-459c863a5072


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v2_api/great_expectations/checkpoints/my_chk.yml:
--------------------------------------------------------------------------------
 1 | name: my_chk
 2 | config_version:
 3 | module_name: great_expectations.checkpoint
 4 | class_name: LegacyCheckpoint
 5 | validation_operator_name: action_list_operator
 6 | batches:
 7 |   - batch_kwargs:
 8 |       path: /Users/sam/code/ge_tutorials/getting_started_tutorial_final_v2_api/great_expectations/../../data/yellow_tripdata_sample_2019-01.csv
 9 |       datasource: data__dir
10 |       data_asset_name: yellow_tripdata_sample_2019-01
11 |     expectation_suite_names:
12 |       - taxi.demo
13 | 


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v2_api/great_expectations/expectations/.ge_store_backend_id:
--------------------------------------------------------------------------------
1 | store_backend_id = 73999387-e31a-4e53-a3b4-9bfb5acc285e


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v2_api/great_expectations/expectations/taxi/demo.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "data_asset_type": "Dataset",
  3 |   "expectation_suite_name": "taxi.demo",
  4 |   "expectations": [
  5 |     {
  6 |       "expectation_type": "expect_table_row_count_to_be_between",
  7 |       "kwargs": {
  8 |         "max_value": 11000,
  9 |         "min_value": 9000
 10 |       },
 11 |       "meta": {
 12 |         "BasicSuiteBuilderProfiler": {
 13 |           "confidence": "very low"
 14 |         }
 15 |       }
 16 |     },
 17 |     {
 18 |       "expectation_type": "expect_table_column_count_to_equal",
 19 |       "kwargs": {
 20 |         "value": 18
 21 |       },
 22 |       "meta": {
 23 |         "BasicSuiteBuilderProfiler": {
 24 |           "confidence": "very low"
 25 |         }
 26 |       }
 27 |     },
 28 |     {
 29 |       "expectation_type": "expect_table_columns_to_match_ordered_list",
 30 |       "kwargs": {
 31 |         "column_list": [
 32 |           "vendor_id",
 33 |           "pickup_datetime",
 34 |           "dropoff_datetime",
 35 |           "passenger_count",
 36 |           "trip_distance",
 37 |           "rate_code_id",
 38 |           "store_and_fwd_flag",
 39 |           "pickup_location_id",
 40 |           "dropoff_location_id",
 41 |           "payment_type",
 42 |           "fare_amount",
 43 |           "extra",
 44 |           "mta_tax",
 45 |           "tip_amount",
 46 |           "tolls_amount",
 47 |           "improvement_surcharge",
 48 |           "total_amount",
 49 |           "congestion_surcharge"
 50 |         ]
 51 |       },
 52 |       "meta": {
 53 |         "BasicSuiteBuilderProfiler": {
 54 |           "confidence": "very low"
 55 |         }
 56 |       }
 57 |     },
 58 |     {
 59 |       "expectation_type": "expect_column_values_to_not_be_null",
 60 |       "kwargs": {
 61 |         "column": "passenger_count"
 62 |       },
 63 |       "meta": {
 64 |         "BasicSuiteBuilderProfiler": {
 65 |           "confidence": "very low"
 66 |         }
 67 |       }
 68 |     },
 69 |     {
 70 |       "expectation_type": "expect_column_distinct_values_to_be_in_set",
 71 |       "kwargs": {
 72 |         "column": "passenger_count",
 73 |         "value_set": [
 74 |           1,
 75 |           2,
 76 |           3,
 77 |           4,
 78 |           5,
 79 |           6
 80 |         ]
 81 |       },
 82 |       "meta": {
 83 |         "BasicSuiteBuilderProfiler": {
 84 |           "confidence": "very low"
 85 |         }
 86 |       }
 87 |     },
 88 |     {
 89 |       "expectation_type": "expect_column_kl_divergence_to_be_less_than",
 90 |       "kwargs": {
 91 |         "column": "passenger_count",
 92 |         "partition_object": {
 93 |           "values": [
 94 |             1,
 95 |             2,
 96 |             3,
 97 |             4,
 98 |             5,
 99 |             6
100 |           ],
101 |           "weights": [
102 |             0.7299,
103 |             0.1458,
104 |             0.039,
105 |             0.0186,
106 |             0.0415,
107 |             0.0252
108 |           ]
109 |         },
110 |         "threshold": 0.6
111 |       },
112 |       "meta": {
113 |         "BasicSuiteBuilderProfiler": {
114 |           "confidence": "very low"
115 |         }
116 |       }
117 |     }
118 |   ],
119 |   "meta": {
120 |     "BasicSuiteBuilderProfiler": {
121 |       "batch_kwargs": {
122 |         "data_asset_name": "yellow_tripdata_sample_2019-01",
123 |         "datasource": "data__dir",
124 |         "path": "/Users/sam/code/ge_tutorials/getting_started_tutorial_stable_api_final/great_expectations/../data/yellow_tripdata_sample_2019-01.csv"
125 |       },
126 |       "created_at": 1612842185.286588,
127 |       "created_by": "BasicSuiteBuilderProfiler"
128 |     },
129 |     "citations": [
130 |       {
131 |         "batch_kwargs": {
132 |           "data_asset_name": "yellow_tripdata_sample_2019-01",
133 |           "datasource": "data__dir",
134 |           "path": "/Users/sam/code/ge_tutorials/getting_started_tutorial_stable_api_final/great_expectations/../data/yellow_tripdata_sample_2019-01.csv"
135 |         },
136 |         "batch_markers": {
137 |           "ge_load_time": "20210209T034305.243453Z",
138 |           "pandas_data_fingerprint": "c4f929e6d4fab001fedc9e075bf4b612"
139 |         },
140 |         "batch_parameters": null,
141 |         "citation_date": "20210209T034305.301394Z",
142 |         "comment": "BasicSuiteBuilderProfiler added a citation based on the current batch."
143 |       }
144 |     ],
145 |     "columns": {
146 |       "congestion_surcharge": {
147 |         "description": ""
148 |       },
149 |       "dropoff_datetime": {
150 |         "description": ""
151 |       },
152 |       "dropoff_location_id": {
153 |         "description": ""
154 |       },
155 |       "extra": {
156 |         "description": ""
157 |       },
158 |       "fare_amount": {
159 |         "description": ""
160 |       },
161 |       "improvement_surcharge": {
162 |         "description": ""
163 |       },
164 |       "mta_tax": {
165 |         "description": ""
166 |       },
167 |       "passenger_count": {
168 |         "description": ""
169 |       },
170 |       "payment_type": {
171 |         "description": ""
172 |       },
173 |       "pickup_datetime": {
174 |         "description": ""
175 |       },
176 |       "pickup_location_id": {
177 |         "description": ""
178 |       },
179 |       "rate_code_id": {
180 |         "description": ""
181 |       },
182 |       "store_and_fwd_flag": {
183 |         "description": ""
184 |       },
185 |       "tip_amount": {
186 |         "description": ""
187 |       },
188 |       "tolls_amount": {
189 |         "description": ""
190 |       },
191 |       "total_amount": {
192 |         "description": ""
193 |       },
194 |       "trip_distance": {
195 |         "description": ""
196 |       },
197 |       "vendor_id": {
198 |         "description": ""
199 |       }
200 |     },
201 |     "great_expectations_version": "0.13.9+1.g62265bff3.dirty",
202 |     "notes": {
203 |       "content": [
204 |         "_To add additional notes, edit the <code>meta.notes.content</code> field in the appropriate Expectation json file._"
205 |       ],
206 |       "format": "markdown"
207 |     }
208 |   }
209 | }


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v2_api/great_expectations/great_expectations.yml:
--------------------------------------------------------------------------------
  1 | # Welcome to Great Expectations! Always know what to expect from your data.
  2 | #
  3 | # Here you can define datasources, batch kwargs generators, integrations and
  4 | # more. This file is intended to be committed to your repo. For help with
  5 | # configuration please:
  6 | #   - Read our docs: https://docs.greatexpectations.io/en/latest/how_to_guides/spare_parts/data_context_reference.html#configuration
  7 | #   - Join our slack channel: http://greatexpectations.io/slack
  8 | 
  9 | # config_version refers to the syntactic version of this config file, and is used in maintaining backwards compatibility
 10 | # It is auto-generated and usually does not need to be changed.
 11 | config_version: 3.0
 12 | 
 13 | # Datasources tell Great Expectations where your data lives and how to get it.
 14 | # You can use the CLI command `great_expectations datasource new` to help you
 15 | # add a new datasource. Read more at https://docs.greatexpectations.io/en/latest/reference/core_concepts/datasource_reference.html
 16 | datasources:
 17 |   data__dir:
 18 |     class_name: PandasDatasource
 19 |     batch_kwargs_generators:
 20 |       subdir_reader:
 21 |         class_name: SubdirReaderBatchKwargsGenerator
 22 |         base_directory: ../../data
 23 |     module_name: great_expectations.datasource
 24 |     data_asset_type:
 25 |       class_name: PandasDataset
 26 |       module_name: great_expectations.dataset
 27 | 
 28 | # This config file supports variable substitution which enables: 1) keeping
 29 | # secrets out of source control & 2) environment-based configuration changes
 30 | # such as staging vs prod.
 31 | #
 32 | # When GE encounters substitution syntax (like `my_key: ${my_value}` or
 33 | # `my_key: $my_value`) in the great_expectations.yml file, it will attempt
 34 | # to replace the value of `my_key` with the value from an environment
 35 | # variable `my_value` or a corresponding key read from this config file,
 36 | # which is defined through the `config_variables_file_path`.
 37 | # Environment variables take precedence over variables defined here.
 38 | #
 39 | # Substitution values defined here can be a simple (non-nested) value,
 40 | # nested value such as a dictionary, or an environment variable (i.e. ${ENV_VAR})
 41 | #
 42 | #
 43 | # https://docs.greatexpectations.io/en/latest/guides/how_to_guides/configuring_data_contexts/how_to_use_a_yaml_file_or_environment_variables_to_populate_credentials.html
 44 | 
 45 | 
 46 | config_variables_file_path: uncommitted/config_variables.yml
 47 | 
 48 | # The plugins_directory will be added to your python path for custom modules
 49 | # used to override and extend Great Expectations.
 50 | plugins_directory: plugins/
 51 | 
 52 | stores:
 53 | # Stores are configurable places to store things like Expectations, Validations
 54 | # Data Docs, and more. These are for advanced users only - most users can simply
 55 | # leave this section alone.
 56 | #
 57 | # Three stores are required: expectations, validations, and
 58 | # evaluation_parameters, and must exist with a valid store entry. Additional
 59 | # stores can be configured for uses such as data_docs, etc.
 60 |   expectations_store:
 61 |     class_name: ExpectationsStore
 62 |     store_backend:
 63 |       class_name: TupleFilesystemStoreBackend
 64 |       base_directory: expectations/
 65 | 
 66 |   validations_store:
 67 |     class_name: ValidationsStore
 68 |     store_backend:
 69 |       class_name: TupleFilesystemStoreBackend
 70 |       base_directory: uncommitted/validations/
 71 | 
 72 |   evaluation_parameter_store:
 73 |     # Evaluation Parameters enable dynamic expectations. Read more here:
 74 |     # https://docs.greatexpectations.io/en/latest/reference/core_concepts/evaluation_parameters.html
 75 |     class_name: EvaluationParameterStore
 76 | 
 77 |   checkpoint_store:
 78 |     class_name: CheckpointStore
 79 |     store_backend:
 80 |       class_name: TupleFilesystemStoreBackend
 81 |       base_directory: checkpoints/
 82 | 
 83 | expectations_store_name: expectations_store
 84 | validations_store_name: validations_store
 85 | evaluation_parameter_store_name: evaluation_parameter_store
 86 | checkpoint_store_name: checkpoint_store
 87 | 
 88 | data_docs_sites:
 89 |   # Data Docs make it simple to visualize data quality in your project. These
 90 |   # include Expectations, Validations & Profiles. The are built for all
 91 |   # Datasources from JSON artifacts in the local repo including validations &
 92 |   # profiles from the uncommitted directory. Read more at https://docs.greatexpectations.io/en/latest/reference/core_concepts/data_docs.html
 93 |   local_site:
 94 |     class_name: SiteBuilder
 95 |     # set to false to hide how-to buttons in Data Docs
 96 |     show_how_to_buttons: true
 97 |     store_backend:
 98 |       class_name: TupleFilesystemStoreBackend
 99 |       base_directory: uncommitted/data_docs/local_site/
100 |     site_index_builder:
101 |       class_name: DefaultSiteIndexBuilder
102 | 
103 | anonymous_usage_statistics:
104 |   data_context_id: 73999387-e31a-4e53-a3b4-9bfb5acc285e
105 |   enabled: true
106 | notebooks:
107 | 


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v2_api/great_expectations/notebooks/pandas/validation_playground.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Validation Playground\n",
  8 |     "\n",
  9 |     "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n",
 10 |     "\n",
 11 |     "#### This notebook assumes that you created at least one expectation suite in your project.\n",
 12 |     "#### Here you will learn how to validate data loaded into a Pandas DataFrame against an expectation suite.\n",
 13 |     "\n",
 14 |     "\n",
 15 |     "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import json\n",
 25 |     "import great_expectations as ge\n",
 26 |     "import great_expectations.jupyter_ux\n",
 27 |     "from great_expectations.datasource.types import BatchKwargs\n",
 28 |     "import datetime"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## 1. Get a DataContext\n",
 36 |     "This represents your **project** that you just created using `great_expectations init`."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "context = ge.data_context.DataContext()"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## 2. Choose an Expectation Suite\n",
 53 |     "\n",
 54 |     "List expectation suites that you created in your project"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "context.list_expectation_suite_names()"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "expectation_suite_name =  # TODO: set to a name from the list above"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## 3. Load a batch of data you want to validate\n",
 80 |     "\n",
 81 |     "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# list datasources of the type PandasDatasource in your project\n",
 91 |     "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'PandasDatasource']"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "datasource_name = # TODO: set to a datasource name from above"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# If you would like to validate a file on a filesystem:\n",
110 |     "batch_kwargs = {'path': \"YOUR_FILE_PATH\", 'datasource': datasource_name}\n",
111 |     "\n",
112 |     "# If you already loaded the data into a Pandas Data Frame:\n",
113 |     "batch_kwargs = {'dataset': \"YOUR_DATAFRAME\", 'datasource': datasource_name}\n",
114 |     "\n",
115 |     "\n",
116 |     "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n",
117 |     "batch.head()"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "## 4. Validate the batch with Validation Operators\n",
125 |     "\n",
126 |     "`Validation Operators` provide a convenient way to bundle the validation of\n",
127 |     "multiple expectation suites and the actions that should be taken after validation.\n",
128 |     "\n",
129 |     "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n",
130 |     "\n",
131 |     "* validating a group of batches that are logically related\n",
132 |     "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n",
133 |     "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n",
134 |     "\n",
135 |     "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n",
145 |     "\n",
146 |     "\"\"\"\n",
147 |     "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n",
148 |     "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n",
149 |     "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n",
150 |     "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n",
151 |     "be None and run_time will default to the current UTC datetime.\n",
152 |     "\"\"\"\n",
153 |     "\n",
154 |     "run_id = {\n",
155 |     "  \"run_name\": \"some_string_that_uniquely_identifies_this_run\",  # insert your own run_name here\n",
156 |     "  \"run_time\": datetime.datetime.now(datetime.timezone.utc)\n",
157 |     "}\n",
158 |     "\n",
159 |     "results = context.run_validation_operator(\n",
160 |     "    \"action_list_operator\",\n",
161 |     "    assets_to_validate=[batch],\n",
162 |     "    run_id=run_id)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "## 5. View the Validation Results in Data Docs\n",
170 |     "\n",
171 |     "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n",
172 |     "\n",
173 |     "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "context.open_data_docs()"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "## Congratulations! You ran Validations!\n",
190 |     "\n",
191 |     "## Next steps:\n",
192 |     "\n",
193 |     "### 1. Read about the typical workflow with Great Expectations:\n",
194 |     "\n",
195 |     "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n",
196 |     "\n",
197 |     "### 2. Explore the documentation & community\n",
198 |     "\n",
199 |     "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers."
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": []
208 |   }
209 |  ],
210 |  "metadata": {
211 |   "kernelspec": {
212 |    "display_name": "Python 3",
213 |    "language": "python",
214 |    "name": "python3"
215 |   },
216 |   "language_info": {
217 |    "codemirror_mode": {
218 |     "name": "ipython",
219 |     "version": 3
220 |    },
221 |    "file_extension": ".py",
222 |    "mimetype": "text/x-python",
223 |    "name": "python",
224 |    "nbconvert_exporter": "python",
225 |    "pygments_lexer": "ipython3",
226 |    "version": "3.7.0"
227 |   },
228 |   "pycharm": {
229 |    "stem_cell": {
230 |     "cell_type": "raw",
231 |     "source": [],
232 |     "metadata": {
233 |      "collapsed": false
234 |     }
235 |    }
236 |   }
237 |  },
238 |  "nbformat": 4,
239 |  "nbformat_minor": 4
240 | }
241 | 


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v2_api/great_expectations/notebooks/spark/validation_playground.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Validation Playground\n",
  8 |     "\n",
  9 |     "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n",
 10 |     "\n",
 11 |     "#### This notebook assumes that you created at least one expectation suite in your project.\n",
 12 |     "#### Here you will learn how to validate data loaded into a PySpark DataFrame against an expectation suite.\n",
 13 |     "\n",
 14 |     "\n",
 15 |     "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import json\n",
 25 |     "import great_expectations as ge\n",
 26 |     "import great_expectations.jupyter_ux\n",
 27 |     "from great_expectations.datasource.types import BatchKwargs\n",
 28 |     "import datetime"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## 1. Get a DataContext\n",
 36 |     "This represents your **project** that you just created using `great_expectations init`."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "context = ge.data_context.DataContext()"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## 2. Choose an Expectation Suite\n",
 53 |     "\n",
 54 |     "List expectation suites that you created in your project"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "context.list_expectation_suite_names()"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "expectation_suite_name =  # TODO: set to a name from the list above"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## 3. Load a batch of data you want to validate\n",
 80 |     "\n",
 81 |     "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# list datasources of the type SparkDFDatasource in your project\n",
 91 |     "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SparkDFDatasource']"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "datasource_name = # TODO: set to a datasource name from above"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# If you would like to validate a file on a filesystem:\n",
110 |     "batch_kwargs = {'path': \"YOUR_FILE_PATH\", 'datasource': datasource_name}\n",
111 |     "# To customize how Spark reads the file, you can add options under reader_options key in batch_kwargs (e.g., header='true')\n",
112 |     "\n",
113 |     "# If you already loaded the data into a PySpark Data Frame:\n",
114 |     "batch_kwargs = {'dataset': \"YOUR_DATAFRAME\", 'datasource': datasource_name}\n",
115 |     "\n",
116 |     "\n",
117 |     "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n",
118 |     "batch.head()"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "## 4. Validate the batch with Validation Operators\n",
126 |     "\n",
127 |     "`Validation Operators` provide a convenient way to bundle the validation of\n",
128 |     "multiple expectation suites and the actions that should be taken after validation.\n",
129 |     "\n",
130 |     "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n",
131 |     "\n",
132 |     "* validating a group of batches that are logically related\n",
133 |     "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n",
134 |     "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n",
135 |     "\n",
136 |     "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n",
146 |     "\n",
147 |     "\"\"\"\n",
148 |     "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n",
149 |     "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n",
150 |     "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n",
151 |     "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n",
152 |     "be None and run_time will default to the current UTC datetime.\n",
153 |     "\"\"\"\n",
154 |     "\n",
155 |     "run_id = {\n",
156 |     "  \"run_name\": \"some_string_that_uniquely_identifies_this_run\",  # insert your own run_name here\n",
157 |     "  \"run_time\": datetime.datetime.now(datetime.timezone.utc)\n",
158 |     "}\n",
159 |     "\n",
160 |     "results = context.run_validation_operator(\n",
161 |     "    \"action_list_operator\",\n",
162 |     "    assets_to_validate=[batch],\n",
163 |     "    run_id=run_id)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "## 5. View the Validation Results in Data Docs\n",
171 |     "\n",
172 |     "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n",
173 |     "\n",
174 |     "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "context.open_data_docs()"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "metadata": {},
189 |    "source": [
190 |     "## Congratulations! You ran Validations!\n",
191 |     "\n",
192 |     "## Next steps:\n",
193 |     "\n",
194 |     "### 1. Read about the typical workflow with Great Expectations:\n",
195 |     "\n",
196 |     "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n",
197 |     "\n",
198 |     "### 2. Explore the documentation & community\n",
199 |     "\n",
200 |     "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers."
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": []
209 |   }
210 |  ],
211 |  "metadata": {
212 |   "kernelspec": {
213 |    "display_name": "Python 3",
214 |    "language": "python",
215 |    "name": "python3"
216 |   },
217 |   "language_info": {
218 |    "codemirror_mode": {
219 |     "name": "ipython",
220 |     "version": 3
221 |    },
222 |    "file_extension": ".py",
223 |    "mimetype": "text/x-python",
224 |    "name": "python",
225 |    "nbconvert_exporter": "python",
226 |    "pygments_lexer": "ipython3",
227 |    "version": "3.7.0"
228 |   },
229 |   "pycharm": {
230 |    "stem_cell": {
231 |     "cell_type": "raw",
232 |     "source": [],
233 |     "metadata": {
234 |      "collapsed": false
235 |     }
236 |    }
237 |   }
238 |  },
239 |  "nbformat": 4,
240 |  "nbformat_minor": 4
241 | }
242 | 


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v2_api/great_expectations/notebooks/sql/validation_playground.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Validation Playground\n",
  8 |     "\n",
  9 |     "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n",
 10 |     "\n",
 11 |     "#### This notebook assumes that you created at least one expectation suite in your project.\n",
 12 |     "#### Here you will learn how to validate data in a SQL database against an expectation suite.\n",
 13 |     "\n",
 14 |     "\n",
 15 |     "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import json\n",
 25 |     "import great_expectations as ge\n",
 26 |     "import great_expectations.jupyter_ux\n",
 27 |     "from great_expectations.datasource.types import BatchKwargs\n",
 28 |     "import datetime"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## 1. Get a DataContext\n",
 36 |     "This represents your **project** that you just created using `great_expectations init`."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "context = ge.data_context.DataContext()"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## 2. Choose an Expectation Suite\n",
 53 |     "\n",
 54 |     "List expectation suites that you created in your project"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "context.list_expectation_suite_names()"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "expectation_suite_name =  # TODO: set to a name from the list above"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## 3. Load a batch of data you want to validate\n",
 80 |     "\n",
 81 |     "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# list datasources of the type SqlAlchemyDatasource in your project\n",
 91 |     "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SqlAlchemyDatasource']"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "datasource_name = # TODO: set to a datasource name from above"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# If you would like to validate an entire table or view in your database's default schema:\n",
110 |     "batch_kwargs = {'table': \"YOUR_TABLE\", 'datasource': datasource_name}\n",
111 |     "\n",
112 |     "# If you would like to validate an entire table or view from a non-default schema in your database:\n",
113 |     "batch_kwargs = {'table': \"YOUR_TABLE\", \"schema\": \"YOUR_SCHEMA\", 'datasource': datasource_name}\n",
114 |     "\n",
115 |     "# If you would like to validate the result set of a query:\n",
116 |     "# batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE', 'datasource': datasource_name}\n",
117 |     "\n",
118 |     "\n",
119 |     "\n",
120 |     "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n",
121 |     "batch.head()"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "## 4. Validate the batch with Validation Operators\n",
129 |     "\n",
130 |     "`Validation Operators` provide a convenient way to bundle the validation of\n",
131 |     "multiple expectation suites and the actions that should be taken after validation.\n",
132 |     "\n",
133 |     "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n",
134 |     "\n",
135 |     "* validating a group of batches that are logically related\n",
136 |     "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n",
137 |     "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n",
138 |     "\n",
139 |     "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n",
149 |     "\n",
150 |     "\"\"\"\n",
151 |     "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n",
152 |     "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n",
153 |     "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n",
154 |     "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n",
155 |     "be None and run_time will default to the current UTC datetime.\n",
156 |     "\"\"\"\n",
157 |     "\n",
158 |     "run_id = {\n",
159 |     "  \"run_name\": \"some_string_that_uniquely_identifies_this_run\",  # insert your own run_name here\n",
160 |     "  \"run_time\": datetime.datetime.now(datetime.timezone.utc)\n",
161 |     "}\n",
162 |     "\n",
163 |     "results = context.run_validation_operator(\n",
164 |     "    \"action_list_operator\",\n",
165 |     "    assets_to_validate=[batch],\n",
166 |     "    run_id=run_id)"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "## 5. View the Validation Results in Data Docs\n",
174 |     "\n",
175 |     "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n",
176 |     "\n",
177 |     "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "context.open_data_docs()"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {},
192 |    "source": [
193 |     "## Congratulations! You ran Validations!\n",
194 |     "\n",
195 |     "## Next steps:\n",
196 |     "\n",
197 |     "### 1. Read about the typical workflow with Great Expectations:\n",
198 |     "\n",
199 |     "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n",
200 |     "\n",
201 |     "### 2. Explore the documentation & community\n",
202 |     "\n",
203 |     "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers."
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": []
212 |   }
213 |  ],
214 |  "metadata": {
215 |   "kernelspec": {
216 |    "display_name": "Python 3",
217 |    "language": "python",
218 |    "name": "python3"
219 |   },
220 |   "language_info": {
221 |    "codemirror_mode": {
222 |     "name": "ipython",
223 |     "version": 3
224 |    },
225 |    "file_extension": ".py",
226 |    "mimetype": "text/x-python",
227 |    "name": "python",
228 |    "nbconvert_exporter": "python",
229 |    "pygments_lexer": "ipython3",
230 |    "version": "3.7.0"
231 |   },
232 |   "pycharm": {
233 |    "stem_cell": {
234 |     "cell_type": "raw",
235 |     "source": [],
236 |     "metadata": {
237 |      "collapsed": false
238 |     }
239 |    }
240 |   }
241 |  },
242 |  "nbformat": 4,
243 |  "nbformat_minor": 4
244 | }
245 | 


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v2_api/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css:
--------------------------------------------------------------------------------
 1 | /*index page*/
 2 | .ge-index-page-site-name-title {}
 3 | .ge-index-page-table-container {}
 4 | .ge-index-page-table {}
 5 | .ge-index-page-table-profiling-links-header {}
 6 | .ge-index-page-table-expectations-links-header {}
 7 | .ge-index-page-table-validations-links-header {}
 8 | .ge-index-page-table-profiling-links-list {}
 9 | .ge-index-page-table-profiling-links-item {}
10 | .ge-index-page-table-expectation-suite-link {}
11 | .ge-index-page-table-validation-links-list {}
12 | .ge-index-page-table-validation-links-item {}
13 | 
14 | /*breadcrumbs*/
15 | .ge-breadcrumbs {}
16 | .ge-breadcrumbs-item {}
17 | 
18 | /*navigation sidebar*/
19 | .ge-navigation-sidebar-container {}
20 | .ge-navigation-sidebar-content {}
21 | .ge-navigation-sidebar-title {}
22 | .ge-navigation-sidebar-link {}
23 | 


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v3_api/README.md:
--------------------------------------------------------------------------------
 1 | # Getting started with Great Expectations tutorial - v3 (Batch Request) API
 2 | 
 3 | This repository contains the final version of the "Getting started with Great Expectations" tutorial in the Great 
 4 | Expectations docs. This repo can be used as a demo and to explore a complete Great Expectations deploy.
 5 | 
 6 | **THIS VERSION WAS CREATED WITH THE V3 (BATCH REQUEST) GREAT EXPECTATIONS API**, which is available in Great Expectations 
 7 | version 0.13.x and above. 
 8 | 
 9 | ## 1. How to run through the tutorial
10 | [Please follow the tutorial in our docs for instructions!](https://docs.greatexpectations.io/en/latest/guides/tutorials/getting_started_v3_api.html)
11 | 
12 | ## 2. How to use this repo to explore and demo Great Expectations
13 | 
14 | ### The `data` directory
15 | 
16 | The CSV files in the data directory are yellow taxi trip data that have been downloaded from the NYC taxi data website:
17 | * [TLC trip record data](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
18 | * [Data dictionary](https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf)
19 | 
20 | We created 10,000 row samples (using the Pandas ``sample`` function) from teh original CSV files for convenience and manually added some breaking changes (0s in the passenger_count column) to demonstrate potential data issues. 
21 | 
22 | In a future version of this tutorial, we might use "naturally occurring" data bugs :)
23 | 
24 | ### The `great_expectations` directory
25 | Currently, this demo contains the following:
26 | * A `great_expectations.yml` file that's configured to use the top-level `data` directory as a Datasource. You will not need to set up anything to get it to work.
27 | * A single Expectation Suite, `taxi.demo`, containing a handful of simple Expectations
28 | * A Checkpoint `my_chk` that is set up to run the suite against the February data set
29 | 


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v3_api/great_expectations/.gitignore:
--------------------------------------------------------------------------------
1 | uncommitted/


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v3_api/great_expectations/checkpoints/my_checkpoint.yml:
--------------------------------------------------------------------------------
 1 | name: my_checkpoint
 2 | config_version: 1.0
 3 | template_name:
 4 | module_name: great_expectations.checkpoint
 5 | class_name: Checkpoint
 6 | run_name_template: '%Y%m%d-%H%M%S-my-run-name-template'
 7 | expectation_suite_name:
 8 | batch_request:
 9 | action_list:
10 |   - name: store_validation_result
11 |     action:
12 |       class_name: StoreValidationResultAction
13 |   - name: store_evaluation_params
14 |     action:
15 |       class_name: StoreEvaluationParametersAction
16 |   - name: update_data_docs
17 |     action:
18 |       class_name: UpdateDataDocsAction
19 |       site_names: []
20 | evaluation_parameters: {}
21 | runtime_configuration: {}
22 | validations:
23 |   - batch_request:
24 |       datasource_name: data__dir
25 |       data_connector_name: data__dir_example_data_connector
26 |       data_asset_name: yellow_tripdata_sample_2019-02.csv
27 |       data_connector_query:
28 |         index: -1
29 |     expectation_suite_name: taxi.demo
30 | profilers: []
31 | 


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v3_api/great_expectations/checkpoints/my_checkpoint_with_custom_expectation.yml:
--------------------------------------------------------------------------------
 1 | name: my_checkpoint_with_custom_expectation
 2 | config_version: 1.0
 3 | template_name:
 4 | module_name: great_expectations.checkpoint
 5 | class_name: Checkpoint
 6 | run_name_template: '%Y%m%d-%H%M%S-my-run-name-template'
 7 | expectation_suite_name:
 8 | batch_request:
 9 | action_list:
10 |   - name: store_validation_result
11 |     action:
12 |       class_name: StoreValidationResultAction
13 |   - name: store_evaluation_params
14 |     action:
15 |       class_name: StoreEvaluationParametersAction
16 |   - name: update_data_docs
17 |     action:
18 |       class_name: UpdateDataDocsAction
19 |       site_names: []
20 | evaluation_parameters: {}
21 | runtime_configuration: {}
22 | validations:
23 |   - batch_request:
24 |       datasource_name: data__dir
25 |       data_connector_name: data__dir_example_data_connector
26 |       data_asset_name: yellow_tripdata_sample_2019-02.csv
27 |       data_connector_query:
28 |         index: -1
29 |     expectation_suite_name: taxi.demo_with_custom_expectation
30 | profilers: []
31 | 


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v3_api/great_expectations/expectations/.ge_store_backend_id:
--------------------------------------------------------------------------------
1 | store_backend_id = 2c8220c3-63db-42c7-be97-b7909cc59f8b


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v3_api/great_expectations/expectations/taxi/demo.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "data_asset_type": null,
  3 |   "expectation_suite_name": "taxi.demo",
  4 |   "expectations": [
  5 |     {
  6 |       "expectation_type": "expect_table_columns_to_match_ordered_list",
  7 |       "kwargs": {
  8 |         "column_list": [
  9 |           "vendor_id",
 10 |           "pickup_datetime",
 11 |           "dropoff_datetime",
 12 |           "passenger_count",
 13 |           "trip_distance",
 14 |           "rate_code_id",
 15 |           "store_and_fwd_flag",
 16 |           "pickup_location_id",
 17 |           "dropoff_location_id",
 18 |           "payment_type",
 19 |           "fare_amount",
 20 |           "extra",
 21 |           "mta_tax",
 22 |           "tip_amount",
 23 |           "tolls_amount",
 24 |           "improvement_surcharge",
 25 |           "total_amount",
 26 |           "congestion_surcharge"
 27 |         ]
 28 |       },
 29 |       "meta": {}
 30 |     },
 31 |     {
 32 |       "expectation_type": "expect_table_row_count_to_be_between",
 33 |       "kwargs": {
 34 |         "max_value": 10000,
 35 |         "min_value": 10000
 36 |       },
 37 |       "meta": {}
 38 |     },
 39 |     {
 40 |       "expectation_type": "expect_column_min_to_be_between",
 41 |       "kwargs": {
 42 |         "column": "passenger_count",
 43 |         "max_value": 1,
 44 |         "min_value": 1
 45 |       },
 46 |       "meta": {}
 47 |     },
 48 |     {
 49 |       "expectation_type": "expect_column_max_to_be_between",
 50 |       "kwargs": {
 51 |         "column": "passenger_count",
 52 |         "max_value": 6,
 53 |         "min_value": 6
 54 |       },
 55 |       "meta": {}
 56 |     },
 57 |     {
 58 |       "expectation_type": "expect_column_mean_to_be_between",
 59 |       "kwargs": {
 60 |         "column": "passenger_count",
 61 |         "max_value": 1.5716,
 62 |         "min_value": 1.5716
 63 |       },
 64 |       "meta": {}
 65 |     },
 66 |     {
 67 |       "expectation_type": "expect_column_median_to_be_between",
 68 |       "kwargs": {
 69 |         "column": "passenger_count",
 70 |         "max_value": 1.0,
 71 |         "min_value": 1.0
 72 |       },
 73 |       "meta": {}
 74 |     },
 75 |     {
 76 |       "expectation_type": "expect_column_quantile_values_to_be_between",
 77 |       "kwargs": {
 78 |         "allow_relative_error": "lower",
 79 |         "column": "passenger_count",
 80 |         "quantile_ranges": {
 81 |           "quantiles": [
 82 |             0.05,
 83 |             0.25,
 84 |             0.5,
 85 |             0.75,
 86 |             0.95
 87 |           ],
 88 |           "value_ranges": [
 89 |             [
 90 |               1,
 91 |               1
 92 |             ],
 93 |             [
 94 |               1,
 95 |               1
 96 |             ],
 97 |             [
 98 |               1,
 99 |               1
100 |             ],
101 |             [
102 |               2,
103 |               2
104 |             ],
105 |             [
106 |               5,
107 |               5
108 |             ]
109 |           ]
110 |         }
111 |       },
112 |       "meta": {}
113 |     },
114 |     {
115 |       "expectation_type": "expect_column_values_to_be_in_set",
116 |       "kwargs": {
117 |         "column": "passenger_count",
118 |         "value_set": [
119 |           1,
120 |           2,
121 |           3,
122 |           4,
123 |           5,
124 |           6
125 |         ]
126 |       },
127 |       "meta": {}
128 |     },
129 |     {
130 |       "expectation_type": "expect_column_values_to_not_be_null",
131 |       "kwargs": {
132 |         "column": "passenger_count"
133 |       },
134 |       "meta": {}
135 |     },
136 |     {
137 |       "expectation_type": "expect_column_proportion_of_unique_values_to_be_between",
138 |       "kwargs": {
139 |         "column": "passenger_count",
140 |         "max_value": 0.0006,
141 |         "min_value": 0.0006
142 |       },
143 |       "meta": {}
144 |     },
145 |     {
146 |       "expectation_type": "expect_column_values_to_be_in_type_list",
147 |       "kwargs": {
148 |         "column": "passenger_count",
149 |         "type_list": [
150 |           "INTEGER",
151 |           "integer",
152 |           "int",
153 |           "int_",
154 |           "int8",
155 |           "int16",
156 |           "int32",
157 |           "int64",
158 |           "uint8",
159 |           "uint16",
160 |           "uint32",
161 |           "uint64",
162 |           "INT",
163 |           "TINYINT",
164 |           "BYTEINT",
165 |           "SMALLINT",
166 |           "BIGINT",
167 |           "IntegerType",
168 |           "LongType",
169 |           "DECIMAL"
170 |         ]
171 |       },
172 |       "meta": {}
173 |     }
174 |   ],
175 |   "meta": {
176 |     "citations": [
177 |       {
178 |         "batch_definition": null,
179 |         "batch_kwargs": null,
180 |         "batch_markers": null,
181 |         "batch_parameters": null,
182 |         "batch_request": {
183 |           "data_asset_name": "yellow_tripdata_sample_2019-01.csv",
184 |           "data_connector_name": "data__dir_example_data_connector",
185 |           "datasource_name": "data__dir",
186 |           "limit": 1000
187 |         },
188 |         "batch_spec": null,
189 |         "citation_date": "2021-04-25T22:43:50.694402Z",
190 |         "comment": "Created suite added via CLI"
191 |       }
192 |     ],
193 |     "great_expectations_version": "0.13.19"
194 |   }
195 | }


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v3_api/great_expectations/expectations/taxi/demo_with_custom_expectation.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data_asset_type": null,
 3 |   "expectation_suite_name": "taxi.demo_with_custom_expectation",
 4 |   "expectations": [
 5 |     {
 6 |       "expectation_type": "expect_column_values_to_be_in_set",
 7 |       "kwargs": {
 8 |         "column": "passenger_count",
 9 |         "value_set": [
10 |           1,
11 |           2,
12 |           3,
13 |           4,
14 |           5,
15 |           6
16 |         ]
17 |       },
18 |       "meta": {}
19 |     },
20 |     {
21 |       "expectation_type": "expect_column_max_to_be_between_custom",
22 |       "kwargs": {
23 |         "column": "passenger_count",
24 |         "max_value": 6,
25 |         "min_value": 1
26 |       },
27 |       "meta": {}
28 |     }
29 |   ]
30 | }
31 | 


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v3_api/great_expectations/great_expectations.yml:
--------------------------------------------------------------------------------
  1 | # Welcome to Great Expectations! Always know what to expect from your data.
  2 | #
  3 | # Here you can define datasources, batch kwargs generators, integrations and
  4 | # more. This file is intended to be committed to your repo. For help with
  5 | # configuration please:
  6 | #   - Read our docs: https://docs.greatexpectations.io/en/latest/reference/spare_parts/data_context_reference.html#configuration
  7 | #   - Join our slack channel: http://greatexpectations.io/slack
  8 | 
  9 | # config_version refers to the syntactic version of this config file, and is used in maintaining backwards compatibility
 10 | # It is auto-generated and usually does not need to be changed.
 11 | config_version: 3.0
 12 | 
 13 | # Datasources tell Great Expectations where your data lives and how to get it.
 14 | # You can use the CLI command `great_expectations datasource new` to help you
 15 | # add a new datasource. Read more at https://docs.greatexpectations.io/en/latest/reference/core_concepts/datasource.html
 16 | datasources:
 17 |   data__dir:
 18 |     execution_engine:
 19 |       module_name: great_expectations.execution_engine
 20 |       class_name: PandasExecutionEngine
 21 |     module_name: great_expectations.datasource
 22 |     class_name: Datasource
 23 |     data_connectors:
 24 |       data__dir_example_data_connector:
 25 |         base_directory: ../../data
 26 |         default_regex:
 27 |           group_names:
 28 |             - data_asset_name
 29 |           pattern: (.*)
 30 |         module_name: great_expectations.datasource.data_connector
 31 |         class_name: InferredAssetFilesystemDataConnector
 32 | 
 33 | # This config file supports variable substitution which enables: 1) keeping
 34 | # secrets out of source control & 2) environment-based configuration changes
 35 | # such as staging vs prod.
 36 | #
 37 | # When GE encounters substitution syntax (like `my_key: ${my_value}` or
 38 | # `my_key: $my_value`) in the great_expectations.yml file, it will attempt
 39 | # to replace the value of `my_key` with the value from an environment
 40 | # variable `my_value` or a corresponding key read from this config file,
 41 | # which is defined through the `config_variables_file_path`.
 42 | # Environment variables take precedence over variables defined here.
 43 | #
 44 | # Substitution values defined here can be a simple (non-nested) value,
 45 | # nested value such as a dictionary, or an environment variable (i.e. ${ENV_VAR})
 46 | #
 47 | #
 48 | # https://docs.greatexpectations.io/en/latest/guides/how_to_guides/configuring_data_contexts/how_to_use_a_yaml_file_or_environment_variables_to_populate_credentials.html
 49 | 
 50 | 
 51 | config_variables_file_path: uncommitted/config_variables.yml
 52 | 
 53 | # The plugins_directory will be added to your python path for custom modules
 54 | # used to override and extend Great Expectations.
 55 | plugins_directory: plugins/
 56 | 
 57 | stores:
 58 | # Stores are configurable places to store things like Expectations, Validations
 59 | # Data Docs, and more. These are for advanced users only - most users can simply
 60 | # leave this section alone.
 61 | #
 62 | # Three stores are required: expectations, validations, and
 63 | # evaluation_parameters, and must exist with a valid store entry. Additional
 64 | # stores can be configured for uses such as data_docs, etc.
 65 |   expectations_store:
 66 |     class_name: ExpectationsStore
 67 |     store_backend:
 68 |       class_name: TupleFilesystemStoreBackend
 69 |       base_directory: expectations/
 70 | 
 71 |   validations_store:
 72 |     class_name: ValidationsStore
 73 |     store_backend:
 74 |       class_name: TupleFilesystemStoreBackend
 75 |       base_directory: uncommitted/validations/
 76 | 
 77 |   evaluation_parameter_store:
 78 |     # Evaluation Parameters enable dynamic expectations. Read more here:
 79 |     # https://docs.greatexpectations.io/en/latest/reference/core_concepts/evaluation_parameters.html
 80 |     class_name: EvaluationParameterStore
 81 | 
 82 |   checkpoint_store:
 83 |     class_name: CheckpointStore
 84 |     store_backend:
 85 |       class_name: TupleFilesystemStoreBackend
 86 |       suppress_store_backend_id: true
 87 |       base_directory: checkpoints/
 88 | 
 89 | expectations_store_name: expectations_store
 90 | validations_store_name: validations_store
 91 | evaluation_parameter_store_name: evaluation_parameter_store
 92 | checkpoint_store_name: checkpoint_store
 93 | 
 94 | data_docs_sites:
 95 |   # Data Docs make it simple to visualize data quality in your project. These
 96 |   # include Expectations, Validations & Profiles. The are built for all
 97 |   # Datasources from JSON artifacts in the local repo including validations &
 98 |   # profiles from the uncommitted directory. Read more at https://docs.greatexpectations.io/en/latest/reference/core_concepts/data_docs.html
 99 |   local_site:
100 |     class_name: SiteBuilder
101 |     # set to false to hide how-to buttons in Data Docs
102 |     show_how_to_buttons: true
103 |     store_backend:
104 |       class_name: TupleFilesystemStoreBackend
105 |       base_directory: uncommitted/data_docs/local_site/
106 |     site_index_builder:
107 |       class_name: DefaultSiteIndexBuilder
108 | 
109 | anonymous_usage_statistics:
110 |   enabled: true
111 |   data_context_id: 2c8220c3-63db-42c7-be97-b7909cc59f8b
112 | notebooks:
113 | 


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v3_api/great_expectations/notebooks/pandas/validation_playground.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Validation Playground\n",
  8 |     "\n",
  9 |     "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n",
 10 |     "\n",
 11 |     "#### This notebook assumes that you created at least one expectation suite in your project.\n",
 12 |     "#### Here you will learn how to validate data loaded into a Pandas DataFrame against an expectation suite.\n",
 13 |     "\n",
 14 |     "\n",
 15 |     "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import json\n",
 25 |     "import great_expectations as ge\n",
 26 |     "import great_expectations.jupyter_ux\n",
 27 |     "from great_expectations.datasource.types import BatchKwargs\n",
 28 |     "import datetime"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## 1. Get a DataContext\n",
 36 |     "This represents your **project** that you just created using `great_expectations init`."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "context = ge.data_context.DataContext()"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## 2. Choose an Expectation Suite\n",
 53 |     "\n",
 54 |     "List expectation suites that you created in your project"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "context.list_expectation_suite_names()"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "expectation_suite_name =  # TODO: set to a name from the list above"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## 3. Load a batch of data you want to validate\n",
 80 |     "\n",
 81 |     "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# list datasources of the type PandasDatasource in your project\n",
 91 |     "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'PandasDatasource']"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "datasource_name = # TODO: set to a datasource name from above"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# If you would like to validate a file on a filesystem:\n",
110 |     "batch_kwargs = {'path': \"YOUR_FILE_PATH\", 'datasource': datasource_name}\n",
111 |     "\n",
112 |     "# If you already loaded the data into a Pandas Data Frame:\n",
113 |     "batch_kwargs = {'dataset': \"YOUR_DATAFRAME\", 'datasource': datasource_name}\n",
114 |     "\n",
115 |     "\n",
116 |     "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n",
117 |     "batch.head()"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "## 4. Validate the batch with Validation Operators\n",
125 |     "\n",
126 |     "`Validation Operators` provide a convenient way to bundle the validation of\n",
127 |     "multiple expectation suites and the actions that should be taken after validation.\n",
128 |     "\n",
129 |     "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n",
130 |     "\n",
131 |     "* validating a group of batches that are logically related\n",
132 |     "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n",
133 |     "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n",
134 |     "\n",
135 |     "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n",
145 |     "\n",
146 |     "\"\"\"\n",
147 |     "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n",
148 |     "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n",
149 |     "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n",
150 |     "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n",
151 |     "be None and run_time will default to the current UTC datetime.\n",
152 |     "\"\"\"\n",
153 |     "\n",
154 |     "run_id = {\n",
155 |     "  \"run_name\": \"some_string_that_uniquely_identifies_this_run\",  # insert your own run_name here\n",
156 |     "  \"run_time\": datetime.datetime.now(datetime.timezone.utc)\n",
157 |     "}\n",
158 |     "\n",
159 |     "results = context.run_validation_operator(\n",
160 |     "    \"action_list_operator\",\n",
161 |     "    assets_to_validate=[batch],\n",
162 |     "    run_id=run_id)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "## 5. View the Validation Results in Data Docs\n",
170 |     "\n",
171 |     "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n",
172 |     "\n",
173 |     "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "context.open_data_docs()"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "## Congratulations! You ran Validations!\n",
190 |     "\n",
191 |     "## Next steps:\n",
192 |     "\n",
193 |     "### 1. Read about the typical workflow with Great Expectations:\n",
194 |     "\n",
195 |     "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n",
196 |     "\n",
197 |     "### 2. Explore the documentation & community\n",
198 |     "\n",
199 |     "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers."
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": []
208 |   }
209 |  ],
210 |  "metadata": {
211 |   "kernelspec": {
212 |    "display_name": "Python 3",
213 |    "language": "python",
214 |    "name": "python3"
215 |   },
216 |   "language_info": {
217 |    "codemirror_mode": {
218 |     "name": "ipython",
219 |     "version": 3
220 |    },
221 |    "file_extension": ".py",
222 |    "mimetype": "text/x-python",
223 |    "name": "python",
224 |    "nbconvert_exporter": "python",
225 |    "pygments_lexer": "ipython3",
226 |    "version": "3.7.0"
227 |   },
228 |   "pycharm": {
229 |    "stem_cell": {
230 |     "cell_type": "raw",
231 |     "source": [],
232 |     "metadata": {
233 |      "collapsed": false
234 |     }
235 |    }
236 |   }
237 |  },
238 |  "nbformat": 4,
239 |  "nbformat_minor": 4
240 | }
241 | 


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v3_api/great_expectations/notebooks/spark/validation_playground.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Validation Playground\n",
  8 |     "\n",
  9 |     "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n",
 10 |     "\n",
 11 |     "#### This notebook assumes that you created at least one expectation suite in your project.\n",
 12 |     "#### Here you will learn how to validate data loaded into a PySpark DataFrame against an expectation suite.\n",
 13 |     "\n",
 14 |     "\n",
 15 |     "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import json\n",
 25 |     "import great_expectations as ge\n",
 26 |     "import great_expectations.jupyter_ux\n",
 27 |     "from great_expectations.datasource.types import BatchKwargs\n",
 28 |     "import datetime"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## 1. Get a DataContext\n",
 36 |     "This represents your **project** that you just created using `great_expectations init`."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "context = ge.data_context.DataContext()"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## 2. Choose an Expectation Suite\n",
 53 |     "\n",
 54 |     "List expectation suites that you created in your project"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "context.list_expectation_suite_names()"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "expectation_suite_name =  # TODO: set to a name from the list above"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## 3. Load a batch of data you want to validate\n",
 80 |     "\n",
 81 |     "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# list datasources of the type SparkDFDatasource in your project\n",
 91 |     "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SparkDFDatasource']"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "datasource_name = # TODO: set to a datasource name from above"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# If you would like to validate a file on a filesystem:\n",
110 |     "batch_kwargs = {'path': \"YOUR_FILE_PATH\", 'datasource': datasource_name}\n",
111 |     "# To customize how Spark reads the file, you can add options under reader_options key in batch_kwargs (e.g., header='true')\n",
112 |     "\n",
113 |     "# If you already loaded the data into a PySpark Data Frame:\n",
114 |     "batch_kwargs = {'dataset': \"YOUR_DATAFRAME\", 'datasource': datasource_name}\n",
115 |     "\n",
116 |     "\n",
117 |     "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n",
118 |     "batch.head()"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "## 4. Validate the batch with Validation Operators\n",
126 |     "\n",
127 |     "`Validation Operators` provide a convenient way to bundle the validation of\n",
128 |     "multiple expectation suites and the actions that should be taken after validation.\n",
129 |     "\n",
130 |     "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n",
131 |     "\n",
132 |     "* validating a group of batches that are logically related\n",
133 |     "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n",
134 |     "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n",
135 |     "\n",
136 |     "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n",
146 |     "\n",
147 |     "\"\"\"\n",
148 |     "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n",
149 |     "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n",
150 |     "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n",
151 |     "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n",
152 |     "be None and run_time will default to the current UTC datetime.\n",
153 |     "\"\"\"\n",
154 |     "\n",
155 |     "run_id = {\n",
156 |     "  \"run_name\": \"some_string_that_uniquely_identifies_this_run\",  # insert your own run_name here\n",
157 |     "  \"run_time\": datetime.datetime.now(datetime.timezone.utc)\n",
158 |     "}\n",
159 |     "\n",
160 |     "results = context.run_validation_operator(\n",
161 |     "    \"action_list_operator\",\n",
162 |     "    assets_to_validate=[batch],\n",
163 |     "    run_id=run_id)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "## 5. View the Validation Results in Data Docs\n",
171 |     "\n",
172 |     "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n",
173 |     "\n",
174 |     "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "context.open_data_docs()"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "metadata": {},
189 |    "source": [
190 |     "## Congratulations! You ran Validations!\n",
191 |     "\n",
192 |     "## Next steps:\n",
193 |     "\n",
194 |     "### 1. Read about the typical workflow with Great Expectations:\n",
195 |     "\n",
196 |     "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n",
197 |     "\n",
198 |     "### 2. Explore the documentation & community\n",
199 |     "\n",
200 |     "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers."
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": []
209 |   }
210 |  ],
211 |  "metadata": {
212 |   "kernelspec": {
213 |    "display_name": "Python 3",
214 |    "language": "python",
215 |    "name": "python3"
216 |   },
217 |   "language_info": {
218 |    "codemirror_mode": {
219 |     "name": "ipython",
220 |     "version": 3
221 |    },
222 |    "file_extension": ".py",
223 |    "mimetype": "text/x-python",
224 |    "name": "python",
225 |    "nbconvert_exporter": "python",
226 |    "pygments_lexer": "ipython3",
227 |    "version": "3.7.0"
228 |   },
229 |   "pycharm": {
230 |    "stem_cell": {
231 |     "cell_type": "raw",
232 |     "source": [],
233 |     "metadata": {
234 |      "collapsed": false
235 |     }
236 |    }
237 |   }
238 |  },
239 |  "nbformat": 4,
240 |  "nbformat_minor": 4
241 | }
242 | 


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v3_api/great_expectations/notebooks/sql/validation_playground.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Validation Playground\n",
  8 |     "\n",
  9 |     "**Watch** a [short tutorial video](https://greatexpectations.io/videos/getting_started/integrate_expectations) or **read** [the written tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data)\n",
 10 |     "\n",
 11 |     "#### This notebook assumes that you created at least one expectation suite in your project.\n",
 12 |     "#### Here you will learn how to validate data in a SQL database against an expectation suite.\n",
 13 |     "\n",
 14 |     "\n",
 15 |     "We'd love it if you **reach out for help on** the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import json\n",
 25 |     "import great_expectations as ge\n",
 26 |     "import great_expectations.jupyter_ux\n",
 27 |     "from great_expectations.datasource.types import BatchKwargs\n",
 28 |     "import datetime"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## 1. Get a DataContext\n",
 36 |     "This represents your **project** that you just created using `great_expectations init`."
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "context = ge.data_context.DataContext()"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## 2. Choose an Expectation Suite\n",
 53 |     "\n",
 54 |     "List expectation suites that you created in your project"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "context.list_expectation_suite_names()"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": null,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "expectation_suite_name =  # TODO: set to a name from the list above"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## 3. Load a batch of data you want to validate\n",
 80 |     "\n",
 81 |     "To learn more about `get_batch`, see [this tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#load-a-batch-of-data-to-validate)\n"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# list datasources of the type SqlAlchemyDatasource in your project\n",
 91 |     "[datasource['name'] for datasource in context.list_datasources() if datasource['class_name'] == 'SqlAlchemyDatasource']"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "datasource_name = # TODO: set to a datasource name from above"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# If you would like to validate an entire table or view in your database's default schema:\n",
110 |     "batch_kwargs = {'table': \"YOUR_TABLE\", 'datasource': datasource_name}\n",
111 |     "\n",
112 |     "# If you would like to validate an entire table or view from a non-default schema in your database:\n",
113 |     "batch_kwargs = {'table': \"YOUR_TABLE\", \"schema\": \"YOUR_SCHEMA\", 'datasource': datasource_name}\n",
114 |     "\n",
115 |     "# If you would like to validate the result set of a query:\n",
116 |     "# batch_kwargs = {'query': 'SELECT YOUR_ROWS FROM YOUR_TABLE', 'datasource': datasource_name}\n",
117 |     "\n",
118 |     "\n",
119 |     "\n",
120 |     "batch = context.get_batch(batch_kwargs, expectation_suite_name)\n",
121 |     "batch.head()"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "## 4. Validate the batch with Validation Operators\n",
129 |     "\n",
130 |     "`Validation Operators` provide a convenient way to bundle the validation of\n",
131 |     "multiple expectation suites and the actions that should be taken after validation.\n",
132 |     "\n",
133 |     "When deploying Great Expectations in a **real data pipeline, you will typically discover these needs**:\n",
134 |     "\n",
135 |     "* validating a group of batches that are logically related\n",
136 |     "* validating a batch against several expectation suites such as using a tiered pattern like `warning` and `failure`\n",
137 |     "* doing something with the validation results (e.g., saving them for a later review, sending notifications in case of failures, etc.).\n",
138 |     "\n",
139 |     "[Read more about Validation Operators in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#save-validation-results)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "# This is an example of invoking a validation operator that is configured by default in the great_expectations.yml file\n",
149 |     "\n",
150 |     "\"\"\"\n",
151 |     "Create a run_id. The run_id must be of type RunIdentifier, with optional run_name and run_time instantiation\n",
152 |     "arguments (or a dictionary with these keys). The run_name can be any string (this could come from your pipeline\n",
153 |     "runner, e.g. Airflow run id). The run_time can be either a dateutil parsable string or a datetime object.\n",
154 |     "Note - any provided datetime will be assumed to be a UTC time. If no instantiation arguments are given, run_name will\n",
155 |     "be None and run_time will default to the current UTC datetime.\n",
156 |     "\"\"\"\n",
157 |     "\n",
158 |     "run_id = {\n",
159 |     "  \"run_name\": \"some_string_that_uniquely_identifies_this_run\",  # insert your own run_name here\n",
160 |     "  \"run_time\": datetime.datetime.now(datetime.timezone.utc)\n",
161 |     "}\n",
162 |     "\n",
163 |     "results = context.run_validation_operator(\n",
164 |     "    \"action_list_operator\",\n",
165 |     "    assets_to_validate=[batch],\n",
166 |     "    run_id=run_id)"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "## 5. View the Validation Results in Data Docs\n",
174 |     "\n",
175 |     "Let's now build and look at your Data Docs. These will now include an **data quality report** built from the `ValidationResults` you just created that helps you communicate about your data with both machines and humans.\n",
176 |     "\n",
177 |     "[Read more about Data Docs in the tutorial](https://docs.greatexpectations.io/en/latest/tutorials/validate_data.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "context.open_data_docs()"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {},
192 |    "source": [
193 |     "## Congratulations! You ran Validations!\n",
194 |     "\n",
195 |     "## Next steps:\n",
196 |     "\n",
197 |     "### 1. Read about the typical workflow with Great Expectations:\n",
198 |     "\n",
199 |     "[typical workflow](https://docs.greatexpectations.io/en/latest/getting_started/typical_workflow.html?utm_source=notebook&utm_medium=validate_data#view-the-validation-results-in-data-docs)\n",
200 |     "\n",
201 |     "### 2. Explore the documentation & community\n",
202 |     "\n",
203 |     "You are now among the elite data professionals who know how to build robust descriptions of your data and protections for pipelines and machine learning models. Join the [**Great Expectations Slack Channel**](https://greatexpectations.io/slack) to see how others are wielding these superpowers."
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": []
212 |   }
213 |  ],
214 |  "metadata": {
215 |   "kernelspec": {
216 |    "display_name": "Python 3",
217 |    "language": "python",
218 |    "name": "python3"
219 |   },
220 |   "language_info": {
221 |    "codemirror_mode": {
222 |     "name": "ipython",
223 |     "version": 3
224 |    },
225 |    "file_extension": ".py",
226 |    "mimetype": "text/x-python",
227 |    "name": "python",
228 |    "nbconvert_exporter": "python",
229 |    "pygments_lexer": "ipython3",
230 |    "version": "3.7.0"
231 |   },
232 |   "pycharm": {
233 |    "stem_cell": {
234 |     "cell_type": "raw",
235 |     "source": [],
236 |     "metadata": {
237 |      "collapsed": false
238 |     }
239 |    }
240 |   }
241 |  },
242 |  "nbformat": 4,
243 |  "nbformat_minor": 4
244 | }
245 | 


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v3_api/great_expectations/plugins/column_custom_max_expectation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Example of custom expectation with renderer.
  3 | 
  4 | This custom expectation can be run as part of a checkpoint with the script run_checkpoint_with_custom_expectation.py
  5 | in the getting_started_tutorial_final_v3_api directory, e.g.
  6 | 
  7 | getting_started_tutorial_final_v3_api$ python run_checkpoint_with_custom_expectation.py
  8 | 
  9 | See corresponding documentation:
 10 | * https://docs.greatexpectations.io/en/latest/guides/how_to_guides/creating_and_editing_expectations/how_to_create_custom_expectations.html
 11 | * https://docs.greatexpectations.io/en/latest/guides/how_to_guides/configuring_data_docs/how_to_create_renderers_for_custom_expectations.html
 12 | """
 13 | 
 14 | from great_expectations.core import ExpectationConfiguration, ExpectationValidationResult
 15 | from great_expectations.execution_engine import (
 16 |    ExecutionEngine,
 17 |    PandasExecutionEngine,
 18 |    SparkDFExecutionEngine,
 19 |    SqlAlchemyExecutionEngine,
 20 | )
 21 | from great_expectations.expectations.expectation import ColumnExpectation
 22 | from great_expectations.expectations.metrics import (
 23 |    ColumnMetricProvider,
 24 |    column_aggregate_value, column_aggregate_partial,
 25 | )
 26 | from great_expectations.expectations.metrics.import_manager import F, sa
 27 | from great_expectations.expectations.util import render_evaluation_parameter_string
 28 | from great_expectations.render.renderer.renderer import renderer
 29 | from great_expectations.render.types import RenderedStringTemplateContent, RenderedTableContent, RenderedBulletListContent, RenderedGraphContent
 30 | from great_expectations.render.util import substitute_none_for_missing 
 31 | 
 32 | from typing import Any, Dict, List, Optional, Union
 33 | 
 34 | 
 35 | class ColumnCustomMax(ColumnMetricProvider):
 36 |     """MetricProvider Class for Custom Aggregate Max MetricProvider"""
 37 | 
 38 |     metric_name = "column.aggregate.custom.max"
 39 | 
 40 |     @column_aggregate_value(engine=PandasExecutionEngine)
 41 |     def _pandas(cls, column, **kwargs):
 42 |         """Pandas Max Implementation"""
 43 |         return column.max()
 44 | 
 45 |     @column_aggregate_partial(engine=SqlAlchemyExecutionEngine)
 46 |     def _sqlalchemy(cls, column, **kwargs):
 47 |         """SqlAlchemy Max Implementation"""
 48 |         return sa.func.max(column)
 49 | 
 50 |     @column_aggregate_partial(engine=SparkDFExecutionEngine)
 51 |     def _spark(cls, column, _table, _column_name, **kwargs):
 52 |         """Spark Max Implementation"""
 53 |         types = dict(_table.dtypes)
 54 |         return F.maxcolumn()
 55 | 
 56 | 
 57 | class ExpectColumnMaxToBeBetweenCustom(ColumnExpectation):
 58 |     # Setting necessary computation metric dependencies and defining kwargs, as well as assigning kwargs default values
 59 |     metric_dependencies = ("column.aggregate.custom.max",)
 60 |     success_keys = ("min_value", "strict_min", "max_value", "strict_max")
 61 | 
 62 |     # Default values
 63 |     default_kwarg_values = {
 64 |         "row_condition": None,
 65 |         "condition_parser": None,
 66 |         "min_value": None,
 67 |         "max_value": None,
 68 |         "strict_min": None,
 69 |         "strict_max": None,
 70 |         "mostly": 1
 71 |     }
 72 | 
 73 |     def _validate(
 74 |         self,
 75 |         configuration: ExpectationConfiguration,
 76 |         metrics: Dict,
 77 |         runtime_configuration: dict = None,
 78 |         execution_engine: ExecutionEngine = None,
 79 |     ):
 80 |         """Validates the given data against the set minimum and maximum value thresholds for the column max"""
 81 |         column_max = metrics["column.aggregate.custom.max"]
 82 | 
 83 |         # Obtaining components needed for validation
 84 |         min_value = self.get_success_kwargs(configuration).get("min_value")
 85 |         strict_min = self.get_success_kwargs(configuration).get("strict_min")
 86 |         max_value = self.get_success_kwargs(configuration).get("max_value")
 87 |         strict_max = self.get_success_kwargs(configuration).get("strict_max")
 88 | 
 89 |         # Checking if mean lies between thresholds
 90 |         if min_value is not None:
 91 |             if strict_min:
 92 |                 above_min = column_max > min_value
 93 |             else:
 94 |                 above_min = column_max >= min_value
 95 |         else:
 96 |             above_min = True
 97 | 
 98 |         if max_value is not None:
 99 |             if strict_max:
100 |                 below_max = column_max < max_value
101 |             else:
102 |                 below_max = column_max <= max_value
103 |         else:
104 |             below_max = True
105 | 
106 |         success = above_min and below_max
107 | 
108 |         return {"success": success, "result": {"observed_value": column_max}}
109 | 
110 |     def validate_configuration(self, configuration: Optional[ExpectationConfiguration]):
111 |         """
112 |         Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that
113 |         necessary configuration arguments have been provided for the validation of the expectation.
114 | 
115 |         Args:
116 |             configuration (OPTIONAL[ExpectationConfiguration]): \
117 |                 An optional Expectation Configuration entry that will be used to configure the expectation
118 |         Returns:
119 |             True if the configuration has been validated successfully. Otherwise, raises an exception
120 |         """
121 |         min_val = None
122 |         max_val = None
123 | 
124 |         # Setting up a configuration
125 |         super().validate_configuration(configuration)
126 |         if configuration is None:
127 |             configuration = self.configuration
128 | 
129 |         # Ensuring basic configuration parameters are properly set
130 |         try:
131 |             assert (
132 |                 "column" in configuration.kwargs
133 |             ), "'column' parameter is required for column map expectations"
134 |         except AssertionError as e:
135 |             raise InvalidExpectationConfigurationError(str(e))
136 | 
137 |         # Validating that Minimum and Maximum values are of the proper format and type
138 |         if "min_value" in configuration.kwargs:
139 |             min_val = configuration.kwargs["min_value"]
140 | 
141 |         if "max_value" in configuration.kwargs:
142 |             max_val = configuration.kwargs["max_value"]
143 | 
144 |         try:
145 |             # Ensuring Proper interval has been provided
146 |             assert (
147 |                 min_val is not None or max_val is not None
148 |             ), "min_value and max_value cannot both be none"
149 |             assert min_val is None or isinstance(
150 |                 min_val, (float, int)
151 |             ), "Provided min threshold must be a number"
152 |             assert max_val is None or isinstance(
153 |                 max_val, (float, int)
154 |             ), "Provided max threshold must be a number"
155 |         except AssertionError as e:
156 |             raise InvalidExpectationConfigurationError(str(e))
157 | 
158 |     @classmethod
159 |     @renderer(renderer_type="renderer.prescriptive")
160 |     @render_evaluation_parameter_string
161 |     def _prescriptive_renderer(
162 |             cls,
163 |             configuration: ExpectationConfiguration = None,
164 |             result: ExpectationValidationResult = None,
165 |             language: str = None,
166 |             runtime_configuration: dict = None,
167 |             **kwargs,
168 |     ) -> List[Union[dict, str, RenderedStringTemplateContent, RenderedTableContent, RenderedBulletListContent,
169 |                     RenderedGraphContent, Any]]:
170 |         runtime_configuration = runtime_configuration or {}
171 |         include_column_name = runtime_configuration.get("include_column_name", True)
172 |         include_column_name = (
173 |             include_column_name if include_column_name is not None else True
174 |         )
175 |         styling = runtime_configuration.get("styling")
176 |         # get params dict with all expected kwargs
177 |         params = substitute_none_for_missing(
178 |             configuration.kwargs,
179 |             [
180 |                 "column",
181 |                 "min_value",
182 |                 "max_value",
183 |                 "mostly",
184 |                 "row_condition",
185 |                 "condition_parser",
186 |                 "strict_min",
187 |                 "strict_max",
188 |             ],
189 |         )
190 | 
191 |         # build string template
192 |         if (params["min_value"] is None) and (params["max_value"] is None):
193 |             template_str = "values may have any length."
194 |         else:
195 |             at_least_str = (
196 |                 "greater than"
197 |                 if params.get("strict_min") is True
198 |                 else "greater than or equal to"
199 |             )
200 |             at_most_str = (
201 |                 "less than" if params.get("strict_max") is True else "less than or equal to"
202 |             )
203 | 
204 |             if params["mostly"] is not None:
205 |                 params["mostly_pct"] = num_to_str(
206 |                     params["mostly"] * 100, precision=15, no_scientific=True
207 |                 )
208 | 
209 |                 if params["min_value"] is not None and params["max_value"] is not None:
210 |                     template_str = f"values must be {at_least_str} $min_value and {at_most_str} $max_value characters long, at least $mostly_pct % of the time."
211 | 
212 |                 elif params["min_value"] is None:
213 |                     template_str = f"values must be {at_most_str} $max_value characters long, at least $mostly_pct % of the time."
214 | 
215 |                 elif params["max_value"] is None:
216 |                     template_str = f"values must be {at_least_str} $min_value characters long, at least $mostly_pct % of the time."
217 |             else:
218 |                 if params["min_value"] is not None and params["max_value"] is not None:
219 |                     template_str = f"values must always be {at_least_str} $min_value and {at_most_str} $max_value characters long."
220 | 
221 |                 elif params["min_value"] is None:
222 |                     template_str = f"values must always be {at_most_str} $max_value characters long."
223 | 
224 |                 elif params["max_value"] is None:
225 |                     template_str = f"values must always be {at_least_str} $min_value characters long."
226 | 
227 |         if include_column_name:
228 |             template_str = "$column " + template_str
229 | 
230 |         if params["row_condition"] is not None:
231 |             (
232 |                 conditional_template_str,
233 |                 conditional_params,
234 |             ) = parse_row_condition_string_pandas_engine(params["row_condition"])
235 |             template_str = conditional_template_str + ", then " + template_str
236 |             params.update(conditional_params)
237 | 
238 |         # return simple string
239 |         return [
240 |             RenderedStringTemplateContent(
241 |                 **{
242 |                     "content_block_type": "string_template",
243 |                     "string_template": {
244 |                         "template": template_str,
245 |                         "params": params,
246 |                         "styling": styling,
247 |                     },
248 |                 }
249 |             )
250 |         ]
251 | 


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v3_api/great_expectations/plugins/custom_data_docs/styles/data_docs_custom_styles.css:
--------------------------------------------------------------------------------
 1 | /*index page*/
 2 | .ge-index-page-site-name-title {}
 3 | .ge-index-page-table-container {}
 4 | .ge-index-page-table {}
 5 | .ge-index-page-table-profiling-links-header {}
 6 | .ge-index-page-table-expectations-links-header {}
 7 | .ge-index-page-table-validations-links-header {}
 8 | .ge-index-page-table-profiling-links-list {}
 9 | .ge-index-page-table-profiling-links-item {}
10 | .ge-index-page-table-expectation-suite-link {}
11 | .ge-index-page-table-validation-links-list {}
12 | .ge-index-page-table-validation-links-item {}
13 | 
14 | /*breadcrumbs*/
15 | .ge-breadcrumbs {}
16 | .ge-breadcrumbs-item {}
17 | 
18 | /*navigation sidebar*/
19 | .ge-navigation-sidebar-container {}
20 | .ge-navigation-sidebar-content {}
21 | .ge-navigation-sidebar-title {}
22 | .ge-navigation-sidebar-link {}
23 | 


--------------------------------------------------------------------------------
/getting_started_tutorial_final_v3_api/run_checkpoint_with_custom_expectation.py:
--------------------------------------------------------------------------------
 1 | import great_expectations as ge
 2 | 
 3 | # add great_expectations/plugins to path
 4 | import sys, os
 5 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'great_expectations'))
 6 | 
 7 | from plugins import column_custom_max_expectation
 8 | 
 9 | context = ge.get_context()
10 | context.run_checkpoint(checkpoint_name="my_checkpoint_with_custom_expectation")
11 | context.open_data_docs()
12 | 


--------------------------------------------------------------------------------