├── README.md
└── main.py


/README.md:
--------------------------------------------------------------------------------
1 | # GreatExpectationsWithDatabricks
2 | Getting Great Expectations setup to run on DataBricks with Spark Dataframes.
3 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | from pyspark.sql import SparkSession, DataFrame
  3 | import os
  4 | import json
  5 | 
  6 | from great_expectations.core.batch import RuntimeBatchRequest
  7 | from great_expectations.profile.json_schema_profiler import JsonSchemaProfiler
  8 | from great_expectations.data_context import BaseDataContext
  9 | from great_expectations.data_context.types.base import (
 10 |     DataContextConfig,
 11 |     FilesystemStoreBackendDefaults,
 12 | )
 13 | 
 14 | 
 15 | trips_expect = {
 16 |   "properties": {},
 17 |   "type": 'object',
 18 |   "data_asset_type": '',
 19 |   "expectation_suite_name": "bikes",
 20 |   "expectations": [
 21 |     {
 22 |       "expectation_type": "expect_table_columns_to_match_ordered_list",
 23 |       "kwargs": {
 24 |         "column_list": [
 25 |           "ride_id", "rideable_type", "started_at", "ended_at", "start_station_name", "start_station_id",
 26 |             "end_station_name", "end_station_id", "start_lat", "start_lng", "end_lat", "end_lng", "member_casual"
 27 |         ]
 28 |       },
 29 |       "meta": {}
 30 |     },
 31 |     {
 32 |       "expectation_type": "expect_table_row_count_to_be_between",
 33 |       "kwargs": {
 34 |         "max_value": 1000000,
 35 |         "min_value": 1000
 36 |       },
 37 |       "meta": {}
 38 |     }
 39 |   ],
 40 |   "ge_cloud_id": '',
 41 |   "meta": {
 42 |     "citations": [
 43 |       {
 44 |         "batch_request": {
 45 |           "data_asset_name": "trip_data_batch",
 46 |           "data_connector_name": "DataFrame_Trips_Data_Connector",
 47 |           "datasource_name": "DataFrame_Trips_Source",
 48 |           "limit": 1000
 49 |         },
 50 |         "citation_date": "2022-06-02",
 51 |         "comment": "Created suite "
 52 |       }
 53 |     ],
 54 |     "great_expectations_version": "0.14.10"
 55 |   }
 56 | }
 57 | 
 58 | 
 59 | 
 60 | def prepare_ge_context(root_dir: str) -> BaseDataContext:
 61 |     data_context_config = DataContextConfig(
 62 |         store_backend_defaults=FilesystemStoreBackendDefaults(
 63 |             root_directory=root_dir
 64 |         ),
 65 |     )
 66 |     ge_context = BaseDataContext(project_config=data_context_config)
 67 |     return ge_context
 68 | 
 69 | 
 70 | def prepare_get_datasource(dname: str = 'DataFrame_Trips_Source') -> dict:
 71 |     ge_dataframe_datasource = {
 72 |         "name": dname,
 73 |         "class_name": "Datasource",
 74 |         "execution_engine": {"class_name": "SparkDFExecutionEngine"},
 75 |         "data_connectors": {
 76 |             "DataFrame_Trips_Data_Connector": {
 77 |                 "module_name": "great_expectations.datasource.data_connector",
 78 |                 "class_name": "RuntimeDataConnector",
 79 |                 "batch_identifiers": [
 80 |                     "trips_source",
 81 |                     "divvy_bike_trips",
 82 |                 ],
 83 |             }
 84 |         },
 85 |     }
 86 |     return ge_dataframe_datasource
 87 | 
 88 | 
 89 | def prepare_checkpoint() -> dict:
 90 |     ge_trip_data_checkpoint = "trip_check"
 91 |     checkpoint_config = {
 92 |         "name": ge_trip_data_checkpoint,
 93 |         "config_version": 1.0,
 94 |         "class_name": "SimpleCheckpoint",
 95 |         "run_name_template": "%Y%m%d-%H%M%S-trip-run",
 96 |     }
 97 |     return checkpoint_config
 98 | 
 99 | 
100 | def prepare_runtime_batch(df: DataFrame):
101 |     batch_request = RuntimeBatchRequest(
102 |         datasource_name="DataFrame_Trips_Source",
103 |         data_connector_name="DataFrame_Trips_Data_Connector",
104 |         data_asset_name="trip_data_batch",  # This can be anything that identifies this data_asset for you
105 |         batch_identifiers={
106 |             "trips_source": "trips_source",
107 |             "divvy_bike_trips": "divvy_bike_trips",
108 |         },
109 |         runtime_parameters={"batch_data": df},  # Your dataframe goes here
110 |     )
111 |     return batch_request
112 | 
113 | 
114 | def run_checkpoint(context, batch_request):
115 |     checkpoint_result = context.run_checkpoint(
116 |         checkpoint_name="trip_check",
117 |         validations=[
118 |             {
119 |                 "batch_request": batch_request,
120 |                 "expectation_suite_name": "bikes",
121 |             }
122 |         ],
123 |     )
124 |     return checkpoint_result
125 | 
126 | df = spark.read.format("csv")\
127 |     .option("header", "true")\
128 |     .option("inferSchema", "true")\
129 |     .load("s3a://confessions-of-a-data-guy/*divvy-tripdata.csv")
130 | 
131 | root_directory = "/dbfs/great_expectations/"
132 | # Prepare Great Expectations / storage on Databricks DBFS
133 | ge_context = prepare_ge_context(root_directory)
134 | 
135 | # Prepare DataFrame as Data Source Connector for GE.
136 | ge_context.add_datasource(**prepare_get_datasource())
137 | 
138 | # Prepare Checkpoint
139 | trips_check = prepare_checkpoint()
140 | ge_context.add_checkpoint(**trips_check)
141 | 
142 | # create and save expectation suite
143 | profiler = JsonSchemaProfiler()
144 | suite = profiler.profile(trips_expect, "bikes")
145 | ge_context.save_expectation_suite(suite)
146 | 
147 | # Prepare Batch and Validate
148 | trips_batch_request = prepare_runtime_batch(df)
149 | validation_results = run_checkpoint(ge_context, trips_batch_request)
150 | print(validation_results)
151 | 


--------------------------------------------------------------------------------