├── README.md └── main.py /README.md: -------------------------------------------------------------------------------- 1 | # GreatExpectationsWithDatabricks 2 | Getting Great Expectations setup to run on DataBricks with Spark Dataframes. 3 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from pyspark.sql import SparkSession, DataFrame 3 | import os 4 | import json 5 | 6 | from great_expectations.core.batch import RuntimeBatchRequest 7 | from great_expectations.profile.json_schema_profiler import JsonSchemaProfiler 8 | from great_expectations.data_context import BaseDataContext 9 | from great_expectations.data_context.types.base import ( 10 | DataContextConfig, 11 | FilesystemStoreBackendDefaults, 12 | ) 13 | 14 | 15 | trips_expect = { 16 | "properties": {}, 17 | "type": 'object', 18 | "data_asset_type": '', 19 | "expectation_suite_name": "bikes", 20 | "expectations": [ 21 | { 22 | "expectation_type": "expect_table_columns_to_match_ordered_list", 23 | "kwargs": { 24 | "column_list": [ 25 | "ride_id", "rideable_type", "started_at", "ended_at", "start_station_name", "start_station_id", 26 | "end_station_name", "end_station_id", "start_lat", "start_lng", "end_lat", "end_lng", "member_casual" 27 | ] 28 | }, 29 | "meta": {} 30 | }, 31 | { 32 | "expectation_type": "expect_table_row_count_to_be_between", 33 | "kwargs": { 34 | "max_value": 1000000, 35 | "min_value": 1000 36 | }, 37 | "meta": {} 38 | } 39 | ], 40 | "ge_cloud_id": '', 41 | "meta": { 42 | "citations": [ 43 | { 44 | "batch_request": { 45 | "data_asset_name": "trip_data_batch", 46 | "data_connector_name": "DataFrame_Trips_Data_Connector", 47 | "datasource_name": "DataFrame_Trips_Source", 48 | "limit": 1000 49 | }, 50 | "citation_date": "2022-06-02", 51 | "comment": "Created suite " 52 | } 53 | ], 54 | "great_expectations_version": "0.14.10" 55 | } 56 | } 57 | 58 | 59 | 60 | def prepare_ge_context(root_dir: str) -> BaseDataContext: 61 | data_context_config = DataContextConfig( 62 | store_backend_defaults=FilesystemStoreBackendDefaults( 63 | root_directory=root_dir 64 | ), 65 | ) 66 | ge_context = BaseDataContext(project_config=data_context_config) 67 | return ge_context 68 | 69 | 70 | def prepare_get_datasource(dname: str = 'DataFrame_Trips_Source') -> dict: 71 | ge_dataframe_datasource = { 72 | "name": dname, 73 | "class_name": "Datasource", 74 | "execution_engine": {"class_name": "SparkDFExecutionEngine"}, 75 | "data_connectors": { 76 | "DataFrame_Trips_Data_Connector": { 77 | "module_name": "great_expectations.datasource.data_connector", 78 | "class_name": "RuntimeDataConnector", 79 | "batch_identifiers": [ 80 | "trips_source", 81 | "divvy_bike_trips", 82 | ], 83 | } 84 | }, 85 | } 86 | return ge_dataframe_datasource 87 | 88 | 89 | def prepare_checkpoint() -> dict: 90 | ge_trip_data_checkpoint = "trip_check" 91 | checkpoint_config = { 92 | "name": ge_trip_data_checkpoint, 93 | "config_version": 1.0, 94 | "class_name": "SimpleCheckpoint", 95 | "run_name_template": "%Y%m%d-%H%M%S-trip-run", 96 | } 97 | return checkpoint_config 98 | 99 | 100 | def prepare_runtime_batch(df: DataFrame): 101 | batch_request = RuntimeBatchRequest( 102 | datasource_name="DataFrame_Trips_Source", 103 | data_connector_name="DataFrame_Trips_Data_Connector", 104 | data_asset_name="trip_data_batch", # This can be anything that identifies this data_asset for you 105 | batch_identifiers={ 106 | "trips_source": "trips_source", 107 | "divvy_bike_trips": "divvy_bike_trips", 108 | }, 109 | runtime_parameters={"batch_data": df}, # Your dataframe goes here 110 | ) 111 | return batch_request 112 | 113 | 114 | def run_checkpoint(context, batch_request): 115 | checkpoint_result = context.run_checkpoint( 116 | checkpoint_name="trip_check", 117 | validations=[ 118 | { 119 | "batch_request": batch_request, 120 | "expectation_suite_name": "bikes", 121 | } 122 | ], 123 | ) 124 | return checkpoint_result 125 | 126 | df = spark.read.format("csv")\ 127 | .option("header", "true")\ 128 | .option("inferSchema", "true")\ 129 | .load("s3a://confessions-of-a-data-guy/*divvy-tripdata.csv") 130 | 131 | root_directory = "/dbfs/great_expectations/" 132 | # Prepare Great Expectations / storage on Databricks DBFS 133 | ge_context = prepare_ge_context(root_directory) 134 | 135 | # Prepare DataFrame as Data Source Connector for GE. 136 | ge_context.add_datasource(**prepare_get_datasource()) 137 | 138 | # Prepare Checkpoint 139 | trips_check = prepare_checkpoint() 140 | ge_context.add_checkpoint(**trips_check) 141 | 142 | # create and save expectation suite 143 | profiler = JsonSchemaProfiler() 144 | suite = profiler.profile(trips_expect, "bikes") 145 | ge_context.save_expectation_suite(suite) 146 | 147 | # Prepare Batch and Validate 148 | trips_batch_request = prepare_runtime_batch(df) 149 | validation_results = run_checkpoint(ge_context, trips_batch_request) 150 | print(validation_results) 151 | --------------------------------------------------------------------------------