├── .gitignore ├── .idea ├── .gitignore ├── dlt_debug.iml ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── LICENSE ├── README.md ├── demo_notebook ├── dlt_debug_hybrid_demo.dbc └── dlt_debug_hybrid_demo.py ├── dlt_with_debug ├── __init__.py ├── dlt_signatures.py ├── helpers.py └── v2.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | /build/ 2 | /dist/ 3 | /dlt_with_debug.egg-info/ 4 | /venv/ 5 | .idea 6 | __pycache__ 7 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.idea/dlt_debug.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 12 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Souvik Pratiher 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 |
3 |

4 |

5 |

DLT with Debug

6 | 7 |

8 | Running DLT workflows from interactive notebooks. 9 |
10 |
11 |

12 |

13 | 14 | # Table of Contents 15 | 1. [About the project](#about-the-project) 16 | 2. [Demo Notebook](#sample-demo-notebook) 17 | 3. [Installation](#installation) 18 | 4. [Usage](#usage) 19 | 5. [Sample Pipeline Example](#sample-dlt-with-debug-dlt-pipeline-example) 20 | 6. [Quick API guide](#quick-api-guide) 21 | 7. [Functionalities](#functionality) 22 | 8. [Limitation](#limitation) 23 | 24 | ## About The Project 25 | 26 | Delta Live Tables (DLTs) are a great way to design data pipelines with only focusing on the core business logic. 27 | It makes the life of data engineers easy but while the development workflows are streamlined in DLT, when it comes to 28 | __*debugging and seeing how the data looks after each transformation step*__ in a typical DLT pipeline it becomes very 29 | tedious as we dont have the DLT package available in our interactive environment. 30 | 31 | Enter **dlt-with-debug** a lightweight decorator utility which allows developers to do interactive 32 | pipeline development by having a unified source code for both DLT run and Non-DLT interactive notebook run. 33 | 34 | 35 |

(back to top)

36 | 37 | ### Built With 38 | 39 | - Python's builtins 40 | - [globals()](https://docs.python.org/3/library/functions.html#globals) 41 | - [exec()](https://docs.python.org/3/library/functions.html#exec) 42 | - [decorator()](https://docs.python.org/3/glossary.html#term-decorator) 43 | 44 |

(back to top)

45 | 46 | ### Sample Demo Notebook 47 | 48 | [Click here](https://github.com/souvik-databricks/dlt-with-debug/tree/main/demo_notebook) to go to a sample notebook which you can import in your workspace to see the utility in action 49 | 50 | 51 | ### Installation 52 | 53 | pip install in your Databricks Notebook 54 | 55 | _**PyPI**_ 56 | ```python 57 | %pip install dlt-with-debug 58 | ``` 59 | 60 |

(back to top)

61 | 62 | 63 | ### Prerequisites 64 | 65 | - [Databricks](https://databricks.com/) 66 | - [Delta Live Tables](https://databricks.com/product/delta-live-tables) 67 | 68 |

(back to top)

(back to top)

106 | 107 | --- 108 | 109 | ### Sample `DLT with debug` DLT pipeline example 110 | 111 | > **Code**: 112 | 113 | Cmd 1 114 | ```python 115 | %pip install -e git+https://github.com/souvik-databricks/dlt-with-debug.git#"egg=dlt_with_debug" 116 | ``` 117 | Cmd 2 118 | ```python 119 | from pyspark.sql.functions import * 120 | from pyspark.sql.types import * 121 | 122 | # We are importing 123 | # dltwithdebug as that's the entry point to interactive DLT workflows 124 | # pipeline_id to ensure we import the dlt package based on environment 125 | # showoutput is a helper function for seeing the output result along with expectation metrics if any is specified 126 | from dlt_with_debug import dltwithdebug, pipeline_id, showoutput 127 | 128 | if pipeline_id: 129 | import dlt 130 | else: 131 | from dlt_with_debug import dlt 132 | ``` 133 | Cmd 3 134 | ```python 135 | json_path = "/databricks-datasets/wikipedia-datasets/data-001/clickstream/raw-uncompressed-json/2015_2_clickstream.json" 136 | ``` 137 | Cmd 4 138 | ```python 139 | # Notice we are using dlt.create_table instead of dlt.table 140 | 141 | @dlt.create_table( 142 | comment="The raw wikipedia click stream dataset, ingested from /databricks-datasets.", 143 | table_properties={ 144 | "quality": "bronze" 145 | } 146 | ) 147 | @dltwithdebug(globals()) 148 | def clickstream_raw(): 149 | return ( 150 | spark.read.option("inferSchema", "true").json(json_path) 151 | ) 152 | ``` 153 | Cmd 5 154 | ```python 155 | # for displaying the result of the transformation 156 | # use showoutput(func_name) 157 | # for example here we are using showoutput(clickstream_raw) 158 | showoutput(clickstream_raw) 159 | ``` 160 | ![Alt Text](https://raw.githubusercontent.com/souvik-databricks/random_snaps/main/clck_raw.png) 161 | 162 | Cmd 6 163 | ```python 164 | @dlt.create_table( 165 | comment="Wikipedia clickstream dataset with cleaned-up datatypes / column names and quality expectations.", 166 | table_properties={ 167 | "quality": "silver" 168 | } 169 | ) 170 | @dlt.expect("valid_current_page", "current_page_id IS NOT NULL AND current_page_title IS NOT NULL") 171 | @dlt.expect_or_fail("valid_count", "click_count > 0") 172 | @dlt.expect_all({'valid_prev_page_id': "previous_page_id IS NOT NULL"}) 173 | @dltwithdebug(globals()) 174 | def clickstream_clean(): 175 | return ( 176 | dlt.read("clickstream_raw") 177 | .withColumn("current_page_id", expr("CAST(curr_id AS INT)")) 178 | .withColumn("click_count", expr("CAST(n AS INT)")) 179 | .withColumn("previous_page_id", expr("CAST(prev_id AS INT)")) 180 | .withColumnRenamed("curr_title", "current_page_title") 181 | .withColumnRenamed("prev_title", "previous_page_title") 182 | .select("current_page_id", "current_page_title", "click_count", "previous_page_id", "previous_page_title") 183 | ) 184 | ``` 185 | Cmd 7 186 | ```python 187 | showoutput(clickstream_clean) 188 | ``` 189 | ![Alt Text](https://raw.githubusercontent.com/souvik-databricks/random_snaps/main/clck_clean.png) 190 | 191 | 192 | --- 193 | > _Important to note that here you can see we are also **seeing how many records will the expectations affect**._ 194 | --- 195 |

(back to top)

196 | 197 | ## Same sample `DLT with debug` DLT pipeline executed as part of a delta live table 198 | ![Alt Text](https://i.ibb.co/VQzZsZR/Screenshot-2022-10-18-at-5-34-14-AM.png) 199 | 200 | > Below we can see the expectation results also match up with the expectation metrics that we got from dltwithdebug earlier 201 | > with `showoutput(clickstream_clean)` 202 | > ![Expectation Results](https://raw.githubusercontent.com/souvik-databricks/random_snaps/main/expectations.png) 203 | 204 |

(back to top)

205 | 206 | ## Quick API guide 207 | 208 | #### Table syntax 209 | 210 | ```python 211 | @dlt.create_table( # <-- Notice we are using the dlt.create_table() instead of dlt.table() 212 | name="", 213 | comment="", 214 | spark_conf={"" : ""}, 215 | table_properties={"" : "", "" : ""}, 216 | path="", 217 | partition_cols=["", ""], 218 | schema="schema-definition", 219 | temporary=False) 220 | @dlt.expect 221 | @dlt.expect_or_fail 222 | @dlt.expect_or_drop 223 | @dlt.expect_all 224 | @dlt.expect_all_or_drop 225 | @dlt.expect_all_or_fail 226 | @dltwithdebug(globals()) # <-- This dltwithdebug(globals()) needs to be added 227 | def (): 228 | return () 229 | ``` 230 | 231 | #### View syntax 232 | 233 | ```python 234 | @dlt.create_view( # <-- Notice we are using the dlt.create_view() instead of dlt.view() 235 | name="", 236 | comment="") 237 | @dlt.expect 238 | @dlt.expect_or_fail 239 | @dlt.expect_or_drop 240 | @dlt.expect_all 241 | @dlt.expect_all_or_drop 242 | @dlt.expect_all_or_fail 243 | @dltwithdebug(globals()) # <-- This dltwithdebug(globals()) needs to be added 244 | def (): 245 | return () 246 | ``` 247 | 248 | #### Getting results syntax 249 | 250 | ```python 251 | showoutput(function_name) # <-- showoutput(function_name) 252 | # Notice we are only passing the function name 253 | # The name of the function which is wrapped by the dltdecorators 254 | 255 | # For example: 256 | # @dlt.create_table() 257 | # @dltwithdebug(globals()) 258 | # def step_one(): 259 | # return spark.read.csv() 260 | 261 | # showoutput(step_one) 262 | ``` 263 | 264 | #### Import syntax 265 | 266 | ```python 267 | # We are importing 268 | # dltwithdebug as that's the entry point to interactive DLT workflows 269 | # pipeline_id to ensure we import the dlt package based on environment 270 | # showoutput is a helper function for seeing the output result along with expectation metrics if any is specified 271 | from dlt_with_debug import dltwithdebug, pipeline_id, showoutput 272 | 273 | if pipeline_id: 274 | import dlt 275 | else: 276 | from dlt_with_debug import dlt 277 | ``` 278 | 279 |

(back to top)

(back to top)

316 | 317 | **Drop a ⭐️ if you liked the project and it helped you to have a smoother experience while working with DLTs** 318 | -------------------------------------------------------------------------------- /demo_notebook/dlt_debug_hybrid_demo.dbc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/souvik-databricks/dlt-with-debug/db7f9b3666d000d96908eb5cac60dddf86ac8c18/demo_notebook/dlt_debug_hybrid_demo.dbc -------------------------------------------------------------------------------- /demo_notebook/dlt_debug_hybrid_demo.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %pip install dlt-with-debug 3 | 4 | # COMMAND ---------- 5 | 6 | from pyspark.sql.functions import * 7 | from pyspark.sql.types import * 8 | 9 | from dlt_with_debug import dltwithdebug, pipeline_id, showoutput 10 | 11 | if pipeline_id: 12 | import dlt 13 | else: 14 | from dlt_with_debug import dlt 15 | 16 | # COMMAND ---------- 17 | 18 | json_path = "/databricks-datasets/wikipedia-datasets/data-001/clickstream/raw-uncompressed-json/2015_2_clickstream.json" 19 | 20 | # COMMAND ---------- 21 | 22 | @dlt.create_table( 23 | comment="The raw wikipedia click stream dataset, ingested from /databricks-datasets.", 24 | table_properties={ 25 | "quality": "bronze" 26 | } 27 | ) 28 | @dltwithdebug(globals()) 29 | def clickstream_raw(): 30 | return ( 31 | spark.read.option("inferSchema", "true").json(json_path) 32 | ) 33 | 34 | # COMMAND ---------- 35 | 36 | showoutput(clickstream_raw) 37 | 38 | # COMMAND ---------- 39 | 40 | @dlt.create_table( 41 | comment="Wikipedia clickstream dataset with cleaned-up datatypes / column names and quality expectations.", 42 | table_properties={ 43 | "quality": "silver" 44 | } 45 | ) 46 | @dlt.expect("valid_current_page", "current_page_id IS NOT NULL AND current_page_title IS NOT NULL") 47 | @dlt.expect_or_fail("valid_count", "click_count > 0") 48 | @dlt.expect_all({'valid_prev_page_id': "previous_page_id IS NOT NULL"}) 49 | @dltwithdebug(globals()) 50 | def clickstream_clean(): 51 | return ( 52 | dlt.read("clickstream_raw") 53 | .withColumn("current_page_id", expr("CAST(curr_id AS INT)")) 54 | .withColumn("click_count", expr("CAST(n AS INT)")) 55 | .withColumn("previous_page_id", expr("CAST(prev_id AS INT)")) 56 | .withColumnRenamed("curr_title", "current_page_title") 57 | .withColumnRenamed("prev_title", "previous_page_title") 58 | .select("current_page_id", "current_page_title", "click_count", "previous_page_id", "previous_page_title") 59 | ) 60 | 61 | # COMMAND ---------- 62 | 63 | showoutput(clickstream_clean) 64 | 65 | # COMMAND ---------- 66 | 67 | 68 | -------------------------------------------------------------------------------- /dlt_with_debug/__init__.py: -------------------------------------------------------------------------------- 1 | from dlt_with_debug.v2 import dltwithdebug, pipeline_id, showoutput 2 | import dlt_with_debug.dlt_signatures as dlt 3 | 4 | -------------------------------------------------------------------------------- /dlt_with_debug/dlt_signatures.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains the empty placeholder signatures of the dlt APIs 3 | """ 4 | from functools import wraps 5 | from dlt_with_debug.helpers import undecorated 6 | import builtins as orig 7 | 8 | g_ns_for_placeholders = globals() 9 | addglobals = lambda x: g_ns_for_placeholders.update(x) 10 | 11 | def read(arg): 12 | return g_ns_for_placeholders[arg]() 13 | 14 | 15 | def read_stream(arg): 16 | return g_ns_for_placeholders[arg]() 17 | 18 | 19 | def table(name=None, 20 | comment=None, 21 | spark_conf=None, 22 | table_properties=None, 23 | path=None, 24 | partition_cols=None, 25 | schema=None, 26 | temporary=None): 27 | def true_decorator(f): 28 | @wraps(f) 29 | def wrapped(*args, **kwargs): 30 | return f(*args, **kwargs) 31 | 32 | return wrapped 33 | 34 | return true_decorator 35 | 36 | create_table = table 37 | 38 | 39 | def view(name=None, 40 | comment=None): 41 | def true_decorator(f): 42 | @wraps(f) 43 | def wrapped(*args, **kwargs): 44 | return f(*args, **kwargs) 45 | 46 | return wrapped 47 | 48 | return true_decorator 49 | 50 | create_view = view 51 | 52 | def get_name_inv_statement(f,name,inv): 53 | func = undecorated(f) 54 | count = func().filter(inv).count() 55 | total = func().count() 56 | stmt = f"Expectation `{name}` will affect {total-count} records which is {orig.round(((total-count)/total)*100,2)}% of total {total} records" 57 | return stmt 58 | 59 | 60 | def expect(name=None, 61 | inv=None): 62 | def true_decorator(f): 63 | @wraps(f) 64 | def wrapped(*args, **kwargs): 65 | if name: 66 | stmt = "'expect' "+get_name_inv_statement(f,name,inv) 67 | print(stmt) 68 | return f(*args, **kwargs) 69 | 70 | return wrapped 71 | 72 | return true_decorator 73 | 74 | 75 | def expect_or_drop(name=None, 76 | inv=None): 77 | def true_decorator(f): 78 | @wraps(f) 79 | def wrapped(*args, **kwargs): 80 | if name: 81 | stmt = "'expect_or_drop' "+get_name_inv_statement(f,name,inv) 82 | print(stmt) 83 | return f(*args, **kwargs) 84 | 85 | return wrapped 86 | 87 | return true_decorator 88 | 89 | 90 | def expect_or_fail(name=None, 91 | inv=None): 92 | def true_decorator(f): 93 | @wraps(f) 94 | def wrapped(*args, **kwargs): 95 | if name: 96 | stmt = "'expect_or_fail' "+get_name_inv_statement(f,name,inv) 97 | print(stmt) 98 | return f(*args, **kwargs) 99 | 100 | return wrapped 101 | 102 | return true_decorator 103 | 104 | 105 | def get_expectations_statement(f,expectations): 106 | func = undecorated(f) 107 | expec_lst = list(expectations.values()) 108 | expec_lst = ["(" + str(i) + ")" for i in expec_lst] 109 | expec_cond = " AND ".join(expec_lst) 110 | count = func().filter(expec_cond).count() 111 | total = func().count() 112 | expec_txt = " AND ".join(list(expectations.keys())) 113 | stmt = f"Expectations `{expec_txt}` will affect {total-count} records which is {orig.round(((total-count) / total) * 100, 2)}% of total {total} records" 114 | return stmt 115 | 116 | 117 | def expect_all(expectations=None): 118 | def true_decorator(f): 119 | @wraps(f) 120 | def wrapped(*args, **kwargs): 121 | if expectations: 122 | stmt = "'expect_all' "+get_expectations_statement(f,expectations) 123 | print(stmt) 124 | return f(*args, **kwargs) 125 | 126 | return wrapped 127 | 128 | return true_decorator 129 | 130 | 131 | def expect_all_or_drop(expectations=None): 132 | def true_decorator(f): 133 | @wraps(f) 134 | def wrapped(*args, **kwargs): 135 | if expectations: 136 | stmt = "'expect_all_or_drop' "+get_expectations_statement(f,expectations) 137 | print(stmt) 138 | return f(*args, **kwargs) 139 | 140 | return wrapped 141 | 142 | return true_decorator 143 | 144 | 145 | def expect_all_or_fail(expectations=None): 146 | def true_decorator(f): 147 | @wraps(f) 148 | def wrapped(*args, **kwargs): 149 | if expectations: 150 | stmt = "'expect_all_or_fail' "+get_expectations_statement(f,expectations) 151 | print(stmt) 152 | return f(*args, **kwargs) 153 | 154 | return wrapped 155 | 156 | return true_decorator 157 | -------------------------------------------------------------------------------- /dlt_with_debug/helpers.py: -------------------------------------------------------------------------------- 1 | from inspect import isfunction, ismethod, isclass 2 | 3 | 4 | def remove_dltwithdebug_decorator(code): 5 | parsed_code = code.split("\n") 6 | parsed_code = [i if (i.startswith('@dlt.table(') == False) else i.replace('@dlt.table(','@dlt.create_table(') for i in parsed_code ] 7 | parsed_code = [i for i in parsed_code if i.startswith('@dltwithdebug') == False] 8 | parsed_code = '\n'.join(parsed_code) 9 | return parsed_code 10 | 11 | 12 | def check_if_decorator(a): 13 | return isfunction(a) or ismethod(a) or isclass(a) 14 | 15 | 16 | def undecorated(o): 17 | """Unpack all decorators from a function, method or class""" 18 | if type(o) is type: 19 | return o 20 | try: 21 | closure = o.__closure__ 22 | except AttributeError: 23 | return 24 | if closure: 25 | for cell in closure: 26 | if cell.cell_contents is o: 27 | continue 28 | 29 | if check_if_decorator(cell.cell_contents): 30 | undecd = undecorated(cell.cell_contents) 31 | if undecd: 32 | return undecd 33 | else: 34 | return o 35 | else: 36 | return o 37 | -------------------------------------------------------------------------------- /dlt_with_debug/v2.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | from inspect import getsource 3 | from dlt_with_debug.helpers import undecorated, remove_dltwithdebug_decorator 4 | from dlt_with_debug.dlt_signatures import addglobals 5 | from pyspark.sql import SparkSession 6 | 7 | spark = SparkSession.getActiveSession() 8 | pipeline_id = spark.conf.get("pipelines.id", None) 9 | 10 | 11 | def dltwithdebug(g_ns): 12 | def true_decorator(f): 13 | @wraps(f) 14 | def wrapped(*args, **kwargs): 15 | if pipeline_id: 16 | return f(*args, **kwargs) 17 | else: 18 | f_undec = undecorated(f) 19 | code = getsource(f_undec) 20 | parsed_code = remove_dltwithdebug_decorator(code) 21 | addglobals(g_ns) 22 | exec(parsed_code, g_ns) 23 | return f(*args, **kwargs) 24 | return wrapped 25 | return true_decorator 26 | 27 | def showoutput(f): 28 | if not pipeline_id: 29 | df = f() 30 | df.display() 31 | else: 32 | None 33 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import codecs 3 | import os 4 | 5 | here = os.path.abspath(os.path.dirname(__file__)) 6 | 7 | with codecs.open(os.path.join(here, "README.md"), encoding="utf-8") as fh: 8 | long_description = "\n" + fh.read() 9 | 10 | VERSION = '2.2' 11 | DESCRIPTION = 'Utility for running workflows leveraging delta live tables from interactive notebooks' 12 | 13 | # Setting up 14 | setup( 15 | name="dlt_with_debug", 16 | version=VERSION, 17 | author="Souvik Pratiher", 18 | url='https://github.com/souvik-databricks/dlt-with-debug', 19 | author_email="souvik.pratiher@databricks.com", 20 | description=DESCRIPTION, 21 | long_description_content_type="text/markdown", 22 | long_description=long_description, 23 | packages=find_packages(), 24 | keywords=['python3', 'delta live tables'], 25 | classifiers=[ 26 | "Development Status :: 5 - Production/Stable", 27 | "Intended Audience :: Developers", 28 | "Programming Language :: Python :: 3" 29 | ] 30 | ) --------------------------------------------------------------------------------