├── .DS_Store ├── Data Apps ├── .DS_Store └── Data Management Intro Dash App │ ├── .DS_Store │ └── plotly_iot_demo │ ├── .DS_Store │ ├── etl_pipelines │ └── iot_dashboard_etl.sql │ ├── option_1_manual_ddl │ ├── .DS_Store │ ├── __pycache__ │ │ └── ddls.cpython-38.pyc │ ├── app.py │ ├── config.json │ └── ddls.py │ └── requirements.txt ├── Delta Optimizer ├── .DS_Store ├── DashAppFrontEnd │ └── .DS_Store ├── Instructions.md ├── Step 1_ Optimization Strategy Builder.py ├── Step 2_ Strategy Runner.py ├── Step 3_ Query History and Profile Analyzer.py ├── deltaoptimizer-1.4.1-py3-none-any.whl ├── deltaoptimizer-1.5.0-py3-none-any.whl └── deltaoptimizer-1.5.2-py3-none-any.whl ├── Design Patterns Notebooks ├── Advanced Notebooks │ ├── End to End Procedural Migration Pattern │ │ └── Procedural Migration Pattern with SCD2 Example.py │ ├── Multi-plexing with Autoloader │ │ └── Option 1: Actually Multi-plexing tables on write │ │ │ ├── Child Job Template.py │ │ │ └── Controller Job.py │ ├── Parallel Custom Named File Exports │ │ ├── Parallel File Exports - Python Version.py │ │ └── Parallel File Exports.py │ └── SCD Design Patterns │ │ └── Advanced CDC With SCD in Databricks.py ├── Step 1 - SQL EDW Pipeline.sql ├── Step 10 - Lakehouse Federation.py ├── Step 11 - SQL Orchestration in Production.py ├── Step 12 - SCD2 - SQL EDW Pipeline.sql ├── Step 13 - Migrating Identity Columns.sql ├── Step 14 - Using the Query Profile.sql ├── Step 2 - Optimize your Delta Tables.py ├── Step 3 - DLT Version Simple SQL EDW Pipeline.sql ├── Step 4 - Create Gold Layer Analytics Tables.sql ├── Step 5 - Unified Batch and Streaming.py ├── Step 6 - Streaming Table Design Patterns.sql ├── Step 7 - COPY INTO Loading Patterns.py ├── Step 8 - Liquid Clustering Delta Tables.py └── Step 9 - Using SQL Functions.py ├── LICENSE ├── README.md ├── Realtime Data Apps Workshop ├── Step 0 - Real Time Data Generator Simulator.py ├── Step 1 - Stream from Generator.py └── Step 2 - Create Gold Views for App Layer.sql ├── RedshiftDDLMigrator └── Redshift DDL Migrator.py ├── Using DBSQL Serverless Client Example.py ├── Using DBSQL Serverless Transaction Manager Example.py ├── Using Delta Helpers Notebook Example.py ├── Using Delta Logger Example.py ├── Using Delta Merge Helpers Example.py ├── Using Streaming Tables and MV Orchestrator.py ├── Using Transaction Manager Example.py └── helperfunctions ├── __init__.py ├── datavalidator.py ├── dbsqlclient.py ├── dbsqltransactions.py ├── deltahelpers.py ├── deltalogger.py ├── redshiftchecker.py ├── requirements.txt ├── stmvorchestrator.py └── transactions.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/.DS_Store -------------------------------------------------------------------------------- /Data Apps/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/Data Apps/.DS_Store -------------------------------------------------------------------------------- /Data Apps/Data Management Intro Dash App/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/Data Apps/Data Management Intro Dash App/.DS_Store -------------------------------------------------------------------------------- /Data Apps/Data Management Intro Dash App/plotly_iot_demo/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/Data Apps/Data Management Intro Dash App/plotly_iot_demo/.DS_Store -------------------------------------------------------------------------------- /Data Apps/Data Management Intro Dash App/plotly_iot_demo/etl_pipelines/iot_dashboard_etl.sql: -------------------------------------------------------------------------------- 1 | TRUNCATE TABLE main.plotly_iot_dashboard.bronze_users; 2 | TRUNCATE TABLE main.plotly_iot_dashboard.bronze_sensors; 3 | 4 | 5 | -- DBTITLE 1,Incrementally Ingest Source Data from Raw Files 6 | COPY INTO main.plotly_iot_dashboard.bronze_sensors 7 | FROM (SELECT 8 | id::bigint AS Id, 9 | device_id::integer AS device_id, 10 | user_id::integer AS user_id, 11 | calories_burnt::decimal(10,2) AS calories_burnt, 12 | miles_walked::decimal(10,2) AS miles_walked, 13 | num_steps::decimal(10,2) AS num_steps, 14 | timestamp::timestamp AS timestamp, 15 | value AS value 16 | FROM "/databricks-datasets/iot-stream/data-device/") 17 | FILEFORMAT = json 18 | COPY_OPTIONS('force'='true') --option to be incremental or always load all files 19 | ; 20 | 21 | 22 | -- DBTITLE 1,Perform Upserts - Device Data 23 | MERGE INTO main.plotly_iot_dashboard.silver_sensors AS target 24 | USING (SELECT Id::integer, 25 | device_id::integer, 26 | user_id::integer, 27 | calories_burnt::decimal, 28 | miles_walked::decimal, 29 | num_steps::decimal, 30 | timestamp::timestamp, 31 | value::string 32 | FROM main.plotly_iot_dashboard.bronze_sensors) AS source 33 | ON source.Id = target.Id 34 | AND source.user_id = target.user_id 35 | AND source.device_id = target.device_id 36 | WHEN MATCHED THEN UPDATE SET 37 | target.calories_burnt = source.calories_burnt, 38 | target.miles_walked = source.miles_walked, 39 | target.num_steps = source.num_steps, 40 | target.timestamp = source.timestamp 41 | WHEN NOT MATCHED THEN INSERT *; 42 | 43 | --Truncate bronze batch once successfully loaded 44 | TRUNCATE TABLE main.plotly_iot_dashboard.bronze_sensors; 45 | 46 | -- COMMAND ---------- 47 | 48 | -- DBTITLE 1,Table Optimizations 49 | OPTIMIZE main.plotly_iot_dashboard.silver_sensors ZORDER BY (user_id, device_id, timestamp); 50 | 51 | -- COMMAND ---------- 52 | 53 | -- DBTITLE 1,Incrementally Ingest Raw User Data 54 | COPY INTO main.plotly_iot_dashboard.bronze_users 55 | FROM (SELECT 56 | userid::bigint AS user_id, 57 | gender AS gender, 58 | age::integer AS age, 59 | height::decimal(10,2) AS height, 60 | weight::decimal(10,2) AS weight, 61 | smoker AS smoker, 62 | familyhistory AS familyhistory, 63 | cholestlevs AS cholestlevs, 64 | bp AS bp, 65 | risk::decimal(10,2) AS risk, 66 | current_timestamp() AS update_timestamp 67 | FROM "/databricks-datasets/iot-stream/data-user/") 68 | FILEFORMAT = CSV 69 | FORMAT_OPTIONS('header'='true') 70 | COPY_OPTIONS('force'='true') --option to be incremental or always load all files 71 | ; 72 | 73 | 74 | MERGE INTO main.plotly_iot_dashboard.silver_users AS target 75 | USING (SELECT 76 | user_id::int, 77 | gender::string, 78 | age::int, 79 | height::decimal, 80 | weight::decimal, 81 | smoker, 82 | familyhistory, 83 | cholestlevs, 84 | bp, 85 | risk, 86 | update_timestamp 87 | FROM main.plotly_iot_dashboard.bronze_users) AS source 88 | ON source.user_id = target.user_id 89 | WHEN MATCHED THEN UPDATE SET 90 | target.gender = source.gender, 91 | target.age = source.age, 92 | target.height = source.height, 93 | target.weight = source.weight, 94 | target.smoker = source.smoker, 95 | target.familyhistory = source.familyhistory, 96 | target.cholestlevs = source.cholestlevs, 97 | target.bp = source.bp, 98 | target.risk = source.risk, 99 | target.update_timestamp = source.update_timestamp 100 | WHEN NOT MATCHED THEN INSERT *; 101 | 102 | --Truncate bronze batch once successfully loaded 103 | TRUNCATE TABLE main.plotly_iot_dashboard.bronze_users; 104 | 105 | 106 | 107 | 108 | -- Create Gold Table and Read via Reflection 109 | 110 | CREATE OR REPLACE TABLE main.plotly_iot_dashboard.gold_sensors 111 | AS 112 | (SELECT timestamp, 113 | -- Number of Steps 114 | (avg(`num_steps`) OVER ( 115 | ORDER BY timestamp 116 | ROWS BETWEEN 117 | 15 PRECEDING AND 118 | CURRENT ROW 119 | )) ::float AS SmoothedNumSteps30SecondMA, -- 30 second moving average 120 | 121 | (avg(`num_steps`) OVER ( 122 | ORDER BY timestamp 123 | ROWS BETWEEN 124 | 60 PRECEDING AND 125 | CURRENT ROW 126 | ))::float AS SmoothedNumSteps120SecondMA,--120 second moving average, 127 | -- Calories Burnt 128 | (avg(`calories_burnt`) OVER ( 129 | ORDER BY timestamp 130 | ROWS BETWEEN 131 | 15 PRECEDING AND 132 | CURRENT ROW 133 | )) ::float AS SmoothedCaloriesBurnt30SecondMA, -- 30 second moving average 134 | 135 | (avg(`calories_burnt`) OVER ( 136 | ORDER BY timestamp 137 | ROWS BETWEEN 138 | 60 PRECEDING AND 139 | CURRENT ROW 140 | ))::float AS SmoothedCaloriesBurnt120SecondMA --120 second moving average 141 | FROM main.plotly_iot_dashboard.silver_sensors 142 | ) -------------------------------------------------------------------------------- /Data Apps/Data Management Intro Dash App/plotly_iot_demo/option_1_manual_ddl/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/Data Apps/Data Management Intro Dash App/plotly_iot_demo/option_1_manual_ddl/.DS_Store -------------------------------------------------------------------------------- /Data Apps/Data Management Intro Dash App/plotly_iot_demo/option_1_manual_ddl/__pycache__/ddls.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/Data Apps/Data Management Intro Dash App/plotly_iot_demo/option_1_manual_ddl/__pycache__/ddls.cpython-38.pyc -------------------------------------------------------------------------------- /Data Apps/Data Management Intro Dash App/plotly_iot_demo/option_1_manual_ddl/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from databricks import sql 4 | from sqlalchemy.orm import declarative_base, Session, sessionmaker 5 | from sqlalchemy import Column, String, Integer, BOOLEAN, create_engine, select, DATE, DATETIME, TIMESTAMP, DECIMAL, case, func, Table, MetaData 6 | import ddls 7 | from dash import Dash, html, Input, Output, ctx, dcc, dash_table 8 | import pandas as pd 9 | import plotly.express as px 10 | import requests 11 | import dash_bootstrap_components as dbc 12 | 13 | 14 | with open('config.json') as w: 15 | 16 | conf = json.load(w) 17 | token = conf.get("token") 18 | http_path = conf.get("http_path") 19 | database = conf.get("database") 20 | host_name = conf.get("host_name") 21 | catalog = conf.get("catalog") 22 | 23 | 24 | 25 | ### Initialize Database Connection 26 | conn_str = f"databricks://token:{token}@{host_name}?http_path={http_path}&catalog={catalog}&schema={database}" 27 | extra_connect_args = { 28 | "_tls_verify_hostname": True, 29 | "_user_agent_entry": "PySQL Example Script", 30 | } 31 | engine = create_engine( 32 | conn_str, 33 | connect_args=extra_connect_args, 34 | ) 35 | 36 | 37 | 38 | ## Get Metadata from Config Files 39 | #ddls.Base.metadata.create_all(bind=engine) 40 | #ddls.Base.metadata.drop_all(bind=engine) 41 | 42 | tables_stmt = f"""SELECT * FROM {catalog}.INFORMATION_SCHEMA.TABLES 43 | WHERE table_schema = '{database}'""" 44 | 45 | 46 | tables_in_db = pd.read_sql_query(tables_stmt, engine) 47 | 48 | ### Core Dash App 49 | 50 | app = Dash(external_stylesheets=[dbc.themes.BOOTSTRAP]) 51 | 52 | ### Layout 53 | 54 | app.layout = html.Div([ 55 | html.Button('Build Database Tables', id='build_db_btn', n_clicks=0), 56 | html.Button('Drop Database Tables', id='drop_tabes_btn', n_clicks=0), 57 | html.Button('Get Table List', id='fetch_tables_btn', n_clicks=0), 58 | html.Button('Run ELT Pipeline', id='run_etl_pipe', n_clicks=0), 59 | html.Div(id='container-button-timestamp'), 60 | html.Br(), 61 | dcc.RadioItems( 62 | id="checklist", 63 | options=["num_steps", "calories_burnt"], 64 | value="num_steps", 65 | inline=True, 66 | 67 | ), 68 | dcc.Graph(id='BasicSensors'), 69 | html.Div([html.Br(), 70 | dcc.Graph(id='SmoothSensors')]), 71 | dash_table.DataTable(tables_in_db.to_dict('records'),[{"name": i, "id": i} for i in tables_in_db.columns], id='tbl'), 72 | html.Div(id='sch_tbl', className = 'sch_tbl') 73 | ]) 74 | 75 | 76 | 77 | @app.callback(Output('sch_tbl', 'children'), 78 | Input('tbl', 'data'), 79 | Input('tbl', 'active_cell')) 80 | def update_graphs(data, active_cell): 81 | row_num = active_cell.get("row") 82 | col_num = active_cell.get("column") 83 | col_name = active_cell.get("column_id") 84 | 85 | table_name_to_detail = tables_in_db.loc[tables_in_db.index[row_num], col_name] 86 | 87 | if col_name == 'table_name': 88 | schema_stmt = f"""DESCRIBE TABLE EXTENDED {catalog}.{database}.{table_name_to_detail}""" 89 | schema_table = pd.read_sql_query(schema_stmt, engine) 90 | cols_for_data_table = [{'name': i, 'id': i} for i in schema_table.columns] 91 | 92 | res_table = dash_table.DataTable( 93 | id='table', 94 | columns=cols_for_data_table, 95 | data =schema_table.to_dict("rows"), 96 | style_data={ 97 | 'color': 'blue', 98 | 'backgroundColor': 'white' 99 | }, 100 | style_data_conditional=[ 101 | { 102 | 'if': {'row_index': 'odd'}, 103 | 'backgroundColor': 'rgb(220, 220, 220)', 104 | } 105 | ], 106 | style_header={ 107 | 'backgroundColor': 'rgb(210, 210, 210)', 108 | 'color': 'black', 109 | 'fontWeight': 'bold' 110 | } 111 | ) 112 | 113 | 114 | return res_table 115 | 116 | else: 117 | msg = f"Please select a table name... Currently Selected: {table_name_to_detail}" 118 | 119 | res_msg = dbc.Alert(msg, color="primary") 120 | 121 | return msg 122 | 123 | 124 | ## Sensors Chat Callback -- No Reflection 125 | @app.callback( 126 | Output(component_id="BasicSensors", component_property="figure"), 127 | Input("checklist", "value") 128 | ) 129 | def update_graph(yaxis): 130 | 131 | device_base_table = ddls.Base.metadata.tables["silver_sensors"] 132 | user_base_table = ddls.Base.metadata.tables["silver_users"] 133 | 134 | ## ORM-based SQL Query with dynamic filters in the callback 135 | 136 | userstmt = (select(device_base_table.c.timestamp, device_base_table.c.num_steps, device_base_table.c.calories_burnt) 137 | .limit(100) 138 | ) 139 | 140 | 141 | ## Read data via pandas or just raw Dict/array 142 | ## TIPS: Always try to push the filtering/complex logic down to the system where the most data is filtered 143 | ## minimize data brought to client 144 | df = pd.read_sql_query(userstmt, engine).sort_values(by=['timestamp']) 145 | 146 | axis_labels = { 147 | "num_steps": "Total Daily Steps", 148 | } 149 | fig = px.line( 150 | df, 151 | x="timestamp", 152 | y=[f"{yaxis}"], 153 | markers=True, 154 | title=f"Comparative Daily Fitness Metrics by Demographic", 155 | ) 156 | 157 | 158 | ## Build Plot Figure and return 159 | 160 | return fig 161 | 162 | 163 | 164 | ## Smooth Sensors Callback for Line Graph - Via Reflection 165 | @app.callback( 166 | Output(component_id="SmoothSensors", component_property="figure"), 167 | Input("checklist", "value") 168 | ) 169 | def update_smooth_graph(yaxis): 170 | 171 | if yaxis == "num_steps": 172 | chart_cols = ["SmoothedNumSteps30SecondMA", "SmoothedNumSteps120SecondMA"] 173 | 174 | elif yaxis == "calories_burnt": 175 | chart_cols = ["SmoothedCaloriesBurnt30SecondMA", "SmoothedCaloriesBurnt120SecondMA"] 176 | 177 | # Reflect database properties into ``metadata``. 178 | #ddls.Base.metadata.reflect(engine=engine) 179 | 180 | ## !! this table is NOT manually defined in our Python object, and is instead read on the fly with reflection 181 | sensors_table= Table("gold_sensors", 182 | ddls.Base.metadata, 183 | Column("timestamp", TIMESTAMP), 184 | autoload=True, 185 | autoload_with=engine, 186 | extend_existing=True) 187 | 188 | # Instantiate a new ``FetchTable`` object to retrieve column objects by name. 189 | 190 | # Get a ``Column`` object from the desired ``Table`` object. 191 | yaxis_short_ma = sensors_table.columns[chart_cols[0]] 192 | yaxis_long_ma = sensors_table.columns[chart_cols[1]] 193 | time_col = sensors_table.columns["timestamp"] 194 | 195 | # Build a session-based query including filtering on text in ``column``. 196 | ma_statement = (select(time_col, yaxis_short_ma, yaxis_long_ma) 197 | .limit(100) 198 | ) 199 | 200 | # Build a Pandas ``DataFrame`` with results from the query. 201 | df = pd.read_sql_query(ma_statement, engine).sort_values(by=['timestamp'], ascending=False) 202 | 203 | axis_labels = { 204 | "num_steps": "Total Daily Steps", 205 | } 206 | fig = px.line( 207 | df, 208 | x="timestamp", 209 | y=chart_cols, 210 | markers=True, 211 | title=f"Smoothed Moving Averages of Chosen Metric", 212 | ) 213 | 214 | 215 | ## Build Plot Figure and return 216 | 217 | return fig 218 | 219 | 220 | #### Run ELT Pipeline Callback 221 | @app.callback( 222 | Output('container-button-timestamp', 'children'), 223 | Input('build_db_btn', 'n_clicks'), 224 | Input('drop_tabes_btn', 'n_clicks'), 225 | Input('fetch_tables_btn', 'n_clicks'), 226 | Input('run_etl_pipe', 'n_clicks') 227 | ) 228 | def displayClick(btn1, btn2, btn3, btn4): 229 | msg = "No Database State Yet..." 230 | if 'build_db_btn' == ctx.triggered_id: 231 | 232 | ddls.Base.metadata.create_all(bind=engine) 233 | msg = "Database built!" 234 | 235 | elif 'drop_tabes_btn' == ctx.triggered_id: 236 | 237 | ddls.Base.metadata.drop_all(bind=engine) 238 | msg = "Database Dropped!" 239 | 240 | elif 'fetch_tables_btn' == ctx.triggered_id: 241 | tbls = list(ddls.Base.metadata.tables) 242 | msg = f"Here are the tables for {catalog}.{database}: {tbls}" 243 | 244 | elif 'run_etl_pipe' == ctx.triggered_id: 245 | 246 | ## Build and Trigger Databricks Jobs 247 | job_req = { 248 | "name": "Plotly_Backend_Pipeline", 249 | "email_notifications": { 250 | "no_alert_for_skipped_runs": "false" 251 | }, 252 | "webhook_notifications": {}, 253 | "timeout_seconds": 0, 254 | "max_concurrent_runs": 1, 255 | "tasks": [ 256 | { 257 | "task_key": "Plotly_Backend_Pipeline", 258 | "sql_task": { 259 | "query": { 260 | "query_id": "88c1412d-d2ca-43a1-9843-aec96b5b1586" 261 | }, 262 | "warehouse_id": "ead10bf07050390f" 263 | }, 264 | "timeout_seconds": 0, 265 | "email_notifications": {} 266 | } 267 | ], 268 | "format": "MULTI_TASK" 269 | } 270 | 271 | job_json = json.dumps(job_req) 272 | ## Get this from a secret or param 273 | headers_auth = {"Authorization":f"Bearer {token}"} 274 | uri = f"https://{host_name}/api/2.1/jobs/create" 275 | 276 | endp_resp = requests.post(uri, data=job_json, headers=headers_auth).json() 277 | 278 | ## Run Job 279 | job_id = endp_resp['job_id'] 280 | 281 | run_now_uri = f"https://{host_name}/api/2.1/jobs/run-now" 282 | 283 | job_run = {"job_id": job_id } 284 | job_run_json = json.dumps(job_run) 285 | 286 | run_resp = requests.post(run_now_uri, data=job_run_json, headers=headers_auth).json() 287 | 288 | 289 | msg = f"Pipeline Created and Ran with Job Id: {endp_resp['job_id']} \n run message: {run_resp}" 290 | 291 | 292 | 293 | return html.Div(msg) 294 | 295 | 296 | 297 | 298 | if __name__ == '__main__': 299 | app.run_server(debug=True) 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | -------------------------------------------------------------------------------- /Data Apps/Data Management Intro Dash App/plotly_iot_demo/option_1_manual_ddl/config.json: -------------------------------------------------------------------------------- 1 | {"http_path": "/sql/1.0/endpoints/", 2 | "host_name": "", 3 | "token": "", 4 | "catalog": "main", 5 | "database": "plotly_iot_dashboard" 6 | } -------------------------------------------------------------------------------- /Data Apps/Data Management Intro Dash App/plotly_iot_demo/option_1_manual_ddl/ddls.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import json 4 | from databricks import sql 5 | from sqlalchemy.orm import declarative_base, Session 6 | from sqlalchemy import Column, String, Integer, BOOLEAN, create_engine, select, TIMESTAMP, DECIMAL, INTEGER, BIGINT, DATE 7 | 8 | ## Return new base class with the mapper initialized 9 | Base = declarative_base() 10 | 11 | 12 | ####### Bronze Tables 13 | 14 | 15 | class BronzeSensors(Base): 16 | 17 | __tablename__ = "bronze_sensors" 18 | 19 | Id = Column(BIGINT, primary_key=True) 20 | device_id = Column(INTEGER) 21 | user_id = Column(INTEGER) 22 | calories_burnt = Column(DECIMAL(10,2)) 23 | miles_walked = Column(DECIMAL(10,2)) 24 | num_steps = Column(DECIMAL(10,2)) 25 | timestamp = Column(TIMESTAMP) 26 | value = Column(String(1024)) 27 | 28 | 29 | class BronzeUsers(Base): 30 | 31 | __tablename__ = "bronze_users" 32 | 33 | user_id = Column(BIGINT, primary_key=True) 34 | gender = Column(String(10)) 35 | age = Column(INTEGER) 36 | height = Column(DECIMAL(10,2)) 37 | weight = Column(DECIMAL(10,2)) 38 | smoker = Column(String(4)) 39 | familyhistory = Column(String(100)) 40 | cholestlevs = Column(String(100)) 41 | bp = Column(String(50)) 42 | risk = Column(DECIMAL(10,2)) 43 | update_timestamp = Column(TIMESTAMP) 44 | 45 | 46 | 47 | 48 | ####### Silver Tables 49 | 50 | class SilverSensors(Base): 51 | 52 | __tablename__ = "silver_sensors" 53 | 54 | Id = Column(BIGINT, primary_key=True) 55 | device_id = Column(INTEGER) 56 | user_id = Column(INTEGER) 57 | calories_burnt = Column(DECIMAL(10,2)) 58 | miles_walked = Column(DECIMAL(10,2)) 59 | num_steps = Column(DECIMAL(10,2)) 60 | timestamp = Column(TIMESTAMP) 61 | value = Column(String(1024)) 62 | 63 | 64 | 65 | class SilverUsers(Base): 66 | 67 | __tablename__ = "silver_users" 68 | 69 | user_id = Column(BIGINT, primary_key=True) 70 | gender = Column(String(10)) 71 | age = Column(INTEGER) 72 | height = Column(DECIMAL(10,2)) 73 | weight = Column(DECIMAL(10,2)) 74 | smoker = Column(String(4)) 75 | familyhistory = Column(String(100)) 76 | cholestlevs = Column(String(100)) 77 | bp = Column(String(50)) 78 | risk = Column(DECIMAL(10,2)) 79 | update_timestamp = Column(TIMESTAMP) 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /Data Apps/Data Management Intro Dash App/plotly_iot_demo/requirements.txt: -------------------------------------------------------------------------------- 1 | dash 2 | dash_bootstrap_components 3 | databricks-sql-connector ==2.4.1 -------------------------------------------------------------------------------- /Delta Optimizer/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/Delta Optimizer/.DS_Store -------------------------------------------------------------------------------- /Delta Optimizer/DashAppFrontEnd/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/Delta Optimizer/DashAppFrontEnd/.DS_Store -------------------------------------------------------------------------------- /Delta Optimizer/Instructions.md: -------------------------------------------------------------------------------- 1 | # Delta Optimizer 2 | Automated Optimization System for a Delta-based Lakehouse running on Spark or Photon 3 | 4 | 5 | delta_io 6 | 7 | 8 | ## Purpose: 9 | The Delta optimizer scrapes and analyzes the query history in DBSQL via the Query History API, as well as the Delta transaction logs on one or many databases, builds a data profile to determine the most important columns that each tables should be Z-ordered by. This aims to drastically reduce the amount of manual discovery and tuning users must do to properly optimize their delta tables, especially when the primary query interface is through a DBSQL Warehouse (as an analyst using SQL or a BI tool that auto-generates SQL). This is especially key when BI tools primarily pass auto-generated SQL to a DBSQL Warehouse, thus making it much more difficult to optimize tables manually at scale. 10 | 11 | 12 | ### How to run: 13 | 14 |
  • 1. Install the associated delta optimizer library whl file to a cluster 15 |
  • 2. Run the Step 1 Notebook with you database_names to monitor, workspace url, warehouseIds to poll, and lookbackperiod. You can schedule this as a job to run monthly (as often as the query patterns might change) 16 |
  • 3. Run the Step 2 Notebook with a cluster similar to the size you would use to normally run an optimization job for your tables. If you do not know, just create a similar size cluster to your dev env. Most operations are incremental and not large except for the first run (as first run may re-write entire tables). Then schedule this notebook as a job to run daily. 17 | 18 | ### Delta Optimizer Process: 19 | 20 |
  • 1. Gather Query History and calculate statistics on all columns for all tables (option to select a particular database) 21 |
  • 2. Read transaction logs and find any merge predicates (if any) run for all tables in one or many databases 22 |
  • 3. Calculate Statistics and Rank Columns for each table for Z-order strategy using runtime stats, occurence stats, and cardinality stats 23 |
  • 4. Prepare and save a ready-to-use config delta table that can be ingested by a job or DLT to actually run the recommended OPTIMIZE/ANALYZE/TBLPROP commands
  • 24 | 25 | 26 | 27 | 28 | 29 | 30 | ### Roadmap: 31 | 32 | #### General Roadmap: 33 | 34 |
  • 1. Separate optimization rules from code logic to make rules configurable 35 |
  • 2. Add option to run for user or simply provide a DBSQL Dashboard of recommendations to make suggestions OOTB 36 |
  • 3. Add table exception rules, allow users to decide which table to auto optimize and which to manually override if they want to optimize their own 37 |
  • 4. Dynamically figure out job configuration (cluster size / periodicity) of commands to run 38 | 39 | #### Query Statistics: 40 | 41 |
  • 1. Enable parsing of queries from not just DBSQL, but ALL clusters (jobs/AP) 42 |
  • 2. Enable parameter selection for specifying specific (1 or many) databases to scrape 43 |
  • 3. Enable pointing to a Git location to parse SQL files with SELECT statements in GIT 44 | 45 | #### Transaction Log Statistics: 46 | 47 |
  • 1. Add partition filtering and file size management - DONE 48 |
  • 2. Column Reording first 32 (currently only re-orders recommended ZORDER columns) - IN PROGRESS 49 |
  • 3. Add Analyze Table STATS - DONE 50 | 51 | #### Ranking Statistics Algorithm: 52 | 53 |
  • 1. More robust standard scaling for statistics (right now its 0-1 standard scaling partitioned by TABLE) 54 |
  • 2. Make ranking system more intelligent - open ended feedback needed for ideas on making ranking system more generalizable and nuanced 55 |
  • 3. Dynamically prune for the actual number of ZORDER columns to best used (dependant first on cardinality). Do this possibly by tracking distance between certain statistics (i.e. if ColA appears 3000 times and Col B appears 2900 times, use both, but if ColA appears 3000 times but ColB appears 3 times, only use ColA) 56 | 57 |
  • 58 | 59 | 60 | #### Execution Step 61 | 62 |
  • 1. Automatically create and schedule a job via the API that reads from the config with the provided notebook and runs at a parameter interval selected by the user 63 | 64 |
  • 2. Use DLT to Generate DDL, file Sizes, and Managed these optimize statements automatically without actually needing to do ETL in DLT -------------------------------------------------------------------------------- /Delta Optimizer/Step 1_ Optimization Strategy Builder.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC # Delta Optimizer - Profiling Stage 5 | # MAGIC 6 | # MAGIC
      7 | # MAGIC 8 | # MAGIC
    • Polls Query History API and get List of Queries for a set of SQL Warehouses (this is incremental, so you just define a lookback period for the first time you poll) 9 | # MAGIC
    • Analyzes transaction logs for tables in set of databases (all by default) -- file size, partitions, merge predicates 10 | # MAGIC
    • Ranks unified strategy 11 | # MAGIC 12 | # MAGIC
    13 | # MAGIC 14 | # MAGIC ### Permissions Required: 15 | # MAGIC 1. User running the delta optimizer must have CREATE DATABASE permission to create a delta optimizer instance (OR have it created upfront by an admin). 16 | # MAGIC 2. User running the delta optimizer must have READ permissions to ALL databases being profiled and optimized. 17 | # MAGIC 3. User running the delta optimizer must have usage permissions to all SQL Warehouses being profiled and optimized. 18 | # MAGIC 19 | # MAGIC ### Steps: 20 | # MAGIC 21 | # MAGIC 1. Decide where you want your Delta Optimizer Instance Output to live by setting Optimizer Output Database . You can have one or many optimizer instances, but each instances needs its own isolated database, they cannot share database namespaces. 22 | # MAGIC 23 | # MAGIC 2. Insert Server HostName . This is the root workspace url for your databricks workspace. This is NOT tied to any specific cluster. 24 | # MAGIC 25 | # MAGIC 3. Choose Catalog Filter Mode . There are 3 options: include_list, exclude_list, and all. include_list is default, which allows you to select the databases you want to profile and optimize. exclude_list will monitor ALL databases except the ones in the list. 'all' mode will monitor ALL databases. Note that the user running the delta optimizer must have read permissions to ALL databases selected no matter the mode. 26 | # MAGIC 27 | # MAGIC 4. List the databases to profile in the Catalog Names (csv)... parameter. This is either an include or exclude list. If mode = 'all', then this parameter is not used. 28 | # MAGIC 29 | # MAGIC 5. Choose Database Filter Mode . There are 3 options: include_list, exclude_list, and all. include_list is default, which allows you to select the databases you want to profile and optimize. exclude_list will monitor ALL databases except the ones in the list. 'all' mode will monitor ALL databases. Note that the user running the delta optimizer must have read permissions to ALL databases selected no matter the mode. 30 | # MAGIC 31 | # MAGIC 6. List the databases to profile in the Database Names (csv)... parameter. This is either an include or exclude list. If mode = 'all', then this parameter is not used. 32 | # MAGIC 33 | # MAGIC 7. Choose the Table Filter Mode . There are 3 options: include_list, exclude_list, and all. include_list is default, which allows you to select the subset of tables you want to profile and optimize. This most will ALWAYS operate within the subset of databases chosen from the Database Filter Mode . i.e. if the table you want is not included in the selected databases, no matter the mode, it will not be profiled and optimized. 34 | # MAGIC 35 | # MAGIC 8. List the tables to profile in the Table Filter List... parameter. This is either an include or exclude list. If mode = 'all', then this parameter is not used. 36 | # MAGIC 37 | # MAGIC 9. Fill out the list of SQL Warehouse IDs (csv list) to profile and extract query history from. This is how the optimizer will detect HOW your tables are being used in queries. It will use the Query History API to incrementally pull your queries for the selected SQL Warehouses and store them in the Delta optimizer database used. 38 | # MAGIC 39 | # MAGIC 10. Choose a Query History Lookback Period . This is ONLY for cold starts. This represents a lagging sample of days from today to pull query history for. After the first run, it picks up where it last left off automatically unless the Start Over? parameter = 'Yes'. 40 | # MAGIC 41 | # MAGIC 11. Optionally choose the Start Over? parameter. 'Yes' means it will truncate all historical state and re-profile history from scratch. 'No' means it will always pick up where it left off. 42 | # MAGIC 43 | # MAGIC 44 | # MAGIC ### KEY USER NOTES: 45 | # MAGIC 1. Think of the catalog/database/filter lists/modes like a funnel. No matter whether inclusion or exclusion mode for each level, the lower levels will always ONLY contain the subset that results from the previous. For example, if I am running for all catalogs except 'main', then in my database list, if there are any databases that live it 'main', they will not be optimized. 46 | # MAGIC 2. Database names should be fully qualified (catalog.database.table) 47 | # MAGIC 3. Table Filter List must be fully qualified (catalog.database.table) 48 | # MAGIC 4. If table filter mode is all, then the filter list can be blank, otherwise ensure that it is correct 49 | # MAGIC 50 | # MAGIC 51 | # MAGIC 52 | # MAGIC ### LIMITATIONS: 53 | # MAGIC 1. Currently it does NOT profile SQL queries run on Adhoc or Jobs clusters, only SQL Warehouses for now. This is on the roadmap to fix. 54 | # MAGIC 55 | # MAGIC ### Depedencies 56 | # MAGIC
  • Ensure that you either get a token as a secret or use a cluster with the env variable called DBX_TOKEN to authenticate to DBSQL 57 | 58 | # COMMAND ---------- 59 | 60 | from deltaoptimizer import DeltaProfiler, QueryProfiler, DeltaOptimizer 61 | import os 62 | 63 | # COMMAND ---------- 64 | 65 | # DBTITLE 1,Register and Retrieve DBX Auth Token 66 | DBX_TOKEN = "" 67 | 68 | # COMMAND ---------- 69 | 70 | # DBTITLE 1,Set up params before running 71 | ## Assume running in a Databricks notebook 72 | dbutils.widgets.dropdown("Query History Lookback Period (days)", defaultValue="3",choices=["1","3","7","14","30","60","90"]) 73 | dbutils.widgets.text("SQL Warehouse Ids (csv list)", "") 74 | dbutils.widgets.text("Server Hostname:", "") 75 | dbutils.widgets.dropdown("Start Over?","No", ["Yes","No"]) 76 | dbutils.widgets.text("Optimizer Output Database:", "hive_metastore.delta_optimizer") 77 | dbutils.widgets.text("Optimizer Output Location (optional):", "") 78 | dbutils.widgets.dropdown("Table Filter Mode", "all", ["all", "include_list", "exclude_list"]) 79 | dbutils.widgets.dropdown("Database Filter Mode", "all", ["all", "include_list", "exclude_list"]) 80 | dbutils.widgets.dropdown("Catalog Filter Mode", "all", ["all", "include_list", "exclude_list"]) 81 | dbutils.widgets.text("Table Filter List (catalog.database.table) (Csv List)", "") 82 | dbutils.widgets.text("Database Filter List (catalog.database) (Csv List)", "") 83 | dbutils.widgets.text("Catalog Filter List (Csv List)", "") 84 | 85 | # COMMAND ---------- 86 | 87 | # DBTITLE 1,Get Params to Variables 88 | lookbackPeriod = int(dbutils.widgets.get("Query History Lookback Period (days)")) 89 | warehouseIdsList = [i.strip() for i in dbutils.widgets.get("SQL Warehouse Ids (csv list)").split(",")] 90 | workspaceName = dbutils.widgets.get("Server Hostname:").strip() 91 | warehouse_ids = dbutils.widgets.get("SQL Warehouse Ids (csv list)") 92 | start_over = dbutils.widgets.get("Start Over?") 93 | table_filter_mode = dbutils.widgets.get("Table Filter Mode") 94 | database_filter_mode = dbutils.widgets.get("Database Filter Mode") 95 | catalog_filter_mode = dbutils.widgets.get("Catalog Filter Mode") 96 | table_filter_list = [i.strip() for i in dbutils.widgets.get("Table Filter List (catalog.database.table) (Csv List)").split(",")] 97 | database_filter_list = [i.strip() for i in dbutils.widgets.get("Database Filter List (catalog.database) (Csv List)").split(",")] 98 | catalog_filter_list = [i.strip() for i in dbutils.widgets.get("Catalog Filter List (Csv List)").split(",")] 99 | database_output = dbutils.widgets.get("Optimizer Output Database:").strip() 100 | 101 | if len(dbutils.widgets.get("Optimizer Output Location (optional):").strip()) > 0: 102 | database_location = dbutils.widgets.get("Optimizer Output Location (optional):").strip() 103 | else: 104 | database_location = None 105 | 106 | 107 | # COMMAND ---------- 108 | 109 | # DBTITLE 1,Initialize Core Optimizer Tables 110 | delta_optimizer = DeltaOptimizer(database_name=database_output, database_location=database_location) 111 | 112 | # COMMAND ---------- 113 | 114 | # DBTITLE 1,Delete Existing Results and Start Over Param 115 | if start_over == "Yes": 116 | delta_optimizer.truncate_delta_optimizer_results() 117 | 118 | # COMMAND ---------- 119 | 120 | # DBTITLE 1,Build Query History Profile 121 | ####### Step 1: Build Profile ####### 122 | ## Initialize Profiler 123 | 124 | ## catalogs_to_check_views should include ALL catalogs where views could live that you want to optimize underlying tables for 125 | ## Ideally they are just the same catalogs are your database names defined in the params so we try to parse for you to start there, but if you need to add, change the list here. 126 | 127 | ## NOTE: Query profiler doesnt really use database filter mode because it doesnt access the databases, only the SQL Query history API. 128 | 129 | query_profiler = QueryProfiler(workspaceName, 130 | warehouseIdsList, 131 | database_name=database_output, 132 | database_location=database_location, 133 | catalogs_to_check_views=catalog_filter_list, 134 | catalog_filter_mode=catalog_filter_mode, 135 | catalog_filter_list=catalog_filter_list, 136 | database_filter_mode=database_filter_mode, 137 | database_filter_list = database_filter_list, 138 | table_filter_mode=table_filter_mode, 139 | table_filter_list=table_filter_list, 140 | scrub_views=True) 141 | 142 | query_profiler.build_query_history_profile(dbx_token = DBX_TOKEN, mode='auto', lookback_period_days=lookbackPeriod) 143 | 144 | # COMMAND ---------- 145 | 146 | # DBTITLE 1,Run Delta Profiler 147 | ####### Step 2: Build stats from transaction logs/table data ####### 148 | 149 | ## Initialize class and pass in database csv string 150 | profiler = DeltaProfiler(catalog_filter_mode=catalog_filter_mode, 151 | catalog_filter_list=catalog_filter_list, 152 | database_filter_mode=database_filter_mode, 153 | database_filter_list = database_filter_list, 154 | table_filter_mode=table_filter_mode, 155 | table_filter_list=table_filter_list, 156 | database_name=database_output, 157 | database_location=database_location 158 | ) ## examples include 'default', 'mydb1,mydb2', 'all' or leave blank 159 | 160 | ## Get tables 161 | profiler.get_all_tables_to_monitor() 162 | 163 | ## Get predicate analysis for tables 164 | profiler.parse_stats_for_tables() 165 | 166 | ## Build final table output 167 | profiler.build_all_tables_stats() 168 | 169 | ## Generate cardinality stats 170 | profiler.build_cardinality_stats() 171 | 172 | 173 | # COMMAND ---------- 174 | 175 | # DBTITLE 1,Run Delta Optimizer 176 | ####### Step 3: Build Strategy and Rank ####### 177 | ## Build Strategy 178 | 179 | delta_optimizer = DeltaOptimizer(database_name=database_output, database_location=database_location) 180 | 181 | delta_optimizer.build_optimization_strategy() 182 | 183 | 184 | # COMMAND ---------- 185 | 186 | # DBTITLE 1,Return most up to date results! 187 | df = delta_optimizer.get_results() 188 | 189 | # COMMAND ---------- 190 | 191 | df.display() 192 | -------------------------------------------------------------------------------- /Delta Optimizer/Step 2_ Strategy Runner.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC ## Run the output of recommended optimize statements as a single run or schedule as a periodic job 5 | # MAGIC 6 | # MAGIC

    Run this after the delta optimizer is finished

    7 | # MAGIC 8 | # MAGIC #### 3 Modes: 9 | # MAGIC 10 | # MAGIC
      1. include_all_tables: this mode optimizes all tables in the databases that the delta optimizer was provided at the profiling stage 11 | # MAGIC
        2. use_include_list : this mode only optimizes tables that you explicitly WANT to INCLUDE that is a subset of the database monitored in the profiling stage. Must provide fully qualified tables names for now (i.e. hive_metastore.iot_dashboard.silver_sensors,etc.). 12 | # MAGIC
          3. use_exlude_list : this mode optimizes all tables in the databases monitored EXCEPT the list provided. Must provide fully qualified table names for now. 13 | # MAGIC 14 | # MAGIC 15 | # MAGIC #### Roadmap: 16 | # MAGIC 17 | # MAGIC 1. Be more selective about type of analyze statements depending on size of table and update frquency. (less frequently updated tables dont need it as much) 18 | # MAGIC 2. Use DLT metaprogramming framework to run in parallel (performance implications) 19 | # MAGIC 3. Use Jobs API to automatically set up a daily / hourly job for this. This is NOT always recommended by default. The optimize timing greatly depends on the ETL pipelines 20 | # MAGIC 4. Dyanmically decide how often to run ANALYZE TABLE commands based on table size mapping (job that does this for you) 21 | 22 | # COMMAND ---------- 23 | 24 | # MAGIC %md 25 | # MAGIC 26 | # MAGIC ### Run Commands in Particular Order: 27 | # MAGIC 28 | # MAGIC
        • 1. ALTER TABLE 29 | # MAGIC
        • 2. Column Reordering 30 | # MAGIC
        • 3. OPTIMIZE TABLE 31 | # MAGIC
        • 4. ANALYZE TABLE 32 | 33 | # COMMAND ---------- 34 | 35 | from pyspark.sql.functions import * 36 | 37 | # COMMAND ---------- 38 | 39 | from deltaoptimizer import DeltaOptimizerBase, DeltaProfiler, QueryProfiler, DeltaOptimizer 40 | 41 | # COMMAND ---------- 42 | 43 | dbutils.widgets.dropdown("table_mode", "include_all_tables", ["include_all_tables", "use_exclude_list", "use_include_list"]) 44 | dbutils.widgets.text("exclude_list(csv)", "") 45 | dbutils.widgets.text("include_list(csv)", "") 46 | dbutils.widgets.text("Optimizer Output Database:", "hive_metastore.delta_optimizer") 47 | 48 | # COMMAND ---------- 49 | 50 | optimizer_location = dbutils.widgets.get("Optimizer Output Database:").strip() 51 | delta_optimizer = DeltaOptimizer(database_name=optimizer_location) 52 | 53 | # COMMAND ---------- 54 | 55 | ## This table by default has only 1 file, so it shouldnt be expensive to collect 56 | table_mode = dbutils.widgets.get("table_mode") 57 | include_table_list = [i.strip() for i in dbutils.widgets.get("include_list(csv)").split(",")] 58 | exclude_table_list = [i.strip() for i in dbutils.widgets.get("exclude_list(csv)").split(",")] 59 | 60 | if table_mode == "include_all_tables": 61 | config_row = (delta_optimizer.get_results() 62 | .collect() 63 | ) 64 | elif table_mode == "use_include_list": 65 | config_row = (delta_optimizer.get_results() 66 | .filter(col("TableName").isin(*include_table_list)) 67 | .collect() 68 | ) 69 | 70 | elif table_mode == "use_exclude_list": 71 | config_row = (delta_optimizer.get_results() 72 | .filter(~col("TableName").isin(*exclude_table_list)) 73 | .collect() 74 | ) 75 | 76 | # COMMAND ---------- 77 | 78 | # DBTITLE 1,Step 1 - Get Table Properties Config 79 | config_tbl_prop = [i[3] for i in config_row] 80 | 81 | print(f"Running {len(config_tbl_prop)} TBL PROPERTIES (file size and re-writes) commands: \n {config_tbl_prop}") 82 | 83 | # COMMAND ---------- 84 | 85 | # DBTITLE 1,Run TBL Properties Commands 86 | for i in config_tbl_prop: 87 | try: 88 | print(f"Running TABLE PROPERTIES command for {i}...") 89 | spark.sql(i) 90 | print(f"Completed TABLE PROPERTIES command for {i}!\n") 91 | 92 | except Exception as e: 93 | print(f"TABLE PROPERTIES failed with error: {str(e)}\n") 94 | 95 | # COMMAND ---------- 96 | 97 | print(col_list) 98 | 99 | # COMMAND ---------- 100 | 101 | # DBTITLE 1,Move Z-Order columns to front 102 | col_list = config_tbl_prop = [i[5] for i in config_row] 103 | 104 | ### This is a recursive step, ordering needs to happend one at a time 105 | ## Starting simple, just moving ZORDEr cols to front, but this can become more nuanced 106 | for i in col_list: 107 | for j in i: 108 | try: 109 | 110 | spark.sql(j) 111 | print(f"Completed column order change for table {i} and column {j}") 112 | 113 | except Exception as e: 114 | print(f"Unable to change order (usually means cause its an Id column and doesnt need reordering anyways...skipping to next columns) \n with error: {str(e)} \n ") 115 | 116 | 117 | # COMMAND ---------- 118 | 119 | # DBTITLE 1,Step 2 - Get config for OPTIMIZE Commands 120 | ## This table by default has only 1 file, so it shouldnt be expensive to collect 121 | config_optim = [i[2] for i in config_row] 122 | 123 | print(f"Running {len(config_optim)} OPTIMIZE commands: \n {config_optim}") 124 | 125 | # COMMAND ---------- 126 | 127 | # DBTITLE 1,Run through OPTIMIZE commands 128 | for i in config_optim: 129 | try: 130 | print(f"Running OPTIMIZE command for {i}...") 131 | spark.sql(i) 132 | print(f"Completed OPTIMIZE command for {i}!\n ") 133 | 134 | except Exception as e: 135 | print(f"Optimize failed with error: {str(e)}\n") 136 | 137 | 138 | # COMMAND ---------- 139 | 140 | # DBTITLE 1,Step 3 - Get Config for ANALYZE TABLE commands 141 | ## This table by default has only 1 file, so it shouldnt be expensive to collect 142 | config_tbl_stats = [i[4] for i in config_row] 143 | 144 | print(f"Running {len(config_tbl_stats)} TBL PROPERTIES (file size and re-writes) commands: \n {config_tbl_stats}") 145 | 146 | # COMMAND ---------- 147 | 148 | # DBTITLE 1,Run through Config for ANALYZE 149 | for i in config_tbl_stats: 150 | try: 151 | print(f"Running ANALYZE TABLE command for {i}...") 152 | spark.sql(i) 153 | print(f"Completed ANALYZE TABLE command for {i}!\n") 154 | 155 | except Exception as e: 156 | print(f"ANALYZE TABLE failed with error: {str(e)}\n") 157 | 158 | -------------------------------------------------------------------------------- /Delta Optimizer/Step 3_ Query History and Profile Analyzer.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC ## Select from Results table to Look at Profiles of Queries, Tables, and Recommendations 5 | 6 | # COMMAND ---------- 7 | 8 | from pyspark.sql.functions import * 9 | 10 | # COMMAND ---------- 11 | 12 | from deltaoptimizer import DeltaOptimizerBase, DeltaProfiler, QueryProfiler, DeltaOptimizer 13 | import os 14 | 15 | # COMMAND ---------- 16 | 17 | dbutils.widgets.text("Optimizer Output Database", "hive_metastore.delta_optimizer") 18 | 19 | # COMMAND ---------- 20 | 21 | optimizer_location = dbutils.widgets.get("Optimizer Output Database").strip() 22 | delta_optimizer = DeltaOptimizer(database_name=optimizer_location) 23 | 24 | # COMMAND ---------- 25 | 26 | # DBTITLE 1,Get Most Recent Strategy Results 27 | # MAGIC %python 28 | # MAGIC ## This table by default has only 1 file, so it shouldnt be expensive to collect 29 | # MAGIC table_mode = dbutils.widgets.get("table_mode") 30 | # MAGIC include_table_list = [i.strip() for i in dbutils.widgets.get("include_list(csv)").split(",")] 31 | # MAGIC exclude_table_list = [i.strip() for i in dbutils.widgets.get("exclude_list(csv)").split(",")] 32 | # MAGIC 33 | # MAGIC if table_mode == "include_all_tables": 34 | # MAGIC df_results = (delta_optimizer.get_results() 35 | # MAGIC ) 36 | # MAGIC elif table_mode == "use_include_list": 37 | # MAGIC df_results = (delta_optimizer.get_results() 38 | # MAGIC .filter(col("TableName").isin(*include_table_list)) 39 | # MAGIC ) 40 | # MAGIC 41 | # MAGIC elif table_mode == "use_exclude_list": 42 | # MAGIC df_results = (delta_optimizer.get_results() 43 | # MAGIC .filter(~col("TableName").isin(*exclude_table_list)) 44 | # MAGIC ) 45 | # MAGIC 46 | # MAGIC 47 | # MAGIC df_results.display() 48 | 49 | # COMMAND ---------- 50 | 51 | # DBTITLE 1,Get Table Stats 52 | df = spark.sql(f""" 53 | 54 | SELECT * 55 | FROM {optimizer_location}.all_tables_table_stats 56 | 57 | """) 58 | 59 | df.display() 60 | 61 | 62 | # COMMAND ---------- 63 | 64 | # DBTITLE 1,Get Cardinality Stats 65 | 66 | df = spark.sql(f""" 67 | 68 | SELECT * 69 | FROM {optimizer_location}.all_tables_cardinality_stats 70 | WHERE IsUsedInReads = 1 OR IsUsedInWrites = 1 71 | """) 72 | 73 | df.display() 74 | 75 | # COMMAND ---------- 76 | 77 | # DBTITLE Register Unique Queries 78 | unqiue_queries = spark.sql(f"""SELECT * FROM {optimizer_location}.parsed_distinct_queries""") 79 | unqiue_queries.createOrReplaceTempView("unique_queries") 80 | 81 | # COMMAND ---------- 82 | 83 | # DBTITLE 1,Raw Query Runs Tables 84 | 85 | """ This table contains ALL queries for the monitored warehouses that have been run over time, so you can build all kinds of visualizations on that. These are NOT distinct queries, they are every single query run 86 | """ 87 | 88 | raw_queries_df = spark.sql(f""" 89 | 90 | SELECT *, 91 | from_unixtime(query_start_time_ms/1000) AS QueryStartTime, 92 | from_unixtime(query_end_time_ms/1000) AS QueryEndTime, 93 | duration/1000 AS QueryDurationSeconds 94 | FROM {optimizer_location}.raw_query_history_statistics 95 | 96 | """) 97 | 98 | raw_queries_df.createOrReplaceTempView("raw_queries") 99 | 100 | raw_queries_df.display() 101 | 102 | # COMMAND ---------- 103 | 104 | # DBTITLE 1,Most Expensive Queries in a all run history (user can add timestamp filter in a WHERE clause) 105 | # MAGIC %sql 106 | # MAGIC 107 | # MAGIC SELECT 108 | # MAGIC r.query_hash, 109 | # MAGIC r.query_text, 110 | # MAGIC SUM(r.duration/1000) AS TotalRuntimeOfQuery, 111 | # MAGIC AVG(r.duration/1000) AS AvgDurationOfQuery, 112 | # MAGIC COUNT(r.query_id) AS TotalRunsOfQuery, 113 | # MAGIC COUNT(r.query_id) / COUNT(DISTINCT date_trunc('day', QueryStartTime)) AS QueriesPerDay, 114 | # MAGIC SUM(r.duration/1000) / COUNT(DISTINCT date_trunc('day', QueryStartTime)) AS TotalRuntimePerDay 115 | # MAGIC FROM raw_queries r 116 | # MAGIC WHERE QueryStartTime >= (current_date() - 7) 117 | # MAGIC GROUP BY r.query_hash, r.query_text 118 | # MAGIC ORDER BY TotalRuntimePerDay DESC 119 | 120 | # COMMAND ---------- 121 | 122 | # DBTITLE 1,Query Runs Over Time - General 123 | # MAGIC %sql 124 | # MAGIC 125 | # MAGIC SELECT 126 | # MAGIC date_trunc('hour', QueryStartTime) AS Date, 127 | # MAGIC COUNT(query_id) AS TotalQueryRuns, 128 | # MAGIC AVG(QueryDurationSeconds) AS AvgQueryDurationSeconds 129 | # MAGIC FROM raw_queries 130 | # MAGIC GROUP BY date_trunc('hour', QueryStartTime) 131 | # MAGIC ORDER BY Date 132 | 133 | # COMMAND ---------- 134 | 135 | # DBTITLE 1,Top 10 Queries with Most Total Runtime Per Day (Duration * # times run) 136 | # MAGIC %sql 137 | # MAGIC 138 | # MAGIC WITH r AS ( 139 | # MAGIC SELECT 140 | # MAGIC date_trunc('day', r.QueryStartTime) AS Date, 141 | # MAGIC r.query_hash, 142 | # MAGIC SUM(r.duration/1000) AS TotalRuntimeOfQuery, 143 | # MAGIC AVG(r.duration/1000) AS AvgDurationOfQuery, 144 | # MAGIC COUNT(r.query_id) AS TotalRunsOfQuery 145 | # MAGIC FROM raw_queries r 146 | # MAGIC GROUP BY date_trunc('day', r.QueryStartTime), r.query_hash 147 | # MAGIC ), 148 | # MAGIC s as ( 149 | # MAGIC SELECT 150 | # MAGIC *, 151 | # MAGIC DENSE_RANK() OVER (PARTITION BY Date ORDER BY TotalRuntimeOfQuery DESC) AS PopularityRank 152 | # MAGIC FROM r 153 | # MAGIC ) 154 | # MAGIC SELECT 155 | # MAGIC uu.query_text, 156 | # MAGIC s.* 157 | # MAGIC FROM s 158 | # MAGIC LEFT JOIN unique_queries uu ON uu.query_hash = s.query_hash 159 | # MAGIC WHERE PopularityRank <= 10 160 | 161 | # COMMAND ---------- 162 | 163 | # DBTITLE 1,Top 10 Longest Running Queries By Day 164 | # MAGIC %sql 165 | # MAGIC 166 | # MAGIC WITH r AS ( 167 | # MAGIC SELECT 168 | # MAGIC date_trunc('day', r.QueryStartTime) AS Date, 169 | # MAGIC r.query_hash, 170 | # MAGIC SUM(r.duration/1000) AS TotalRuntimeOfQuery, 171 | # MAGIC AVG(r.duration/1000) AS AvgDurationOfQuery, 172 | # MAGIC COUNT(r.query_id) AS TotalRunsOfQuery 173 | # MAGIC FROM raw_queries r 174 | # MAGIC GROUP BY date_trunc('day', r.QueryStartTime), r.query_hash 175 | # MAGIC ), 176 | # MAGIC s as ( 177 | # MAGIC SELECT 178 | # MAGIC *, 179 | # MAGIC DENSE_RANK() OVER (PARTITION BY Date ORDER BY AvgDurationOfQuery DESC) AS PopularityRank 180 | # MAGIC FROM r 181 | # MAGIC ) 182 | # MAGIC SELECT 183 | # MAGIC uu.query_text, 184 | # MAGIC s.* 185 | # MAGIC FROM s 186 | # MAGIC LEFT JOIN unique_queries uu ON uu.query_hash = s.query_hash 187 | # MAGIC WHERE PopularityRank <= 10 188 | 189 | # COMMAND ---------- 190 | 191 | # DBTITLE 1,Top 10 Most OFTEN ran queries by Day 192 | # MAGIC %sql 193 | # MAGIC 194 | # MAGIC WITH r AS ( 195 | # MAGIC SELECT 196 | # MAGIC date_trunc('day', r.QueryStartTime) AS Date, 197 | # MAGIC r.query_hash, 198 | # MAGIC SUM(r.duration/1000) AS TotalRuntimeOfQuery, 199 | # MAGIC AVG(r.duration/1000) AS AvgDurationOfQuery, 200 | # MAGIC COUNT(r.query_id) AS TotalRunsOfQuery 201 | # MAGIC FROM raw_queries r 202 | # MAGIC GROUP BY date_trunc('day', r.QueryStartTime), r.query_hash 203 | # MAGIC ), 204 | # MAGIC s as ( 205 | # MAGIC SELECT 206 | # MAGIC *, 207 | # MAGIC DENSE_RANK() OVER (PARTITION BY Date ORDER BY TotalRunsOfQuery DESC) AS PopularityRank 208 | # MAGIC FROM r 209 | # MAGIC ) 210 | # MAGIC SELECT 211 | # MAGIC uu.query_text, 212 | # MAGIC s.* 213 | # MAGIC FROM s 214 | # MAGIC LEFT JOIN unique_queries uu ON uu.query_hash = s.query_hash 215 | # MAGIC WHERE PopularityRank <= 10 216 | 217 | # COMMAND ---------- 218 | 219 | # DBTITLE 1,Most Expensive Table MERGE / DELETE operations 220 | writes_df = spark.sql(f""" 221 | 222 | SELECT * 223 | FROM {optimizer_location}.write_statistics_merge_predicate 224 | 225 | """) 226 | 227 | writes_df.display() 228 | -------------------------------------------------------------------------------- /Delta Optimizer/deltaoptimizer-1.4.1-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/Delta Optimizer/deltaoptimizer-1.4.1-py3-none-any.whl -------------------------------------------------------------------------------- /Delta Optimizer/deltaoptimizer-1.5.0-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/Delta Optimizer/deltaoptimizer-1.5.0-py3-none-any.whl -------------------------------------------------------------------------------- /Delta Optimizer/deltaoptimizer-1.5.2-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/Delta Optimizer/deltaoptimizer-1.5.2-py3-none-any.whl -------------------------------------------------------------------------------- /Design Patterns Notebooks/Advanced Notebooks/End to End Procedural Migration Pattern/Procedural Migration Pattern with SCD2 Example.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC # End to End Procedural Programming for Data Warehousing Example for SCD2 Pipeline 5 | # MAGIC 6 | # MAGIC ## Overview: 7 | # MAGIC 8 | # MAGIC This notebook shows how to use the popular delta helper libraries to implement end to end procedural data warehousing using the following: 9 | # MAGIC 10 | # MAGIC 1. Simple Python + SQL for complex control flow 11 | # MAGIC 2. DeltaLogger for easy logging and error tracking 12 | # MAGIC 3. Serverless Client for Pushing SQL statements down to DBSQL Serverless Warehouse 13 | # MAGIC 4. Multi Statement Transaction Manager for SCD2 Multi statement upserts pushed to DBSQL Serverless 14 | # MAGIC 15 | # MAGIC 16 | # MAGIC ## Steps: 17 | # MAGIC 18 | # MAGIC 0. Initialize Logger 19 | # MAGIC 1. Create DDLS 20 | # MAGIC 2. COPY INTO Bronze Tables 21 | # MAGIC 3. MERGE Upserts (Multi Statement Transaction) 22 | # MAGIC 4. Operational / Historical Snapshots Gold Tables 23 | # MAGIC 5. Clean up staging tables 24 | # MAGIC 6. Complete / Fail Runs in Logger 25 | # MAGIC 26 | 27 | # COMMAND ---------- 28 | 29 | # DBTITLE 1,Medallion Architecture 30 | # MAGIC %md 31 | # MAGIC 32 | # MAGIC 33 | 34 | # COMMAND ---------- 35 | 36 | # DBTITLE 1,Optional, can build these libraries into wheel 37 | # MAGIC %pip install sqlglot 38 | 39 | # COMMAND ---------- 40 | 41 | # DBTITLE 1,Available Libraries for Procedural Management 42 | from helperfunctions.deltalogger import DeltaLogger ## Easy logging OOTB 43 | from helperfunctions.dbsqlclient import ServerlessClient ## Push Statement down to DBSQL from anyhere spark.sql() ==> serverless_client.sql() 44 | from helperfunctions.dbsqltransactions import DBSQLTransactionManager ## OOTB Multi-statement transactions to serverless SQL / DBSQL 45 | from helperfunctions.deltahelpers import DeltaHelpers, DeltaMergeHelpers ## For Temp Tables and Concurrent Merge statements 46 | 47 | # COMMAND ---------- 48 | 49 | # DBTITLE 1,Scope Session 50 | # MAGIC %sql 51 | # MAGIC CREATE DATABASE IF NOT EXISTS main.iot_dashboard; 52 | # MAGIC USE CATALOG main; 53 | # MAGIC USE SCHEMA iot_dashboard; 54 | 55 | # COMMAND ---------- 56 | 57 | # DBTITLE 1,Step 0: Initialize Logger and Serverless Client 58 | WAREHOUSE_ID = "475b94ddc7cd5211" 59 | HOST_NAME = "e2-demo-field-eng.cloud.databricks.com" 60 | #TOKEN = 61 | LOGGER_TABLE = 'main.iot_dashboard.logger' 62 | PIPELINE_PROCESS_NAME = 'iot_dashboard_scd2_end_to_end' 63 | 64 | ## Create Serverless Client 65 | serverless_client = ServerlessClient(warehouse_id=WAREHOUSE_ID, host_name=HOST_NAME) #token=TOKEN 66 | 67 | ## Create Delta Logger 68 | delta_logger = DeltaLogger(logger_table_name=LOGGER_TABLE, session_process_name=PIPELINE_PROCESS_NAME) # partition_cols=['start_date'], session_batch_id="12309821345" 69 | 70 | ## Optionally create transaction manager for multi statement transaction requirements (like SCD2 upserts) 71 | serverless_transaction_manager = DBSQLTransactionManager(warehouse_id=WAREHOUSE_ID, host_name=HOST_NAME) 72 | 73 | # COMMAND ---------- 74 | 75 | print(delta_logger.active_process_name) 76 | print(delta_logger.active_run_id) 77 | print(delta_logger.active_batch_id) 78 | 79 | # COMMAND ---------- 80 | 81 | # DBTITLE 1,Start Run with Delta Logger 82 | delta_logger.start_run(process_name='copy_into_command', batch_id="custom_batch_id") 83 | 84 | # COMMAND ---------- 85 | 86 | print(delta_logger.active_run_start_ts) 87 | print(delta_logger.active_run_status) 88 | 89 | # COMMAND ---------- 90 | 91 | # DBTITLE 1,Step 1: Create DDLs 92 | ddl_sql = """CREATE TABLE IF NOT EXISTS main.iot_dashboard.bronze_sensors_scd_2 93 | ( 94 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY, 95 | device_id INT, 96 | user_id INT, 97 | calories_burnt DECIMAL(10,2), 98 | miles_walked DECIMAL(10,2), 99 | num_steps DECIMAL(10,2), 100 | timestamp TIMESTAMP, 101 | value STRING, 102 | ingest_timestamp TIMESTAMP 103 | ) 104 | USING DELTA 105 | ; 106 | 107 | CREATE TABLE IF NOT EXISTS main.iot_dashboard.silver_sensors_scd_2 108 | ( 109 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY, 110 | device_id INT, 111 | user_id INT, 112 | calories_burnt DECIMAL(10,2), 113 | miles_walked DECIMAL(10,2), 114 | num_steps DECIMAL(10,2), 115 | timestamp TIMESTAMP, 116 | value STRING, 117 | ingest_timestamp TIMESTAMP, 118 | -- Processing Columns 119 | _start_timestamp TIMESTAMP, 120 | _end_timestamp TIMESTAMP, 121 | _batch_run_id STRING, 122 | _is_current BOOLEAN 123 | ) 124 | USING DELTA 125 | PARTITIONED BY (_is_current, user_id) 126 | TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported', 'delta.columnMapping.mode' = 'name') 127 | ; 128 | """ 129 | 130 | ## Simple Control Flow with Python, if/else, try/catch, for/while 131 | try: 132 | 133 | 134 | serverless_client.submit_multiple_sql_commands(ddl_sql) 135 | delta_logger.log_run_info(log_level='INFO', msg= f'DDLQuery runtime: 1 seconds') 136 | 137 | 138 | except Exception as e: 139 | 140 | delta_logger.log_run_info(log_level='CRITICAL', msg='Failed to create DDLS with error') 141 | 142 | raise(e) 143 | 144 | 145 | 146 | # COMMAND ---------- 147 | 148 | # DBTITLE 1,Step 2: Incrementally Ingest Source Data from Raw Files 149 | 150 | copy_into_sql = """ 151 | COPY INTO main.iot_dashboard.bronze_sensors_scd_2 152 | FROM (SELECT 153 | id::bigint AS Id, 154 | device_id::integer AS device_id, 155 | user_id::integer AS user_id, 156 | calories_burnt::decimal(10,2) AS calories_burnt, 157 | miles_walked::decimal(10,2) AS miles_walked, 158 | num_steps::decimal(10,2) AS num_steps, 159 | timestamp::timestamp AS timestamp, 160 | value AS value, -- This is a JSON object, 161 | now() AS ingest_timestamp 162 | FROM "/databricks-datasets/iot-stream/data-device/") 163 | FILEFORMAT = json -- csv, xml, txt, parquet, binary, etc. 164 | COPY_OPTIONS('force'='true') --'true' always loads all data it sees. option to be incremental or always load all files 165 | """ 166 | 167 | ## Simple Control Flow with Python, if/else, try/catch, for/while 168 | try: 169 | 170 | serverless_client.sql(copy_into_sql) 171 | 172 | batch_row_count = serverless_client.sql("SELECT COUNT(0) FROM main.iot_dashboard.bronze_sensors_scd_2").collect()[0][0] 173 | 174 | ## Log customer queryable metrics 175 | delta_logger.log_run_metric(run_metrics_dict={"Batch_Rows": batch_row_count}) 176 | 177 | delta_logger.log_run_info(msg = 'COPY INTO complete') 178 | 179 | except Exception as e: 180 | 181 | delta_logger.log_run_info(log_level='CRITICAL', msg='Failed to COPY INTO with error') 182 | raise(e) 183 | 184 | # COMMAND ---------- 185 | 186 | # DBTITLE 1,Step 3: Multi Statement Transaction: Perform SCD2 INSERT ONLY Upserts - Device Data 187 | mst_scd_transaction_sql = """ 188 | 189 | CREATE OR REPLACE TABLE main.iot_dashboard.temp_batch_to_insert 190 | AS 191 | WITH de_dup ( 192 | SELECT Id::integer, 193 | device_id::integer, 194 | user_id::integer, 195 | calories_burnt::decimal, 196 | miles_walked::decimal, 197 | num_steps::decimal, 198 | timestamp::timestamp, 199 | value::string, 200 | ingest_timestamp, 201 | ROW_NUMBER() OVER(PARTITION BY device_id, user_id, timestamp ORDER BY ingest_timestamp DESC) AS DupRank 202 | FROM main.iot_dashboard.bronze_sensors_scd_2 203 | ) 204 | 205 | SELECT Id, device_id, user_id, calories_burnt, miles_walked, num_steps, timestamp, value, ingest_timestamp, 206 | now() AS _start_timestamp, 207 | true AS _is_current, 208 | 1 AS _batch_run_id -- example batch run id 209 | FROM de_dup 210 | WHERE DupRank = 1 211 | ; 212 | 213 | MERGE INTO main.iot_dashboard.silver_sensors_scd_2 AS target 214 | USING ( 215 | 216 | SELECT updates.Id AS merge_key_id, 217 | updates.user_id AS merge_key_user_id, 218 | updates.device_id AS merge_key_device_id, 219 | updates.* --merge key can be built in whatever way makes sense to get unique rows 220 | FROM main.iot_dashboard.temp_batch_to_insert AS updates 221 | 222 | UNION ALL 223 | 224 | -- These rows will INSERT updated rows of existing records and new rows 225 | -- Setting the merge_key to NULL forces these rows to NOT MATCH and be INSERTed. 226 | SELECT 227 | NULL AS merge_key_id, 228 | NULL AS merge_key_user_id, 229 | NULL AS merge_key_device_id, 230 | updates.* 231 | FROM main.iot_dashboard.temp_batch_to_insert AS updates 232 | INNER JOIN main.iot_dashboard.silver_sensors_scd_2 as target_table 233 | ON updates.Id = target_table.Id 234 | AND updates.user_id = target_table.user_id 235 | AND updates.device_id = target_table.device_id -- What makes the key unique 236 | -- This needs to be accounted for when deciding to expire existing rows 237 | WHERE updates.value <> target_table.value -- Only update if any of the data has changed 238 | 239 | ) AS source 240 | 241 | ON target.Id = source.merge_key_id 242 | AND target.user_id = source.merge_key_user_id 243 | AND target.device_id = source.merge_key_device_id 244 | 245 | WHEN MATCHED AND (target._is_current = true AND target.value <> source.value) THEN 246 | UPDATE SET 247 | target._end_timestamp = source._start_timestamp, -- start of new record is end of old record 248 | target._is_current = false 249 | 250 | WHEN NOT MATCHED THEN 251 | INSERT (id, device_id, user_id, calories_burnt, miles_walked, num_steps, value, timestamp, ingest_timestamp, _start_timestamp, _end_timestamp, _is_current, _batch_run_id) 252 | VALUES ( 253 | source.id, source.device_id, source.user_id, source.calories_burnt, source.miles_walked, source.num_steps, source.value, source.timestamp, 254 | source.ingest_timestamp, 255 | source._start_timestamp, -- start timestamp -- new records 256 | NULL ,-- end_timestamp 257 | source._is_current, -- is current record 258 | source._batch_run_id --example batch run id 259 | ) 260 | ; 261 | """ 262 | 263 | 264 | ## Simple Control Flow with Python, if/else, try/catch, for/while 265 | 266 | try: 267 | 268 | #serverless_client.submit_multiple_sql_commands(mst_scd_transaction_sql) 269 | serverless_transaction_manager = DBSQLTransactionManager(warehouse_id=WAREHOUSE_ID) 270 | serverless_transaction_manager.execute_dbsql_transaction(sql_string=str(mst_scd_transaction_sql), tables_to_manage=['main.iot_dashboard.temp_batch_to_insert', 'main.iot_dashboard.silver_sensors_scd_2']) 271 | 272 | except Exception as e: 273 | 274 | delta_logger.fail_run() 275 | 276 | raise(e) 277 | 278 | # COMMAND ---------- 279 | 280 | # DBTITLE 1,Step 4: Clean up and Optimize Tables 281 | 282 | ## Simple Control Flow with Python, if/else, try/catch, for/while 283 | try: 284 | 285 | serverless_client.sql("TRUNCATE TABLE main.iot_dashboard.temp_batch_to_insert") 286 | delta_logger.log_run_info(msg='Batch cleared!') 287 | 288 | except Exception as e: 289 | 290 | delta_logger.log_run_info(log_level='INFO', msg='couldnt find table, was already deleted') 291 | 292 | 293 | ## Optimize command 294 | try: 295 | 296 | serverless_client.sql("OPTIMIZE main.iot_dashboard.silver_sensors_scd_2 ZORDER BY (timestamp, device_id)") 297 | 298 | delta_logger.log_run_info(msg='Target tables optimized!') 299 | 300 | except Exception as e: 301 | 302 | ## For these operations, they are not critical to the pipeline successs, so just log the event and keep going 303 | delta_logger.log_run_info(log_level='WARN', msg='couldnt find table, this should exist or a conflect happens') 304 | raise(e) 305 | 306 | 307 | # COMMAND ---------- 308 | 309 | # DBTITLE 1,Create "Current" View 310 | gold_views_sql = """ 311 | CREATE OR REPLACE VIEW main.iot_dashboard.silver_sensors_current 312 | AS 313 | SELECT * FROM main.iot_dashboard.silver_sensors_scd_2 314 | WHERE _is_current = true; 315 | 316 | CREATE OR REPLACE VIEW main.iot_dashboard.silver_sensors_snapshot_as_of_2023_10_10_19_30_00 317 | AS 318 | -- Get more recent record for each record as of a specific version 319 | WITH de_dup ( 320 | SELECT Id::integer, 321 | device_id::integer, 322 | user_id::integer, 323 | calories_burnt::decimal, 324 | miles_walked::decimal, 325 | num_steps::decimal, 326 | timestamp::timestamp, 327 | value::string, 328 | ingest_timestamp, 329 | _start_timestamp, 330 | _is_current, 331 | _end_timestamp, 332 | ROW_NUMBER() OVER(PARTITION BY id ORDER BY _start_timestamp DESC) AS DupRank -- Get most recent record as of a specific point in time 333 | FROM main.iot_dashboard.silver_sensors_scd_2 334 | -- Point in time snapshot timestamp such as end of month 335 | WHERE _start_timestamp <= '2023-10-10T19:30:00'::timestamp 336 | ) 337 | 338 | SELECT * 339 | FROM de_dup 340 | WHERE DupRank = 1 341 | ; 342 | """ 343 | 344 | 345 | ## Optimize command 346 | try: 347 | 348 | serverless_client.submit_multiple_sql_commands(gold_views_sql) 349 | 350 | delta_logger.log_run_info(msg='Operational View Created!') 351 | 352 | except Exception as e: 353 | 354 | delta_logger.log_run_info(log_level='CRITICAL', msg='couldnt find table, all should exist') 355 | raise(e) 356 | 357 | 358 | # COMMAND ---------- 359 | 360 | # DBTITLE 1,Complete Run! 361 | delta_logger.complete_run() 362 | 363 | # COMMAND ---------- 364 | 365 | delta_logger.full_table_name 366 | 367 | # COMMAND ---------- 368 | 369 | # MAGIC %sql 370 | # MAGIC 371 | # MAGIC SELECT *, 372 | # MAGIC run_metadata:Batch_Rows -- our custom metrics we logged 373 | # MAGIC FROM main.iot_dashboard.logger 374 | # MAGIC ORDER BY run_id DESC 375 | -------------------------------------------------------------------------------- /Design Patterns Notebooks/Advanced Notebooks/Multi-plexing with Autoloader/Option 1: Actually Multi-plexing tables on write/Child Job Template.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC ## Controller notebook 5 | # MAGIC 6 | # MAGIC Identifies and Orcestrates the sub jobs 7 | 8 | # COMMAND ---------- 9 | 10 | from pyspark.sql.functions import * 11 | from pyspark.sql.types import * 12 | 13 | # COMMAND ---------- 14 | 15 | # DBTITLE 1,Step 1: Logic to get unique list of events/sub directories that separate the different streams 16 | # Design considerations 17 | # Ideally the writer of the raw data will separate out event types by folder so you can use globPathFilters to create separate streams 18 | # If ALL events are in one data source, all streams will stream from 1 table and then will be filtered for that event in the stream. To avoid many file listings of the same file, enable useNotifications = true in autoloader 19 | 20 | # COMMAND ---------- 21 | 22 | # DBTITLE 1,Define Params 23 | dbutils.widgets.text("Input Root Path", "") 24 | dbutils.widgets.text("Parent Job Name", "") 25 | dbutils.widgets.text("Child Task Name", "") 26 | 27 | # COMMAND ---------- 28 | 29 | # DBTITLE 1,Get Params 30 | root_input_path = dbutils.widgets.get("Input Root Path") 31 | parent_job_name = dbutils.widgets.get("Parent Job Name") 32 | child_task_name = dbutils.widgets.get("Child Task Name") 33 | 34 | print(f"Root input path: {root_input_path}") 35 | print(f"Parent Job Name: {parent_job_name}") 36 | print(f"Event Task Name: {child_task_name}") 37 | 38 | # COMMAND ---------- 39 | 40 | # DBTITLE 1,Define Dynamic Checkpoint Path 41 | ## Eeach stream needs its own checkpoint, we can dynamically define that for each event/table we want to create / teast out 42 | 43 | checkpoint_path = f"dbfs:/checkpoints//{parent_job_name}/{child_task_name}/" 44 | 45 | # COMMAND ---------- 46 | 47 | # DBTITLE 1,Target Location Definitions 48 | spark.sql("""CREATE DATABASE IF NOT EXISTS iot_multiplexing_demo""") 49 | 50 | # COMMAND ---------- 51 | 52 | # DBTITLE 1,Use Whatever custom event filtering logic is needed 53 | filter_regex_string = "part-" + child_task_name + "*.json*" 54 | 55 | print(filter_regex_string) 56 | 57 | # COMMAND ---------- 58 | 59 | # DBTITLE 1,Read Stream 60 | input_df = (spark 61 | .readStream 62 | .format("text") 63 | .option("multiLine", "true") 64 | .option("pathGlobFilter", filter_regex_string) 65 | .load(root_input_path) 66 | .withColumn("inputFileName", input_file_name()) ## you can filter using .option("globPathFilter") as well here 67 | ) 68 | 69 | # COMMAND ---------- 70 | 71 | # DBTITLE 1,Transformation Logic on any events (can be conditional on event) 72 | transformed_df = (input_df 73 | .withColumn("EventName", lit(child_task_name)) 74 | .selectExpr("value:id::integer AS Id", 75 | "EventName", 76 | "value:user_id::integer AS UserId", 77 | "value:device_id::integer AS DeviceId", 78 | "value:num_steps::decimal AS NumberOfSteps", 79 | "value:miles_walked::decimal AS MilesWalked", 80 | "value:calories_burnt::decimal AS Calories", 81 | "value:timestamp::timestamp AS EventTimestamp", 82 | "current_timestamp() AS IngestionTimestamp", 83 | "inputFileName") 84 | 85 | ) 86 | 87 | # COMMAND ---------- 88 | 89 | # DBTITLE 1,Truncate this child stream and reload from all data 90 | 91 | dbutils.fs.rm(checkpoint_path, recurse=True) 92 | 93 | # COMMAND ---------- 94 | 95 | # DBTITLE 1,Dynamic Write Stream 96 | (transformed_df 97 | .writeStream 98 | .trigger(once=True) 99 | .option("checkpointLocation", checkpoint_path) 100 | .toTable(f"iot_multiplexing_demo.iot_stream_event_{child_task_name}") 101 | ) 102 | -------------------------------------------------------------------------------- /Design Patterns Notebooks/Advanced Notebooks/Parallel Custom Named File Exports/Parallel File Exports - Python Version.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC # Python Version 5 | # MAGIC ### Author: Cody Austin Davis 6 | # MAGIC ### Date: 2/22/2023 7 | # MAGIC 8 | # MAGIC This notebook shows users how to rename/move files from one s3 location/file name to another in parallel using spark. 9 | 10 | # COMMAND ---------- 11 | 12 | # MAGIC %pip install boto 13 | 14 | # COMMAND ---------- 15 | 16 | # DBTITLE 1,Define Function To Dynamically Rename File Paths 17 | @udf("string") 18 | def getRenamedFilePath(source_path): 19 | 20 | source_path_root = "/".join(source_path.split("/")[:-1]) 21 | source_path_file_name = "".join(source_path.split("/")[-1]) 22 | 23 | ## insert any arbitrary file renaming logic here 24 | new_path_file_name = "/renamed/" + source_path_file_name 25 | 26 | new_path = source_path_root + new_path_file_name 27 | 28 | return new_path 29 | 30 | # COMMAND ---------- 31 | 32 | # DBTITLE 1,Create test data set not in dbutils 33 | (spark.read.json("dbfs:/databricks-datasets/iot-stream/data-device/") 34 | .write.format("json").mode("overwrite").save('s3://oetrta/codyaustindavis/parallelfile_source/') 35 | ) 36 | 37 | # COMMAND ---------- 38 | 39 | # DBTITLE 1,Define python udf that renames / copies files in parallel with boto or HTTP request 40 | from pyspark.sql.functions import * 41 | import boto3 42 | 43 | ## This UDF can be adjusted to accept access keys as another parameter 44 | 45 | @udf("string") 46 | def mv_s3_object(source_path, target_path): 47 | 48 | ## Get SOURCE bucket name and source path separately for boto 49 | source_bucket_name = "/".join(source_path.split("/")[0:3]).split("//")[1] 50 | source_file_path = "/".join(source_path.split("/")[3:]) 51 | 52 | ## Get TARGET bucket name and source path separately for boto 53 | target_bucket_name = "/".join(target_path.split("/")[0:3]).split("//")[1] 54 | target_file_path = "/".join(target_path.split("/")[3:]) 55 | 56 | ## Prep boto request copy params 57 | source_dict = {'Bucket': source_bucket_name, 'Key': source_file_path} 58 | 59 | ## Try copying the file over, return SUCCESS or error message in pyspark data frame 60 | s3 = boto3.resource('s3') 61 | msg = 'NOOP' 62 | try: 63 | s3.Object(target_bucket_name, target_file_path).copy_from(CopySource=source_dict) 64 | ## This delete is optional, you might want to separate this out into another job. This just represents the 2 commands to simulate a "move" 65 | s3.Object(source_bucket_name, source_file_path).delete() 66 | 67 | msg = 'SUCCESS' 68 | 69 | except Exception as e: 70 | msg = f'FAIL: {str(e)} \n BUCKET: {source_bucket_name}, SOURCE: {source_file_path}, TARGET: {target_file_path}' 71 | 72 | return msg 73 | 74 | # COMMAND ---------- 75 | 76 | # DBTITLE 1,Chose a source path (either dynamically or manually) and move / rename files with the udfs in parallel 77 | input_path_to_move = 's3://oetrta/codyaustindavis/parallelfile_source/' 78 | 79 | filesDf = (spark.createDataFrame(dbutils.fs.ls(input_path_to_move)) 80 | .filter(~col("name").startswith("_")) ## exclude out-of-scope files 81 | .withColumn("target_path", getRenamedFilePath(col("path"))) ## Python udf to create the new file path with any logic inside function 82 | .selectExpr("path AS source_path", "target_path") ## select 2 paths needed 83 | .withColumn("WasMoved", mv_s3_object(col("source_path"), col("target_path"))) ## Push source and target paths to udf to execute in parallel and return msg 84 | ) 85 | 86 | display(filesDf) 87 | 88 | # COMMAND ---------- 89 | 90 | # DBTITLE 1,Confirm rename 91 | dbutils.fs.ls("s3://oetrta/codyaustindavis/parallelfile_source/renamed/" 92 | ) 93 | 94 | # COMMAND ---------- 95 | 96 | # MAGIC %md 97 | # MAGIC 98 | # MAGIC # Scala Version 99 | 100 | # COMMAND ---------- 101 | 102 | # DBTITLE 1,Define scala file renaming function 103 | # MAGIC %scala 104 | # MAGIC 105 | # MAGIC 106 | # MAGIC def getNewFilePath(sourcePath: String): String = { 107 | # MAGIC val source_path = sourcePath; 108 | # MAGIC 109 | # MAGIC val slice_len = source_path.split("/").length - 1; 110 | # MAGIC val source_path_root = source_path.split("/").slice(0, slice_len); 111 | # MAGIC val source_path_file_name = source_path.split("/").last; 112 | # MAGIC 113 | # MAGIC // any arbitrary file rename logic 114 | # MAGIC val new_path_file_name = "renamed/"+source_path_file_name; 115 | # MAGIC 116 | # MAGIC 117 | # MAGIC val new_path = source_path_root.mkString("/") + "/" + new_path_file_name; 118 | # MAGIC 119 | # MAGIC return new_path 120 | # MAGIC } 121 | 122 | # COMMAND ---------- 123 | 124 | # DBTITLE 1,Test Scala File Renaming function 125 | # MAGIC %scala 126 | # MAGIC 127 | # MAGIC val test_new_path = getNewFilePath("dbfs:/databricks-datasets/iot-stream/data-device/part-00003.json.gz") 128 | # MAGIC 129 | # MAGIC println(test_new_path) 130 | 131 | # COMMAND ---------- 132 | 133 | # DBTITLE 1,Broadcast Configs to Executors 134 | # MAGIC %scala 135 | # MAGIC import org.apache.hadoop.fs 136 | # MAGIC 137 | # MAGIC // maybe we need to register access keys here? not sure yet. Still dealing with Auth issues 138 | # MAGIC val conf = new org.apache.spark.util.SerializableConfiguration(sc.hadoopConfiguration) 139 | # MAGIC 140 | # MAGIC val broadcastConf = sc.broadcast(conf) 141 | # MAGIC 142 | # MAGIC print(conf.value) 143 | 144 | # COMMAND ---------- 145 | 146 | # DBTITLE 1,Run file renaming and moving for each row (need to add AUTH) 147 | # MAGIC %scala 148 | # MAGIC 149 | # MAGIC import org.apache.hadoop.fs._ 150 | # MAGIC 151 | # MAGIC // root bucket of where original files were dropped 152 | # MAGIC val filesToCopy = dbutils.fs.ls("dbfs:/databricks-datasets/iot-stream/data-device/").map(_.path) 153 | # MAGIC 154 | # MAGIC spark.sparkContext.parallelize(filesToCopy).foreachPartition(rows => rows.foreach { 155 | # MAGIC 156 | # MAGIC file => 157 | # MAGIC 158 | # MAGIC println(file) 159 | # MAGIC val fromPath = new Path(file) 160 | # MAGIC 161 | # MAGIC val tempNewPath = getNewFilePath(file) 162 | # MAGIC 163 | # MAGIC val toPath = new Path(tempNewPath) 164 | # MAGIC 165 | # MAGIC val fromFs = toPath.getFileSystem(conf.value) 166 | # MAGIC 167 | # MAGIC val toFs = toPath.getFileSystem(conf.value) 168 | # MAGIC 169 | # MAGIC FileUtil.copy(fromFs, fromPath, toFs, toPath, false, conf.value) 170 | # MAGIC 171 | # MAGIC }) 172 | 173 | # COMMAND ---------- 174 | 175 | # DBTITLE 1,Look at files to Copy 176 | # MAGIC %scala 177 | # MAGIC 178 | # MAGIC val filesToCopy = dbutils.fs.ls("dbfs:/databricks-datasets/iot-stream/data-device/").map(_.path) 179 | # MAGIC 180 | # MAGIC 181 | # MAGIC val filesDf = spark.sparkContext.parallelize(filesToCopy).toDF() 182 | # MAGIC 183 | # MAGIC display(filesDf) 184 | -------------------------------------------------------------------------------- /Design Patterns Notebooks/Advanced Notebooks/Parallel Custom Named File Exports/Parallel File Exports.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # DBTITLE 1,helper function to dynamically build target path for each file 3 | # MAGIC %scala 4 | # MAGIC 5 | # MAGIC 6 | # MAGIC def getNewFilePath(sourcePath: String): String = { 7 | # MAGIC val source_path = sourcePath; 8 | # MAGIC 9 | # MAGIC val slice_len = source_path.split("/").length - 1; 10 | # MAGIC val source_path_root = source_path.split("/").slice(0, slice_len); 11 | # MAGIC val source_path_file_name = source_path.split("/").last; 12 | # MAGIC 13 | # MAGIC // any arbitrary file rename logic 14 | # MAGIC val new_path_file_name = "renamed/"+source_path_file_name; 15 | # MAGIC val new_path = source_path_root.mkString("/") + "/" + new_path_file_name; 16 | # MAGIC 17 | # MAGIC return new_path 18 | # MAGIC } 19 | 20 | # COMMAND ---------- 21 | 22 | # DBTITLE 1,Test New Function to dynamically build target path for each row (file) 23 | # MAGIC %scala 24 | # MAGIC 25 | # MAGIC val test_new_path = getNewFilePath("dbfs:/databricks-datasets/iot-stream/data-device/part-00003.json.gz") 26 | # MAGIC 27 | # MAGIC println(test_new_path) 28 | 29 | # COMMAND ---------- 30 | 31 | # MAGIC %scala 32 | # MAGIC import org.apache.hadoop.fs 33 | # MAGIC 34 | # MAGIC // maybe we need to register access keys here? not sure yet. Still dealing with Auth issues 35 | # MAGIC val conf = new org.apache.spark.util.SerializableConfiguration(sc.hadoopConfiguration) 36 | # MAGIC 37 | # MAGIC val broadcastConf = sc.broadcast(conf) 38 | # MAGIC 39 | # MAGIC print(conf.value) 40 | 41 | # COMMAND ---------- 42 | 43 | # MAGIC %scala 44 | # MAGIC 45 | # MAGIC import org.apache.hadoop.fs._ 46 | # MAGIC 47 | # MAGIC // root bucket of where original files were dropped 48 | # MAGIC val filesToCopy = dbutils.fs.ls("dbfs:/databricks-datasets/iot-stream/data-device/").map(_.path) 49 | # MAGIC 50 | # MAGIC spark.sparkContext.parallelize(filesToCopy).foreachPartition(rows => rows.foreach { 51 | # MAGIC 52 | # MAGIC file => 53 | # MAGIC 54 | # MAGIC println(file) 55 | # MAGIC val fromPath = new Path(file) 56 | # MAGIC 57 | # MAGIC val tempNewPath = getNewFilePath(file) 58 | # MAGIC 59 | # MAGIC val toPath = new Path(tempNewPath) 60 | # MAGIC 61 | # MAGIC val fromFs = toPath.getFileSystem(conf.value) 62 | # MAGIC 63 | # MAGIC val toFs = toPath.getFileSystem(conf.value) 64 | # MAGIC 65 | # MAGIC FileUtil.copy(fromFs, fromPath, toFs, toPath, false, conf.value) 66 | # MAGIC 67 | # MAGIC }) 68 | 69 | # COMMAND ---------- 70 | 71 | # MAGIC %scala 72 | # MAGIC 73 | # MAGIC val filesToCopy = dbutils.fs.ls("dbfs:/databricks-datasets/iot-stream/data-device/").map(_.path) 74 | # MAGIC 75 | # MAGIC 76 | # MAGIC val filesDf = spark.sparkContext.parallelize(filesToCopy).toDF() 77 | # MAGIC 78 | # MAGIC display(filesDf) 79 | -------------------------------------------------------------------------------- /Design Patterns Notebooks/Step 1 - SQL EDW Pipeline.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md 3 | -- MAGIC 4 | -- MAGIC # This notebook generates a full data pipeline from databricks dataset - iot-stream 5 | -- MAGIC 6 | -- MAGIC ## This creates 2 tables: 7 | -- MAGIC 8 | -- MAGIC Database: iot_dashboard 9 | -- MAGIC 10 | -- MAGIC Tables: silver_sensors, silver_users 11 | -- MAGIC 12 | -- MAGIC Params: StartOver (Yes/No) - allows user to truncate and reload pipeline 13 | 14 | -- COMMAND ---------- 15 | 16 | -- DBTITLE 1,Medallion Architecture 17 | -- MAGIC %md 18 | -- MAGIC 19 | -- MAGIC 20 | 21 | -- COMMAND ---------- 22 | 23 | DROP DATABASE IF EXISTS iot_dashboard CASCADE; 24 | CREATE DATABASE IF NOT EXISTS iot_dashboard; 25 | USE iot_dashboard; 26 | 27 | -- COMMAND ---------- 28 | 29 | -- MAGIC %md 30 | -- MAGIC 31 | -- MAGIC # DDL Documentation: 32 | -- MAGIC 33 | -- MAGIC https://docs.databricks.com/spark/latest/spark-sql/language-manual/sql-ref-syntax-ddl-alter-table.html 34 | 35 | -- COMMAND ---------- 36 | 37 | CREATE TABLE IF NOT EXISTS iot_dashboard.bronze_sensors 38 | ( 39 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY, 40 | device_id INT, 41 | user_id INT, 42 | calories_burnt DECIMAL(10,2), 43 | miles_walked DECIMAL(10,2), 44 | num_steps DECIMAL(10,2), 45 | timestamp TIMESTAMP, 46 | value STRING 47 | ) 48 | USING DELTA 49 | TBLPROPERTIES("delta.targetFileSize"="128mb") 50 | -- Other helpful properties 51 | -- delta.dataSkippingNumIndexedCols -- decides how many columns are automatically tracked with statistics kepts (defaults to first 32) 52 | -- LOCATION "s3://bucket-name/data_lakehouse/tables/data/bronze/bronze_senors/" 53 | ; 54 | 55 | -- COMMAND ---------- 56 | 57 | -- DBTITLE 1,Look at Table Details 58 | DESCRIBE TABLE EXTENDED iot_dashboard.bronze_sensors 59 | 60 | -- COMMAND ---------- 61 | 62 | CREATE TABLE IF NOT EXISTS iot_dashboard.bronze_users 63 | ( 64 | userid BIGINT GENERATED BY DEFAULT AS IDENTITY (START WITH 1 INCREMENT BY 1), 65 | gender STRING, 66 | age INT, 67 | height DECIMAL(10,2), 68 | weight DECIMAL(10,2), 69 | smoker STRING, 70 | familyhistory STRING, 71 | cholestlevs STRING, 72 | bp STRING, 73 | risk DECIMAL(10,2), 74 | update_timestamp TIMESTAMP 75 | ) 76 | USING DELTA 77 | TBLPROPERTIES("delta.targetFileSize"="128mb") 78 | --LOCATION s3:/// 79 | ; 80 | 81 | -- COMMAND ---------- 82 | 83 | -- MAGIC %md 84 | -- MAGIC ## Exhaustive list of all COPY INTO Options 85 | -- MAGIC https://docs.databricks.com/sql/language-manual/delta-copy-into.html#format-options-1 86 | 87 | -- COMMAND ---------- 88 | 89 | -- MAGIC %md 90 | -- MAGIC 91 | -- MAGIC ## New FEATURES IN DBR 11! 92 | -- MAGIC 93 | -- MAGIC 1. COPY INTO GENERIC TABLE 94 | -- MAGIC 2. DROP COLUMN STATEMENT 95 | -- MAGIC 3. Select all except: SELECT * EXCEPT (col1,...) FROM table 96 | -- MAGIC 97 | -- MAGIC https://docs.databricks.com/release-notes/runtime/11.0.html 98 | 99 | -- COMMAND ---------- 100 | 101 | 102 | --With DBR 11, we dont need to specify DDL first 103 | --CREATE TABLE IF NOT EXISTS iot_dashboard.bronze_sensors 104 | 105 | --COPY INTO iot_dashboard.bronze_sensors 106 | --FROM (SELECT 107 | -- id::bigint AS Id, 108 | -- device_id::integer AS device_id, 109 | -- user_id::integer AS user_id, 110 | -- calories_burnt::decimal(10,2) AS calories_burnt, 111 | -- miles_walked::decimal(10,2) AS miles_walked, 112 | -- num_steps::decimal(10,2) AS num_steps, 113 | -- timestamp::timestamp AS timestamp, 114 | -- value AS value -- This is a JSON object 115 | --FROM "/databricks-datasets/iot-stream/data-device/") 116 | --FILEFORMAT = json 117 | --COPY_OPTIONS('force'='true') -- 'false' -- process incrementally 118 | --option to be incremental or always load all files 119 | 120 | 121 | 122 | -- COMMAND ---------- 123 | 124 | -- DBTITLE 1,Incrementally Ingest Source Data from Raw Files 125 | COPY INTO iot_dashboard.bronze_sensors 126 | FROM (SELECT 127 | id::bigint AS Id, 128 | device_id::integer AS device_id, 129 | user_id::integer AS user_id, 130 | calories_burnt::decimal(10,2) AS calories_burnt, 131 | miles_walked::decimal(10,2) AS miles_walked, 132 | num_steps::decimal(10,2) AS num_steps, 133 | timestamp::timestamp AS timestamp, 134 | value AS value -- This is a JSON object 135 | FROM "/databricks-datasets/iot-stream/data-device/") 136 | FILEFORMAT = json -- csv, xml, txt, parquet, binary, etc. 137 | COPY_OPTIONS('force'='false') --'true' always loads all data it sees. option to be incremental or always load all files 138 | 139 | 140 | --Other Helpful copy options: 141 | /* 142 | PATTERN('[A-Za-z0-9].json') 143 | FORMAT_OPTIONS ('ignoreCorruptFiles' = 'true') -- skips bad files for more robust incremental loads 144 | COPY_OPTIONS ('mergeSchema' = 'true') 145 | 'ignoreChanges' = 'true' - ENSURE DOWNSTREAM PIPELINE CAN HANDLE DUPLICATE ALREADY PROCESSED RECORDS WITH MERGE/INSERT WHERE NOT EXISTS/Etc. 146 | 'ignoreDeletes' = 'true' 147 | */; 148 | 149 | -- COMMAND ---------- 150 | 151 | -- DBTITLE 1,Create Silver Table for upserting updates 152 | CREATE TABLE IF NOT EXISTS iot_dashboard.silver_sensors 153 | ( 154 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY, 155 | device_id INT, 156 | user_id INT, 157 | calories_burnt DECIMAL(10,2), 158 | miles_walked DECIMAL(10,2), 159 | num_steps DECIMAL(10,2), 160 | timestamp TIMESTAMP, 161 | value STRING 162 | ) 163 | USING DELTA 164 | PARTITIONED BY (user_id) 165 | TBLPROPERTIES("delta.targetFileSize"="128mb") -- if update heavy, file sizes are great between 64-128 mbs. The more update heavy, the smaller the files (32-256mb) 166 | --LOCATION s3:/// -- Always specify location for production tables so you control where it lives in S3/ADLS/GCS 167 | -- Not specifying location parth will put table in DBFS, a managed bucket that cannot be accessed by apps outside of databricks 168 | ; 169 | 170 | -- COMMAND ---------- 171 | 172 | -- DBTITLE 1,Perform Upserts - Device Data 173 | MERGE INTO iot_dashboard.silver_sensors AS target 174 | USING ( 175 | WITH de_dup ( 176 | SELECT Id::integer, 177 | device_id::integer, 178 | user_id::integer, 179 | calories_burnt::decimal, 180 | miles_walked::decimal, 181 | num_steps::decimal, 182 | timestamp::timestamp, 183 | value::string, 184 | ROW_NUMBER() OVER(PARTITION BY device_id, user_id, timestamp ORDER BY timestamp DESC) AS DupRank 185 | FROM iot_dashboard.bronze_sensors 186 | ) 187 | 188 | SELECT Id, device_id, user_id, calories_burnt, miles_walked, num_steps, timestamp, value 189 | FROM de_dup 190 | WHERE DupRank = 1 191 | ) AS source 192 | ON source.Id = target.Id 193 | AND source.user_id = target.user_id 194 | AND source.device_id = target.device_id 195 | WHEN MATCHED THEN UPDATE SET 196 | target.calories_burnt = source.calories_burnt, 197 | target.miles_walked = source.miles_walked, 198 | target.num_steps = source.num_steps, 199 | target.timestamp = source.timestamp 200 | WHEN NOT MATCHED THEN INSERT *; 201 | 202 | -- This calculate table stats for all columns to ensure the optimizer can build the best plan 203 | -- THIS IS NOT INCREMENTAL 204 | ANALYZE TABLE iot_dashboard.silver_sensors COMPUTE STATISTICS FOR ALL COLUMNS; 205 | 206 | 207 | /* 208 | -- INCREMENTAL 209 | Two things will happen: 210 | 211 | 1. Files written into the table will be compacted into larger files - up to targetFileSize 212 | 2. Co-locate files by the ZORDER keys 213 | 214 | 215 | Choice Factors: 216 | 1. Use on column often utilized in joins, filters, etc. 217 | 2. High cardinality columns 218 | 219 | Recommended 1-3 columns, can do 5+ 220 | Order ZORDER cols in order of cardinality ascending 221 | 222 | */ 223 | 224 | OPTIMIZE iot_dashboard.silver_sensors ZORDER BY (timestamp); 225 | 226 | 227 | -- Truncate bronze batch once successfully loaded 228 | 229 | -- This is the classical batch design pattern - but we can also now use streaming tables 230 | 231 | TRUNCATE TABLE iot_dashboard.bronze_sensors; 232 | 233 | -- COMMAND ---------- 234 | 235 | DESCRIBE HISTORY iot_dashboard.silver_sensors 236 | 237 | -- COMMAND ---------- 238 | 239 | SELECT * FROM iot_dashboard.silver_sensors VERSION AS OF 1; 240 | 241 | -- COMMAND ---------- 242 | 243 | -- MAGIC %md 244 | -- MAGIC 245 | -- MAGIC ## Exhaustive list of optimizations on Delta Tables 246 | -- MAGIC https://docs.databricks.com/delta/optimizations/file-mgmt.html#set-a-target-size 247 | 248 | -- COMMAND ---------- 249 | 250 | -- MAGIC %md 251 | -- MAGIC 252 | -- MAGIC # Levels of optimization on Databricks 253 | -- MAGIC 254 | -- MAGIC ## Partitions - Do not over partition - usually ZORDERING covers what you need - even in big tables 255 | -- MAGIC ### File Sizes - smaller for BI heavy and update heavy tables 64mb to 128mb 256 | -- MAGIC #### Order of files -- ZORDER(col,col) / CLUSTER BY -- ZORDER on most used filtering/join columns, in order of cardinality like a funnel 257 | -- MAGIC ##### Indexes -- For highly selective queries - need to create index first then fill with data "needle in a haystack" 258 | 259 | -- COMMAND ---------- 260 | 261 | -- MAGIC %md 262 | -- MAGIC 263 | -- MAGIC ##### For partitions, make sure each partitions is at LEAST 10s of GB, otherwise, your partitions are too small 264 | 265 | -- COMMAND ---------- 266 | 267 | -- DBTITLE 1,Change Size of Files - will be changed when files are optimized 268 | ALTER TABLE iot_dashboard.silver_sensors SET TBLPROPERTIES ('delta.targetFileSize'='64mb'); 269 | 270 | -- COMMAND ---------- 271 | 272 | -- DBTITLE 1,Table Optimizations 273 | -- You want to optimize by high cardinality columns like ids, timestamps, strings 274 | OPTIMIZE iot_dashboard.silver_sensors ZORDER BY (device_id, timestamp); 275 | 276 | -- COMMAND ---------- 277 | 278 | -- MAGIC %md 279 | -- MAGIC 280 | -- MAGIC ## Details on Bloom Indexs Here: 281 | -- MAGIC https://docs.databricks.com/delta/optimizations/bloom-filters.html 282 | 283 | -- COMMAND ---------- 284 | 285 | --Bloom filters need to exist first, so if you add an index later you need to reprocess the files (an optimize, insert, etc.) 286 | --Ideally a column that is highly selective but not used in z-order (text, other timestamps, etc.) 287 | 288 | CREATE BLOOMFILTER INDEX 289 | ON TABLE iot_dashboard.silver_sensors 290 | FOR COLUMNS(device_id OPTIONS (fpp=0.1, numItems=50000000)) 291 | 292 | -- COMMAND ---------- 293 | 294 | -- DBTITLE 1,Select Semi Structured/Unstructred Data with JSON dot notation 295 | SELECT 296 | *, 297 | value:user_id::integer AS parsed_user, 298 | value:time_stamp::timestamp AS parsed_time -- Pro tip: You can do the same thing if reading in json via the text reader. Makes for highly flexible data ingestion 299 | FROM iot_dashboard.silver_sensors; 300 | 301 | -- COMMAND ---------- 302 | 303 | -- MAGIC %md 304 | -- MAGIC 305 | -- MAGIC ## Ingest User Data As Well 306 | 307 | -- COMMAND ---------- 308 | 309 | -- DBTITLE 1,Incrementally Ingest Raw User Data 310 | COPY INTO iot_dashboard.bronze_users 311 | FROM (SELECT 312 | userid::bigint AS userid, 313 | gender AS gender, 314 | age::integer AS age, 315 | height::decimal(10,2) AS height, 316 | weight::decimal(10,2) AS weight, 317 | smoker AS smoker, 318 | familyhistory AS familyhistory, 319 | cholestlevs AS cholestlevs, 320 | bp AS bp, 321 | risk::decimal(10,2) AS risk, 322 | current_timestamp() AS update_timestamp 323 | FROM "/databricks-datasets/iot-stream/data-user/") 324 | FILEFORMAT = CSV 325 | FORMAT_OPTIONS('header'='true') 326 | COPY_OPTIONS('force'='true') --option to be incremental or always load all files 327 | ; 328 | 329 | -- COMMAND ---------- 330 | 331 | CREATE TABLE IF NOT EXISTS iot_dashboard.silver_users 332 | ( 333 | userid BIGINT GENERATED BY DEFAULT AS IDENTITY, 334 | gender STRING, 335 | age INT, 336 | height DECIMAL(10,2), 337 | weight DECIMAL(10,2), 338 | smoker STRING, 339 | familyhistory STRING, 340 | cholestlevs STRING, 341 | bp STRING, 342 | risk DECIMAL(10,2), 343 | update_timestamp TIMESTAMP 344 | ) 345 | USING DELTA 346 | TBLPROPERTIES("delta.targetFileSize"="128mb") 347 | --LOCATION s3:/// -- Always specify path for production tables. 348 | ; 349 | 350 | -- COMMAND ---------- 351 | 352 | MERGE INTO iot_dashboard.silver_users AS target 353 | USING (SELECT 354 | userid::int, 355 | gender::string, 356 | age::int, 357 | height::decimal, 358 | weight::decimal, 359 | smoker, 360 | familyhistory, 361 | cholestlevs, 362 | bp, 363 | risk, 364 | update_timestamp 365 | FROM iot_dashboard.bronze_users) AS source 366 | ON source.userid = target.userid 367 | WHEN MATCHED THEN UPDATE SET 368 | target.gender = source.gender, 369 | target.age = source.age, 370 | target.height = source.height, 371 | target.weight = source.weight, 372 | target.smoker = source.smoker, 373 | target.familyhistory = source.familyhistory, 374 | target.cholestlevs = source.cholestlevs, 375 | target.bp = source.bp, 376 | target.risk = source.risk, 377 | target.update_timestamp = source.update_timestamp 378 | WHEN NOT MATCHED THEN INSERT *; 379 | 380 | --Truncate bronze batch once successfully loaded 381 | TRUNCATE TABLE iot_dashboard.bronze_users; 382 | 383 | -- COMMAND ---------- 384 | 385 | OPTIMIZE iot_dashboard.silver_users ZORDER BY (userid); 386 | 387 | -- COMMAND ---------- 388 | 389 | SELECT * FROM iot_dashboard.silver_users; 390 | 391 | -- COMMAND ---------- 392 | 393 | SELECT * FROM iot_dashboard.silver_sensors; 394 | -------------------------------------------------------------------------------- /Design Patterns Notebooks/Step 10 - Lakehouse Federation.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # Using Lakehouse Federation for a single Pane of Glass 3 | 4 | ## Topics 5 | 6 | 1. How to use Lakehouse Federation 7 | 2. Setting up new database 8 | 3. Performance management / considerations 9 | 4. Limitations 10 | -------------------------------------------------------------------------------- /Design Patterns Notebooks/Step 11 - SQL Orchestration in Production.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | ## Orchestrating SQL Pipelines in Production 3 | 4 | 1. SQL Tasks Types 5 | 2. Airflow Operator 6 | 3. DBSQL REST API / Pushdown Client 7 | 4. Single Node Jobs pattern 8 | -------------------------------------------------------------------------------- /Design Patterns Notebooks/Step 12 - SCD2 - SQL EDW Pipeline.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md 3 | -- MAGIC 4 | -- MAGIC # This notebook generates a full data pipeline from databricks dataset - iot-stream using INSERT ONLY SCD-2 Architecture 5 | -- MAGIC 6 | -- MAGIC ## This creates 2 tables: 7 | -- MAGIC 8 | -- MAGIC Database: iot_dashboard 9 | -- MAGIC 10 | -- MAGIC Tables: silver_sensors_silver, silver_sensors_bronze (raw updates) 11 | -- MAGIC 12 | -- MAGIC Params: StartOver (Yes/No) - allows user to truncate and reload pipeline 13 | 14 | -- COMMAND ---------- 15 | 16 | -- DBTITLE 1,Medallion Architecture 17 | -- MAGIC %md 18 | -- MAGIC 19 | -- MAGIC 20 | 21 | -- COMMAND ---------- 22 | 23 | DROP DATABASE IF EXISTS iot_dashboard CASCADE; 24 | CREATE DATABASE IF NOT EXISTS iot_dashboard; 25 | USE iot_dashboard; 26 | 27 | -- COMMAND ---------- 28 | 29 | -- MAGIC %md 30 | -- MAGIC 31 | -- MAGIC # DDL Documentation: 32 | -- MAGIC 33 | -- MAGIC https://docs.databricks.com/spark/latest/spark-sql/language-manual/sql-ref-syntax-ddl-alter-table.html 34 | 35 | -- COMMAND ---------- 36 | 37 | CREATE TABLE IF NOT EXISTS iot_dashboard.bronze_sensors_scd_2 38 | ( 39 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY, 40 | device_id INT, 41 | user_id INT, 42 | calories_burnt DECIMAL(10,2), 43 | miles_walked DECIMAL(10,2), 44 | num_steps DECIMAL(10,2), 45 | timestamp TIMESTAMP, 46 | value STRING, 47 | ingest_timestamp TIMESTAMP 48 | ) 49 | USING DELTA 50 | TBLPROPERTIES("delta.targetFileSize"="128mb") 51 | -- Other helpful properties 52 | -- delta.dataSkippingNumIndexedCols -- decides how many columns are automatically tracked with statistics kepts (defaults to first 32) 53 | -- LOCATION "s3://bucket-name/data_lakehouse/tables/data/bronze/bronze_senors/" 54 | ; 55 | 56 | -- COMMAND ---------- 57 | 58 | -- DBTITLE 1,Look at Table Details 59 | DESCRIBE TABLE EXTENDED iot_dashboard.bronze_sensors_scd_2 60 | 61 | -- COMMAND ---------- 62 | 63 | -- MAGIC %md 64 | -- MAGIC 65 | -- MAGIC ## New FEATURES IN DBR 11! 66 | -- MAGIC 67 | -- MAGIC 1. COPY INTO GENERIC TABLE 68 | -- MAGIC 2. DROP COLUMN STATEMENT 69 | -- MAGIC 3. Select all except: SELECT * EXCEPT (col1,...) FROM table 70 | -- MAGIC 71 | -- MAGIC https://docs.databricks.com/release-notes/runtime/11.0.html 72 | 73 | -- COMMAND ---------- 74 | 75 | 76 | --With DBR 11, we dont need to specify DDL first 77 | --CREATE TABLE IF NOT EXISTS iot_dashboard.bronze_sensors 78 | 79 | --COPY INTO iot_dashboard.bronze_sensors 80 | --FROM (SELECT 81 | -- id::bigint AS Id, 82 | -- device_id::integer AS device_id, 83 | -- user_id::integer AS user_id, 84 | -- calories_burnt::decimal(10,2) AS calories_burnt, 85 | -- miles_walked::decimal(10,2) AS miles_walked, 86 | -- num_steps::decimal(10,2) AS num_steps, 87 | -- timestamp::timestamp AS timestamp, 88 | -- value AS value -- This is a JSON object 89 | --FROM "/databricks-datasets/iot-stream/data-device/") 90 | --FILEFORMAT = json 91 | --COPY_OPTIONS('force'='true') -- 'false' -- process incrementally 92 | --option to be incremental or always load all files 93 | 94 | 95 | 96 | -- COMMAND ---------- 97 | 98 | -- DBTITLE 1,Incrementally Ingest Source Data from Raw Files 99 | COPY INTO iot_dashboard.bronze_sensors_scd_2 100 | FROM (SELECT 101 | id::bigint AS Id, 102 | device_id::integer AS device_id, 103 | user_id::integer AS user_id, 104 | calories_burnt::decimal(10,2) AS calories_burnt, 105 | miles_walked::decimal(10,2) AS miles_walked, 106 | num_steps::decimal(10,2) AS num_steps, 107 | timestamp::timestamp AS timestamp, 108 | value AS value, -- This is a JSON object, 109 | now() AS ingest_timestamp 110 | FROM "/databricks-datasets/iot-stream/data-device/") 111 | FILEFORMAT = json -- csv, xml, txt, parquet, binary, etc. 112 | COPY_OPTIONS('force'='true') --'true' always loads all data it sees. option to be incremental or always load all files 113 | 114 | 115 | --Other Helpful copy options: 116 | /* 117 | PATTERN('[A-Za-z0-9].json') 118 | FORMAT_OPTIONS ('ignoreCorruptFiles' = 'true') -- skips bad files for more robust incremental loads 119 | COPY_OPTIONS ('mergeSchema' = 'true') 120 | 'ignoreChanges' = 'true' - ENSURE DOWNSTREAM PIPELINE CAN HANDLE DUPLICATE ALREADY PROCESSED RECORDS WITH MERGE/INSERT WHERE NOT EXISTS/Etc. 121 | 'ignoreDeletes' = 'true' 122 | */; 123 | 124 | -- COMMAND ---------- 125 | 126 | -- DBTITLE 1,Create Silver Table for upserting updates 127 | CREATE TABLE IF NOT EXISTS iot_dashboard.silver_sensors_scd_2 128 | ( 129 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY, 130 | device_id INT, 131 | user_id INT, 132 | calories_burnt DECIMAL(10,2), 133 | miles_walked DECIMAL(10,2), 134 | num_steps DECIMAL(10,2), 135 | timestamp TIMESTAMP, 136 | value STRING, 137 | ingest_timestamp TIMESTAMP, 138 | -- Processing Columns 139 | _start_timestamp TIMESTAMP, 140 | _end_timestamp TIMESTAMP, 141 | _batch_run_id STRING, 142 | _is_current BOOLEAN 143 | ) 144 | USING DELTA 145 | PARTITIONED BY (_is_current) 146 | TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported', 'delta.columnMapping.mode' = 'name') -- if update heavy, file sizes are great between 64-128 mbs. The more update heavy, the smaller the files (32-256mb) 147 | --LOCATION s3:/// -- Always specify location for production tables so you control where it lives in S3/ADLS/GCS 148 | -- Not specifying location parth will put table in DBFS, a managed bucket that cannot be accessed by apps outside of databricks 149 | ; 150 | 151 | -- COMMAND ---------- 152 | 153 | -- DBTITLE 1,Check incoming batch for data 154 | SELECT * FROM iot_dashboard.bronze_sensors_scd_2 155 | 156 | -- COMMAND ---------- 157 | 158 | -- DBTITLE 1,Perform SCD2 INSERT ONLY Upserts - Device Data 159 | -- Step 1 - get state of the active batch 160 | 161 | --DECLARE OR REPLACE VARIABLE var_batch_id STRING = uuid(); 162 | 163 | -- Optional intra-batch pre insert/merge de-cup 164 | CREATE OR REPLACE TABLE iot_dashboard.temp_batch_to_insert 165 | AS 166 | WITH de_dup ( 167 | SELECT Id::integer, 168 | device_id::integer, 169 | user_id::integer, 170 | calories_burnt::decimal, 171 | miles_walked::decimal, 172 | num_steps::decimal, 173 | timestamp::timestamp, 174 | value::string, 175 | ingest_timestamp, 176 | ROW_NUMBER() OVER(PARTITION BY device_id, user_id, timestamp ORDER BY ingest_timestamp DESC) AS DupRank 177 | FROM iot_dashboard.bronze_sensors_scd_2 178 | ) 179 | 180 | SELECT Id, device_id, user_id, calories_burnt, miles_walked, num_steps, timestamp, value, ingest_timestamp, 181 | now() AS _start_timestamp, 182 | true AS _is_current, 183 | 1001 AS _batch_run_id -- example batch run id 184 | FROM de_dup 185 | WHERE DupRank = 1 186 | ; 187 | 188 | MERGE INTO iot_dashboard.silver_sensors_scd_2 AS target 189 | USING ( 190 | 191 | SELECT updates.Id AS merge_key_id, 192 | updates.user_id AS merge_key_user_id, 193 | updates.device_id AS merge_key_device_id, 194 | updates.* --merge key can be built in whatever way makes sense to get unique rows 195 | FROM iot_dashboard.temp_batch_to_insert AS updates 196 | 197 | UNION ALL 198 | 199 | -- These rows will INSERT updated rows of existing records and new rows 200 | -- Setting the merge_key to NULL forces these rows to NOT MATCH and be INSERTed. 201 | SELECT 202 | NULL AS merge_key_id, 203 | NULL AS merge_key_user_id, 204 | NULL AS merge_key_device_id, 205 | updates.* 206 | FROM iot_dashboard.temp_batch_to_insert AS updates 207 | INNER JOIN iot_dashboard.silver_sensors_scd_2 as target_table 208 | ON updates.Id = target_table.Id 209 | AND updates.user_id = target_table.user_id 210 | AND updates.device_id = target_table.device_id -- What makes the key unique 211 | -- This needs to be accounted for when deciding to expire existing rows 212 | WHERE updates.value <> target_table.value -- Only update if any of the data has changed 213 | 214 | ) AS source 215 | 216 | ON target.Id = source.merge_key_id 217 | AND target.user_id = source.merge_key_user_id 218 | AND target.device_id = source.merge_key_device_id 219 | 220 | WHEN MATCHED AND (target._is_current = true AND target.value <> source.value) THEN 221 | UPDATE SET 222 | target._end_timestamp = source._start_timestamp, -- start of new record is end of old record 223 | target._is_current = false 224 | 225 | WHEN NOT MATCHED THEN 226 | INSERT (id, device_id, user_id, calories_burnt, miles_walked, num_steps, value, timestamp, ingest_timestamp, _start_timestamp, _end_timestamp, _is_current, _batch_run_id) 227 | VALUES ( 228 | source.id, source.device_id, source.user_id, source.calories_burnt, source.miles_walked, source.num_steps, source.value, source.timestamp, 229 | source.ingest_timestamp, 230 | source._start_timestamp, -- start timestamp -- new records 231 | NULL ,-- end_timestamp 232 | source._is_current, -- is current record 233 | source._batch_run_id --example batch run id 234 | ) 235 | ; 236 | 237 | -- This calculate table stats for all columns to ensure the optimizer can build the best plan 238 | -- THIS IS NOT INCREMENTAL 239 | ANALYZE TABLE iot_dashboard.silver_sensors_scd_2 COMPUTE STATISTICS FOR ALL COLUMNS; 240 | 241 | -- THIS IS INCREMENTAL 242 | OPTIMIZE iot_dashboard.silver_sensors_scd_2 ZORDER BY (timestamp, device_id); 243 | 244 | -- Truncate bronze batch once successfully loaded 245 | -- If succeeds remove temp table 246 | TRUNCATE TABLE iot_dashboard.temp_batch_to_insert; 247 | 248 | -- COMMAND ---------- 249 | 250 | DESCRIBE HISTORY iot_dashboard.silver_sensors_scd_2 251 | 252 | -- COMMAND ---------- 253 | 254 | -- DBTITLE 1,Select Raw Table 255 | SELECT * FROM iot_dashboard.silver_sensors_scd_2 256 | 257 | -- COMMAND ---------- 258 | 259 | -- DBTITLE 1,Get Amount of Expired Records 260 | SELECT 261 | _is_current AS ActiveRecord, 262 | COUNT(0) 263 | FROM iot_dashboard.silver_sensors_scd_2 264 | GROUP BY _is_current 265 | 266 | -- COMMAND ---------- 267 | 268 | -- DBTITLE 1,Look at various batch timelines over time 269 | SELECT 270 | `_start_timestamp` AS active_timestamp, 271 | COUNT(0) 272 | FROM iot_dashboard.silver_sensors_scd_2 273 | GROUP BY `_start_timestamp` 274 | ORDER BY active_timestamp 275 | 276 | -- COMMAND ---------- 277 | 278 | -- DBTITLE 1,Create "Current" View 279 | CREATE OR REPLACE VIEW iot_dashboard.silver_sensors_current 280 | AS 281 | SELECT * FROM iot_dashboard.silver_sensors_scd_2 282 | WHERE _is_current = true 283 | 284 | -- COMMAND ---------- 285 | 286 | -- DBTITLE 1,Create "Snapshotted Views" 287 | CREATE OR REPLACE VIEW iot_dashboard.silver_sensors_snapshot_as_of_2023_10_10_19_30_00 288 | AS 289 | -- Get more recent record for each record as of a specific version 290 | WITH de_dup ( 291 | SELECT Id::integer, 292 | device_id::integer, 293 | user_id::integer, 294 | calories_burnt::decimal, 295 | miles_walked::decimal, 296 | num_steps::decimal, 297 | timestamp::timestamp, 298 | value::string, 299 | ingest_timestamp, 300 | _start_timestamp, 301 | _is_current, 302 | _end_timestamp, 303 | ROW_NUMBER() OVER(PARTITION BY id ORDER BY _start_timestamp DESC) AS DupRank -- Get most recent record as of a specific point in time 304 | FROM iot_dashboard.silver_sensors_scd_2 305 | -- Point in time snapshot timestamp such as end of month 306 | WHERE _start_timestamp <= '2023-10-10T19:30:00'::timestamp 307 | ) 308 | 309 | SELECT * 310 | FROM de_dup 311 | WHERE DupRank = 1 312 | ; 313 | 314 | -- COMMAND ---------- 315 | 316 | -- DBTITLE 1,Look at snapshot of most recent version of each record at a point in time 317 | SELECT * 318 | FROM iot_dashboard.silver_sensors_snapshot_as_of_2023_10_10_19_30_00 319 | -------------------------------------------------------------------------------- /Design Patterns Notebooks/Step 13 - Migrating Identity Columns.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md 3 | -- MAGIC 4 | -- MAGIC ## How to migrate IDENTITY columns from a Data Warehouse to DBSQL / Delta Lakehouse 5 | -- MAGIC 6 | -- MAGIC ## Summary 7 | -- MAGIC Quick notebook showing how to properly migrate tables from a data warehouse to a Delta table where you want to retain the values of existing IDENTITY key values and ensure that the IDENTITY generation picks up from the most recent IDENTITY column value 8 | 9 | -- COMMAND ---------- 10 | 11 | -- MAGIC %md 12 | -- MAGIC 13 | -- MAGIC 14 | -- MAGIC ### Steps to migrate key properly 15 | -- MAGIC 16 | -- MAGIC 1. Create a table with id columns such as: GENERATED BY DEFAULT AS IDENTITY (START WITH 1 INCREMENT BY 1) 17 | -- MAGIC 2. Backfill existing data warehouse tables with an INSERT INTO / MERGE from a snapshot of the datawarehouse table 18 | -- MAGIC 3. Run command: ALTER TABLE main.default.identity_test ALTER COLUMN id SYNC IDENTITY; to ensure that the newly inserted values pick up where the data warehouse left off on key generation 19 | -- MAGIC 4. Insert new identity values with new pipelines (or leave out column and let it auto-generate) 20 | 21 | -- COMMAND ---------- 22 | 23 | -- DBTITLE 1,Simple End to End Example 24 | 25 | CREATE OR REPLACE TABLE main.default.identity_test ( 26 | id BIGINT GENERATED BY DEFAULT AS IDENTITY (START WITH 1 INCREMENT BY 1), 27 | name STRING DEFAULT 'cody' 28 | ) 29 | TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported', 'delta.columnMapping.mode' = 'name') 30 | ; 31 | 32 | -- Simulate EDW migration load with existing keys 33 | INSERT INTO main.default.identity_test (id,name) 34 | VALUES (5, 'cody'), (6, 'davis'); 35 | 36 | 37 | SELECT * FROM main.default.identity_test; 38 | 39 | 40 | -- Simulate new load incrmentally 41 | 42 | INSERT INTO main.default.identity_test (name) 43 | VALUES ('cody_new'), ('davis_new'); 44 | 45 | -- BAD! ID keys get messed up 46 | SELECT * FROM main.default.identity_test; 47 | 48 | -- FIX 49 | ALTER TABLE main.default.identity_test ALTER COLUMN id SYNC IDENTITY; 50 | 51 | -- try again 52 | INSERT INTO main.default.identity_test (name) 53 | VALUES ('cody_fix'), ('davis_fix'); 54 | 55 | SELECT * FROM main.default.identity_test; 56 | 57 | 58 | -------------------------------------------------------------------------------- /Design Patterns Notebooks/Step 14 - Using the Query Profile.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- QUERY PROFILE DEMO 3 | -- SHOWS HOW TO SPOT AND RESOLVE BOTTLENECKS AND PERFORMANCE CHALLNEGES IN DBSQL WITH QUERY PROFILE 4 | 5 | /* DEMO FLOW 6 | 7 | 1. Show 4 Unoptimized Queries and Assess the query profile 8 | 2. Optimize the 4 queries for different needs with PARTITION, ZORDER, CLUSTER BY (optional) 9 | 3. Show Updated Query Profiles and improved bottlenecks 10 | 11 | 12 | Query Profile Things to Look for at the top level in the profile: 13 | 14 | 1. % Time in Photon - top level 15 | 2. % execution time vs optimizing/pruning files 16 | 3. Spilling - If ANY, than query is very bad or cluster is too small 17 | 18 | Node-level things to look for: 19 | 20 | 1. Dark colored nodes -- indicate where time/effort is going 21 | 2. Large arrows -- indicate high data transfer across nodes 22 | 3. File / Partition Pruning metrics 23 | 4. Runtime for each node (correlated to darkness of color) 24 | 25 | 26 | */ 27 | USE CATALOG main; 28 | USE DATABASE tpc_edw_demo; 29 | 30 | 31 | --============================================================================-- 32 | /* 33 | Step 1: Start with unoptimized data model 34 | 35 | Look at 4 queries: 36 | 1. Single point lookups -- Specific trade id -- File pruning bottleneck 37 | 2. Big Joins and Large Selects -- Showing different types of bottleneck -- Shuffle 38 | 2. Range lookups -- Analytics for a specific date range -- Complex file pruning and shuffle! 39 | 3. Aggregates -- Aggregates on a specific date range - more complex question -- More complex query plans 40 | 41 | */ 42 | 43 | 44 | --===== QUERY 1: Point selection -- look for specific trade id 45 | 46 | SELECT 47 | h.currenttradeid, 48 | h.currentprice, 49 | h.currentholding, 50 | h.currentholding*h.currentprice AS CurrentMarketValue, 51 | c.lastname, 52 | c.firstname, 53 | c.status 54 | FROM main.tpc_edw_demo.factholdings h 55 | INNER JOIN main.tpc_edw_demo.dimcustomer c ON c.sk_customerid = h.sk_customerid 56 | INNER JOIN main.tpc_edw_demo.dimcompany comp ON comp.sk_companyid = h.sk_companyid 57 | WHERE h.currenttradeid = 527764963 58 | AND c.status = 'Active' 59 | 60 | -- Look at unoptimized query profile - No file pruning! 61 | 62 | -- Query 1 Profile Output: https://e2-demo-field-eng.cloud.databricks.com/sql/history?lookupKey=CiQwMWVlNzJiMi00MjZhLTEzNGYtODE3OC1hNGNmNjhmOGU1MzIQiLKmnLYxGhA0NzViOTRkZGM3Y2Q1MjExILyFvZC14AQ%3D&o=1444828305810485&uiQueryProfileVisible=true&userId=20904982561468 63 | 64 | 65 | -- Look at Row and time spent output in the DAG in the query profile -- Fatter arrows mean more data movement and bottlenecks 66 | -- Darker nodes in profile show bottlenecks AUTOMATICALLY 67 | 68 | /* Scan Node for factholding 69 | 70 | Files pruned 0 71 | Files read 448 72 | Number of output batches 449 73 | Number of output rows 1 74 | Peak memory usage 3.50 GB 75 | Size of files pruned 0 76 | Size of files read 49.03 GB 77 | */ 78 | 79 | /* Scan Node for dimcompany 80 | Files pruned 0 81 | Files read 16 82 | Number of output batches 1,232 83 | Number of output rows 5,000,000 84 | Peak memory usage 288.37 MB 85 | Size of files pruned 0 86 | Size of files read 709.25 MB 87 | */ 88 | 89 | 90 | 91 | --===== QUERY 2: Big Joins and Select 92 | 93 | SELECT 94 | h.tradeid, 95 | h.currentprice, 96 | h.currentholding, 97 | h.currentholding*h.currentprice AS CurrentMarketValue, 98 | c.lastname, 99 | c.firstname, 100 | c.status, 101 | * 102 | FROM main.tpc_edw_demo.factholdings h 103 | INNER JOIN main.tpc_edw_demo.dimcustomer c ON c.sk_customerid = h.sk_customerid 104 | INNER JOIN main.tpc_edw_demo.dimcompany comp ON comp.sk_companyid = h.sk_companyid 105 | 106 | -- What is the bottleneck here? SHUFFLE 107 | -- Not only lots of rows, but lots of data to shuffle around 108 | 109 | -- Query 2 Profile Output: https://e2-demo-field-eng.cloud.databricks.com/sql/history?lookupKey=CiQwMWVlNzJiMy00ZmRjLTE2NjUtYmYwZC05ZWY5NGVlNzJkZjIQ4f3BnLYxGhA0NzViOTRkZGM3Y2Q1MjExILyFvZC14AQ%3D&o=1444828305810485&uiQueryProfileVisible=true&userId=20904982561468 110 | 111 | 112 | 113 | --===== QUERY 3: Range Timestamp/Date Filters 114 | 115 | SELECT 116 | to_date(sk_dateid::string, "yyyyMMdd") AS Date, 117 | AVG(h.currentholding*h.currentprice) AS CurrentMarketValue, 118 | MAX(h.currentholding*h.currentprice) AS MaxHoldingValue 119 | FROM main.tpc_edw_demo.factholdings h 120 | INNER JOIN main.tpc_edw_demo.dimcustomer c ON c.sk_customerid = h.sk_customerid 121 | INNER JOIN main.tpc_edw_demo.dimcompany comp ON comp.sk_companyid = h.sk_companyid 122 | WHERE sk_dateid BETWEEN 20130101 AND 20131201 123 | GROUP BY to_date(sk_dateid::string, "yyyyMMdd") 124 | ORDER BY Date 125 | 126 | 127 | -- What is the bottleneck here? File Pruning on a range 128 | -- Lots of downstream aggregations, we want to minimize records BEFORE those transformations 129 | -- Look at time spent nodes and Rows node 130 | 131 | -- Query 3 Profile Output: https://e2-demo-field-eng.cloud.databricks.com/sql/history?lookupKey=CiQwMWVlNzJiNy0xYWU4LTExMjUtOTU3YS1mNjgyMGNhZGEwZmMQuLWlnbYxGhA0NzViOTRkZGM3Y2Q1MjExILyFvZC14AQ%3D&o=1444828305810485&uiQueryProfileVisible=true&userId=20904982561468 132 | 133 | 134 | --===== QUERY 4: Complex Query Aggregates 135 | 136 | -- For a given year, who were the top 10 holding customers? 137 | 138 | WITH year_selected_holding AS ( 139 | 140 | SELECT 141 | h.tradeid, 142 | h.currentprice, 143 | h.currentholding, 144 | h.currentholding*h.currentprice AS CurrentMarketValue, 145 | c.lastname, 146 | c.firstname, 147 | c.status, 148 | comp.name AS company_name, 149 | to_date(sk_dateid::string, "yyyyMMdd") AS Date 150 | FROM main.tpc_edw_demo.factholdings h 151 | INNER JOIN main.tpc_edw_demo.dimcustomer c ON c.sk_customerid = h.sk_customerid 152 | INNER JOIN main.tpc_edw_demo.dimcompany comp ON comp.sk_companyid = h.sk_companyid 153 | WHERE h.sk_dateid BETWEEN 20150101 AND 20151201 154 | ) 155 | , 156 | holding_customer_agg AS ( 157 | 158 | SELECT 159 | CONCAT(lastname, ', ', firstname) AS CustomerName, 160 | SUM(CurrentMarketValue) AS TotalHoldingsValue 161 | FROM year_selected_holding 162 | GROUP BY CONCAT(lastname, ', ', firstname) 163 | ), 164 | customer_rank AS ( 165 | 166 | SELECT 167 | *, 168 | DENSE_RANK() OVER (ORDER BY TotalHoldingsValue DESC) AS CustomerRank 169 | FROM holding_customer_agg 170 | ) 171 | SELECT * FROM customer_rank ORDER BY CustomerRank LIMIT 10 172 | 173 | -- What is bottleneck here? -- SHUFFLE 174 | -- No file pruning happening still 175 | 176 | -- Query 4 Profile Output: https://e2-demo-field-eng.cloud.databricks.com/sql/history?lookupKey=CiQwMWVlNzJiOS1jYjU2LTE3YTItYmFiMy0xNjY3YWRlMmRlOTcQ6%2FTrnbYxGhA0NzViOTRkZGM3Y2Q1MjExILyFvZC14AQ%3D&o=1444828305810485&uiQueryProfileVisible=true&userId=20904982561468 177 | 178 | 179 | 180 | --============================================================================-- 181 | /* 182 | Step 2: OPTIMIZE / ZORDER the core source tables 183 | 184 | Questions to ask: 185 | 186 | 1. What did we filter on above? 187 | 2. What is reused in filters? 188 | 3. How can we do smarter joins to reduce those shuffle bottlenecks? 189 | 190 | */ 191 | 192 | -- Table: main.tpc_edw_demo.factholdings 193 | -- Columns Used Often in Filters: sk_dateid, currenttradeid 194 | 195 | OPTIMIZE main.tpc_edw_demo.factholdings ZORDER BY (sk_dateid, currenttradeid); 196 | -- Large fact table so be careful here, be selective 197 | ANALYZE TABLE main.tpc_edw_demo.factholdings COMPUTE STATISTICS FOR COLUMNS sk_dateid, currenttradeid, sk_customerid, sk_companyid; 198 | 199 | 200 | -- Table: main.tpc_edw_demo.dimcustomer 201 | -- Columns Used Often in Joins as dim tables: sk_customerid 202 | 203 | OPTIMIZE main.tpc_edw_demo.dimcustomer ZORDER BY (sk_customerid); 204 | -- Dim table so not really expensive to calculate 205 | ANALYZE TABLE main.tpc_edw_demo.dimcustomer COMPUTE STATISTICS FOR ALL COLUMNS; 206 | 207 | 208 | -- Table: main.tpc_edw_demo.dimcompany 209 | -- Columns Used Often in Joins as dim tables: sk_companyid 210 | 211 | OPTIMIZE main.tpc_edw_demo.dimcompany ZORDER BY (sk_companyid); 212 | -- Dim table so not really expensive to calculate 213 | ANALYZE TABLE main.tpc_edw_demo.dimcompany COMPUTE STATISTICS FOR ALL COLUMNS; 214 | 215 | 216 | 217 | 218 | --============================================================================-- 219 | /* 220 | Step 3: Look at updated query profiles! 221 | 222 | Updated Query Profiles re-run from above queries: 223 | 224 | 225 | 226 | Query 1 Profile: https://e2-demo-field-eng.cloud.databricks.com/sql/history?lookupKey=CiQwMWVlNzJiYi00MWVmLTExNDAtYWIwMi0yZDQwMjNmNWU0MTQQ4KKSnrYxGhA0NzViOTRkZGM3Y2Q1MjExILyFvZC14AQ%3D&o=1444828305810485&uiQueryProfileVisible=true&userId=20904982561468 227 | 228 | Original Runtime: 3 seconds 229 | Optimize Runtime: 1.3 seconds 230 | 231 | WHAT IS DIFFERENT? -- Pruned MUCH more -- no more scanning all of the table 232 | 233 | Files pruned 1,018 234 | Files read 6 235 | Number of output batches 2 236 | Number of output rows 1 237 | Peak memory usage 98.89 MB 238 | Size of files pruned 47.88 GB 239 | Size of files read 293.59 MB 240 | 241 | 242 | ---===== 243 | 244 | Query 2 Profile: https://e2-demo-field-eng.cloud.databricks.com/sql/history?lookupKey=CiQwMWVlNzJiMy00ZmRjLTE2NjUtYmYwZC05ZWY5NGVlNzJkZjIQ4f3BnLYxGhA0NzViOTRkZGM3Y2Q1MjExILyFvZC14AQ%3D&o=1444828305810485&uiQueryProfileVisible=true&userId=20904982561468 245 | 246 | Original Runtime: 5 min 39 seconds 247 | Optimized Runtime: 4 min 36 seconds 248 | 249 | WHAT IS DIFFERENT? 250 | Selecting everything from large tables is just wasteful :/ 251 | 252 | Shuffle node went down a little bit from 3.7 hours in total runtime to 3.4, but selecting that much data with no filters requires more tuning 253 | 254 | 255 | ---===== 256 | Query 3 Profile: https://e2-demo-field-eng.cloud.databricks.com/sql/history?lookupKey=CiQwMWVlNzJiYy1jOThlLTFiYjAtOWMwNS04NDIwYjA1MTE0M2QQ66%2B6nrYxGhA0NzViOTRkZGM3Y2Q1MjExILyFvZC14AQ%3D&o=1444828305810485&uiQueryProfileVisible=true&userId=20904982561468 257 | 258 | Original Runtime: 9.2 seconds, 0 files pruned 259 | Optimized Runtime: 9.3 seconds, many files pruned, but now more work 260 | 261 | 262 | 263 | WHAT IS DIFFERENT? 264 | Files pruned 822 265 | Files read 202 266 | Number of output batches 57,690 267 | Number of output rows 224,054,102 268 | 269 | Look at shuffle 270 | 271 | 272 | 273 | ---===== 274 | Query 4 Profile: https://e2-demo-field-eng.cloud.databricks.com/sql/history?lookupKey=CiQwMWVlNzJiZC0wZmZlLTE1ZjgtOWNmNy0xNWU1OTVlYmM1NmQQh8vBnrYxGhA0NzViOTRkZGM3Y2Q1MjExILyFvZC14AQ%3D&o=1444828305810485&uiQueryProfileVisible=true&userId=20904982561468 275 | 276 | Original Runtime: 8.5 seconds, 0 files pruned 277 | Optimized Runtime: 12.3 seconds, Many files pruned 278 | 279 | NEW BOTTLENECK: SCAN instead of shuffle. 280 | 281 | WHAT IS DIFFERENT? 282 | 283 | Files pruned 826 284 | Files read 198 285 | Number of output batches 56,824 286 | Number of output rows 224,077,524 287 | Peak memory usage 5.76 GB 288 | Size of files pruned 38.86 GB 289 | Size of files read 9.30 GB 290 | 291 | LESSON: When doing longer term queries, make sure the file sizing is in proportion to the queries that are run on it. 292 | There are trade offs for optimizing for a few longer "historical" queries vs many "current" queries. 293 | Typically current queries are prioritized since you can spin up a serverless backfill cluster. 294 | 295 | */ 296 | 297 | -------------------------------------------------------------------------------- /Design Patterns Notebooks/Step 2 - Optimize your Delta Tables.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC # Delta Table Optimization Methods Tutorial 5 | # MAGIC 6 | # MAGIC This notebook walks through the various methods and consideration when tuning / optimizing Delta tables in SQL 7 | 8 | # COMMAND ---------- 9 | 10 | # MAGIC %md 11 | # MAGIC 12 | # MAGIC ## Delta Tables Optimization Knobs 13 | # MAGIC 14 | # MAGIC ### File Sizes 15 | # MAGIC 16 | # MAGIC #### COMPACTION - OPTIMIZE 17 | # MAGIC 18 | # MAGIC ##### ZORDER / CLUSTER BY (liquid tables) 19 | # MAGIC 20 | # MAGIC ###### Bloom Filter 21 | 22 | # COMMAND ---------- 23 | 24 | # MAGIC %md 25 | # MAGIC 26 | # MAGIC ### Optimizing for UPSERTS 27 | # MAGIC 28 | # MAGIC Commands 4-8 29 | 30 | # COMMAND ---------- 31 | 32 | # MAGIC %sql 33 | # MAGIC DROP TABLE IF EXISTS iot_dashboard.bronze_sensors_optimization; 34 | # MAGIC CREATE OR REPLACE TABLE iot_dashboard.bronze_sensors_optimization 35 | # MAGIC USING DELTA 36 | # MAGIC TBLPROPERTIES("delta.targetFileSize"="2mb") --2-128 mb for tables with heavy updates or if used for BI 37 | # MAGIC AS 38 | # MAGIC (SELECT * FROM iot_dashboard.silver_sensors LIMIT 10000) --Only load a subset for sample MERGE; 39 | 40 | # COMMAND ---------- 41 | 42 | # MAGIC %sql 43 | # MAGIC DROP TABLE IF EXISTS iot_dashboard.silver_sensors_optimization; 44 | # MAGIC CREATE OR REPLACE TABLE iot_dashboard.silver_sensors_optimization 45 | # MAGIC USING DELTA 46 | # MAGIC TBLPROPERTIES("delta.targetFileSize"="2mb") 47 | # MAGIC AS 48 | # MAGIC SELECT * fROM iot_dashboard.silver_sensors; 49 | 50 | # COMMAND ---------- 51 | 52 | # MAGIC %sql 53 | # MAGIC 54 | # MAGIC MERGE INTO iot_dashboard.silver_sensors_optimization AS target 55 | # MAGIC USING (SELECT Id::integer, 56 | # MAGIC device_id::integer, 57 | # MAGIC user_id::integer, 58 | # MAGIC calories_burnt::decimal, 59 | # MAGIC miles_walked::decimal, 60 | # MAGIC num_steps::decimal, 61 | # MAGIC timestamp::timestamp, 62 | # MAGIC value::string 63 | # MAGIC FROM iot_dashboard.bronze_sensors_optimization) AS source 64 | # MAGIC ON source.Id = target.Id 65 | # MAGIC AND source.user_id = target.user_id 66 | # MAGIC AND source.device_id = target.device_id 67 | # MAGIC AND target.timestamp > now() - INTERVAL 2 hours 68 | # MAGIC WHEN MATCHED THEN UPDATE SET 69 | # MAGIC target.calories_burnt = source.calories_burnt, 70 | # MAGIC target.miles_walked = source.miles_walked, 71 | # MAGIC target.num_steps = source.num_steps, 72 | # MAGIC target.timestamp = source.timestamp 73 | # MAGIC WHEN NOT MATCHED THEN INSERT *; 74 | # MAGIC 75 | # MAGIC -- Without optimizing tables 8.82 seconds 76 | # MAGIC -- After optimizing by merge columns 19 seconds 77 | 78 | # COMMAND ---------- 79 | 80 | # MAGIC %md 81 | # MAGIC 82 | # MAGIC ## Run CREATE/REPLACE and MERGE statements, track runtime, then run OPTIMIZE statement and run all create/merge statements again to look at spark plan differences 83 | 84 | # COMMAND ---------- 85 | 86 | # MAGIC %sql 87 | # MAGIC 88 | # MAGIC -- You want to optimize by high cardinality columns like ids, timestamps, strings 89 | # MAGIC -- ON MERGE COLUMNS, then timeseries columns, then commonly used columns in queries 90 | # MAGIC 91 | # MAGIC --This operation is incremental 92 | # MAGIC --OPTIMIZE iot_dashboard.bronze_sensors_test1 ZORDER BY (Id, user_id, device_id); 93 | # MAGIC OPTIMIZE iot_dashboard.silver_sensors_optimization ZORDER BY (user_id, device_id, Id); 94 | 95 | # COMMAND ---------- 96 | 97 | # MAGIC %md 98 | # MAGIC 99 | # MAGIC ## What about queries on this table? 100 | # MAGIC 101 | # MAGIC 1. ZORDER by commonly joined columns 102 | # MAGIC 2. Partition by larger chunks only if needed 103 | # MAGIC 3. Keep important columns in front of tables 104 | # MAGIC 4. For highly selective queries, use bloom indexes 105 | 106 | # COMMAND ---------- 107 | 108 | # MAGIC %md 109 | # MAGIC 110 | # MAGIC ## Exercise 1: Change optimization strategies for single point filters 111 | 112 | # COMMAND ---------- 113 | 114 | # MAGIC %sql 115 | # MAGIC OPTIMIZE iot_dashboard.silver_sensors_optimization ZORDER BY (user_id); 116 | # MAGIC 117 | # MAGIC -- by user_id, timestamp -- 8 files pruned 118 | # MAGIC -- by just user id selecting on user_id -- 34 files pruned (1 read) all but one 119 | # MAGIC -- by just timestamp -- no files pruned when selecting on user_id 120 | 121 | # COMMAND ---------- 122 | 123 | # DBTITLE 1,Create gold aggregate VIEW 124 | # MAGIC %sql 125 | # MAGIC 126 | # MAGIC CREATE OR REPLACE VIEW iot_dashboard.hourly_summary_statistics 127 | # MAGIC AS 128 | # MAGIC SELECT user_id, 129 | # MAGIC date_trunc('hour', timestamp) AS HourBucket, 130 | # MAGIC AVG(num_steps) AS AvgNumStepsAcrossDevices, 131 | # MAGIC AVG(calories_burnt) AS AvgCaloriesBurnedAcrossDevices, 132 | # MAGIC AVG(miles_walked) AS AvgMilesWalkedAcrossDevices 133 | # MAGIC FROM iot_dashboard.silver_sensors_optimization 134 | # MAGIC GROUP BY user_id,date_trunc('hour', timestamp) -- wrapping a function around a column 135 | # MAGIC ORDER BY HourBucket 136 | 137 | # COMMAND ---------- 138 | 139 | # DBTITLE 1,Exercise 1: Tuning for single column queries 140 | # MAGIC %sql 141 | # MAGIC 142 | # MAGIC -- LOOK AT BEFORE AND AFTER QUERIES for OPTIMIZE PRE/POST 143 | # MAGIC 144 | # MAGIC -- After optimize look at user_id files pruned 145 | # MAGIC -- by user_id, timestamp -- 8 files pruned 146 | # MAGIC -- by just user id selecting on user_id -- 34 files pruned (1 read) all but one 147 | # MAGIC -- by just timestamp -- no files pruned when selecting on user_is 148 | # MAGIC 149 | # MAGIC -- POST OPTIMIZE SCAN METRICS 150 | # MAGIC --number of files pruned 33 151 | # MAGIC -- number of files read 1 152 | # MAGIC 153 | # MAGIC SELECT * FROM iot_dashboard.hourly_summary_statistics WHERe user_id = 1 154 | 155 | # COMMAND ---------- 156 | 157 | # MAGIC %md 158 | # MAGIC 159 | # MAGIC ## Exercise 2: Multi-dimensional filters and optimzation 160 | 161 | # COMMAND ---------- 162 | 163 | # MAGIC %sql 164 | # MAGIC 165 | # MAGIC 166 | # MAGIC SELECT MIN(HourBucket), MAX(HourBucket) 167 | # MAGIC FROM iot_dashboard.hourly_summary_statistics 168 | 169 | # COMMAND ---------- 170 | 171 | # MAGIC %sql 172 | # MAGIC OPTIMIZE iot_dashboard.silver_sensors_optimization ZORDER BY (user_id, timestamp); 173 | # MAGIC 174 | # MAGIC -- by user_id, timestamp -- 2 files pruned, 29 scanned 175 | # MAGIC -- by timestamp, user_id -- does order matter? 2 files pruned, 29 scanned, - not really 176 | # MAGIC -- How to make this more selective? -- Hour bucket is abstracting the filter pushdown, lets try just the raw table 177 | 178 | # COMMAND ---------- 179 | 180 | # DBTITLE 1,Exercise 2: Optimizing Multi-dimensional queries 181 | # MAGIC %sql 182 | # MAGIC 183 | # MAGIC SELECT * 184 | # MAGIC FROM iot_dashboard.hourly_summary_statistics 185 | # MAGIC WHERE user_id = 1 186 | # MAGIC AND HourBucket BETWEEN "2018-07-22T00:00:00.000+0000" AND "2018-07-22T01:00:00.000+0000" 187 | 188 | # COMMAND ---------- 189 | 190 | # DBTITLE 1,Lesson learned -- let Delta do the filtering first, then group and aggregate -- subqueries are actually better 191 | # MAGIC %sql 192 | # MAGIC 193 | # MAGIC -- Look at SPARK QUERY PLAN SCAN node 194 | # MAGIC -- How many files are pruned/read? 195 | # MAGIC -- Try optimizing the table on different columns (1,2,3) -- see what happens! 196 | # MAGIC --28 pruned, 3 files read 197 | # MAGIC 198 | # MAGIC SELECT * 199 | # MAGIC FROM iot_dashboard.silver_sensors_optimization 200 | # MAGIC WHERE user_id = 1 201 | # MAGIC AND timestamp BETWEEN "2018-07-22T00:00:00.000+0000"::timestamp AND "2018-07-22T01:00:00.000+0000"::timestamp 202 | 203 | # COMMAND ---------- 204 | 205 | # DBTITLE 1,Automate Certain Pushdown Filter Rules in VIEWs 206 | # MAGIC %sql 207 | # MAGIC 208 | # MAGIC CREATE OR REPLACE VIEW iot_dashboard.test_filter_pushdown 209 | # MAGIC AS 210 | # MAGIC WITH raw_pushdown AS 211 | # MAGIC ( 212 | # MAGIC SELECT * 213 | # MAGIC FROM iot_dashboard.silver_sensors_optimization 214 | # MAGIC WHERE user_id = 1 215 | # MAGIC AND timestamp BETWEEN "2018-07-22T00:00:00.000+0000"::timestamp AND "2018-07-22T01:00:00.000+0000"::timestamp 216 | # MAGIC ) 217 | # MAGIC SELECT user_id, 218 | # MAGIC date_trunc('hour', timestamp) AS HourBucket, 219 | # MAGIC AVG(num_steps) AS AvgNumStepsAcrossDevices, 220 | # MAGIC AVG(calories_burnt) AS AvgCaloriesBurnedAcrossDevices, 221 | # MAGIC AVG(miles_walked) AS AvgMilesWalkedAcrossDevices 222 | # MAGIC FROM raw_pushdown 223 | # MAGIC GROUP BY user_id,date_trunc('hour', timestamp) 224 | # MAGIC ORDER BY HourBucket 225 | 226 | # COMMAND ---------- 227 | 228 | # MAGIC %sql 229 | # MAGIC 230 | # MAGIC -- Now pruning is automatically done and manual users do not have to remember each time for common views 231 | # MAGIC SELECT * FROM iot_dashboard.test_filter_pushdown 232 | 233 | # COMMAND ---------- 234 | 235 | # DBTITLE 1,Efficacy on More Complex VIEWs 236 | # MAGIC %sql 237 | # MAGIC 238 | # MAGIC CREATE OR REPLACE VIEW iot_dashboard.smoothed_hourly_statistics 239 | # MAGIC AS 240 | # MAGIC SELECT *, 241 | # MAGIC -- Number of Steps 242 | # MAGIC (avg(`AvgNumStepsAcrossDevices`) OVER ( 243 | # MAGIC ORDER BY `HourBucket` 244 | # MAGIC ROWS BETWEEN 245 | # MAGIC 4 PRECEDING AND 246 | # MAGIC CURRENT ROW 247 | # MAGIC )) ::float AS SmoothedNumSteps4HourMA, -- 4 hour moving average 248 | # MAGIC 249 | # MAGIC (avg(`AvgNumStepsAcrossDevices`) OVER ( 250 | # MAGIC ORDER BY `HourBucket` 251 | # MAGIC ROWS BETWEEN 252 | # MAGIC 24 PRECEDING AND 253 | # MAGIC CURRENT ROW 254 | # MAGIC ))::float AS SmoothedNumSteps12HourMA --24 hour moving average 255 | # MAGIC , 256 | # MAGIC -- Calories Burned 257 | # MAGIC (avg(`AvgCaloriesBurnedAcrossDevices`) OVER ( 258 | # MAGIC ORDER BY `HourBucket` 259 | # MAGIC ROWS BETWEEN 260 | # MAGIC 4 PRECEDING AND 261 | # MAGIC CURRENT ROW 262 | # MAGIC ))::float AS SmoothedCalsBurned4HourMA, -- 4 hour moving average 263 | # MAGIC 264 | # MAGIC (avg(`AvgCaloriesBurnedAcrossDevices`) OVER ( 265 | # MAGIC ORDER BY `HourBucket` 266 | # MAGIC ROWS BETWEEN 267 | # MAGIC 24 PRECEDING AND 268 | # MAGIC CURRENT ROW 269 | # MAGIC ))::float AS SmoothedCalsBurned12HourMA --24 hour moving average, 270 | # MAGIC , 271 | # MAGIC -- Miles Walked 272 | # MAGIC (avg(`AvgMilesWalkedAcrossDevices`) OVER ( 273 | # MAGIC ORDER BY `HourBucket` 274 | # MAGIC ROWS BETWEEN 275 | # MAGIC 4 PRECEDING AND 276 | # MAGIC CURRENT ROW 277 | # MAGIC ))::float AS SmoothedMilesWalked4HourMA, -- 4 hour moving average 278 | # MAGIC 279 | # MAGIC (avg(`AvgMilesWalkedAcrossDevices`) OVER ( 280 | # MAGIC ORDER BY `HourBucket` 281 | # MAGIC ROWS BETWEEN 282 | # MAGIC 24 PRECEDING AND 283 | # MAGIC CURRENT ROW 284 | # MAGIC ))::float AS SmoothedMilesWalked12HourMA --24 hour moving average 285 | # MAGIC FROM iot_dashboard.hourly_summary_statistics 286 | 287 | # COMMAND ---------- 288 | 289 | # DBTITLE 1,File Pruning on Complex VIEWs 290 | # MAGIC %sql 291 | # MAGIC 292 | # MAGIC -- How are files being pruned in the SCAN node? 293 | # MAGIC SELECt * FROM iot_dashboard.smoothed_hourly_statistics WHERE user_id = 1 294 | -------------------------------------------------------------------------------- /Design Patterns Notebooks/Step 3 - DLT Version Simple SQL EDW Pipeline.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md 3 | -- MAGIC 4 | -- MAGIC # This notebook generates a full data pipeline from databricks dataset - iot-stream 5 | -- MAGIC 6 | -- MAGIC #### Define the SQL - Add as a library to a DLT pipeline, and run the pipeline! 7 | -- MAGIC 8 | -- MAGIC ## This creates 2 tables: 9 | -- MAGIC 10 | -- MAGIC Database: iot_dashboard 11 | -- MAGIC 12 | -- MAGIC Tables: silver_sensors, silver_users 13 | -- MAGIC 14 | -- MAGIC Params: StartOver (Yes/No) - allows user to truncate and reload pipeline 15 | 16 | -- COMMAND ---------- 17 | 18 | -- MAGIC %md 19 | -- MAGIC 20 | -- MAGIC ## This is built as a library for a Delta Live Tables pipeline 21 | 22 | -- COMMAND ---------- 23 | 24 | -- MAGIC %md 25 | -- MAGIC ## Exhaustive list of all cloud_files STREAMING LIVE TABLE options 26 | -- MAGIC https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-incremental-data.html#language-sql 27 | 28 | -- COMMAND ---------- 29 | 30 | -- DBTITLE 1,Incrementally Ingest Source Data from Raw Files 31 | --No longer need a separate copy into statement, you can use the Databricks Autoloader directly in SQL by using the cloud_files function 32 | -- OPTIONALLY defined DDL in the table definition 33 | CREATE OR REFRESH STREAMING LIVE TABLE bronze_sensors 34 | ( 35 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY, 36 | device_id INT, 37 | user_id INT, 38 | calories_burnt DECIMAL(10,2), 39 | miles_walked DECIMAL(10,2), 40 | num_steps DECIMAL(10,2), 41 | timestamp TIMESTAMP, 42 | value STRING, 43 | CONSTRAINT has_device EXPECT (device_id IS NOT NULL) ON VIOLATION DROP ROW , 44 | CONSTRAINT has_user EXPECT(user_id IS NOT NULL) ON VIOLATION DROP ROW, 45 | CONSTRAINT has_data EXPECT(num_steps IS NOT NULL) -- with no violation rule, nothing happens, we just track quality in DLT 46 | ) 47 | TBLPROPERTIES("delta.targetFileSize"="128mb", 48 | "pipelines.autoOptimize.managed"="true", 49 | "pipelines.autoOptimize.zOrderCols"="create_timestamp,device_id,user_id", 50 | "pipelines.trigger.interval"="1 hour") 51 | AS 52 | SELECT 53 | id::bigint AS Id, 54 | device_id::integer AS device_id, 55 | user_id::integer AS user_id, 56 | calories_burnt::decimal(10,2) AS calories_burnt, 57 | miles_walked::decimal(10,2) AS miles_walked, 58 | num_steps::decimal(10,2) AS num_steps, 59 | timestamp::timestamp AS timestamp, 60 | value AS value 61 | FROM cloud_files("/databricks-datasets/iot-stream/data-device/", "json") 62 | -- First 2 params of cloud_files are always input file path and format, then rest are map object of optional params 63 | -- To make incremental - Add STREAMING keyword before LIVE TABLE 64 | ; 65 | 66 | 67 | 68 | -- COMMAND ---------- 69 | 70 | -- MAGIC %md 71 | -- MAGIC 72 | -- MAGIC ## Process Change data with updates or deletes 73 | -- MAGIC API Docs: https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html 74 | -- MAGIC 75 | -- MAGIC 76 | -- MAGIC ### Automatically store change as SCD 1 or SCD 2 Type changes 77 | -- MAGIC 78 | -- MAGIC SCD 1/2 Docs: https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html#language-sql 79 | 80 | -- COMMAND ---------- 81 | 82 | -- DBTITLE 1,Incremental upsert data into target silver layer 83 | -- Create and populate the target table. 84 | CREATE OR REFRESH STREAMING LIVE TABLE silver_sensors 85 | ( 86 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY, 87 | device_id INT, 88 | user_id INT, 89 | calories_burnt DECIMAL(10,2), 90 | miles_walked DECIMAL(10,2), 91 | num_steps DECIMAL(10,2), 92 | timestamp TIMESTAMP, 93 | value STRING) 94 | TBLPROPERTIES("delta.targetFileSize"="128mb", 95 | "quality"="silver", 96 | "pipelines.autoOptimize.managed"="true", 97 | "pipelines.autoOptimize.zOrderCols"="create_timestamp,device_id,user_id", 98 | "pipelines.trigger.interval"="1 hour" 99 | ); 100 | 101 | -- COMMAND ---------- 102 | 103 | -- DBTITLE 1,Actually run CDC Transformation Operation 104 | APPLY CHANGES INTO 105 | LIVE.silver_sensors 106 | FROM 107 | STREAM(LIVE.bronze_sensors) -- use STREAM to get change feed, use LIVE to get DAG source table 108 | KEYS 109 | (user_id, device_id) -- Identical to the ON statement in MERGE, can be 1 of many keys 110 | --APPLY AS DELETE WHEN 111 | -- operation = "DELETE" --Need if you have a operation columnd that specifies "APPEND"/"UPDATE"/"DELETE" like true CDC data 112 | SEQUENCE BY 113 | timestamp 114 | COLUMNS * EXCEPT 115 | (Id) --For auto increment keys, exclude the updates cause you dont want to replace Ids of auto_id columns 116 | -- Optionally exclude columns like metadata or operation types, by default, UPDATE * is the operation 117 | STORED AS 118 | SCD TYPE 1 -- [SCD TYPE 2] will expire updated originals 119 | 120 | -- COMMAND ---------- 121 | 122 | -- MAGIC %md 123 | -- MAGIC 124 | -- MAGIC ## FULL REFRESH EXAMPLE - Ingest Full User Data Set Each Load 125 | 126 | -- COMMAND ---------- 127 | 128 | -- DBTITLE 1,FulltIngest Raw User Data 129 | CREATE OR REPLACE STREAMING LIVE TABLE silver_users 130 | ( -- REPLACE truncates the checkpoint each time and loads from scratch every time 131 | userid BIGINT GENERATED BY DEFAULT AS IDENTITY, 132 | gender STRING, 133 | age INT, 134 | height DECIMAL(10,2), 135 | weight DECIMAL(10,2), 136 | smoker STRING, 137 | familyhistory STRING, 138 | cholestlevs STRING, 139 | bp STRING, 140 | risk DECIMAL(10,2), 141 | update_timestamp TIMESTAMP, 142 | CONSTRAINT has_user EXPECT (userid IS NOT NULL) ON VIOLATION DROP ROW 143 | ) 144 | TBLPROPERTIES("delta.targetFileSize"="128mb", 145 | "quality"="silver", 146 | "pipelines.autoOptimize.managed"="true", 147 | "pipelines.autoOptimize.zOrderCols"="userid", 148 | "pipelines.trigger.interval"="1 day" 149 | ) 150 | AS (SELECT 151 | userid::bigint AS userid, 152 | gender AS gender, 153 | age::integer AS age, 154 | height::decimal(10,2) AS height, 155 | weight::decimal(10,2) AS weight, 156 | smoker AS smoker, 157 | familyhistory AS familyhistory, 158 | cholestlevs AS cholestlevs, 159 | bp AS bp, 160 | risk::decimal(10,2) AS risk, 161 | current_timestamp() AS update_timestamp 162 | FROM cloud_files("/databricks-datasets/iot-stream/data-user/","csv", map( 'header', 'true')) 163 | ) 164 | ; 165 | -------------------------------------------------------------------------------- /Design Patterns Notebooks/Step 4 - Create Gold Layer Analytics Tables.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md 3 | -- MAGIC 4 | -- MAGIC ## Create Gold Layer Tables that aggregate and clean up the data for BI / ML 5 | 6 | -- COMMAND ---------- 7 | 8 | CREATE OR REPLACE TABLE iot_dashboard.hourly_summary_statistics 9 | AS 10 | SELECT user_id, 11 | date_trunc('hour', timestamp) AS HourBucket, 12 | AVG(num_steps)::float AS AvgNumStepsAcrossDevices, 13 | AVG(calories_burnt)::float AS AvgCaloriesBurnedAcrossDevices, 14 | AVG(miles_walked)::float AS AvgMilesWalkedAcrossDevices 15 | FROM iot_dashboard.silver_sensors 16 | GROUP BY user_id,date_trunc('hour', timestamp) 17 | ORDER BY HourBucket; 18 | 19 | 20 | CREATE OR REPLACE TABLE iot_dashboard.smoothed_hourly_statistics 21 | AS 22 | SELECT *, 23 | -- Number of Steps 24 | (avg(`AvgNumStepsAcrossDevices`) OVER ( 25 | ORDER BY `HourBucket` 26 | ROWS BETWEEN 27 | 4 PRECEDING AND 28 | CURRENT ROW 29 | )) ::float AS SmoothedNumSteps4HourMA, -- 4 hour moving average 30 | 31 | (avg(`AvgNumStepsAcrossDevices`) OVER ( 32 | ORDER BY `HourBucket` 33 | ROWS BETWEEN 34 | 24 PRECEDING AND 35 | CURRENT ROW 36 | ))::float AS SmoothedNumSteps12HourMA --24 hour moving average 37 | , 38 | -- Calories Burned 39 | (avg(`AvgCaloriesBurnedAcrossDevices`) OVER ( 40 | ORDER BY `HourBucket` 41 | ROWS BETWEEN 42 | 4 PRECEDING AND 43 | CURRENT ROW 44 | ))::float AS SmoothedCalsBurned4HourMA, -- 4 hour moving average 45 | 46 | (avg(`AvgCaloriesBurnedAcrossDevices`) OVER ( 47 | ORDER BY `HourBucket` 48 | ROWS BETWEEN 49 | 24 PRECEDING AND 50 | CURRENT ROW 51 | ))::float AS SmoothedCalsBurned12HourMA --24 hour moving average, 52 | , 53 | -- Miles Walked 54 | (avg(`AvgMilesWalkedAcrossDevices`) OVER ( 55 | ORDER BY `HourBucket` 56 | ROWS BETWEEN 57 | 4 PRECEDING AND 58 | CURRENT ROW 59 | ))::float AS SmoothedMilesWalked4HourMA, -- 4 hour moving average 60 | 61 | (avg(`AvgMilesWalkedAcrossDevices`) OVER ( 62 | ORDER BY `HourBucket` 63 | ROWS BETWEEN 64 | 24 PRECEDING AND 65 | CURRENT ROW 66 | ))::float AS SmoothedMilesWalked12HourMA --24 hour moving average 67 | FROM iot_dashboard.hourly_summary_statistics 68 | 69 | -- COMMAND ---------- 70 | 71 | -- DBTITLE 1,Build Visuals in DBSQL, Directly in Notebook, or in any BI tool! 72 | SELECT * FROM iot_dashboard.smoothed_hourly_statistics WHERE user_id = 1 73 | -------------------------------------------------------------------------------- /Design Patterns Notebooks/Step 7 - COPY INTO Loading Patterns.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC ## Materlized Views 5 | # MAGIC 6 | # MAGIC Patterns and Best Practices 7 | # MAGIC 8 | # MAGIC 9 | # MAGIC 1. Create Materialized View 10 | # MAGIC 2. Optimize Materialized View 11 | # MAGIC 3. Check / Monitor Performance of MV 12 | # MAGIC 4. When to NOT use MVs 13 | -------------------------------------------------------------------------------- /Design Patterns Notebooks/Step 8 - Liquid Clustering Delta Tables.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC ## Deep Dive on Liquid Clustering Delta Tables 5 | # MAGIC 6 | # MAGIC ### Topics 7 | # MAGIC 8 | # MAGIC 1. How to create and optimize liquid tables 9 | # MAGIC 2. How to merge/update/delete data from liquid tables 10 | # MAGIC 3. VACUUM/PURGE/REORG on Liqiud tables 11 | # MAGIC 4. Performance Measurement 12 | # MAGIC 5. When to use ZORDER/Partitions vs Liquid 13 | # MAGIC 6. Liquid Limitations 14 | -------------------------------------------------------------------------------- /Design Patterns Notebooks/Step 9 - Using SQL Functions.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # SQL Functions Topic Deep Dive 3 | 4 | ## Topics 5 | 6 | 1. How to use SQL functions 7 | 2. Different languages - Python/SQL 8 | 3. Variables, etc. 9 | 4. Using Models in SQL functions 10 | 5. AI Functions 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Cody Austin Davis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # edw-best-practices 2 | ## Git Repo for EDW Best Practice Assets on the Lakehouse 3 | 4 | This Git Project Provides a framework of example notebooks that aims to show any typical data warehousing SQL users how to built pipelines and analytics on the Lakeshouse. Broken out in 4 steps, the notebooks will walk the user through a single use case that they can run in their own Databricks environment leading them through the data maturity curve as follows: 5 | 6 |
        • 1. Step 1 - Build a classical batched-oriented SQL pipeline with best practices on the Lakehouse 7 | 8 |
        • 2. Step 2 - Build the above in Delta Live Tables and automate all orchestration 9 | 10 |
        • 3. Step 3 - Build and analyze summary analytics tables 11 | 12 |
        • 4. Step 4 - Create gold views 13 | 14 |
        • 4. Step 5 - Convert and old batch pipeline to a Streaming pipeline 15 | 16 | 17 | This Git repo also provides some examples of more advacned use cases like using the Delta Change Data feed. 18 | 19 | This Git repo also provides some helper functions that make ETL easier in production pipelines. 20 | 21 | -------------------------------------------------------------------------------- /Realtime Data Apps Workshop/Step 0 - Real Time Data Generator Simulator.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC ### This notebook simulates a real-time feed from an IoT Device 5 | # MAGIC 6 | # MAGIC Notes: 7 | # MAGIC 8 | # MAGIC
        • 1. Starts with an initial batch of the earlist data from the databricks-datasets/iot-stream 9 | # MAGIC
        • 2. Allows user to truncate and reload simulated streaming data 10 | # MAGIC
        • 3. Allows user to decide how often to drop files to simulate different update frequencies 11 | 12 | # COMMAND ---------- 13 | 14 | spark.conf.set("spark.sql.shuffle.partitions", "32") 15 | 16 | # COMMAND ---------- 17 | 18 | from pyspark.sql.functions import * 19 | 20 | # COMMAND ---------- 21 | 22 | # DBTITLE 1,Define Source and Sink Paths 23 | source_data_path = "/databricks-datasets/iot-stream/data-device/" 24 | target_data_path = "dbfs:/Filestore/real-time-data-demo/iot_dashboard/" 25 | 26 | # COMMAND ---------- 27 | 28 | # DBTITLE 1,Get all records, order by timestamp, and drop 1 at time 29 | df = (spark.read.json(source_data_path).orderBy("timestamp") 30 | .withColumn("second", date_trunc("second", col("timestamp"))) 31 | ) 32 | 33 | # COMMAND ---------- 34 | 35 | dbutils.widgets.text("Second Frequency (Integer)", "1") 36 | dbutils.widgets.text("Starting Record Batch Size", "1000") 37 | dbutils.widgets.dropdown("Start Over Each Run", "Yes", ["Yes", "No"]) 38 | dbutils.widgets.text("Records Per Trigger (Integer):", "1000") 39 | dbutils.widgets.dropdown("Run Mode", "Real Time", ["Real Time", "Historical Stream"]) 40 | 41 | run_mode = dbutils.widgets.get("Run Mode") 42 | start_over = dbutils.widgets.get("Start Over Each Run") 43 | drop_periodicity = int(dbutils.widgets.get("Second Frequency (Integer)")) 44 | start_batch_size = int(dbutils.widgets.get("Starting Record Batch Size")) 45 | records_per_trigger = int(dbutils.widgets.get("Records Per Trigger (Integer):")) 46 | 47 | print(f"Run Mode: {run_mode}... \n Generating {records_per_trigger} records every {drop_periodicity} seconds starting with {start_batch_size} records. \n Start over each run?: {start_over}") 48 | 49 | # COMMAND ---------- 50 | 51 | from pyspark.sql import Window 52 | from pyspark.sql.functions import * 53 | import time 54 | 55 | # COMMAND ---------- 56 | 57 | # DBTITLE 1,Sort Data to Drop files in order of timeframe to simulate real-time 58 | historical_overSpec = Window.orderBy("timestamp") 59 | realtime_overSpec = Window.orderBy("second") 60 | 61 | prepped_df = (df.withColumn("row_num", row_number().over(historical_overSpec)) ## For 62 | .withColumn("sec_rank", dense_rank().over(realtime_overSpec)) 63 | ) 64 | 65 | # COMMAND ---------- 66 | 67 | # DBTITLE 1,Write Starting Batch to get initial state 68 | ## Start over each time 69 | 70 | if start_over == "Yes": 71 | print("Truncating and reloading source data...") 72 | dbutils.fs.rm(target_data_path, recurse=True) 73 | 74 | 75 | ## Write initial batch size 76 | 77 | if run_mode == "Historical Stream": 78 | 79 | ## This separates data in batches by #rows 80 | initial_batch = prepped_df.filter(col("row_num") <= lit(start_batch_size)).select("value").coalesce(1) 81 | initial_batch.write.text(f"{target_data_path}initial_batch_0_{start_batch_size}.json") 82 | 83 | elif run_mode == "Real Time": 84 | 85 | # This separates data in batches by seconds 86 | initial_batch = prepped_df.filter(col("sec_rank") <= lit(start_batch_size)).select("value").coalesce(1) 87 | initial_batch.write.text(f"{target_data_path}initial_batch_0_{start_batch_size}.json") 88 | 89 | # COMMAND ---------- 90 | 91 | # DBTITLE 1,Load Incremental Records in order of timestamp after initial batch 92 | if run_mode == "Historical Stream": 93 | 94 | max_val = prepped_df.agg(max("row_num")).collect()[0][0] 95 | batches = list(range(start_batch_size, max_val, records_per_trigger)) 96 | 97 | 98 | coalesced_prepped_df = prepped_df.coalesce(1) 99 | 100 | for i, j in enumerate(batches): 101 | 102 | print(i) 103 | print(f"Dropping batch {i} from records {j} --> {batches[i+1]}") 104 | 105 | start_rec = j 106 | end_rec = batches[i+1] 107 | 108 | incremental_df = (coalesced_prepped_df 109 | .filter((col("row_num") > lit(start_rec)) & (col("row_num") <= lit(end_rec))) 110 | .coalesce(1) 111 | .orderBy("row_num").select("value") 112 | ) 113 | incremental_df.write.text(f"{target_data_path}batch_{i}_from_{start_rec}_to_{end_rec}.json") 114 | 115 | time.sleep(drop_periodicity) 116 | 117 | 118 | elif run_mode == "Real Time": 119 | 120 | max_val = prepped_df.agg(max("sec_rank")).collect()[0][0] 121 | 122 | ## Dropping X seconds of data at a time proportional to the real drop rate 123 | batches = list(range(start_batch_size, max_val, drop_periodicity)) 124 | 125 | 126 | coalesced_prepped_df = prepped_df.coalesce(1) 127 | 128 | for i, j in enumerate(batches): 129 | 130 | print(i) 131 | print(f"Dropping batch {i} from records {j} --> {batches[i+1]}") 132 | 133 | start_rec = j 134 | end_rec = batches[i+1] 135 | 136 | incremental_df = (coalesced_prepped_df 137 | .filter((col("sec_rank") > lit(start_rec)) & (col("sec_rank") <= lit(end_rec))) 138 | .coalesce(1) 139 | .orderBy("sec_rank").select("value") 140 | ) 141 | incremental_df.write.text(f"{target_data_path}batch_{i}_from_{start_rec}_to_{end_rec}.json") 142 | 143 | time.sleep(drop_periodicity) 144 | -------------------------------------------------------------------------------- /Realtime Data Apps Workshop/Step 2 - Create Gold Views for App Layer.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md 3 | -- MAGIC 4 | -- MAGIC ## Building Production Data Apps - Last Mile BI on Databricks and Dash 5 | -- MAGIC 6 | -- MAGIC Dash apps: https://dash.gallery/Portal/ 7 | -- MAGIC 8 | -- MAGIC 9 | 10 | -- COMMAND ---------- 11 | 12 | -- MAGIC %md 13 | -- MAGIC 14 | -- MAGIC 15 | -- MAGIC 16 | -- MAGIC 17 | 18 | -- COMMAND ---------- 19 | 20 | -- MAGIC %md 21 | -- MAGIC 22 | -- MAGIC ## Dashboard Recommendations 23 | -- MAGIC 24 | -- MAGIC 1. Pushdown timestamp filters as much as possible (especially now that insert order is preserved) 25 | -- MAGIC 2. Bring back as little data as necessary 26 | -- MAGIC 3. Make the Lakehouse do all the work 27 | 28 | -- COMMAND ---------- 29 | 30 | -- DBTITLE 1,Generate View with Heavy Logic 31 | -- We can decide to build directly on bronze or on silver for higher quality data 32 | 33 | CREATE OR REPLACE VIEW real_time_iot_dashboard.gold_sensors 34 | AS 35 | ( 36 | WITH water_mark AS ((SELECT MAX(timestamp) FROM real_time_iot_dashboard.bronze_sensors)) 37 | 38 | SELECT timestamp, 39 | -- Number of Steps 40 | (avg(`num_steps`) OVER ( 41 | ORDER BY timestamp 42 | ROWS BETWEEN 43 | 15 PRECEDING AND 44 | CURRENT ROW 45 | )) ::float AS SmoothedNumSteps30SecondMA, -- 30 second moving average 46 | 47 | (avg(`num_steps`) OVER ( 48 | ORDER BY timestamp 49 | ROWS BETWEEN 50 | 60 PRECEDING AND 51 | CURRENT ROW 52 | ))::float AS SmoothedNumSteps120SecondMA,--120 second moving average, 53 | -- Calories Burnt 54 | (avg(`calories_burnt`) OVER ( 55 | ORDER BY timestamp 56 | ROWS BETWEEN 57 | 15 PRECEDING AND 58 | CURRENT ROW 59 | )) ::float AS SmoothedCaloriesBurnt30SecondMA, -- 30 second moving average 60 | 61 | (avg(`calories_burnt`) OVER ( 62 | ORDER BY timestamp 63 | ROWS BETWEEN 64 | 60 PRECEDING AND 65 | CURRENT ROW 66 | ))::float AS SmoothedCaloriesBurnt120SecondMA --120 second moving average 67 | FROM real_time_iot_dashboard.bronze_sensors 68 | WHERE timestamp >= ((SELECT * FROM water_mark) - INTERVAL '15 MINUTES') -- In real time, you would use current_timestamp, but this is synthetic old data 69 | ORDER BY timestamp DESC 70 | ) 71 | 72 | -- COMMAND ---------- 73 | 74 | CREATE OR REPLACE VIEW real_time_iot_dashboard.gold_sensors_stateful 75 | AS 76 | SELECT EventStart as timestamp, 77 | num_steps AS SmoothedNumSteps30SecondMA, -- 30 second moving average 78 | 79 | (avg(`num_steps`) OVER ( 80 | ORDER BY EventStart 81 | ROWS BETWEEN 82 | 30 PRECEDING AND 83 | CURRENT ROW 84 | ))::float AS SmoothedNumSteps120SecondMA,--120 second moving average, 85 | -- Calories Burnt 86 | calories_burnt AS SmoothedCaloriesBurnt30SecondMA, -- 30 second moving average 87 | 88 | (avg(`calories_burnt`) OVER ( 89 | ORDER BY EventStart 90 | ROWS BETWEEN 91 | 30 PRECEDING AND 92 | CURRENT ROW 93 | ))::float AS SmoothedCaloriesBurnt120SecondMA --120 second moving average 94 | FROM real_time_iot_dashboard.silver_sensors_stateful ss 95 | WHERE 96 | --Use partition pruning to ignore data as it ages 97 | ss.Date = ((SELECT MAX(Date) FROM real_time_iot_dashboard.silver_sensors_stateful)) 98 | AND ss.EventStart >= ((SELECT MAX(EventStart) FROM real_time_iot_dashboard.silver_sensors_stateful) - INTERVAL '15 MINUTES') 99 | ORDER BY timestamp DESC 100 | LIMIT 200 101 | 102 | -- COMMAND ---------- 103 | 104 | -- MAGIC %sql 105 | -- MAGIC 106 | -- MAGIC SELECT * FROM real_time_iot_dashboard.gold_sensors_stateful 107 | 108 | -- COMMAND ---------- 109 | 110 | -- DBTITLE 1,Example of Dashboard Client Side Query 111 | SELECT * 112 | FROM real_time_iot_dashboard.gold_sensors 113 | LIMIT 1000 114 | 115 | -- COMMAND ---------- 116 | 117 | -- DBTITLE 1,Embed this into a Dash Callback to create automatically refreshing tables that trigger when the table updates 118 | WITH log AS 119 | (DESCRIBE HISTORY real_time_iot_dashboard.bronze_sensors 120 | ), 121 | state AS ( 122 | SELECT 123 | version, 124 | timestamp, 125 | operation 126 | FROM log 127 | WHERE (timestamp >= current_timestamp() - INTERVAL '24 hours') 128 | AND operation IN ('MERGE', 'WRITE', 'DELETE', 'STREAMING UPDATE') 129 | ORDER By version DESC 130 | ), 131 | comparison AS ( 132 | SELECT DISTINCT 133 | s1.version, 134 | s1.timestamp, 135 | s1.operation, 136 | LAG(version) OVER (ORDER BY version) AS Previous_Version, 137 | LAG(timestamp) OVER (ORDER BY timestamp) AS Previous_Timestamp 138 | FROM state AS s1 139 | ORDER BY version DESC) 140 | 141 | SELECT 142 | date_trunc('hour', timestamp) AS HourBlock, 143 | AVG(timestamp::double - Previous_Timestamp::double) AS AvgUpdateFrequencyInSeconds 144 | FROM comparison 145 | GROUP BY date_trunc('hour', timestamp) 146 | ORDER BY HourBlock 147 | -------------------------------------------------------------------------------- /RedshiftDDLMigrator/Redshift DDL Migrator.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC ## Redshift --> Databricks DDL Migrator 5 | # MAGIC 6 | # MAGIC ### v000 (PROTOTYPE) 7 | # MAGIC ### Author: Cody Austin Davis 8 | # MAGIC ### Date: 8/13/2022 9 | # MAGIC 10 | # MAGIC 11 | # MAGIC #### DEPENDENCIES: 12 | # MAGIC 13 | # MAGIC
        • 1. Must first create a table/view in Redshift that contains all historical DDL statements. This statement can be found from AWS here . You can name the table whatever you would like and supply the table name in this notebook. 14 | # MAGIC
        • 2. Must install the Redshift <> Databricks Jar file to the cluster on Databricks found here: Amazon Redshift Connector 15 | # MAGIC 16 | # MAGIC 17 | # MAGIC #### ROADMAP: 18 | # MAGIC 19 | # MAGIC
        • CALL OUT EDGE CASES: Super data type, timeszones 20 | # MAGIC
        • Parse external tables - from Redshift 21 | # MAGIC
        • Edge data type (timezone, encoding, etc.) 22 | # MAGIC
        • Translate primary key generation object and automatically run the sync command (stretch goal) 23 | # MAGIC
        • Translate default values in DDL statements 24 | # MAGIC
        • Make Identity column generation more robust (translate increments, etc.) 25 | 26 | # COMMAND ---------- 27 | 28 | # MAGIC %pip install sqlparse 29 | # MAGIC %pip install sql-metadata 30 | 31 | # COMMAND ---------- 32 | 33 | import json 34 | import sqlparse 35 | from sql_metadata import Parser 36 | from pyspark.sql.functions import * 37 | 38 | # COMMAND ---------- 39 | 40 | redshift_user = dbutils.secrets.get(scope='rm_redshift', key = 'username') ## Supply your own secret values or raw keys here for username and password 41 | redshift_password = dbutils.secrets.get(scope='rm_redshift', key = 'password') 42 | 43 | # COMMAND ---------- 44 | 45 | hostname_redshift = '' 46 | port_redshift = '5439' 47 | tempdir_redshift_unloads = '' 48 | iam_role_redshift = '' 49 | database = "" 50 | print(f"Running testing off: {hostname_redshift}") 51 | 52 | # COMMAND ---------- 53 | 54 | # DBTITLE 1,Get DDL Admin View if not exists 55 | dbutils.widgets.text("Redshift DDL Table Name", "") 56 | redshift_table_name = dbutils.widgets.get("Redshift DDL Table Name") 57 | 58 | 59 | dbutils.widgets.text("Redshift Schemas to migrate(csv)", "") 60 | schemas_to_migrate = [i.strip() for i in dbutils.widgets.get("Redshift Schemas to migrate(csv)").split(",") if len(i) > 0] 61 | 62 | if len(schemas_to_migrate) == 0: 63 | schemas_to_migrate = "All" 64 | 65 | print(f"Extracting DDL from the following table: {redshift_table_name}") 66 | print(f"Migrating the following schemas: {schemas_to_migrate}") 67 | 68 | 69 | # COMMAND ---------- 70 | 71 | redshift_url = f"jdbc:redshift://{hostname_redshift}:{port_redshift}/{database}?user={redshift_user}&password={redshift_password}&ssl=true&sslfactory=org.postgresql.ssl.NonValidatingFactory" 72 | 73 | # COMMAND ---------- 74 | 75 | ## Pull and Aggregate mode recent DDL statement for all tables, and optionally filter for a set of schemas 76 | 77 | rsh_query = f"""SELECT LISTAGG(CASE WHEN LEN(RTRIM(ddl)) = 0 THEN ddl ELSE RTRIM(ddl) END) WITHIN GROUP (ORDER BY seq) as query_statement, schemaname, tablename 78 | FROM {redshift_table_name} GROUP BY schemaname, tablename""" 79 | 80 | 81 | 82 | if schemas_to_migrate == "All": 83 | view_create = (spark.read.format("com.databricks.spark.redshift") 84 | .option("url", redshift_url) 85 | .option("query", rsh_query) 86 | .option("tempdir", tempdir_redshift_unloads) 87 | .option("aws_iam_role", iam_role_redshift) 88 | .load() 89 | ) 90 | else: 91 | view_create = (spark.read.format("com.databricks.spark.redshift") 92 | .option("url", redshift_url) 93 | .option("query", rsh_query) 94 | .option("tempdir", tempdir_redshift_unloads) 95 | .option("aws_iam_role", iam_role_redshift) 96 | .load() 97 | .filter(col("schemaname").isin(*schemas_to_migrate)) 98 | ) 99 | 100 | # COMMAND ---------- 101 | 102 | spark.sql("""CREATE DATABASE IF NOT EXISTS redshift_migration;""") 103 | 104 | # COMMAND ---------- 105 | 106 | # MAGIC %md 107 | # MAGIC 108 | # MAGIC 109 | # MAGIC Output: 110 | # MAGIC 111 | # MAGIC {"", "optimize_command": ""},...} 112 | # MAGIC Query text, command Id, rawSql String, run timestamp, recency rank, table_name, clean DDL, clean OPTIMIZE command 113 | 114 | # COMMAND ---------- 115 | 116 | @udf("string") 117 | def getCreateStatementOnly(sqlString): 118 | try: 119 | resultStr = sqlString.partition("CREATE")[1] + sqlString.partition("CREATE")[2].partition(";")[0] 120 | return resultStr 121 | except: 122 | resultStr = '' 123 | return resultStr 124 | 125 | 126 | 127 | def getCreateStatementOnlyPython(sqlString): 128 | try: 129 | resultStr = sqlString.partition("CREATE")[1] + sqlString.partition("CREATE")[2].partition(";")[0] 130 | return resultStr 131 | except: 132 | resultStr = '' 133 | return resultStr 134 | 135 | # COMMAND ---------- 136 | 137 | # DBTITLE 1,Parsing Functions 138 | import re 139 | 140 | def get_table_name(tokens): 141 | for token in reversed(tokens): 142 | if token.ttype is None: 143 | return token.value 144 | return "" 145 | 146 | ## Get zorder cols from DIST and SORT KEYS 147 | 148 | ## Allow ZORDER cols to be empty (no ZORDER, just optimize) 149 | def get_zorder_cols(tokens): 150 | 151 | zorder_keys = [] 152 | dist_cols = [] 153 | sort_cols = [] 154 | for i, t in enumerate(tokens): 155 | 156 | if re.search('distkey', str(t).lower()): 157 | dc = str(tokens[i+1]) 158 | dist_cols = [i.strip() for i in re.sub("[\t\n]", "", dc[dc.find("(")+1:dc.find(")")]).split(",")] 159 | #print(f"found dist key! {dist_cols}") 160 | 161 | if re.search('sortkey', str(t).lower()): 162 | sc = str(tokens[i+1]) 163 | sort_cols = [i.strip() for i in re.sub("[\t\n]", "", sc[sc.find("(")+1:sc.find(")")]).split(",")] 164 | #print(f"found sort key! {sort_cols}") 165 | 166 | ## TO DO: Make need to automate the ordering of these cols since they will go into a Z ORDER 167 | 168 | zorder_keys = list(set(dist_cols + sort_cols)) 169 | 170 | return zorder_keys or [] 171 | 172 | ### See if columns is an identity column or not 173 | 174 | def is_identity_column(token): 175 | has_id_cols = False 176 | 177 | if re.search('identity', str(token).lower()): 178 | dc = str(token) 179 | has_id_cols = True 180 | return has_id_cols 181 | 182 | return has_id_cols 183 | 184 | 185 | 186 | ## Spark UDF function 187 | @udf("string") 188 | def getDDLFromSQLString(sqlString): 189 | 190 | cleanSqlString = getCreateStatementOnlyPython(sqlString) 191 | parse = sqlparse.parse(cleanSqlString) 192 | 193 | ## For each statement in the sql string (can be thousands, parse SQL String and built DDL expression and optimize statement) 194 | final_ddl_json = {} 195 | 196 | try: 197 | for stmt in parse: 198 | # Get all the tokens except whitespaces 199 | tokens = [t for t in sqlparse.sql.TokenList(stmt.tokens) if t.ttype != sqlparse.tokens.Whitespace] 200 | is_create_stmt = False 201 | 202 | zorder_cols = get_zorder_cols(tokens) 203 | 204 | for i, token in enumerate(tokens): 205 | # Check if create statement 206 | if token.match(sqlparse.tokens.DDL, 'CREATE'): 207 | is_create_stmt = True 208 | continue 209 | 210 | 211 | # If it was a create statement and the current token starts with "(" 212 | if is_create_stmt and token.value.startswith("("): 213 | # Get the table name by looking at the tokens in reverse order till you find 214 | # a token with None type 215 | 216 | ## Get Table Info 217 | table_name = get_table_name(tokens[:i]) 218 | #print (f"table: {table_name}") 219 | 220 | ### Get Column Info 221 | txt = token.value 222 | 223 | ## Split on comma but only if not in parentheses (eg. NUMERIC(10,2)) 224 | s = txt[1:txt.rfind(")")].replace("\n","") 225 | #columns = re.split(r',\s*(?![^()]*\))', s) 226 | columns = re.split(r"(?<=[^\d+()]),(?![^()]*\))", s) 227 | 228 | ## Prep for rebuilding SQL String 229 | target_ddl_array = [] 230 | 231 | for column in columns: 232 | c = ' '.join(column.split()).split() 233 | c_name = c[0].replace('\"',"") 234 | c_type = c[1] # For condensed type information 235 | 236 | c_type_full = " ".join(c[1:]) # For detailed type information ## Do not do this for stage 1 of migrator 237 | ## Check for identity generation column 238 | is_id = is_identity_column(c_type_full) 239 | 240 | ## Make identity column if id col found in Redshift 241 | ## !!! USER MUST RUN ID SYNC WHEN MOVING ACTUAL EXISTING IDS ON FIRST BACKFILL FROM REDSHIFT!!! 242 | if is_id is True: 243 | c_type = "BIGINT" + " GENERATED BY DEFAULT AS IDENTITY" 244 | 245 | #print (f"column: {c_name}") 246 | #print (f"date type: {c_type}") 247 | 248 | ## Rebuild String for DBX 249 | clean_col = c_name + " " + c_type 250 | 251 | if clean_col.lower() == 'primary key': 252 | pass 253 | else: 254 | target_ddl_array.append(clean_col) 255 | 256 | #print(f"Table columns: {target_ddl_array}") 257 | #print(f"Z ORDER Columns: {zorder_cols}") 258 | 259 | ## Build entire statement 260 | full_ddl_string = f"CREATE TABLE IF NOT EXISTS {table_name} ({','.join(target_ddl_array)});" 261 | 262 | if len(zorder_cols) >= 1: 263 | full_optimize_string = f"OPTIMIZE {table_name} ZORDER BY ({','.join(zorder_cols)});" 264 | else: 265 | full_optimize_string = f"OPTIMIZE {table_name};" 266 | 267 | #print(full_ddl_string) 268 | #print(full_optimize_string) 269 | #print ("---"*20) 270 | 271 | final_ddl_json = {"table_name": table_name, "ddl": full_ddl_string, "optimize_command": full_optimize_string} 272 | 273 | break 274 | except: 275 | pass 276 | 277 | return json.dumps(final_ddl_json) 278 | 279 | # COMMAND ---------- 280 | 281 | # MAGIC %md 282 | # MAGIC 283 | # MAGIC ## TO DO: 284 | # MAGIC 285 | # MAGIC 1. Pull out database and table from results 286 | # MAGIC 2. Get most recent DDL statement for each table 287 | # MAGIC 3. Write command to auto run commands and migrate entire DDL in 1 command 288 | # MAGIC 4. Make incremental and merge results into target table 289 | 290 | # COMMAND ---------- 291 | 292 | (view_create.withColumn("ParsedDDL", getDDLFromSQLString(col("query_statement"))) 293 | ## Get most recent table ddl command 294 | ## merge into target table (just truncating and reloading right now) 295 | ## Add separate command to run all statements 296 | .write 297 | .format("delta") 298 | .option("overwriteSchema", "true") 299 | .mode("overwrite") 300 | .saveAsTable("redshift_migration.redshift_ddl_to_databricks") 301 | ) 302 | 303 | # COMMAND ---------- 304 | 305 | spark.sql("""SELECT query_statement, ParsedDDL:ddl, ParsedDDL:optimize_command FROM redshift_migration.redshift_ddl_to_databricks""") 306 | -------------------------------------------------------------------------------- /Using DBSQL Serverless Client Example.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %pip install -r helperfunctions/requirements.txt 3 | 4 | # COMMAND ---------- 5 | 6 | from helperfunctions.dbsqlclient import ServerlessClient 7 | 8 | # COMMAND ---------- 9 | 10 | # DBTITLE 1,Example Inputs For Client 11 | 12 | 13 | token = None ## optional 14 | host_name = None ## optional 15 | warehouse_id = "475b94ddc7cd5211" 16 | 17 | ## Single Query Example 18 | sql_statement = "SELECT concat_ws('-', M.id, N.id, random()) as ID FROM range(1000) AS M, range(1000) AS N LIMIT 10000000" 19 | 20 | ## Multi Query Example 21 | multi_statement = "SELECT 1; SELECT 2; SELECT concat_ws('-', M.id, N.id, random()) as ID FROM range(1000) AS M, range(1000) AS N LIMIT 10000000" 22 | 23 | # COMMAND ---------- 24 | 25 | serverless_client = ServerlessClient(warehouse_id = warehouse_id, token=token, host_name=host_name) ## token=, host_name=verbose=True for print statements and other debugging messages 26 | 27 | # COMMAND ---------- 28 | 29 | # DBTITLE 1,Basic sql drop-in command 30 | """ 31 | Optional Params: 32 | 1. full_results 33 | 2. use_catalog = - this is a command specific USE CATALOG statement for the single SQL command 34 | 3. use_schema = - this is a command specific USE SCHEMA 35 | 36 | """ 37 | 38 | result_df = serverless_client.sql(sql_statement = sql_statement) ## OPTIONAL: use_catalog="hive_metastore", use_schema="default" 39 | 40 | # COMMAND ---------- 41 | 42 | # DBTITLE 1,Multi Statement Command - No Results just Status - Recommended for production 43 | """ 44 | Optional Params: 45 | 1. full_results 46 | 2. use_catalog = - this is a command specific USE CATALOG statement for the single SQL command 47 | 3. use_schema = - this is a command specific USE SCHEMA 48 | 49 | """ 50 | 51 | result = serverless_client.submit_multiple_sql_commands(sql_statements = multi_statement, full_results=False) #session_catalog, session_schema are also optional parameters that will simulate a USE statement. True full_results just returns the whole API response for each query 52 | 53 | # COMMAND ---------- 54 | 55 | # DBTITLE 1,Multi Statement Command Returning Results of Last Command - Best for simple processes 56 | result_multi_df = serverless_client.submit_multiple_sql_commands_last_results(sql_statements = multi_statement) 57 | 58 | # COMMAND ---------- 59 | 60 | display(result_multi_df) 61 | 62 | # COMMAND ---------- 63 | 64 | # DBTITLE 1,If Multi Statement Fails, this is how to access the result chain 65 | ## The function save the state of each command in the chain, even if it fails to return results for troubleshooting 66 | 67 | last_saved_multi_statement_state = serverless_client.multi_statement_result_state 68 | print(last_saved_multi_statement_state) 69 | -------------------------------------------------------------------------------- /Using DBSQL Serverless Transaction Manager Example.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %pip install -r helperfunctions/requirements.txt 3 | 4 | # COMMAND ---------- 5 | 6 | from helperfunctions.dbsqltransactions import DBSQLTransactionManager 7 | 8 | # COMMAND ---------- 9 | 10 | # DBTITLE 1,Example Inputs For Client 11 | token = None ## optional 12 | host_name = None ## optional 13 | warehouse_id = "475b94ddc7cd5211" 14 | 15 | # COMMAND ---------- 16 | 17 | # DBTITLE 1,Example Multi Statement Transaction 18 | sqlString = """ 19 | USE CATALOG hive_metastore; 20 | 21 | CREATE SCHEMA IF NOT EXISTS iot_dashboard; 22 | 23 | USE SCHEMA iot_dashboard; 24 | 25 | -- Create Tables 26 | CREATE OR REPLACE TABLE iot_dashboard.bronze_sensors 27 | ( 28 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY, 29 | device_id INT, 30 | user_id INT, 31 | calories_burnt DECIMAL(10,2), 32 | miles_walked DECIMAL(10,2), 33 | num_steps DECIMAL(10,2), 34 | timestamp TIMESTAMP, 35 | value STRING 36 | ) 37 | USING DELTA 38 | TBLPROPERTIES("delta.targetFileSize"="128mb"); 39 | 40 | CREATE OR REPLACE TABLE iot_dashboard.silver_sensors 41 | ( 42 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY, 43 | device_id INT, 44 | user_id INT, 45 | calories_burnt DECIMAL(10,2), 46 | miles_walked DECIMAL(10,2), 47 | num_steps DECIMAL(10,2), 48 | timestamp TIMESTAMP, 49 | value STRING 50 | ) 51 | USING DELTA 52 | PARTITIONED BY (user_id) 53 | TBLPROPERTIES("delta.targetFileSize"="128mb"); 54 | 55 | -- Statement 1 -- the load 56 | COPY INTO iot_dashboard.bronze_sensors 57 | FROM (SELECT 58 | id::bigint AS Id, 59 | device_id::integer AS device_id, 60 | user_id::integer AS user_id, 61 | calories_burnt::decimal(10,2) AS calories_burnt, 62 | miles_walked::decimal(10,2) AS miles_walked, 63 | num_steps::decimal(10,2) AS num_steps, 64 | timestamp::timestamp AS timestamp, 65 | value AS value -- This is a JSON object 66 | FROM "/databricks-datasets/iot-stream/data-device/") 67 | FILEFORMAT = json 68 | COPY_OPTIONS('force'='true') -- 'false' -- process incrementally 69 | --option to be incremental or always load all files 70 | ; 71 | 72 | -- Statement 2 73 | MERGE INTO iot_dashboard.silver_sensors AS target 74 | USING (SELECT Id::integer, 75 | device_id::integer, 76 | user_id::integer, 77 | calories_burnt::decimal, 78 | miles_walked::decimal, 79 | num_steps::decimal, 80 | timestamp::timestamp, 81 | value::string 82 | FROM iot_dashboard.bronze_sensors) AS source 83 | ON source.Id = target.Id 84 | AND source.user_id = target.user_id 85 | AND source.device_id = target.device_id 86 | WHEN MATCHED THEN UPDATE SET 87 | target.calories_burnt = source.calories_burnt, 88 | target.miles_walked = source.miles_walked, 89 | target.num_steps = source.num_steps, 90 | target.timestamp = source.timestamp 91 | WHEN NOT MATCHED THEN INSERT *; 92 | 93 | OPTIMIZE iot_dashboard.silver_sensors ZORDER BY (timestamp); 94 | 95 | -- This calculate table stats for all columns to ensure the optimizer can build the best plan 96 | -- Statement 3 97 | 98 | ANALYZE TABLE iot_dashboard.silver_sensors COMPUTE STATISTICS FOR ALL COLUMNS; 99 | 100 | CREATE OR REPLACE TABLE hourly_summary_statistics 101 | AS 102 | SELECT user_id, 103 | date_trunc('hour', timestamp) AS HourBucket, 104 | AVG(num_steps)::float AS AvgNumStepsAcrossDevices, 105 | AVG(calories_burnt)::float AS AvgCaloriesBurnedAcrossDevices, 106 | AVG(miles_walked)::float AS AvgMilesWalkedAcrossDevices 107 | FROM silver_sensors 108 | GROUP BY user_id,date_trunc('hour', timestamp) 109 | ORDER BY HourBucket; 110 | 111 | -- Statement 4 112 | -- Truncate bronze batch once successfully loaded 113 | TRUNCATE TABLE bronze_sensors; 114 | """ 115 | 116 | # COMMAND ---------- 117 | 118 | serverless_client_t = DBSQLTransactionManager(warehouse_id = warehouse_id, mode="inferred_altered_tables") ## token=, host_name=verbose=True for print statements and other debugging messages 119 | 120 | # COMMAND ---------- 121 | 122 | # DBTITLE 1,Submitting the Multi Statement Transaction to Serverless SQL Warehouse 123 | """ 124 | PARAMS: 125 | warehouse_id --> Required, the SQL warehouse to submit statements 126 | mode -> selected_tables, inferred_altered_tables 127 | token --> optional, will try to get one for the user 128 | host_name --> optional, will try to infer same workspace url 129 | 130 | 131 | execute_sql_transaction params: 132 | return_type --> "message", "last_results". "message" will return status of query chain. "last_result" will run all statements and return the last results of the final query in the chain 133 | 134 | """ 135 | 136 | result_df = serverless_client_t.execute_dbsql_transaction(sql_string = sqlString) 137 | -------------------------------------------------------------------------------- /Using Delta Helpers Notebook Example.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC ## Using Delta Helpers Materialization Class. 5 | # MAGIC 6 | # MAGIC This class is for the purpose of materializing tables with delta onto cloud storage. This is often helpful for debugging and for simplifying longer, more complex query pipelines that would otherwise require highly nested CTE statements. Often times, the plan is simplified and performane is improved by removing the lazy evaluation and creating "checkpoint" steps with a materialized temp_db. Currently spark temp tables are NOT materialized, and thus not evaluated until called which is identical to a subquery. 7 | # MAGIC 8 | # MAGIC #### Initialization 9 | # MAGIC 10 | # MAGIC
        • deltaHelpers = DeltaHelpers(temp_root_path= "dbfs:/delta_temp_db", db_name="delta_temp") - The parameters are defaults and can be changed to a customer db name or s3 path 11 | # MAGIC 12 | # MAGIC #### There are 4 methods: 13 | # MAGIC 14 | # MAGIC
        • createOrReplaceTempDeltaTable(df: DataFrame, table_name: String) - This creates or replaces materialized delta table in the default location in dbfs or in your provided s3 path 15 | # MAGIC
        • appendToTempDeltaTable(df: DataFrame, table_name: String) - This appends to an existing delta table or creates a new one if not exists in dbfs or your provided s3 path 16 | # MAGIC
        • removeTempDeltaTable(table_name) - This removes the delta table from your delta_temp database session 17 | # MAGIC
        • removeAllTempTablesForSession() - This truncates the initialized temp_db session. It does NOT run a DROP DATABASE command because the database can be global. It only removes the session path it creates. 18 | 19 | # COMMAND ---------- 20 | 21 | # MAGIC %pip install -r helperfunctions/requirements.txt 22 | 23 | # COMMAND ---------- 24 | 25 | # DBTITLE 1,Import 26 | from helperfunctions.deltahelpers import DeltaHelpers 27 | 28 | # COMMAND ---------- 29 | 30 | # DBTITLE 1,Initialize 31 | ## 2 Params [Optional - db_name, temp_root_path] 32 | deltaHelpers = DeltaHelpers() 33 | 34 | # COMMAND ---------- 35 | 36 | # DBTITLE 1,Create or Replace Temp Delta Table 37 | df = spark.read.format("json").load("/databricks-datasets/iot-stream/data-device/") 38 | 39 | ## Methods return the cached dataframe so you can continue on as needed without reloading source each time AND you can reference in SQL (better for foreachBatch) 40 | ## No longer lazy -- this calls an action 41 | df = deltaHelpers.createOrReplaceTempDeltaTable(df, "iot_data") 42 | 43 | ## Build ML Models 44 | 45 | display(df) 46 | 47 | # COMMAND ---------- 48 | 49 | # DBTITLE 1,Read cached table quickly in python or SQL 50 | # MAGIC %sql 51 | # MAGIC -- Read cahced table quickly in python or SQL 52 | # MAGIC SELECT * FROM delta_temp.iot_data 53 | 54 | # COMMAND ---------- 55 | 56 | df.count() 57 | 58 | # COMMAND ---------- 59 | 60 | # DBTITLE 1,Append to Temp Delta Table 61 | ## Data is 1,000,000 rows 62 | df_doubled = deltaHelpers.appendToTempDeltaTable(df, "iot_data") 63 | 64 | ## Be CAREFUL HERE! Since the function calls an action, it is NOT lazily evaluated. So running it multiple times can append the same data 65 | df_doubled.count() 66 | 67 | # COMMAND ---------- 68 | 69 | # MAGIC %sql 70 | # MAGIC 71 | # MAGIC DESCRIBE HISTORY delta_temp.iot_data 72 | 73 | # COMMAND ---------- 74 | 75 | # DBTITLE 1,Remove Temp Delta Table 76 | deltaHelpers.removeTempDeltaTable("iot_data") 77 | 78 | # COMMAND ---------- 79 | 80 | # MAGIC %sql 81 | # MAGIC 82 | # MAGIC SELECT * FROM delta_temp.iot_data 83 | 84 | # COMMAND ---------- 85 | 86 | # DBTITLE 1,Truncate Session 87 | ## Deletes all tables in session path but does not drop that delta_temp database 88 | deltaHelpers.removeAllTempTablesForSession() 89 | -------------------------------------------------------------------------------- /Using Delta Logger Example.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC ## Delta Logger - How to use 5 | # MAGIC 6 | # MAGIC Purpose: This notebook utilizes the delta logger library to automatically and easiy log general pipeline information all in one place for any data pipeline. 7 | # MAGIC 8 | # MAGIC All logger tables have a standard default schema DDL: 9 | # MAGIC 10 | # MAGIC CREATE TABLE IF NOT EXISTS {full_table_name} ( 11 | # MAGIC run_id BIGINT GENERATED BY DEFAULT AS IDENTITY, 12 | # MAGIC batch_id STRING, 13 | # MAGIC session_process_name STRING NOT NULL, 14 | # MAGIC process_name STRING NOT NULL, 15 | # MAGIC status STRING NOT NULL, -- RUNNING, FAIL, SUCCESS, STALE 16 | # MAGIC start_timestamp TIMESTAMP NOT NULL, 17 | # MAGIC end_timestamp TIMESTAMP, 18 | # MAGIC duration_seconds DECIMAL, 19 | # MAGIC duration_ms DECIMAL, 20 | # MAGIC run_metadata STRING, -- String formatted like JSON 21 | # MAGIC update_timestamp TIMESTAMP, 22 | # MAGIC update_date DATE GENERATED ALWAYS AS (update_timestamp::date), 23 | # MAGIC start_date DATE GENERATED ALWAYS AS (start_timestamp::date), 24 | # MAGIC end_date DATE GENERATED ALWAYS AS (end_timestamp::date) 25 | # MAGIC ) 26 | # MAGIC USING DELTA 27 | # MAGIC 28 | # MAGIC ## Overivew 29 | # MAGIC The Delta logger is organized into Sessions, Processes, and Runs. 30 | # MAGIC A session is just like a Spark Session. It is an attempt at running a particular job/task. It is scoped like an active session each time the delta_logger is initialized. 31 | # MAGIC 32 | # MAGIC A session can then have one or many proesses running inside it. This is to allow for nested tracking of specific actions/processes within a Databricks job/notebook. By default, if a custom process name is not provided when starting a run, the session_process_name = active_process_name. 33 | # MAGIC 34 | # MAGIC Then, each process (can be 1 or many for each session/batch) can perform a run. A run is the smallest atomic unit. It is an ever-incrementing attempt at running a process. Runs then have start times, status, end times, metadata, etc. 35 | # MAGIC 36 | # MAGIC 37 | # MAGIC ## Initialize 38 | # MAGIC delta_logger = DeltaLogger(logger_table="main.iot_dashboard.pipeline_logs", 39 | # MAGIC session_process_name="iot_pipeline", 40 | # MAGIC batch_id = None ## Optional - allows user to pass in custom session batch id, by default a uuid is created for measuring a session id. 41 | # MAGIC logger_location=None, ## Optional location of the underlying table. S3/ADLS/GCS/dbfs path. 42 | # MAGIC partition_col:[str] = None ## Optional list of custom partition columns for the table. Allows user to customerize their logger to their query and management needs. 43 | # MAGIC ) 44 | # MAGIC 45 | # MAGIC - logger_table is the logging table you want to store and reference. You can create and manage as many logger tables as you would like. If you initilize a DeltaLogger and that table does not exist, it will create it for you. 46 | # MAGIC - session_process_name OPTIONAL - Users can log events/runs and pass the process_name into each event, or they can simply define it at the session level this way. This will default to using the session_process_name passed in here for the whole session. It can be overridden anytime. You can also use this to log child processes within a session by starting/completing runs with additional process names while using the session process name as the parent. And example is provided in this notebook. 47 | # MAGIC - logger_location OPTIONAL - default = None. This is an override for specifying a specific object storage location for where the user wants the table to live. If not provided, it will be a managed table by default (recommended). 48 | # MAGIC 49 | # MAGIC ## Methods: 50 | # MAGIC 51 | # MAGIC For most methods: -- if process_name not provided, will use session. If cannot find process_name, will error. 52 | # MAGIC 53 | # MAGIC - create_logger() -- creates a logger table if not exists. This also optimizes the table since it is used in initlialization. 54 | # MAGIC - drop_logger() -- drops the logger table attached to the session 55 | # MAGIC - truncate_logger() -- clears an existing logger table 56 | # MAGIC - start_run(process_name: Optional, msg: Optional) 57 | # MAGIC - fail_run(process_name: Optional, msg: Optional) 58 | # MAGIC - complete_run(process_name: Optional, msg: Optional) 59 | # MAGIC - log_run_metric(process_name: Optional, run_metric_dic:dict[str]) 60 | # MAGIC - log_run_info(log_level = 'INFO', msg = None) 61 | # MAGIC - get_last_successful_run_id(proces_name: Optional) -- If no previous successful run, return -1 62 | # MAGIC - get_last_successful_run_timestamp(process_name: Optional) -- If no previous successful run for the process, defaults to "1900-01-01 00:00:00" 63 | # MAGIC - get_last_run_id(process_name: Optional) -- Get last run id regardless of status, if none return -1 64 | # MAGIC - get_last_run_timestamp(process_name: Optional) -- Get last run timestamp , If no previous run for the process, defaults to "1900-01-01 00:00:00" 65 | # MAGIC - get_last_failed_run_id(process_name: Optional) 66 | # MAGIC - get_last_failed_run_timestamp(prcoess_name: Optional) 67 | # MAGIC - clean_stale_runs(process_name: Optional) -- Will mark any runs without and end timestamp in the running state to "STALE" and give them an end timestamp. This ONLY happens when a new run is created and the runs are < the max existing RUNNING run id 68 | # MAGIC - optimize_log(process_name:Optional, zorderCols=["end_timestamp", "start_timestamp", "run_id"]) -- Optimizes the underlying log table for a particular process name a ZORDERs by input col list 69 | # MAGIC ### Limitations / Considerations 70 | # MAGIC 1. Currently supports 1 concurrent run per process_name for a given delta table. If you want to run concurrent pipelines, you need to create separate process names for them. This is meant to be a simple run and logging tracking solution for EDW pipelines. 71 | # MAGIC 72 | # MAGIC 2. User can pass in the fully qualified table name, use the spark session defaults, or pass in catalog and database overrides to the parameters. Pick one. 73 | # MAGIC 74 | 75 | # COMMAND ---------- 76 | 77 | # MAGIC %md 78 | # MAGIC 79 | # MAGIC ## Design Patterns In this Example 80 | # MAGIC 81 | # MAGIC 1. Use for Basic error handling, tracking of runs of various processes 82 | # MAGIC 2. Use for watermarking loading patterns. i.e. Creating a new run automatically pulls the most recent previous successful run and provide a "watermark" variable you can utilize for incremental loading. Use delta_logger.get_last_succes 83 | # MAGIC 3. Use with DBSQL Client and Transaction Manager Together for end to end 84 | 85 | # COMMAND ---------- 86 | 87 | # DBTITLE 1,Import Logger 88 | from helperfunctions.deltalogger import DeltaLogger 89 | 90 | # COMMAND ---------- 91 | 92 | # DBTITLE 1,Initialize a Delta Logger (creates logger table referenced in not exists) 93 | ## Session_process_name - Name for the session of a notebook. By default it is the notebook path. 94 | ## Session_batch_id - Id for the session. By default it is a generated uuid for each delta_logger initialization (session). Can customize to any string. 95 | ## partition_cols - Customer partition columns for a Delta table. By default the partition columns are: ['start_date', 'session_process_name', 'process_name'] 96 | 97 | ## All date colummns are auto-generated columns that are based on the timestamp columns in the table. 98 | 99 | delta_logger = DeltaLogger(logger_table_name="main.iot_dashboard.logger") 100 | 101 | # COMMAND ---------- 102 | 103 | # DBTITLE 1,Start A Run 104 | ## process_name - Optional additionl / sub process name within session. By default process_name is the same as the session process name 105 | ## batch_id - Optional Batch Id 106 | 107 | delta_logger.start_run() 108 | 109 | 110 | # COMMAND ---------- 111 | 112 | # DBTITLE 1,Get Active Run Info 113 | print(delta_logger.session_process_name) 114 | print(delta_logger.active_process_name) 115 | print(delta_logger.active_run_id) 116 | print(delta_logger.active_run_end_ts) 117 | print(delta_logger.active_run_start_ts) 118 | print(delta_logger.active_run_status) 119 | print(delta_logger.active_run_metadata) 120 | 121 | # COMMAND ---------- 122 | 123 | # DBTITLE 1,Log a Custom Named Metrics to Reference in Queries 124 | ## Seems to cancel out metrics when a manual process id is defined 125 | 126 | delta_logger.log_run_metric(run_metrics_dict={"Rows_Affected": 10000, "Percent_Success": 1}) 127 | 128 | # COMMAND ---------- 129 | 130 | # DBTITLE 1,Watermarking Example Baked into Logger with Process Run Start Times 131 | watermark_ts = delta_logger.get_most_recent_success_run_start_time() 132 | 133 | print(watermark_ts) 134 | 135 | # COMMAND ---------- 136 | 137 | # DBTITLE 1,Log Run Info/Messages 138 | delta_logger.log_run_info(log_level='INFO', msg = "This step did some weird stuff") 139 | 140 | # COMMAND ---------- 141 | 142 | delta_logger.log_run_info(log_level='WARN', msg = "This step did some weird stuff") 143 | 144 | # COMMAND ---------- 145 | 146 | # DBTITLE 1,Complete a Run 147 | delta_logger.complete_run() 148 | ## delta_logger.fail_run() 149 | 150 | # COMMAND ---------- 151 | 152 | # MAGIC %md 153 | # MAGIC 154 | # MAGIC ### Run a custom child/sub process name run within a session 155 | 156 | # COMMAND ---------- 157 | 158 | ## Start a customer process name run within a session 159 | 160 | ## This starts a run with this sub-process and registers the process_name as the active process 161 | delta_logger.start_run(process_name='MERGE STEP') 162 | 163 | 164 | # COMMAND ---------- 165 | 166 | delta_logger.log_run_metric(run_metrics_dict={"Rows_Affected": 40124, "Percent_Success": 0.5}) 167 | 168 | # COMMAND ---------- 169 | 170 | delta_logger.complete_run(process_name='MERGE STEP') 171 | 172 | # COMMAND ---------- 173 | 174 | # MAGIC %md 175 | # MAGIC 176 | # MAGIC ## Analyze and Use the Logs! 177 | 178 | # COMMAND ---------- 179 | 180 | # DBTITLE 1,Select From Logger in order of events DESC 181 | # MAGIC %sql 182 | # MAGIC 183 | # MAGIC SELECT * 184 | # MAGIC FROM main.iot_dashboard.logger 185 | # MAGIC ORDER BY run_id DESC 186 | 187 | # COMMAND ---------- 188 | 189 | # DBTITLE 1,Analyze Custom Logged Metrics 190 | # MAGIC %sql 191 | # MAGIC 192 | # MAGIC SELECT 193 | # MAGIC session_process_name, 194 | # MAGIC process_name, 195 | # MAGIC date_trunc('HOUR', start_timestamp) AS EventHour, 196 | # MAGIC AVG(run_metadata:Rows_Affected) AS AvgRowsProcessed -- We can use our custom metrics in SQL Queries and Dashboards 197 | # MAGIC FROM main.iot_dashboard.logger 198 | # MAGIC GROUP BY 199 | # MAGIC session_process_name, 200 | # MAGIC process_name, 201 | # MAGIC date_trunc('HOUR', start_timestamp) 202 | # MAGIC ORDER BY EventHour 203 | 204 | # COMMAND ---------- 205 | 206 | # DBTITLE 1,Check the partition columns of this logger 207 | delta_logger.logger_partition_cols 208 | 209 | # COMMAND ---------- 210 | 211 | # DBTITLE 1,Check the ZORDER columns of this logger 212 | delta_logger.logger_zorder_cols 213 | 214 | # COMMAND ---------- 215 | 216 | # DBTITLE 1,Use the Delta Partitions/ZORDER cols To Easily Query Large Logger Table 217 | # MAGIC %sql 218 | # MAGIC 219 | # MAGIC -- Using Partition Pruning 220 | # MAGIC SELECT 221 | # MAGIC * 222 | # MAGIC FROM main.iot_dashboard.logger 223 | # MAGIC WHERE start_date = '2023-11-02'::DATE 224 | # MAGIC AND session_process_name = '/Repos/cody.davis@databricks.com/edw-best-practices/Using Delta Logger Example' 225 | # MAGIC 226 | 227 | # COMMAND ---------- 228 | 229 | # DBTITLE 1,Using the ZORDER cols to do analysis over time 230 | # MAGIC %sql 231 | # MAGIC 232 | # MAGIC 233 | # MAGIC -- Using Partition Pruning 234 | # MAGIC SELECT 235 | # MAGIC * 236 | # MAGIC FROM main.iot_dashboard.logger 237 | # MAGIC WHERE start_date = '2023-11-02'::DATE 238 | # MAGIC AND session_process_name = '/Repos/cody.davis@databricks.com/edw-best-practices/Using Delta Logger Example' 239 | # MAGIC AND start_timestamp BETWEEN (now() - INTERVAL 2 DAYS) AND now(); 240 | # MAGIC 241 | # MAGIC 242 | # MAGIC -- Using Partition Pruning 243 | # MAGIC SELECT 244 | # MAGIC * 245 | # MAGIC FROM main.iot_dashboard.logger 246 | # MAGIC WHERE start_date = '2023-11-02'::DATE 247 | # MAGIC AND session_process_name = '/Repos/cody.davis@databricks.com/edw-best-practices/Using Delta Logger Example' 248 | # MAGIC AND run_id BETWEEN 1 AND 5 249 | -------------------------------------------------------------------------------- /Using Delta Merge Helpers Example.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC ## Delta Merge Helpers: 5 | # MAGIC 6 | # MAGIC This is class with a set of static methods that help the user easily perform retry statements on operataions that may be cause a lot of conflicting transactions (usually in MERGE / UPDATE statements). 7 | # MAGIC 8 | # MAGIC
        • 1 Method: retrySqlStatement(spark: SparkSession, operation_name: String, sqlStatement: String) - the spark param is your existing Spark session, the operation name is simply an operation to identify your transaction, the sqlStatement parameter is the SQL statement you want to retry. 9 | 10 | # COMMAND ---------- 11 | 12 | # MAGIC %pip install -r helperfunctions/requirements.txt 13 | 14 | # COMMAND ---------- 15 | 16 | from helperfunctions.deltahelpers import DeltaMergeHelpers 17 | 18 | # COMMAND ---------- 19 | 20 | 21 | sql_statement = """ 22 | MERGE INTO iot_dashboard.silver_sensors AS target 23 | USING (SELECT Id::integer, 24 | device_id::integer, 25 | user_id::integer, 26 | calories_burnt::decimal, 27 | miles_walked::decimal, 28 | num_steps::decimal, 29 | timestamp::timestamp, 30 | value::string 31 | FROM iot_dashboard.bronze_sensors) AS source 32 | ON source.Id = target.Id 33 | AND source.user_id = target.user_id 34 | AND source.device_id = target.device_id 35 | WHEN MATCHED THEN UPDATE SET 36 | target.calories_burnt = source.calories_burnt, 37 | target.miles_walked = source.miles_walked, 38 | target.num_steps = source.num_steps, 39 | target.timestamp = source.timestamp 40 | WHEN NOT MATCHED THEN INSERT *; 41 | """ 42 | 43 | DeltaMergeHelpers.retrySqlStatement(spark, "merge_sensors", sqlStatement=sql_statement) 44 | -------------------------------------------------------------------------------- /Using Streaming Tables and MV Orchestrator.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC ## This library helps orchestrate Streaming tables in conjunction with other tables that may depend on synchronous updated from the streaming table for classical EDW loading patterns 5 | # MAGIC 6 | # MAGIC ## Assumptions / Best Practices 7 | # MAGIC 8 | # MAGIC 1. Assumes ST is NOT SCHEDULED in the CREATE STATEMENT (externally orchestrated) (that is a different loading pattern that is not as common in classical EDW) 9 | # MAGIC 10 | # MAGIC 2. Assumes that one or many pipelines are dependent upon the successful CREATe OR REFRESH of the streaming table, so this library will simply block the tasks from moving the job onto the rest of the DAG to ensure the downstream tasks actually read from the table when it finishes updated 11 | # MAGIC 12 | # MAGIC 3. This works best with a single node "Driver" notebook loading sql files from Git similar to how airflow would orchestrate locally. The single job node would then call spark.sql() to run the CREATE OR REFRESH and then you arent needing a warehouse and a DLT pipeline in the job for streaming refreshes. 13 | 14 | # COMMAND ---------- 15 | 16 | # MAGIC %md 17 | # MAGIC 18 | # MAGIC ## Library Steps 19 | # MAGIC 20 | # MAGIC ### This library only takes in 1 sql statement at a time, this is because if there are multiple and only some pass and others fail, then it would not be correct failing or passing the whole statement. Each ST/MV must be done separately. This can be done by simply calling the static methods multiple times. 21 | # MAGIC 22 | # MAGIC 1. Parse Streaming Table / MV Create / Refresh commmand 23 | # MAGIC 2. Identify ST / MV table(s) for that command 24 | # MAGIC 3. Run SQL command - CREATE / REFRESH ST/MV 25 | # MAGIC 4. DESCRIBE DETAIL to get pipelines.pipelineId metadata 26 | # MAGIC 5. Perform REST API Call to check for in-progress Refreshes 27 | # MAGIC 6. Poll and block statement chain from "finishing" until all pipelines identified are in either "PASS/FAIL" 28 | # MAGIC 7. If statement PASSES - then complete and return 29 | # MAGIC 8. If statement FAILS - then throw REFRESH FAIL exception 30 | 31 | # COMMAND ---------- 32 | 33 | from helperfunctions.stmvorchestrator import orchestrate_stmv_statement 34 | 35 | # COMMAND ---------- 36 | 37 | sql_statement = """ 38 | CREATE OR REFRESH STREAMING TABLE main.iot_dashboard.streaming_tables_raw_data 39 | AS SELECT 40 | id::bigint AS Id, 41 | device_id::integer AS device_id, 42 | user_id::integer AS user_id, 43 | calories_burnt::decimal(10,2) AS calories_burnt, 44 | miles_walked::decimal(10,2) AS miles_walked, 45 | num_steps::decimal(10,2) AS num_steps, 46 | timestamp::timestamp AS timestamp, 47 | value AS value -- This is a JSON object 48 | FROM STREAM read_files('dbfs:/databricks-datasets/iot-stream/data-device/*.json*', 49 | format => 'json', 50 | maxFilesPerTrigger => 12 -- what does this do when you 51 | ) 52 | """ 53 | 54 | # COMMAND ---------- 55 | 56 | orchestrate_stmv_statement(spark, dbutils, sql_statement=sql_statement) 57 | -------------------------------------------------------------------------------- /Using Transaction Manager Example.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md 3 | # MAGIC 4 | # MAGIC ## TO DO: 5 | # MAGIC 6 | # MAGIC 1. Continue to add edge cases on affected tables: RESTORE TABLE, OPTIMIZE 7 | # MAGIC 2. Ensure shapshot versions are created for tables that do not exists. if transaction fails and snapshot is -1, then run DROP TABLE IF EXISTS statement. 8 | 9 | # COMMAND ---------- 10 | 11 | # MAGIC %pip install -r helperfunctions/requirements.txt 12 | 13 | # COMMAND ---------- 14 | 15 | from helperfunctions.transactions import Transaction 16 | 17 | # COMMAND ---------- 18 | 19 | # DBTITLE 1,Example SQL Transaction Block 20 | sqlString = """ 21 | USE CATALOG hive_metastore; 22 | 23 | CREATE SCHEMA IF NOT EXISTS iot_dashboard; 24 | 25 | USE SCHEMA iot_dashboard; 26 | 27 | -- Create Tables 28 | CREATE OR REPLACE TABLE iot_dashboard.bronze_sensors 29 | ( 30 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY, 31 | device_id INT, 32 | user_id INT, 33 | calories_burnt DECIMAL(10,2), 34 | miles_walked DECIMAL(10,2), 35 | num_steps DECIMAL(10,2), 36 | timestamp TIMESTAMP, 37 | value STRING 38 | ) 39 | USING DELTA 40 | TBLPROPERTIES("delta.targetFileSize"="128mb"); 41 | 42 | CREATE OR REPLACE TABLE iot_dashboard.silver_sensors 43 | ( 44 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY, 45 | device_id INT, 46 | user_id INT, 47 | calories_burnt DECIMAL(10,2), 48 | miles_walked DECIMAL(10,2), 49 | num_steps DECIMAL(10,2), 50 | timestamp TIMESTAMP, 51 | value STRING 52 | ) 53 | USING DELTA 54 | PARTITIONED BY (user_id) 55 | TBLPROPERTIES("delta.targetFileSize"="128mb"); 56 | 57 | -- Statement 1 -- the load 58 | COPY INTO iot_dashboard.bronze_sensors 59 | FROM (SELECT 60 | id::bigint AS Id, 61 | device_id::integer AS device_id, 62 | user_id::integer AS user_id, 63 | calories_burnt::decimal(10,2) AS calories_burnt, 64 | miles_walked::decimal(10,2) AS miles_walked, 65 | num_steps::decimal(10,2) AS num_steps, 66 | timestamp::timestamp AS timestamp, 67 | value AS value -- This is a JSON object 68 | FROM "/databricks-datasets/iot-stream/data-device/") 69 | FILEFORMAT = json 70 | COPY_OPTIONS('force'='true') -- 'false' -- process incrementally 71 | --option to be incremental or always load all files 72 | ; 73 | 74 | -- Statement 2 75 | MERGE INTO iot_dashboard.silver_sensors AS target 76 | USING (SELECT Id::integer, 77 | device_id::integer, 78 | user_id::integer, 79 | calories_burnt::decimal, 80 | miles_walked::decimal, 81 | num_steps::decimal, 82 | timestamp::timestamp, 83 | value::string 84 | FROM iot_dashboard.bronze_sensors) AS source 85 | ON source.Id = target.Id 86 | AND source.user_id = target.user_id 87 | AND source.device_id = target.device_id 88 | WHEN MATCHED THEN UPDATE SET 89 | target.calories_burnt = source.calories_burnt, 90 | target.miles_walked = source.miles_walked, 91 | target.num_steps = source.num_steps, 92 | target.timestamp = source.timestamp 93 | WHEN NOT MATCHED THEN INSERT *; 94 | 95 | OPTIMIZE iot_dashboard.silver_sensors ZORDER BY (timestamp); 96 | 97 | -- This calculate table stats for all columns to ensure the optimizer can build the best plan 98 | -- Statement 3 99 | 100 | ANALYZE TABLE iot_dashboard.silver_sensors COMPUTE STATISTICS FOR ALL COLUMNS; 101 | 102 | CREATE OR REPLACE TABLE hourly_summary_statistics 103 | AS 104 | SELECT user_id, 105 | date_trunc('hour', timestamp) AS HourBucket, 106 | AVG(num_steps)::float AS AvgNumStepsAcrossDevices, 107 | AVG(calories_burnt)::float AS AvgCaloriesBurnedAcrossDevices, 108 | AVG(miles_walked)::float AS AvgMilesWalkedAcrossDevices 109 | FROM silver_sensors 110 | GROUP BY user_id,date_trunc('hour', timestamp) 111 | ORDER BY HourBucket; 112 | 113 | -- Statement 4 114 | -- Truncate bronze batch once successfully loaded 115 | TRUNCATE TABLE bronze_sensors; 116 | """ 117 | 118 | # COMMAND ---------- 119 | 120 | # MAGIC %md 121 | # MAGIC 122 | # MAGIC ## 3 Primary Ways to Do a Transaction 123 | # MAGIC 124 | # MAGIC 1. SQL - selected_tables: This allows the user to explicitly control which exact tables get snapshotted and rolledback - good for production where lots of jobs are running. 125 | # MAGIC 126 | # MAGIC 2. SQL - inferred_selected_tables This uses SQL Glot to automatically find tables that would be altered from the SQL inside the transaction block, and will snapshot those tables. Great for simplicity but should be checked in a test before moving to production 127 | # MAGIC 128 | # MAGIC 3. Python - call .begin_transaction() and rollback_transaction() methods manually do manage a transaction state. This allows for more complex logic outside of a contiguous multi statement SQL block 129 | 130 | # COMMAND ---------- 131 | 132 | # MAGIC %md 133 | # MAGIC 134 | # MAGIC ## Method 1: SQL - selected_tables 135 | 136 | # COMMAND ---------- 137 | 138 | # DBTITLE 1,Initialize Transaction Class - Manually Define Selected Tables 139 | x = Transaction(mode="selected_tables", uc_default=False) 140 | 141 | # COMMAND ---------- 142 | 143 | # DBTITLE 1,Execute a multi statement SQL transaction from a SQL string - Manually Defining 144 | ## This method is great because to do not need to rollback manually, it is handled for you 145 | ## This statement auto-commmits on success. If you do not want that, you can write pyspark or regular SQL outside of this method and then manually rollback 146 | x.execute_sql_transaction(sqlString, tables_to_manage=["hive_metastore.iot_dashboard.silver_sensors"]) 147 | 148 | # COMMAND ---------- 149 | 150 | # MAGIC %md 151 | # MAGIC ## Method 2: SQL - inferred_altered_tables 152 | 153 | # COMMAND ---------- 154 | 155 | y = Transaction(mode="inferred_altered_tables", uc_default=False) ## uc_default=True if you want to infer schema with main as default instead of hive_metastore. 156 | 157 | # COMMAND ---------- 158 | 159 | ## This statement auto-commmits on success. If you do not want that, you can write pyspark or regular SQL outside of this method and then manually rollback 160 | 161 | y.execute_sql_transaction(sqlString) 162 | 163 | # COMMAND ---------- 164 | 165 | # MAGIC %md 166 | # MAGIC 167 | # MAGIC ## Method 3: Python 168 | # MAGIC 169 | # MAGIC Call transaction begin and rollback and do any logic in between 170 | 171 | # COMMAND ---------- 172 | 173 | # DBTITLE 1,Begin Transaction in Python 174 | x.begin_transaction(tables_to_snapshot=["hive_metastore.iot_dashbaord.silver_sensors"]) 175 | 176 | # COMMAND ---------- 177 | 178 | ##### Do a bunch of logic here, any logic at all 179 | ##### 180 | 181 | # COMMAND ---------- 182 | 183 | # DBTITLE 1,Get Transaction Snapshot Info 184 | x.get_transaction_snapshot() 185 | 186 | # COMMAND ---------- 187 | 188 | # DBTITLE 1,Manually rollback a transaction from most recent explicit snapshot for tables 189 | ### If you use the SQL execute method, it auto commits!! So you cannot roll back once it succeed. It will do it automatically. You can still use all the manual methods if you want to opt out of auto handling the rollback/committ process 190 | x.rollback_transaction() 191 | -------------------------------------------------------------------------------- /helperfunctions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/helperfunctions/__init__.py -------------------------------------------------------------------------------- /helperfunctions/dbsqltransactions.py: -------------------------------------------------------------------------------- 1 | from helperfunctions.dbsqlclient import ServerlessClient 2 | from helperfunctions.transactions import Transaction, TransactionException, AlteredTableParser 3 | import warnings 4 | 5 | class DBSQLTransactionManager(Transaction): 6 | 7 | def __init__(self, warehouse_id, mode="selected_tables", uc_default=False, host_name=None, token=None): 8 | 9 | super().__init__(mode=mode, uc_default=uc_default) 10 | self.host_name = host_name 11 | self.token = token 12 | self.warehouse_id = warehouse_id 13 | 14 | ## other state 15 | self.use_sessions = None 16 | 17 | return 18 | 19 | 20 | ### Execute multi statment SQL, now we can implement this easier for Serverless or not Serverless 21 | def execute_dbsql_transaction(self, sql_string, tables_to_manage=[], force=False, return_type="message"): 22 | 23 | ## return_type = message (returns status messages), last_result (returns the result of the last command in the sql chain) 24 | ## If force= True, then if transaction manager fails to find tables, then it runs the SQL anyways 25 | ## You do not NEED to run SQL this way to rollback a transaction, 26 | ## but it automatically breaks up multiple statements in one SQL file into a series of spark.sql() commands 27 | 28 | serverless_client = ServerlessClient(warehouse_id = self.warehouse_id, token=self.token, host_name=self.host_name) ## token=, host_name=verbose=True for print statements and other debugging messages 29 | 30 | current_catalog = serverless_client.spark.sql("SELECT current_catalog()").collect()[0][0] 31 | current_schema = serverless_client.spark.sql("SELECT current_schema()").collect()[0][0] 32 | 33 | ## Add default USE session scopes if USE statement were defined outside of the SQL string in the same spark session 34 | 35 | try: 36 | 37 | default_use_session_scope = 'USE ' + current_catalog + '.' + current_schema + '; ' 38 | 39 | ## Default to defaults if for some reason session level fetching fails 40 | except: 41 | if self.uc_default: 42 | default_use_session_scope = 'USE main.default; ' 43 | elif not self.uc_default: 44 | default_use_session_scope = 'USE hive_metastore.uc_default; ' 45 | else: 46 | raise(ValueError("Unable to infer current session and uc_default is not True or False. True = main.default, False = hive_metastore.default as the default base session")) 47 | 48 | scoped_sql_string = default_use_session_scope + sql_string 49 | 50 | result_df = None 51 | 52 | stmts = [i.strip() for i in scoped_sql_string.split(";") if len(i.strip()) >0] 53 | 54 | ## Save to class state 55 | self.raw_sql_statement = scoped_sql_string 56 | self.sql_statement_list = stmts 57 | 58 | success_tables = False 59 | 60 | try: 61 | 62 | self.begin_dynamic_transaction(tables_to_manage=tables_to_manage) 63 | success_tables = True 64 | 65 | except Exception as e: 66 | print(f"FAILED: failed to acquire tables with errors: {str(e)}") 67 | 68 | 69 | ## If succeeded or force = True, then run the SQL 70 | if success_tables or force: 71 | if success_tables == False and force == True: 72 | warnings.warn("WARNING: Failed to acquire tables but force flag = True, so SQL statement will run anyways") 73 | 74 | ## Run the Transaction Logic with Serverless Client 75 | 76 | try: 77 | print(f"TRANSACTION IN PROGRESS ...Running multi statement SQL transaction now\n") 78 | 79 | ###!! Since the DBSQL execution API does not understand multiple statements, we need to submit the USE commands in the correct order manually. This is done with the AlteredTableParser() 80 | 81 | ### Get the USE session tree and submit SQL statements according to that tree 82 | parser = AlteredTableParser() 83 | parser.parse_sql_chain_for_altered_tables(self.sql_statement_list) 84 | self.use_sessions = parser.get_use_session_tree() 85 | 86 | for i in self.use_sessions: 87 | 88 | session_catalog = i.get("session_cat") 89 | session_db = i.get("session_db") 90 | use_session_statemnts = i.get("sql_statements") 91 | 92 | #print(use_session_statemnts) 93 | 94 | for s in use_session_statemnts: 95 | 96 | single_st = s.get("statement") 97 | 98 | print(f"\nRunning \n {single_st}") 99 | 100 | if single_st is not None: 101 | 102 | ## Submit the single command with the session USE scoped commands from the Parser Tree 103 | ## OPTION 1: return status message 104 | if return_type == "message": 105 | 106 | result_df = serverless_client.submit_multiple_sql_commands(sql_statements=single_st, use_catalog=session_catalog, use_schema=session_db) 107 | 108 | elif return_type == "last_result": 109 | 110 | result_df = serverless_client.submit_multiple_sql_commands_last_results(sql_statements=single_st, use_catalog=session_catalog, use_schema=session_db) 111 | 112 | else: 113 | result_df = None 114 | print("No run mode selected, select 'message' or 'last_results'") 115 | 116 | 117 | print(f"\n TRANSACTION SUCCEEDED: Multi Statement SQL Transaction Successfull! Updating Snapshot\n ") 118 | self.commit_transaction() 119 | 120 | 121 | ## Return results after committing sucesss outside of the for loop 122 | return result_df 123 | 124 | 125 | except Exception as e: 126 | print(f"\n TRANSACTION FAILED to run all statements... ROLLING BACK \n") 127 | self.rollback_transaction() 128 | print(f"Rollback successful!") 129 | 130 | raise(e) 131 | 132 | else: 133 | 134 | raise(TransactionException(message="Failed to acquire tables and force=False, not running process.", errors="Failed to acquire tables and force=False, not running process.")) 135 | -------------------------------------------------------------------------------- /helperfunctions/deltahelpers.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | import re 4 | import os 5 | from datetime import datetime, timedelta 6 | import uuid 7 | from pyspark.sql import SparkSession 8 | from pyspark.sql.functions import col, count, lit, max 9 | from pyspark.sql.types import * 10 | 11 | 12 | ### Helps Materialize temp tables during ETL pipelines 13 | class DeltaHelpers(): 14 | 15 | 16 | def __init__(self, db_name="delta_temp", temp_root_path="dbfs:/delta_temp_db"): 17 | 18 | self.spark = SparkSession.getActiveSession() 19 | self.db_name = db_name 20 | self.temp_root_path = temp_root_path 21 | 22 | self.dbutils = None 23 | 24 | #if self.spark.conf.get("spark.databricks.service.client.enabled") == "true": 25 | try: 26 | from pyspark.dbutils import DBUtils 27 | self.dbutils = DBUtils(self.spark) 28 | 29 | except: 30 | 31 | import IPython 32 | self.dbutils = IPython.get_ipython().user_ns["dbutils"] 33 | 34 | self.session_id =self.dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get() 35 | self.temp_env = self.temp_root_path + self.session_id 36 | self.spark.sql(f"""DROP DATABASE IF EXISTS {self.db_name} CASCADE;""") 37 | self.spark.sql(f"""CREATE DATABASE IF NOT EXISTS {self.db_name} LOCATION '{self.temp_env}'; """) 38 | print(f"Initializing Root Temp Environment: {self.db_name} at {self.temp_env}") 39 | 40 | return 41 | 42 | 43 | def createOrReplaceTempDeltaTable(self, df, table_name): 44 | 45 | tblObj = {} 46 | new_table_id = table_name 47 | write_path = self.temp_env + new_table_id 48 | 49 | self.spark.sql(f"DROP TABLE IF EXISTS {self.db_name}.{new_table_id}") 50 | self.dbutils.fs.rm(write_path, recurse=True) 51 | 52 | df.write.format("delta").mode("overwrite").option("path", write_path).saveAsTable(f"{self.db_name}.{new_table_id}") 53 | 54 | persisted_df = self.spark.read.format("delta").load(write_path) 55 | return persisted_df 56 | 57 | def appendToTempDeltaTable(self, df, table_name): 58 | 59 | tblObj = {} 60 | new_table_id = table_name 61 | write_path = self.temp_env + new_table_id 62 | 63 | df.write.format("delta").mode("append").option("path", write_path).saveAsTable(f"{self.db_name}.{new_table_id}") 64 | 65 | persisted_df = self.spark.read.format("delta").load(write_path) 66 | return persisted_df 67 | 68 | def removeTempDeltaTable(self, table_name): 69 | 70 | table_path = self.temp_env + table_name 71 | self.dbutils.fs.rm(table_path, recurse=True) 72 | self.spark.sql(f"""DROP TABLE IF EXISTS {self.db_name}.{table_name}""") 73 | 74 | print(f"Temp Table: {table_name} has been deleted.") 75 | return 76 | 77 | def removeAllTempTablesForSession(self): 78 | 79 | self.dbutils.fs.rm(self.temp_env, recurse=True) 80 | ##spark.sql(f"""DROP DATABASE IF EXISTS {self.db_name} CASCADE""") This temp db name COULD be global, never delete without separate method 81 | print(f"All temp tables in the session have been removed: {self.temp_env}") 82 | return 83 | 84 | 85 | 86 | class SchemaHelpers(): 87 | 88 | def __init__(): 89 | import json 90 | return 91 | 92 | @staticmethod 93 | def getDDLString(structObj): 94 | import json 95 | ddl = [] 96 | for c in json.loads(structObj.json()).get("fields"): 97 | 98 | name = c.get("name") 99 | dType = c.get("type") 100 | ddl.append(f"{name}::{dType} AS {name}") 101 | 102 | final_ddl = ", ".join(ddl) 103 | return final_ddl 104 | 105 | @staticmethod 106 | def getDDLList(structObj): 107 | import json 108 | ddl = [] 109 | for c in json.loads(structObj.json()).get("fields"): 110 | 111 | name = c.get("name") 112 | dType = c.get("type") 113 | ddl.append(f"{name}::{dType} AS {name}") 114 | 115 | return ddl 116 | 117 | @staticmethod 118 | def getFlattenedSqlExprFromValueColumn(structObj): 119 | import json 120 | ddl = [] 121 | for c in json.loads(structObj.json()).get("fields"): 122 | 123 | name = c.get("name") 124 | dType = c.get("type") 125 | ddl.append(f"value:{name}::{dType} AS {name}") 126 | 127 | return ddl 128 | 129 | 130 | 131 | 132 | class DeltaMergeHelpers(): 133 | 134 | def __init__(self): 135 | return 136 | 137 | @staticmethod 138 | def retrySqlStatement(spark, operationName, sqlStatement, maxRetries = 10, maxSecondsBetweenAttempts=60): 139 | 140 | import time 141 | maxRetries = maxRetries 142 | numRetries = 0 143 | maxWaitTime = maxSecondsBetweenAttempts 144 | ### Does not check for existence, ensure that happens before merge 145 | 146 | while numRetries <= maxRetries: 147 | 148 | try: 149 | 150 | print(f"SQL Statement Attempt for {operationName} #{numRetries + 1}...") 151 | 152 | spark.sql(sqlStatement) 153 | 154 | print(f"SQL Statement Attempt for {operationName} #{numRetries + 1} Successful!") 155 | break 156 | 157 | except Exception as e: 158 | error_msg = str(e) 159 | 160 | print(f"Failed SQL Statment Attmpet for {operationName} #{numRetries} with error: {error_msg}") 161 | 162 | numRetries += 1 163 | if numRetries > maxRetries: 164 | break 165 | 166 | waitTime = waitTime = 2**(numRetries-1) ## Wait longer up to max wait time for failed operations 167 | 168 | if waitTime > maxWaitTime: 169 | waitTime = maxWaitTime 170 | 171 | print(f"Waiting {waitTime} seconds before next attempt on {operationName}...") 172 | time.sleep(waitTime) -------------------------------------------------------------------------------- /helperfunctions/redshiftchecker.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql.functions import col, count, lit, date_trunc 3 | 4 | class RedshiftChecker(): 5 | 6 | 7 | """ 8 | 9 | This class reads a table with optional filters from a Redshift environment, validates schema, rows counts, data types, and returns a diff 10 | Dependencies: 11 | 1. Assumes Redshift Connector is installed on the running cluster 12 | 2. Assumes cluster has IAM Instance profile access to the requested Databricks tables 13 | 14 | ## TO DO: 15 | 1. Add Data Type Comparisons 16 | 2. Add Row-level comparisons 17 | """ 18 | 19 | def __init__(self, connectionString, iamRole, tempDir): 20 | 21 | print(f"Initialized Redshift Data Checker") 22 | self.spark = SparkSession.getActiveSession() 23 | self.connectionString = connectionString 24 | self.iamRole = iamRole 25 | self.tempDir = tempDir 26 | 27 | #### Build a Query and return the result 28 | 29 | def getSpark(self): 30 | return self.spark 31 | 32 | def getQuery(self, tableName, dateFilterColumn=None, startDateTime=None, endDateTime=None, limit=None): 33 | 34 | tableName = tableName 35 | 36 | sqlQuery = f"""SELECT * FROM {tableName}""" 37 | dateFilterColumn = dateFilterColumn 38 | startDateTime = startDateTime 39 | endDateTime = endDateTime 40 | 41 | try: 42 | 43 | if dateFilterColumn is not None: 44 | 45 | if (endDateTime is not None) and (startDateTime is not None): 46 | sqlFilter = f""" WHERE {dateFilterColumn} BETWEEN 47 | (CASE WHEN '{startDateTime}' = 'None' THEN now() ELSE '{startDateTime}'::timestamp END) 48 | AND 49 | (CASE WHEN '{endDateTime}' = 'None' THEN now() ELSE '{endDateTime}'::timestamp END)""" 50 | filteredQuery = sqlQuery + sqlFilter 51 | 52 | elif startDateTime is not None: 53 | sqlFilter = f""" WHERE {dateFilterColumn} BETWEEN 54 | (CASE WHEN '{startDateTime}' = 'None' THEN now() ELSE '{startDateTime}'::timestamp END) 55 | AND 56 | now()""" 57 | filteredQuery = sqlQuery + sqlFilter 58 | 59 | else: 60 | filteredQuery = sqlQuery 61 | 62 | else: 63 | filteredQuery = sqlQuery 64 | 65 | ## filteredQuery 66 | ## Limit query if supplied 67 | if isinstance(limit, int): 68 | limitStr = f""" LIMIT {limit}""" 69 | finalQuery = filteredQuery + limitStr 70 | 71 | elif limit is None: 72 | finalQuery = filteredQuery 73 | else: 74 | finalQuery = filteredQuery 75 | print("No valid limit provided... not limiting table...") 76 | 77 | except Exception as e: 78 | print(f"ERROR: Please provide a valid date filter or limit: {str(e)}") 79 | 80 | return finalQuery 81 | 82 | #### Get Redshift Table from a query 83 | def getRedshiftQueryResult(self, query): 84 | 85 | rsh_query = query 86 | redshift_df = ( self.spark.read 87 | .format("com.databricks.spark.redshift") 88 | .option("url", self.connectionString) 89 | .option("query", rsh_query) 90 | .option("tempdir", self.tempDir) 91 | .option("aws_iam_role", self.iamRole) 92 | .load() 93 | ) 94 | 95 | return redshift_df 96 | 97 | #### Get Databricks Table from a query 98 | def getDatabricksQueryResults(self, query): 99 | 100 | dbx_query = query 101 | databricks_df = self.spark.sql(dbx_query) 102 | 103 | return databricks_df 104 | 105 | #### Get Databricks Table 106 | def getDatabricksTable(self, tableName, dateFilterColumn=None, startDateTime=None, endDateTime=None, limit=None): 107 | 108 | finalQuery = self.getQuery(tableName, dateFilterColumn, startDateTime, endDateTime, limit) 109 | databricks_df = self.getDatabricksQueryResults(finalQuery) 110 | return databricks_df 111 | 112 | #### Get Redshift Table 113 | def getRedshiftTable(self, tableName, dateFilterColumn=None, startDateTime=None, endDateTime=None, limit=None): 114 | 115 | finalQuery = self.getQuery(tableName, dateFilterColumn, startDateTime, endDateTime, limit) 116 | redshift_df = self.getRedshiftQueryResult(finalQuery) 117 | return redshift_df 118 | 119 | 120 | def compareColumnsOfTable(self, redshiftTableName, databricksTableName): 121 | 122 | redshift_table = self.getRedshiftTable(redshiftTableName).columns 123 | dbx_table = self.getDatabricksTable(databricksTableName).columns 124 | 125 | int_cols = ','.join(list(set(redshift_table).intersection(set(dbx_table)))) 126 | in_dbx_not_redshift = ','.join([i for i in dbx_table if i not in int_cols]) 127 | in_redshift_not_dbx = ','.join([i for i in redshift_table if i not in int_cols]) 128 | 129 | cols_schema = ['in_both', 'in_redshift_not_databricks', 'in_databricks_not_redshift'] 130 | data = [[int_cols, in_redshift_not_dbx, in_dbx_not_redshift]] 131 | 132 | cols_comp_df = self.spark.createDataFrame(data, cols_schema) 133 | 134 | return cols_comp_df 135 | 136 | 137 | def compareRowCountOfTable(self, redsfhitTableName, databricksTableName, dateFilterColumn=None, startDateTime=None, endDateTime = None, limit=None, groupByAgg='all'): 138 | 139 | from pyspark.sql.functions import date_trunc 140 | ## Group by agg options 141 | #None -- All Rows will be counted and compared 142 | #all -- same as None, all rows will be counted 143 | #day -- All rows within the range will be counted and grouped by day 144 | #hour -- All rows within the range will be counted and grouped by hour 145 | #minute -- All rows within the range will be counted and grouped by minute 146 | 147 | ## If dateFilter column is None, just count whole table 148 | redshift_table = self.getRedshiftTable(redsfhitTableName, dateFilterColumn, startDateTime, endDateTime, limit) 149 | dbx_table = self.getDatabricksTable(databricksTableName, dateFilterColumn, startDateTime, endDateTime, limit) 150 | 151 | if (groupByAgg.lower() == 'all') or (groupByAgg is None) or (dateFilterColumn is None): 152 | 153 | red_times = (redshift_table 154 | .agg(count("*").alias("RedshiftRowCount")) 155 | .withColumn("condition", lit("Full Table Row Counts")) 156 | ) 157 | 158 | dbx_times = (dbx_table 159 | .agg(count("*").alias("DatabricksRowCount")) 160 | .withColumn("condition", lit("Full Table Row Counts")) 161 | ) 162 | 163 | final_df = red_times.join(dbx_times, on="condition", how="full_outer") 164 | return final_df 165 | 166 | elif groupByAgg.lower() in ['day', 'hour', 'minute', 'month', 'year']: 167 | 168 | red_times = (redshift_table 169 | .withColumn("date_col", date_trunc(groupByAgg, col(dateFilterColumn))) 170 | .groupBy("date_col") 171 | .agg(count(dateFilterColumn).alias("RedshiftRowCount")) 172 | .orderBy("date_col") 173 | ) 174 | 175 | dbx_times = (dbx_table 176 | .withColumn("date_col", date_trunc(groupByAgg, col(dateFilterColumn))) 177 | .groupBy("date_col") 178 | .agg(count(dateFilterColumn).alias("DatabricksRowCount")) 179 | .orderBy("date_col") 180 | ) 181 | 182 | 183 | final_df = red_times.join(dbx_times, on="date_col", how="full_outer") 184 | return final_df 185 | 186 | else: 187 | print("ERROR: please provide valid grouping, or dont provide one at all :)") 188 | return -------------------------------------------------------------------------------- /helperfunctions/requirements.txt: -------------------------------------------------------------------------------- 1 | sqlglot 2 | pyarrow -------------------------------------------------------------------------------- /helperfunctions/stmvorchestrator.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | import time 4 | 5 | 6 | ## Function to block Create or REFRESH of ST or MV statements to wait until it is finishing before moving to next task 7 | 8 | ## Similar to the awaitTermination() method in a streaming pipeline 9 | 10 | ## Only supports 1 sql statement at a time on purpose 11 | 12 | def orchestrate_stmv_statement(spark, dbutils, sql_statement, host_name=None, token=None): 13 | 14 | host_name = None 15 | token = None 16 | 17 | ## Infer hostname from same workspace 18 | if host_name is not None: 19 | host_name = host_name 20 | 21 | else: 22 | host_name = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().getOrElse(None).replace("https://", "") 23 | 24 | ## Automatically get user token if none provided 25 | if token is not None: 26 | token = token 27 | else: 28 | token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None) 29 | 30 | 31 | ## Get current catalogs/schemas from outside USE commands 32 | current_schema = spark.sql("SELECT current_schema()").collect()[0][0] 33 | current_catalog = spark.sql("SELECT current_catalog()").collect()[0][0] 34 | 35 | if current_catalog == 'spark_catalog': 36 | current_catalog = 'hive_metastore' 37 | 38 | 39 | ## Check for multiple statements, if more than 1, than raise too many statement exception 40 | all_statements = re.split(";", sql_statement) 41 | 42 | if (len(all_statements) > 1): 43 | print("WARNING: There are more than one statements in this sql command, this function will just pick and try to run the first statement and ignore the rest.") 44 | 45 | 46 | sql_statement = all_statements[0] 47 | 48 | 49 | try: 50 | 51 | ## Get table/mv that is being refreshed 52 | table_match = re.split("CREATE OR REFRESH STREAMING TABLE\s|REFRESH STREAMING TABLE\s|CREATE OR REFRESH MATERIALIZED VIEW\s|REFRESH MATERIALIZED VIEW\s", sql_statement.upper())[1].split(" ")[0] 53 | 54 | except Exception as e: 55 | 56 | ## If it was not able to find a REFRESH statement, ignore and unblock the operation and move on (i.e. if its not an ST/MV or if its just a CREATE) 57 | 58 | print("WARNING: No ST / MV Refresh statements found. Moving on.") 59 | return 60 | 61 | ## If ST/MV refresh was found 62 | 63 | if (len(table_match.split(".")) == 3): 64 | ## fully qualified, dont change it 65 | pass 66 | elif (len(table_match.split(".")) == 2): 67 | table_match = current_catalog + "." + table_match 68 | 69 | elif(len(table_match.split(".")) == 1): 70 | table_match = current_catalog + "." + current_schema + "." + table_match 71 | 72 | 73 | ## Step 2 - Execute SQL Statement 74 | spark.sql(sql_statement) 75 | 76 | 77 | ## Step 3 - Get pipeline Id for table 78 | active_pipeline_id = (spark.sql(f"DESCRIBE DETAIL {table_match}") 79 | .selectExpr("properties").take(1)[0][0] 80 | .get("pipelines.pipelineId") 81 | ) 82 | 83 | ## Poll for pipeline status 84 | 85 | 86 | current_state = "UNKNOWN" 87 | 88 | ## Pipeline is active 89 | while current_state not in ("FAILED", "IDLE"): 90 | 91 | url = "https://" + host_name + "/api/2.0/pipelines/" 92 | headers_auth = {"Authorization":f"Bearer {token}"} 93 | 94 | check_status_resp = requests.get(url + active_pipeline_id , headers=headers_auth).json() 95 | 96 | current_state = check_status_resp.get("state") 97 | 98 | if current_state == "IDLE": 99 | print(f"STMV Pipeline {active_pipeline_id} completed! \n Moving on") 100 | return 101 | 102 | elif current_state == "FAILED": 103 | raise(BaseException(f"PIPELINE {active_pipeline_id} FAILED!")) 104 | 105 | 106 | else: 107 | ## Wait before polling again 108 | ## TODO: Do exponential backoff 109 | time.sleep(5) 110 | 111 | --------------------------------------------------------------------------------