├── .DS_Store
├── Data Apps
    ├── .DS_Store
    └── Data Management Intro Dash App
    │   ├── .DS_Store
    │   └── plotly_iot_demo
    │       ├── .DS_Store
    │       ├── etl_pipelines
    │           └── iot_dashboard_etl.sql
    │       ├── option_1_manual_ddl
    │           ├── .DS_Store
    │           ├── __pycache__
    │           │   └── ddls.cpython-38.pyc
    │           ├── app.py
    │           ├── config.json
    │           └── ddls.py
    │       └── requirements.txt
├── Delta Optimizer
    ├── .DS_Store
    ├── DashAppFrontEnd
    │   └── .DS_Store
    ├── Instructions.md
    ├── Step 1_ Optimization Strategy Builder.py
    ├── Step 2_ Strategy Runner.py
    ├── Step 3_ Query History and Profile Analyzer.py
    ├── deltaoptimizer-1.4.1-py3-none-any.whl
    ├── deltaoptimizer-1.5.0-py3-none-any.whl
    └── deltaoptimizer-1.5.2-py3-none-any.whl
├── Design Patterns Notebooks
    ├── Advanced Notebooks
    │   ├── End to End Procedural Migration Pattern
    │   │   └── Procedural Migration Pattern with SCD2 Example.py
    │   ├── Multi-plexing with Autoloader
    │   │   └── Option 1: Actually Multi-plexing tables on write
    │   │   │   ├── Child Job Template.py
    │   │   │   └── Controller Job.py
    │   ├── Parallel Custom Named File Exports
    │   │   ├── Parallel File Exports - Python Version.py
    │   │   └── Parallel File Exports.py
    │   └── SCD Design Patterns
    │   │   └── Advanced CDC With SCD in Databricks.py
    ├── Step 1 - SQL EDW Pipeline.sql
    ├── Step 10 - Lakehouse Federation.py
    ├── Step 11 - SQL Orchestration in Production.py
    ├── Step 12 - SCD2 - SQL EDW Pipeline.sql
    ├── Step 13 - Migrating Identity Columns.sql
    ├── Step 14 - Using the Query Profile.sql
    ├── Step 2 - Optimize your Delta Tables.py
    ├── Step 3 - DLT Version Simple SQL EDW Pipeline.sql
    ├── Step 4 - Create Gold Layer Analytics Tables.sql
    ├── Step 5 - Unified Batch and Streaming.py
    ├── Step 6 - Streaming Table Design Patterns.sql
    ├── Step 7 - COPY INTO Loading Patterns.py
    ├── Step 8 - Liquid Clustering Delta Tables.py
    └── Step 9 - Using SQL Functions.py
├── LICENSE
├── README.md
├── Realtime Data Apps Workshop
    ├── Step 0 - Real Time Data Generator Simulator.py
    ├── Step 1 - Stream from Generator.py
    └── Step 2 - Create Gold Views for App Layer.sql
├── RedshiftDDLMigrator
    └── Redshift DDL Migrator.py
├── Using DBSQL Serverless Client Example.py
├── Using DBSQL Serverless Transaction Manager Example.py
├── Using Delta Helpers Notebook Example.py
├── Using Delta Logger Example.py
├── Using Delta Merge Helpers Example.py
├── Using Streaming Tables and MV Orchestrator.py
├── Using Transaction Manager Example.py
└── helperfunctions
    ├── __init__.py
    ├── datavalidator.py
    ├── dbsqlclient.py
    ├── dbsqltransactions.py
    ├── deltahelpers.py
    ├── deltalogger.py
    ├── redshiftchecker.py
    ├── requirements.txt
    ├── stmvorchestrator.py
    └── transactions.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/.DS_Store


--------------------------------------------------------------------------------
/Data Apps/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/Data Apps/.DS_Store


--------------------------------------------------------------------------------
/Data Apps/Data Management Intro Dash App/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/Data Apps/Data Management Intro Dash App/.DS_Store


--------------------------------------------------------------------------------
/Data Apps/Data Management Intro Dash App/plotly_iot_demo/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/Data Apps/Data Management Intro Dash App/plotly_iot_demo/.DS_Store


--------------------------------------------------------------------------------
/Data Apps/Data Management Intro Dash App/plotly_iot_demo/etl_pipelines/iot_dashboard_etl.sql:
--------------------------------------------------------------------------------
  1 | TRUNCATE TABLE main.plotly_iot_dashboard.bronze_users;
  2 | TRUNCATE TABLE main.plotly_iot_dashboard.bronze_sensors;
  3 | 
  4 | 
  5 | -- DBTITLE 1,Incrementally Ingest Source Data from Raw Files
  6 | COPY INTO main.plotly_iot_dashboard.bronze_sensors
  7 | FROM (SELECT 
  8 | id::bigint AS Id,
  9 | device_id::integer AS device_id,
 10 | user_id::integer AS user_id,
 11 | calories_burnt::decimal(10,2) AS calories_burnt, 
 12 | miles_walked::decimal(10,2) AS miles_walked, 
 13 | num_steps::decimal(10,2) AS num_steps, 
 14 | timestamp::timestamp AS timestamp,
 15 | value AS value
 16 | FROM "/databricks-datasets/iot-stream/data-device/")
 17 | FILEFORMAT = json
 18 | COPY_OPTIONS('force'='true') --option to be incremental or always load all files
 19 | ;
 20 | 
 21 | 
 22 | -- DBTITLE 1,Perform Upserts - Device Data
 23 | MERGE INTO main.plotly_iot_dashboard.silver_sensors AS target
 24 | USING (SELECT Id::integer,
 25 |               device_id::integer,
 26 |               user_id::integer,
 27 |               calories_burnt::decimal,
 28 |               miles_walked::decimal,
 29 |               num_steps::decimal,
 30 |               timestamp::timestamp,
 31 |               value::string
 32 |               FROM main.plotly_iot_dashboard.bronze_sensors) AS source
 33 | ON source.Id = target.Id
 34 | AND source.user_id = target.user_id
 35 | AND source.device_id = target.device_id
 36 | WHEN MATCHED THEN UPDATE SET 
 37 |   target.calories_burnt = source.calories_burnt,
 38 |   target.miles_walked = source.miles_walked,
 39 |   target.num_steps = source.num_steps,
 40 |   target.timestamp = source.timestamp
 41 | WHEN NOT MATCHED THEN INSERT *;
 42 | 
 43 | --Truncate bronze batch once successfully loaded
 44 | TRUNCATE TABLE main.plotly_iot_dashboard.bronze_sensors;
 45 | 
 46 | -- COMMAND ----------
 47 | 
 48 | -- DBTITLE 1,Table Optimizations
 49 | OPTIMIZE main.plotly_iot_dashboard.silver_sensors ZORDER BY (user_id, device_id, timestamp);
 50 | 
 51 | -- COMMAND ----------
 52 | 
 53 | -- DBTITLE 1,Incrementally Ingest Raw User Data
 54 | COPY INTO main.plotly_iot_dashboard.bronze_users
 55 | FROM (SELECT 
 56 | userid::bigint AS user_id,
 57 | gender AS gender,
 58 | age::integer AS age,
 59 | height::decimal(10,2) AS height, 
 60 | weight::decimal(10,2) AS weight,
 61 | smoker AS smoker,
 62 | familyhistory AS familyhistory,
 63 | cholestlevs AS cholestlevs,
 64 | bp AS bp,
 65 | risk::decimal(10,2) AS risk,
 66 | current_timestamp() AS update_timestamp
 67 | FROM "/databricks-datasets/iot-stream/data-user/")
 68 | FILEFORMAT = CSV
 69 | FORMAT_OPTIONS('header'='true')
 70 | COPY_OPTIONS('force'='true') --option to be incremental or always load all files
 71 | ;
 72 | 
 73 | 
 74 | MERGE INTO main.plotly_iot_dashboard.silver_users AS target
 75 | USING (SELECT 
 76 |       user_id::int,
 77 |       gender::string,
 78 |       age::int,
 79 |       height::decimal, 
 80 |       weight::decimal,
 81 |       smoker,
 82 |       familyhistory,
 83 |       cholestlevs,
 84 |       bp,
 85 |       risk,
 86 |       update_timestamp
 87 |       FROM main.plotly_iot_dashboard.bronze_users) AS source
 88 | ON source.user_id = target.user_id
 89 | WHEN MATCHED THEN UPDATE SET 
 90 |   target.gender = source.gender,
 91 |       target.age = source.age,
 92 |       target.height = source.height, 
 93 |       target.weight = source.weight,
 94 |       target.smoker = source.smoker,
 95 |       target.familyhistory = source.familyhistory,
 96 |       target.cholestlevs = source.cholestlevs,
 97 |       target.bp = source.bp,
 98 |       target.risk = source.risk,
 99 |       target.update_timestamp = source.update_timestamp
100 | WHEN NOT MATCHED THEN INSERT *;
101 | 
102 | --Truncate bronze batch once successfully loaded
103 | TRUNCATE TABLE main.plotly_iot_dashboard.bronze_users;
104 | 
105 | 
106 | 
107 | 
108 | -- Create Gold Table and Read via Reflection
109 | 
110 | CREATE OR REPLACE TABLE main.plotly_iot_dashboard.gold_sensors
111 | AS
112 | (SELECT timestamp,
113 | -- Number of Steps
114 | (avg(`num_steps`) OVER (
115 |         ORDER BY timestamp
116 |         ROWS BETWEEN
117 |           15 PRECEDING AND
118 |           CURRENT ROW
119 |       )) ::float AS SmoothedNumSteps30SecondMA, -- 30 second moving average
120 |      
121 | (avg(`num_steps`) OVER (
122 |         ORDER BY timestamp
123 |         ROWS BETWEEN
124 |           60 PRECEDING AND
125 |           CURRENT ROW
126 |       ))::float AS SmoothedNumSteps120SecondMA,--120 second moving average,
127 | -- Calories Burnt
128 | (avg(`calories_burnt`) OVER (
129 |         ORDER BY timestamp
130 |         ROWS BETWEEN
131 |           15 PRECEDING AND
132 |           CURRENT ROW
133 |       )) ::float AS SmoothedCaloriesBurnt30SecondMA, -- 30 second moving average
134 |      
135 | (avg(`calories_burnt`) OVER (
136 |         ORDER BY timestamp
137 |         ROWS BETWEEN
138 |           60 PRECEDING AND
139 |           CURRENT ROW
140 |       ))::float AS SmoothedCaloriesBurnt120SecondMA --120 second moving average
141 | FROM main.plotly_iot_dashboard.silver_sensors
142 | )


--------------------------------------------------------------------------------
/Data Apps/Data Management Intro Dash App/plotly_iot_demo/option_1_manual_ddl/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/Data Apps/Data Management Intro Dash App/plotly_iot_demo/option_1_manual_ddl/.DS_Store


--------------------------------------------------------------------------------
/Data Apps/Data Management Intro Dash App/plotly_iot_demo/option_1_manual_ddl/__pycache__/ddls.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/Data Apps/Data Management Intro Dash App/plotly_iot_demo/option_1_manual_ddl/__pycache__/ddls.cpython-38.pyc


--------------------------------------------------------------------------------
/Data Apps/Data Management Intro Dash App/plotly_iot_demo/option_1_manual_ddl/app.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | from databricks import sql
  4 | from sqlalchemy.orm import declarative_base, Session, sessionmaker
  5 | from sqlalchemy import Column, String, Integer, BOOLEAN, create_engine, select, DATE, DATETIME, TIMESTAMP, DECIMAL, case, func, Table, MetaData
  6 | import ddls
  7 | from dash import Dash, html, Input, Output, ctx, dcc, dash_table
  8 | import pandas as pd
  9 | import plotly.express as px
 10 | import requests
 11 | import dash_bootstrap_components as dbc
 12 | 
 13 | 
 14 | with open('config.json') as w:
 15 | 
 16 |     conf = json.load(w)
 17 |     token = conf.get("token")
 18 |     http_path = conf.get("http_path")
 19 |     database = conf.get("database")
 20 |     host_name = conf.get("host_name")
 21 |     catalog = conf.get("catalog")
 22 | 
 23 | 
 24 | 
 25 | ### Initialize Database Connection
 26 | conn_str = f"databricks://token:{token}@{host_name}?http_path={http_path}&catalog={catalog}&schema={database}"
 27 | extra_connect_args = {
 28 |             "_tls_verify_hostname": True,
 29 |             "_user_agent_entry": "PySQL Example Script",
 30 |         }
 31 | engine = create_engine(
 32 |             conn_str,
 33 |             connect_args=extra_connect_args,
 34 |         )
 35 | 
 36 | 
 37 | 
 38 | ## Get Metadata from Config Files
 39 | #ddls.Base.metadata.create_all(bind=engine)
 40 | #ddls.Base.metadata.drop_all(bind=engine)
 41 | 
 42 | tables_stmt = f"""SELECT * FROM {catalog}.INFORMATION_SCHEMA.TABLES
 43 | WHERE table_schema = '{database}'"""
 44 | 
 45 | 
 46 | tables_in_db = pd.read_sql_query(tables_stmt, engine)
 47 | 
 48 | ### Core Dash App
 49 | 
 50 | app = Dash(external_stylesheets=[dbc.themes.BOOTSTRAP])
 51 | 
 52 | ### Layout
 53 | 
 54 | app.layout = html.Div([
 55 |     html.Button('Build Database Tables', id='build_db_btn', n_clicks=0),
 56 |     html.Button('Drop Database Tables', id='drop_tabes_btn', n_clicks=0),
 57 |     html.Button('Get Table List', id='fetch_tables_btn', n_clicks=0),
 58 |     html.Button('Run ELT Pipeline', id='run_etl_pipe', n_clicks=0),
 59 |     html.Div(id='container-button-timestamp'),
 60 |             html.Br(),
 61 |         dcc.RadioItems(
 62 |         id="checklist",
 63 |         options=["num_steps", "calories_burnt"],
 64 |         value="num_steps",
 65 |         inline=True,
 66 |         
 67 |     ),
 68 |         dcc.Graph(id='BasicSensors'),
 69 |         html.Div([html.Br(),
 70 |                  dcc.Graph(id='SmoothSensors')]),
 71 |         dash_table.DataTable(tables_in_db.to_dict('records'),[{"name": i, "id": i} for i in tables_in_db.columns], id='tbl'),
 72 |         html.Div(id='sch_tbl', className = 'sch_tbl')
 73 |     ])
 74 | 
 75 | 
 76 | 
 77 | @app.callback(Output('sch_tbl', 'children'),
 78 |               Input('tbl', 'data'),
 79 |               Input('tbl', 'active_cell'))
 80 | def update_graphs(data, active_cell):
 81 |     row_num = active_cell.get("row")
 82 |     col_num = active_cell.get("column")
 83 |     col_name = active_cell.get("column_id")
 84 | 
 85 |     table_name_to_detail = tables_in_db.loc[tables_in_db.index[row_num], col_name]
 86 | 
 87 |     if col_name == 'table_name':
 88 |         schema_stmt = f"""DESCRIBE TABLE EXTENDED {catalog}.{database}.{table_name_to_detail}"""
 89 |         schema_table = pd.read_sql_query(schema_stmt, engine)
 90 |         cols_for_data_table = [{'name': i, 'id': i} for i in schema_table.columns]
 91 | 
 92 |         res_table = dash_table.DataTable(
 93 |             id='table',
 94 |             columns=cols_for_data_table,
 95 |             data =schema_table.to_dict("rows"),
 96 |             style_data={
 97 |                 'color': 'blue',
 98 |                 'backgroundColor': 'white'
 99 |             },
100 |             style_data_conditional=[
101 |                 {
102 |                     'if': {'row_index': 'odd'},
103 |                     'backgroundColor': 'rgb(220, 220, 220)',
104 |                 }
105 |             ],
106 |             style_header={
107 |                 'backgroundColor': 'rgb(210, 210, 210)',
108 |                 'color': 'black',
109 |                 'fontWeight': 'bold'
110 |             }
111 |         )
112 |         
113 | 
114 |         return res_table
115 | 
116 |     else:
117 |         msg = f"Please select a table name... Currently Selected: {table_name_to_detail}"
118 | 
119 |         res_msg = dbc.Alert(msg, color="primary")
120 | 
121 |     return msg
122 | 
123 | 
124 | ## Sensors Chat Callback -- No Reflection
125 | @app.callback(
126 |     Output(component_id="BasicSensors", component_property="figure"),
127 |     Input("checklist", "value")
128 | )
129 | def update_graph(yaxis):
130 | 
131 |     device_base_table = ddls.Base.metadata.tables["silver_sensors"]
132 |     user_base_table = ddls.Base.metadata.tables["silver_users"]
133 | 
134 |     ## ORM-based SQL Query with dynamic filters in the callback
135 | 
136 |     userstmt = (select(device_base_table.c.timestamp, device_base_table.c.num_steps, device_base_table.c.calories_burnt)
137 |                 .limit(100)
138 |         )
139 |     
140 | 
141 |     ## Read data via pandas or just raw Dict/array
142 |     ## TIPS: Always try to push the filtering/complex logic down to the system where the most data is filtered
143 |     ## minimize data brought to client
144 |     df = pd.read_sql_query(userstmt, engine).sort_values(by=['timestamp'])
145 | 
146 |     axis_labels = {
147 |         "num_steps": "Total Daily Steps",
148 |     }
149 |     fig = px.line(
150 |         df,
151 |         x="timestamp",
152 |         y=[f"{yaxis}"],
153 |         markers=True,
154 |         title=f"Comparative Daily Fitness Metrics by Demographic",
155 |     )
156 | 
157 | 
158 |     ## Build Plot Figure and return
159 | 
160 |     return fig
161 | 
162 | 
163 | 
164 | ## Smooth Sensors Callback for Line Graph - Via Reflection
165 | @app.callback(
166 |     Output(component_id="SmoothSensors", component_property="figure"),
167 |     Input("checklist", "value")
168 | )
169 | def update_smooth_graph(yaxis):
170 | 
171 |     if yaxis == "num_steps":
172 |         chart_cols = ["SmoothedNumSteps30SecondMA", "SmoothedNumSteps120SecondMA"]
173 | 
174 |     elif yaxis == "calories_burnt":
175 |         chart_cols = ["SmoothedCaloriesBurnt30SecondMA", "SmoothedCaloriesBurnt120SecondMA"]
176 |         
177 |     # Reflect database properties into ``metadata``.
178 |     #ddls.Base.metadata.reflect(engine=engine)
179 |     
180 |     ## !! this table is NOT manually defined in our Python object, and is instead read on the fly with reflection
181 |     sensors_table= Table("gold_sensors", 
182 |                          ddls.Base.metadata, 
183 |                          Column("timestamp", TIMESTAMP), 
184 |                          autoload=True, 
185 |                          autoload_with=engine,
186 |                          extend_existing=True)
187 |     
188 |     # Instantiate a new ``FetchTable`` object to retrieve column objects by name.
189 |     
190 |     # Get a ``Column`` object from the desired ``Table`` object.
191 |     yaxis_short_ma = sensors_table.columns[chart_cols[0]]
192 |     yaxis_long_ma = sensors_table.columns[chart_cols[1]]
193 |     time_col = sensors_table.columns["timestamp"]
194 | 
195 |     # Build a session-based query including filtering on text in ``column``.
196 |     ma_statement = (select(time_col, yaxis_short_ma, yaxis_long_ma)
197 |                 .limit(100)
198 |         )
199 |     
200 |     # Build a Pandas ``DataFrame`` with results from the query.
201 |     df = pd.read_sql_query(ma_statement, engine).sort_values(by=['timestamp'], ascending=False)
202 |     
203 |     axis_labels = {
204 |         "num_steps": "Total Daily Steps",
205 |     }
206 |     fig = px.line(
207 |         df,
208 |         x="timestamp",
209 |         y=chart_cols,
210 |         markers=True,
211 |         title=f"Smoothed Moving Averages of Chosen Metric",
212 |     )
213 | 
214 | 
215 |     ## Build Plot Figure and return
216 | 
217 |     return fig
218 | 
219 | 
220 | #### Run ELT Pipeline Callback
221 | @app.callback(
222 |     Output('container-button-timestamp', 'children'),
223 |     Input('build_db_btn', 'n_clicks'),
224 |     Input('drop_tabes_btn', 'n_clicks'),
225 |     Input('fetch_tables_btn', 'n_clicks'),
226 |     Input('run_etl_pipe', 'n_clicks')
227 | )
228 | def displayClick(btn1, btn2, btn3, btn4):
229 |     msg = "No Database State Yet..."
230 |     if 'build_db_btn' == ctx.triggered_id:
231 | 
232 |         ddls.Base.metadata.create_all(bind=engine)
233 |         msg = "Database built!"
234 | 
235 |     elif 'drop_tabes_btn' == ctx.triggered_id:
236 | 
237 |         ddls.Base.metadata.drop_all(bind=engine)
238 |         msg = "Database Dropped!"
239 | 
240 |     elif 'fetch_tables_btn' == ctx.triggered_id:
241 |         tbls = list(ddls.Base.metadata.tables)
242 |         msg = f"Here are the tables for {catalog}.{database}: {tbls}"
243 | 
244 |     elif 'run_etl_pipe' == ctx.triggered_id:
245 | 
246 |         ## Build and Trigger Databricks Jobs 
247 |         job_req = {
248 |                 "name": "Plotly_Backend_Pipeline",
249 |                 "email_notifications": {
250 |                     "no_alert_for_skipped_runs": "false"
251 |                 },
252 |                 "webhook_notifications": {},
253 |                 "timeout_seconds": 0,
254 |                 "max_concurrent_runs": 1,
255 |                 "tasks": [
256 |                     {
257 |                         "task_key": "Plotly_Backend_Pipeline",
258 |                         "sql_task": {
259 |                             "query": {
260 |                                 "query_id": "88c1412d-d2ca-43a1-9843-aec96b5b1586"
261 |                             },
262 |                             "warehouse_id": "ead10bf07050390f"
263 |                         },
264 |                         "timeout_seconds": 0,
265 |                         "email_notifications": {}
266 |                     }
267 |                 ],
268 |                 "format": "MULTI_TASK"
269 |             }
270 |         
271 |         job_json = json.dumps(job_req)
272 |         ## Get this from a secret or param
273 |         headers_auth = {"Authorization":f"Bearer {token}"}
274 |         uri = f"https://{host_name}/api/2.1/jobs/create"
275 | 
276 |         endp_resp = requests.post(uri, data=job_json, headers=headers_auth).json()
277 | 
278 |         ## Run Job
279 |         job_id = endp_resp['job_id']
280 | 
281 |         run_now_uri = f"https://{host_name}/api/2.1/jobs/run-now"
282 | 
283 |         job_run = {"job_id": job_id }
284 |         job_run_json = json.dumps(job_run)
285 | 
286 |         run_resp = requests.post(run_now_uri, data=job_run_json, headers=headers_auth).json()
287 | 
288 | 
289 |         msg = f"Pipeline Created and Ran with Job Id: {endp_resp['job_id']} \n run message: {run_resp}"
290 | 
291 | 
292 | 
293 |     return html.Div(msg)
294 | 
295 | 
296 | 
297 | 
298 | if __name__ == '__main__':
299 |     app.run_server(debug=True)
300 | 
301 | 
302 | 
303 | 
304 | 
305 | 
306 | 
307 | 
308 | 
309 | 


--------------------------------------------------------------------------------
/Data Apps/Data Management Intro Dash App/plotly_iot_demo/option_1_manual_ddl/config.json:
--------------------------------------------------------------------------------
1 | {"http_path": "/sql/1.0/endpoints/<warehouse_id>",
2 | "host_name": "<host_name>",
3 | "token": "<dbx_token>",
4 | "catalog": "main",
5 | "database": "plotly_iot_dashboard"
6 | }


--------------------------------------------------------------------------------
/Data Apps/Data Management Intro Dash App/plotly_iot_demo/option_1_manual_ddl/ddls.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import json
 4 | from databricks import sql
 5 | from sqlalchemy.orm import declarative_base, Session
 6 | from sqlalchemy import Column, String, Integer, BOOLEAN, create_engine, select, TIMESTAMP, DECIMAL, INTEGER, BIGINT, DATE
 7 | 
 8 | ## Return new base class with the mapper initialized
 9 | Base = declarative_base()
10 | 
11 | 
12 | ####### Bronze Tables
13 | 
14 | 
15 | class BronzeSensors(Base):
16 | 
17 |     __tablename__ = "bronze_sensors"
18 | 
19 |     Id = Column(BIGINT, primary_key=True)
20 |     device_id = Column(INTEGER)
21 |     user_id = Column(INTEGER)
22 |     calories_burnt = Column(DECIMAL(10,2))
23 |     miles_walked = Column(DECIMAL(10,2))
24 |     num_steps = Column(DECIMAL(10,2))
25 |     timestamp = Column(TIMESTAMP)
26 |     value = Column(String(1024))
27 | 
28 | 
29 | class BronzeUsers(Base):
30 | 
31 |     __tablename__ = "bronze_users"
32 | 
33 |     user_id = Column(BIGINT, primary_key=True)
34 |     gender = Column(String(10))
35 |     age = Column(INTEGER)
36 |     height = Column(DECIMAL(10,2))
37 |     weight = Column(DECIMAL(10,2))
38 |     smoker = Column(String(4))
39 |     familyhistory = Column(String(100))
40 |     cholestlevs = Column(String(100))
41 |     bp = Column(String(50))
42 |     risk = Column(DECIMAL(10,2))
43 |     update_timestamp = Column(TIMESTAMP)
44 | 
45 | 
46 | 
47 | 
48 | ####### Silver  Tables
49 | 
50 | class SilverSensors(Base):
51 | 
52 |     __tablename__ = "silver_sensors"
53 | 
54 |     Id = Column(BIGINT, primary_key=True)
55 |     device_id = Column(INTEGER)
56 |     user_id = Column(INTEGER)
57 |     calories_burnt = Column(DECIMAL(10,2))
58 |     miles_walked = Column(DECIMAL(10,2))
59 |     num_steps = Column(DECIMAL(10,2))
60 |     timestamp = Column(TIMESTAMP)
61 |     value = Column(String(1024))
62 | 
63 | 
64 | 
65 | class SilverUsers(Base):
66 | 
67 |     __tablename__ = "silver_users"
68 | 
69 |     user_id = Column(BIGINT, primary_key=True)
70 |     gender = Column(String(10))
71 |     age = Column(INTEGER)
72 |     height = Column(DECIMAL(10,2))
73 |     weight = Column(DECIMAL(10,2))
74 |     smoker = Column(String(4))
75 |     familyhistory = Column(String(100))
76 |     cholestlevs = Column(String(100))
77 |     bp = Column(String(50))
78 |     risk = Column(DECIMAL(10,2))
79 |     update_timestamp = Column(TIMESTAMP)
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/Data Apps/Data Management Intro Dash App/plotly_iot_demo/requirements.txt:
--------------------------------------------------------------------------------
1 | dash
2 | dash_bootstrap_components
3 | databricks-sql-connector ==2.4.1


--------------------------------------------------------------------------------
/Delta Optimizer/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/Delta Optimizer/.DS_Store


--------------------------------------------------------------------------------
/Delta Optimizer/DashAppFrontEnd/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/Delta Optimizer/DashAppFrontEnd/.DS_Store


--------------------------------------------------------------------------------
/Delta Optimizer/Instructions.md:
--------------------------------------------------------------------------------
 1 | # Delta Optimizer
 2 | Automated Optimization System for a Delta-based Lakehouse running on Spark or Photon
 3 | 
 4 | 
 5 | <img width="1279" alt="delta_io" src="https://delta.io/static/delta-hp-hero-bottom-46084c40468376aaecdedc066291e2d8.png">
 6 | 
 7 | 
 8 | ## Purpose:
 9 | <p1> The Delta optimizer scrapes and analyzes the query history in DBSQL via the Query History API, as well as the Delta transaction logs on one or many databases, builds a data profile to determine the most important columns that each tables should be Z-ordered by. This aims to drastically reduce the amount of manual discovery and tuning users must do to properly optimize their delta tables, especially when the primary query interface is through a DBSQL Warehouse (as an analyst using SQL or a BI tool that auto-generates SQL). This is especially key when BI tools primarily pass auto-generated SQL to a DBSQL Warehouse, thus making it much more difficult to optimize tables manually at scale. </p1>
10 |   
11 |   
12 | ### How to run: 
13 | 
14 | <li> 1. Install the associated delta optimizer library whl file to a cluster
15 | <li> 2. Run the Step 1 Notebook with you database_names to monitor, workspace url, warehouseIds to poll, and lookbackperiod. You can schedule this as a job to run monthly (as often as the query patterns might change)
16 | <li> 3. Run the Step 2 Notebook with a cluster similar to the size you would use to normally run an optimization job for your tables. If you do not know, just create a similar size cluster to your dev env. Most operations are incremental and not large except for the first run (as first run may re-write entire tables). Then schedule this notebook as a job to run daily. 
17 |   
18 | ### Delta Optimizer Process: 
19 | 
20 | <li> 1. Gather Query History and calculate statistics on all columns for all tables (option to select a particular database)
21 | <li> 2. Read transaction logs and find any merge predicates (if any) run for all tables in one or many databases
22 | <li> 3. Calculate Statistics and Rank Columns for each table for Z-order strategy using runtime stats, occurence stats, and cardinality stats
23 | <li> 4. Prepare and save a ready-to-use config delta table that can be ingested by a job or DLT to actually run the recommended OPTIMIZE/ANALYZE/TBLPROP commands </li>
24 |  
25 | 
26 | 
27 | 
28 | 
29 | 
30 | ### Roadmap: 
31 | 
32 | #### General Roadmap: 
33 | 
34 | <li> 1. Separate optimization rules from code logic to make rules configurable
35 | <li> 2. Add option to run for user or simply provide a DBSQL Dashboard of recommendations to make suggestions OOTB
36 | <li> 3. Add table exception rules, allow users to decide which table to auto optimize and which to manually override if they want to optimize their own
37 | <li> 4. Dynamically figure out job configuration (cluster size / periodicity) of commands to run
38 |   
39 | #### Query Statistics: 
40 | 
41 | <li> 1. Enable parsing of queries from not just DBSQL, but ALL clusters (jobs/AP)
42 | <li> 2. Enable parameter selection for specifying specific (1 or many) databases to scrape
43 | <li> 3. Enable pointing to a Git location to parse SQL files with SELECT statements in GIT
44 | 
45 | #### Transaction Log Statistics: 
46 | 
47 | <li> 1. Add partition filtering and file size management - <b> DONE </b>
48 | <li> 2. Column Reording first 32 (currently only re-orders recommended ZORDER columns) - <b> IN PROGRESS </b>  
49 | <li> 3. Add Analyze Table STATS - <b> DONE </b>  
50 | 
51 | #### Ranking Statistics Algorithm:
52 | 
53 | <li> 1. More robust standard scaling for statistics (right now its 0-1 standard scaling partitioned by TABLE)
54 | <li> 2. Make ranking system more intelligent - open ended feedback needed for ideas on making ranking system more generalizable and nuanced
55 | <li> 3. Dynamically prune for the actual number of ZORDER columns to best used (dependant first on cardinality). Do this possibly by tracking distance between certain statistics (i.e. if ColA appears 3000 times and Col B appears 2900 times, use both, but if ColA appears 3000 times but ColB appears 3 times, only use ColA)
56 | 
57 | </li>
58 | 
59 | 
60 | #### Execution Step
61 | 
62 | <li> 1. Automatically create and schedule a job via the API that reads from the config with the provided notebook and runs at a parameter interval selected by the user
63 |   
64 | <li> 2. Use DLT to Generate DDL, file Sizes, and Managed these optimize statements automatically without actually needing to do ETL in DLT


--------------------------------------------------------------------------------
/Delta Optimizer/Step 1_ Optimization Strategy Builder.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC 
  4 | # MAGIC # Delta Optimizer - Profiling Stage
  5 | # MAGIC 
  6 | # MAGIC <ul> 
  7 | # MAGIC   
  8 | # MAGIC   <li> Polls Query History API and get List of Queries for a set of SQL Warehouses (this is incremental, so you just define a lookback period for the first time you poll)
  9 | # MAGIC   <li> Analyzes transaction logs for tables in set of databases (all by default) -- file size, partitions, merge predicates
 10 | # MAGIC   <li> Ranks unified strategy
 11 | # MAGIC     
 12 | # MAGIC </ul>
 13 | # MAGIC 
 14 | # MAGIC ### Permissions Required:
 15 | # MAGIC 1. User running the delta optimizer must have CREATE DATABASE permission to create a delta optimizer instance (OR have it created upfront by an admin). 
 16 | # MAGIC 2. User running the delta optimizer must have READ permissions to ALL databases being profiled and optimized. 
 17 | # MAGIC 3. User running the delta optimizer must have usage permissions to all SQL Warehouses being profiled and optimized. 
 18 | # MAGIC 
 19 | # MAGIC ### Steps: 
 20 | # MAGIC 
 21 | # MAGIC 1. Decide where you want your Delta Optimizer Instance Output to live by setting <b> Optimizer Output Database </b>. You can have one or many optimizer instances, but each instances needs its own isolated database, they cannot share database namespaces. 
 22 | # MAGIC 
 23 | # MAGIC 2. Insert <b> Server HostName </b>. This is the root workspace url for your databricks workspace. This is NOT tied to any specific cluster. 
 24 | # MAGIC 
 25 | # MAGIC 3. Choose <b> Catalog Filter Mode </b>. There are 3 options: include_list, exclude_list, and all. include_list is default, which allows you to select the databases you want to profile and optimize. exclude_list will monitor ALL databases except the ones in the list. 'all' mode will monitor ALL databases. Note that the user running the delta optimizer must have read permissions to ALL databases selected no matter the mode.
 26 | # MAGIC 
 27 | # MAGIC 4. List the databases to profile in the <b> Catalog Names (csv)... </b> parameter. This is either an include or exclude list. If mode = 'all', then this parameter is not used. 
 28 | # MAGIC 
 29 | # MAGIC 5. Choose <b> Database Filter Mode </b>. There are 3 options: include_list, exclude_list, and all. include_list is default, which allows you to select the databases you want to profile and optimize. exclude_list will monitor ALL databases except the ones in the list. 'all' mode will monitor ALL databases. Note that the user running the delta optimizer must have read permissions to ALL databases selected no matter the mode.
 30 | # MAGIC 
 31 | # MAGIC 6. List the databases to profile in the <b> Database Names (csv)... </b> parameter. This is either an include or exclude list. If mode = 'all', then this parameter is not used. 
 32 | # MAGIC 
 33 | # MAGIC 7. Choose the <b> Table Filter Mode </b>. There are 3 options:  include_list, exclude_list, and all. include_list is default, which allows you to select the subset of tables you want to profile and optimize. This most will ALWAYS operate within the subset of databases chosen from the <b> Database Filter Mode </b>. i.e. if the table you want is not included in the selected databases, no matter the mode, it will not be profiled and optimized. 
 34 | # MAGIC 
 35 | # MAGIC 8. List the tables to profile in the <b> Table Filter List... </b> parameter. This is either an include or exclude list. If mode = 'all', then this parameter is not used. 
 36 | # MAGIC 
 37 | # MAGIC 9. Fill out the list of <b> SQL Warehouse IDs (csv list) </b> to profile and extract query history from. This is how the optimizer will detect HOW your tables are being used in queries. It will use the Query History API to incrementally pull your queries for the selected SQL Warehouses and store them in the Delta optimizer database used. 
 38 | # MAGIC 
 39 | # MAGIC 10. Choose a <b> Query History Lookback Period </b>. This is ONLY for cold starts. This represents a lagging sample of days from today to pull query history for. After the first run, it picks up where it last left off automatically unless the <b> Start Over? </b> parameter = 'Yes'.
 40 | # MAGIC 
 41 | # MAGIC 11. Optionally choose the <b> Start Over? </b> parameter. 'Yes' means it will truncate all historical state and re-profile history from scratch. 'No' means it will always pick up where it left off. 
 42 | # MAGIC 
 43 | # MAGIC 
 44 | # MAGIC ### KEY USER NOTES: 
 45 | # MAGIC 1. Think of the catalog/database/filter lists/modes like a funnel. No matter whether inclusion or exclusion mode for each level, the lower levels will always ONLY contain the subset that results from the previous. For example, if I am running for all catalogs except 'main', then in my database list, if there are any databases that live it 'main', they will not be optimized. 
 46 | # MAGIC 2. Database names should be fully qualified (catalog.database.table)
 47 | # MAGIC 3. Table Filter List must be fully qualified (catalog.database.table)
 48 | # MAGIC 4. If table filter mode is all, then the filter list can be blank, otherwise ensure that it is correct
 49 | # MAGIC 
 50 | # MAGIC 
 51 | # MAGIC 
 52 | # MAGIC ### LIMITATIONS: 
 53 | # MAGIC 1. Currently it does NOT profile SQL queries run on Adhoc or Jobs clusters, only SQL Warehouses for now. This is on the roadmap to fix. 
 54 | # MAGIC 
 55 | # MAGIC ### Depedencies
 56 | # MAGIC <li> Ensure that you either get a token as a secret or use a cluster with the env variable called DBX_TOKEN to authenticate to DBSQL
 57 | 
 58 | # COMMAND ----------
 59 | 
 60 | from deltaoptimizer import DeltaProfiler, QueryProfiler, DeltaOptimizer
 61 | import os
 62 | 
 63 | # COMMAND ----------
 64 | 
 65 | # DBTITLE 1,Register and Retrieve DBX Auth Token
 66 | DBX_TOKEN = "<dbx_token>"
 67 | 
 68 | # COMMAND ----------
 69 | 
 70 | # DBTITLE 1,Set up params before running
 71 | ## Assume running in a Databricks notebook
 72 | dbutils.widgets.dropdown("Query History Lookback Period (days)", defaultValue="3",choices=["1","3","7","14","30","60","90"])
 73 | dbutils.widgets.text("SQL Warehouse Ids (csv list)", "")
 74 | dbutils.widgets.text("Server Hostname:", "")
 75 | dbutils.widgets.dropdown("Start Over?","No", ["Yes","No"])
 76 | dbutils.widgets.text("Optimizer Output Database:", "hive_metastore.delta_optimizer")
 77 | dbutils.widgets.text("Optimizer Output Location (optional):", "")
 78 | dbutils.widgets.dropdown("Table Filter Mode", "all", ["all", "include_list", "exclude_list"])
 79 | dbutils.widgets.dropdown("Database Filter Mode", "all", ["all", "include_list", "exclude_list"])
 80 | dbutils.widgets.dropdown("Catalog Filter Mode", "all", ["all", "include_list", "exclude_list"])
 81 | dbutils.widgets.text("Table Filter List (catalog.database.table) (Csv List)", "")
 82 | dbutils.widgets.text("Database Filter List (catalog.database) (Csv List)", "")
 83 | dbutils.widgets.text("Catalog Filter List (Csv List)", "")
 84 | 
 85 | # COMMAND ----------
 86 | 
 87 | # DBTITLE 1,Get Params to Variables
 88 | lookbackPeriod = int(dbutils.widgets.get("Query History Lookback Period (days)"))
 89 | warehouseIdsList = [i.strip() for i in dbutils.widgets.get("SQL Warehouse Ids (csv list)").split(",")]
 90 | workspaceName = dbutils.widgets.get("Server Hostname:").strip()
 91 | warehouse_ids = dbutils.widgets.get("SQL Warehouse Ids (csv list)")
 92 | start_over = dbutils.widgets.get("Start Over?")
 93 | table_filter_mode = dbutils.widgets.get("Table Filter Mode")
 94 | database_filter_mode = dbutils.widgets.get("Database Filter Mode")
 95 | catalog_filter_mode = dbutils.widgets.get("Catalog Filter Mode")
 96 | table_filter_list = [i.strip() for i in dbutils.widgets.get("Table Filter List (catalog.database.table) (Csv List)").split(",")]
 97 | database_filter_list = [i.strip() for i in dbutils.widgets.get("Database Filter List (catalog.database) (Csv List)").split(",")]
 98 | catalog_filter_list = [i.strip() for i in dbutils.widgets.get("Catalog Filter List (Csv List)").split(",")]
 99 | database_output = dbutils.widgets.get("Optimizer Output Database:").strip()
100 | 
101 | if len(dbutils.widgets.get("Optimizer Output Location (optional):").strip()) > 0:
102 |   database_location = dbutils.widgets.get("Optimizer Output Location (optional):").strip()
103 | else: 
104 |   database_location = None
105 | 
106 | 
107 | # COMMAND ----------
108 | 
109 | # DBTITLE 1,Initialize Core Optimizer Tables
110 | delta_optimizer = DeltaOptimizer(database_name=database_output, database_location=database_location)
111 | 
112 | # COMMAND ----------
113 | 
114 | # DBTITLE 1,Delete Existing Results and Start Over Param
115 | if start_over == "Yes":
116 |   delta_optimizer.truncate_delta_optimizer_results()
117 | 
118 | # COMMAND ----------
119 | 
120 | # DBTITLE 1,Build Query History Profile
121 | ####### Step 1: Build Profile #######
122 | ## Initialize Profiler
123 | 
124 | ## catalogs_to_check_views should include ALL catalogs where views could live that you want to optimize underlying tables for
125 | ## Ideally they are just the same catalogs are your database names defined in the params so we try to parse for you to start there, but if you need to add, change the list here. 
126 | 
127 | ## NOTE: Query profiler doesnt really use database filter mode because it doesnt access the databases, only the SQL Query history API. 
128 | 
129 | query_profiler = QueryProfiler(workspaceName, 
130 |   warehouseIdsList, 
131 |   database_name=database_output, 
132 |   database_location=database_location,
133 |   catalogs_to_check_views=catalog_filter_list, 
134 |   catalog_filter_mode=catalog_filter_mode, 
135 |   catalog_filter_list=catalog_filter_list, 
136 |   database_filter_mode=database_filter_mode, 
137 |   database_filter_list = database_filter_list, 
138 |   table_filter_mode=table_filter_mode, 
139 |   table_filter_list=table_filter_list,
140 |   scrub_views=True)
141 | 
142 | query_profiler.build_query_history_profile(dbx_token = DBX_TOKEN, mode='auto', lookback_period_days=lookbackPeriod)
143 | 
144 | # COMMAND ----------
145 | 
146 | # DBTITLE 1,Run Delta Profiler
147 | ####### Step 2: Build stats from transaction logs/table data #######
148 | 
149 | ## Initialize class and pass in database csv string
150 | profiler = DeltaProfiler(catalog_filter_mode=catalog_filter_mode, 
151 |   catalog_filter_list=catalog_filter_list, 
152 |   database_filter_mode=database_filter_mode, 
153 |   database_filter_list = database_filter_list, 
154 |   table_filter_mode=table_filter_mode, 
155 |   table_filter_list=table_filter_list,
156 |   database_name=database_output,
157 |   database_location=database_location
158 | ) ## examples include 'default', 'mydb1,mydb2', 'all' or leave blank
159 | 
160 | ## Get tables
161 | profiler.get_all_tables_to_monitor()
162 | 
163 | ## Get predicate analysis for tables
164 | profiler.parse_stats_for_tables()
165 | 
166 | ## Build final table output
167 | profiler.build_all_tables_stats()
168 | 
169 | ## Generate cardinality stats
170 | profiler.build_cardinality_stats()
171 | 
172 | 
173 | # COMMAND ----------
174 | 
175 | # DBTITLE 1,Run Delta Optimizer
176 | ####### Step 3: Build Strategy and Rank #######
177 | ## Build Strategy
178 | 
179 | delta_optimizer = DeltaOptimizer(database_name=database_output, database_location=database_location)
180 | 
181 | delta_optimizer.build_optimization_strategy()
182 | 
183 | 
184 | # COMMAND ----------
185 | 
186 | # DBTITLE 1,Return most up to date results!
187 | df = delta_optimizer.get_results()
188 | 
189 | # COMMAND ----------
190 | 
191 | df.display()
192 | 


--------------------------------------------------------------------------------
/Delta Optimizer/Step 2_ Strategy Runner.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC 
  4 | # MAGIC ## Run the output of recommended optimize statements as a single run or schedule as a periodic job
  5 | # MAGIC 
  6 | # MAGIC <h4> Run this after the delta optimizer is finished </h4>
  7 | # MAGIC 
  8 | # MAGIC #### 3 Modes:
  9 | # MAGIC 
 10 | # MAGIC <ul> 1. <b>include_all_tables</b>: this mode optimizes all tables in the databases that the delta optimizer was provided at the profiling stage
 11 | # MAGIC   <ul> 2. <b> use_include_list</b> : this mode only optimizes tables that you explicitly WANT to INCLUDE that is a subset of the database monitored in the profiling stage. Must provide fully qualified tables names for now (i.e. hive_metastore.iot_dashboard.silver_sensors,etc.). 
 12 | # MAGIC     <ul> 3. <b> use_exlude_list</b> : this mode optimizes all tables in the databases monitored EXCEPT the list provided. Must provide fully qualified table names for now. 
 13 | # MAGIC       
 14 | # MAGIC       
 15 | # MAGIC #### Roadmap: 
 16 | # MAGIC 
 17 | # MAGIC 1. Be more selective about type of analyze statements depending on size of table and update frquency. (less frequently updated tables dont need it as much)
 18 | # MAGIC 2. Use DLT metaprogramming framework to run in parallel (performance implications)
 19 | # MAGIC 3. Use Jobs API to automatically set up a daily / hourly job for this. This is NOT always recommended by default. The optimize timing greatly depends on the ETL pipelines
 20 | # MAGIC 4. Dyanmically decide how often to run ANALYZE TABLE commands based on table size mapping (job that does this for you)
 21 | 
 22 | # COMMAND ----------
 23 | 
 24 | # MAGIC %md
 25 | # MAGIC 
 26 | # MAGIC ### Run Commands in Particular Order:
 27 | # MAGIC 
 28 | # MAGIC <li> 1. ALTER TABLE
 29 | # MAGIC <li> 2. Column Reordering
 30 | # MAGIC <li> 3. OPTIMIZE TABLE
 31 | # MAGIC <li> 4. ANALYZE TABLE
 32 | 
 33 | # COMMAND ----------
 34 | 
 35 | from pyspark.sql.functions import *
 36 | 
 37 | # COMMAND ----------
 38 | 
 39 | from deltaoptimizer import DeltaOptimizerBase, DeltaProfiler, QueryProfiler, DeltaOptimizer
 40 | 
 41 | # COMMAND ----------
 42 | 
 43 | dbutils.widgets.dropdown("table_mode", "include_all_tables", ["include_all_tables", "use_exclude_list", "use_include_list"])
 44 | dbutils.widgets.text("exclude_list(csv)", "")
 45 | dbutils.widgets.text("include_list(csv)", "")
 46 | dbutils.widgets.text("Optimizer Output Database:", "hive_metastore.delta_optimizer")
 47 | 
 48 | # COMMAND ----------
 49 | 
 50 | optimizer_location = dbutils.widgets.get("Optimizer Output Database:").strip()
 51 | delta_optimizer = DeltaOptimizer(database_name=optimizer_location)
 52 | 
 53 | # COMMAND ----------
 54 | 
 55 | ## This table by default has only 1 file, so it shouldnt be expensive to collect
 56 | table_mode = dbutils.widgets.get("table_mode")
 57 | include_table_list = [i.strip() for i in dbutils.widgets.get("include_list(csv)").split(",")]
 58 | exclude_table_list = [i.strip() for i in dbutils.widgets.get("exclude_list(csv)").split(",")]
 59 | 
 60 | if table_mode == "include_all_tables":
 61 |   config_row = (delta_optimizer.get_results()
 62 |   .collect()
 63 |                )
 64 | elif table_mode == "use_include_list":
 65 |   config_row = (delta_optimizer.get_results()
 66 |   .filter(col("TableName").isin(*include_table_list))
 67 |   .collect()
 68 |                )
 69 |   
 70 | elif table_mode == "use_exclude_list": 
 71 |   config_row = (delta_optimizer.get_results()
 72 |   .filter(~col("TableName").isin(*exclude_table_list))
 73 |   .collect()
 74 |                )
 75 | 
 76 | # COMMAND ----------
 77 | 
 78 | # DBTITLE 1,Step 1 - Get Table Properties Config
 79 | config_tbl_prop = [i[3] for i in config_row]
 80 | 
 81 | print(f"Running {len(config_tbl_prop)} TBL PROPERTIES (file size and re-writes) commands: \n {config_tbl_prop}")
 82 | 
 83 | # COMMAND ----------
 84 | 
 85 | # DBTITLE 1,Run TBL Properties Commands
 86 | for i in config_tbl_prop:
 87 |     try: 
 88 |         print(f"Running TABLE PROPERTIES command for {i}...")
 89 |         spark.sql(i)
 90 |         print(f"Completed TABLE PROPERTIES command for {i}!\n")
 91 |         
 92 |     except Exception as e:
 93 |         print(f"TABLE PROPERTIES failed with error: {str(e)}\n")
 94 | 
 95 | # COMMAND ----------
 96 | 
 97 | print(col_list)
 98 | 
 99 | # COMMAND ----------
100 | 
101 | # DBTITLE 1,Move Z-Order columns to front
102 | col_list = config_tbl_prop = [i[5] for i in config_row]
103 | 
104 | ### This is a recursive step, ordering needs to happend one at a time
105 | ## Starting simple, just moving ZORDEr cols to front, but this can become more nuanced
106 | for i in col_list:
107 |   for j in i:
108 |     try: 
109 | 
110 |       spark.sql(j)
111 |       print(f"Completed column order change for table {i} and column {j}")
112 |       
113 |     except Exception as e:
114 |       print(f"Unable to change order (usually means cause its an Id column and doesnt need reordering anyways...skipping to next columns) \n with error: {str(e)} \n ")
115 |       
116 | 
117 | # COMMAND ----------
118 | 
119 | # DBTITLE 1,Step 2 - Get config for OPTIMIZE Commands
120 | ## This table by default has only 1 file, so it shouldnt be expensive to collect
121 | config_optim = [i[2] for i in config_row]
122 | 
123 | print(f"Running {len(config_optim)} OPTIMIZE commands: \n {config_optim}")
124 | 
125 | # COMMAND ----------
126 | 
127 | # DBTITLE 1,Run through OPTIMIZE commands
128 | for i in config_optim:
129 |     try: 
130 |         print(f"Running OPTIMIZE command for {i}...")
131 |         spark.sql(i)
132 |         print(f"Completed OPTIMIZE command for {i}!\n ")
133 |         
134 |     except Exception as e:
135 |         print(f"Optimize failed with error: {str(e)}\n")
136 | 
137 | 
138 | # COMMAND ----------
139 | 
140 | # DBTITLE 1,Step 3 - Get Config for ANALYZE TABLE commands
141 | ## This table by default has only 1 file, so it shouldnt be expensive to collect
142 | config_tbl_stats = [i[4] for i in config_row]
143 | 
144 | print(f"Running {len(config_tbl_stats)} TBL PROPERTIES (file size and re-writes) commands: \n {config_tbl_stats}")
145 | 
146 | # COMMAND ----------
147 | 
148 | # DBTITLE 1,Run through Config for ANALYZE
149 | for i in config_tbl_stats:
150 |     try: 
151 |         print(f"Running ANALYZE TABLE command for {i}...")
152 |         spark.sql(i)
153 |         print(f"Completed ANALYZE TABLE command for {i}!\n")
154 |         
155 |     except Exception as e:
156 |         print(f"ANALYZE TABLE failed with error: {str(e)}\n")
157 | 
158 | 


--------------------------------------------------------------------------------
/Delta Optimizer/Step 3_ Query History and Profile Analyzer.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC 
  4 | # MAGIC ## Select from Results table to Look at Profiles of Queries, Tables, and Recommendations
  5 | 
  6 | # COMMAND ----------
  7 | 
  8 | from pyspark.sql.functions import *
  9 | 
 10 | # COMMAND ----------
 11 | 
 12 | from deltaoptimizer import DeltaOptimizerBase, DeltaProfiler, QueryProfiler, DeltaOptimizer
 13 | import os
 14 | 
 15 | # COMMAND ----------
 16 | 
 17 | dbutils.widgets.text("Optimizer Output Database", "hive_metastore.delta_optimizer")
 18 | 
 19 | # COMMAND ----------
 20 | 
 21 | optimizer_location = dbutils.widgets.get("Optimizer Output Database").strip()
 22 | delta_optimizer = DeltaOptimizer(database_name=optimizer_location)
 23 | 
 24 | # COMMAND ----------
 25 | 
 26 | # DBTITLE 1,Get Most Recent Strategy Results
 27 | # MAGIC %python
 28 | # MAGIC ## This table by default has only 1 file, so it shouldnt be expensive to collect
 29 | # MAGIC table_mode = dbutils.widgets.get("table_mode")
 30 | # MAGIC include_table_list = [i.strip() for i in dbutils.widgets.get("include_list(csv)").split(",")]
 31 | # MAGIC exclude_table_list = [i.strip() for i in dbutils.widgets.get("exclude_list(csv)").split(",")]
 32 | # MAGIC 
 33 | # MAGIC if table_mode == "include_all_tables":
 34 | # MAGIC   df_results = (delta_optimizer.get_results()
 35 | # MAGIC                )
 36 | # MAGIC elif table_mode == "use_include_list":
 37 | # MAGIC   df_results = (delta_optimizer.get_results()
 38 | # MAGIC   .filter(col("TableName").isin(*include_table_list))
 39 | # MAGIC                )
 40 | # MAGIC   
 41 | # MAGIC elif table_mode == "use_exclude_list": 
 42 | # MAGIC   df_results = (delta_optimizer.get_results()
 43 | # MAGIC   .filter(~col("TableName").isin(*exclude_table_list))
 44 | # MAGIC                )
 45 | # MAGIC   
 46 | # MAGIC   
 47 | # MAGIC df_results.display()
 48 | 
 49 | # COMMAND ----------
 50 | 
 51 | # DBTITLE 1,Get Table Stats
 52 | df = spark.sql(f"""
 53 | 
 54 | SELECT * 
 55 | FROM {optimizer_location}.all_tables_table_stats
 56 | 
 57 | """)
 58 | 
 59 | df.display()
 60 | 
 61 | 
 62 | # COMMAND ----------
 63 | 
 64 | # DBTITLE 1,Get Cardinality Stats
 65 | 
 66 | df = spark.sql(f"""
 67 | 
 68 | SELECT * 
 69 | FROM {optimizer_location}.all_tables_cardinality_stats
 70 | WHERE IsUsedInReads = 1 OR IsUsedInWrites = 1
 71 | """)
 72 | 
 73 | df.display()
 74 | 
 75 | # COMMAND ----------
 76 | 
 77 | # DBTITLE Register Unique Queries
 78 | unqiue_queries = spark.sql(f"""SELECT * FROM {optimizer_location}.parsed_distinct_queries""")
 79 | unqiue_queries.createOrReplaceTempView("unique_queries")
 80 | 
 81 | # COMMAND ----------
 82 | 
 83 | # DBTITLE 1,Raw Query Runs Tables
 84 | 
 85 | """ This table contains ALL queries for the monitored warehouses that have been run over time, so you can build all kinds of visualizations on that. These are NOT distinct queries, they are every single query run
 86 | """
 87 | 
 88 | raw_queries_df = spark.sql(f"""
 89 | 
 90 | SELECT *,
 91 | from_unixtime(query_start_time_ms/1000) AS QueryStartTime,
 92 | from_unixtime(query_end_time_ms/1000) AS QueryEndTime,
 93 | duration/1000 AS QueryDurationSeconds
 94 | FROM {optimizer_location}.raw_query_history_statistics
 95 | 
 96 | """)
 97 | 
 98 | raw_queries_df.createOrReplaceTempView("raw_queries")
 99 | 
100 | raw_queries_df.display()
101 | 
102 | # COMMAND ----------
103 | 
104 | # DBTITLE 1,Most Expensive Queries in a all run history (user can add timestamp filter in a WHERE clause)
105 | # MAGIC %sql
106 | # MAGIC 
107 | # MAGIC SELECT 
108 | # MAGIC r.query_hash,
109 | # MAGIC r.query_text,
110 | # MAGIC SUM(r.duration/1000) AS TotalRuntimeOfQuery,
111 | # MAGIC AVG(r.duration/1000) AS AvgDurationOfQuery,
112 | # MAGIC COUNT(r.query_id) AS TotalRunsOfQuery,
113 | # MAGIC COUNT(r.query_id) / COUNT(DISTINCT date_trunc('day', QueryStartTime)) AS QueriesPerDay,
114 | # MAGIC SUM(r.duration/1000) / COUNT(DISTINCT date_trunc('day', QueryStartTime)) AS TotalRuntimePerDay
115 | # MAGIC FROM raw_queries r
116 | # MAGIC WHERE QueryStartTime >= (current_date() - 7)
117 | # MAGIC GROUP BY r.query_hash, r.query_text
118 | # MAGIC ORDER BY TotalRuntimePerDay DESC
119 | 
120 | # COMMAND ----------
121 | 
122 | # DBTITLE 1,Query Runs Over Time - General
123 | # MAGIC %sql
124 | # MAGIC 
125 | # MAGIC SELECT 
126 | # MAGIC date_trunc('hour', QueryStartTime) AS Date,
127 | # MAGIC COUNT(query_id) AS TotalQueryRuns,
128 | # MAGIC AVG(QueryDurationSeconds) AS AvgQueryDurationSeconds
129 | # MAGIC FROM raw_queries
130 | # MAGIC GROUP BY date_trunc('hour', QueryStartTime)
131 | # MAGIC ORDER BY Date
132 | 
133 | # COMMAND ----------
134 | 
135 | # DBTITLE 1,Top 10 Queries with Most Total Runtime Per Day (Duration * # times run)
136 | # MAGIC %sql
137 | # MAGIC 
138 | # MAGIC WITH r AS (
139 | # MAGIC   SELECT 
140 | # MAGIC   date_trunc('day', r.QueryStartTime) AS Date,
141 | # MAGIC   r.query_hash,
142 | # MAGIC   SUM(r.duration/1000) AS TotalRuntimeOfQuery,
143 | # MAGIC   AVG(r.duration/1000) AS AvgDurationOfQuery,
144 | # MAGIC   COUNT(r.query_id) AS TotalRunsOfQuery
145 | # MAGIC   FROM raw_queries r
146 | # MAGIC   GROUP BY date_trunc('day', r.QueryStartTime), r.query_hash
147 | # MAGIC ),
148 | # MAGIC s as (
149 | # MAGIC SELECT 
150 | # MAGIC *,
151 | # MAGIC DENSE_RANK() OVER (PARTITION BY Date ORDER BY TotalRuntimeOfQuery DESC) AS PopularityRank
152 | # MAGIC FROM r
153 | # MAGIC )
154 | # MAGIC SELECT 
155 | # MAGIC uu.query_text,
156 | # MAGIC s.*
157 | # MAGIC FROM s
158 | # MAGIC LEFT JOIN unique_queries uu ON uu.query_hash = s.query_hash
159 | # MAGIC  WHERE PopularityRank <= 10
160 | 
161 | # COMMAND ----------
162 | 
163 | # DBTITLE 1,Top 10 Longest Running Queries By Day
164 | # MAGIC %sql
165 | # MAGIC 
166 | # MAGIC WITH r AS (
167 | # MAGIC   SELECT 
168 | # MAGIC   date_trunc('day', r.QueryStartTime) AS Date,
169 | # MAGIC   r.query_hash,
170 | # MAGIC   SUM(r.duration/1000) AS TotalRuntimeOfQuery,
171 | # MAGIC   AVG(r.duration/1000) AS AvgDurationOfQuery,
172 | # MAGIC   COUNT(r.query_id) AS TotalRunsOfQuery
173 | # MAGIC   FROM raw_queries r
174 | # MAGIC   GROUP BY date_trunc('day', r.QueryStartTime), r.query_hash
175 | # MAGIC ),
176 | # MAGIC s as (
177 | # MAGIC SELECT 
178 | # MAGIC *,
179 | # MAGIC DENSE_RANK() OVER (PARTITION BY Date ORDER BY AvgDurationOfQuery DESC) AS PopularityRank
180 | # MAGIC FROM r
181 | # MAGIC )
182 | # MAGIC SELECT 
183 | # MAGIC uu.query_text,
184 | # MAGIC s.*
185 | # MAGIC FROM s
186 | # MAGIC LEFT JOIN unique_queries uu ON uu.query_hash = s.query_hash
187 | # MAGIC  WHERE PopularityRank <= 10
188 | 
189 | # COMMAND ----------
190 | 
191 | # DBTITLE 1,Top 10 Most OFTEN ran queries by Day
192 | # MAGIC %sql
193 | # MAGIC 
194 | # MAGIC WITH r AS (
195 | # MAGIC   SELECT 
196 | # MAGIC   date_trunc('day', r.QueryStartTime) AS Date,
197 | # MAGIC   r.query_hash,
198 | # MAGIC   SUM(r.duration/1000) AS TotalRuntimeOfQuery,
199 | # MAGIC   AVG(r.duration/1000) AS AvgDurationOfQuery,
200 | # MAGIC   COUNT(r.query_id) AS TotalRunsOfQuery
201 | # MAGIC   FROM raw_queries r
202 | # MAGIC   GROUP BY date_trunc('day', r.QueryStartTime), r.query_hash
203 | # MAGIC ),
204 | # MAGIC s as (
205 | # MAGIC SELECT 
206 | # MAGIC *,
207 | # MAGIC DENSE_RANK() OVER (PARTITION BY Date ORDER BY TotalRunsOfQuery DESC) AS PopularityRank
208 | # MAGIC FROM r
209 | # MAGIC )
210 | # MAGIC SELECT 
211 | # MAGIC uu.query_text,
212 | # MAGIC s.*
213 | # MAGIC FROM s
214 | # MAGIC LEFT JOIN unique_queries uu ON uu.query_hash = s.query_hash
215 | # MAGIC  WHERE PopularityRank <= 10
216 | 
217 | # COMMAND ----------
218 | 
219 | # DBTITLE 1,Most Expensive Table MERGE / DELETE operations
220 | writes_df = spark.sql(f"""
221 | 
222 | SELECT * 
223 | FROM {optimizer_location}.write_statistics_merge_predicate
224 | 
225 | """)
226 | 
227 | writes_df.display()
228 | 


--------------------------------------------------------------------------------
/Delta Optimizer/deltaoptimizer-1.4.1-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/Delta Optimizer/deltaoptimizer-1.4.1-py3-none-any.whl


--------------------------------------------------------------------------------
/Delta Optimizer/deltaoptimizer-1.5.0-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/Delta Optimizer/deltaoptimizer-1.5.0-py3-none-any.whl


--------------------------------------------------------------------------------
/Delta Optimizer/deltaoptimizer-1.5.2-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/Delta Optimizer/deltaoptimizer-1.5.2-py3-none-any.whl


--------------------------------------------------------------------------------
/Design Patterns Notebooks/Advanced Notebooks/End to End Procedural Migration Pattern/Procedural Migration Pattern with SCD2 Example.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC
  4 | # MAGIC # End to End Procedural Programming for Data Warehousing Example for SCD2 Pipeline
  5 | # MAGIC
  6 | # MAGIC ## Overview: 
  7 | # MAGIC
  8 | # MAGIC This notebook shows how to use the popular delta helper libraries to implement end to end procedural data warehousing using the following: 
  9 | # MAGIC
 10 | # MAGIC 1. Simple Python + SQL for complex control flow
 11 | # MAGIC 2. DeltaLogger for easy logging and error tracking
 12 | # MAGIC 3. Serverless Client for Pushing SQL statements down to DBSQL Serverless Warehouse
 13 | # MAGIC 4. Multi Statement Transaction Manager for SCD2 Multi statement upserts pushed to DBSQL Serverless
 14 | # MAGIC
 15 | # MAGIC
 16 | # MAGIC ## Steps: 
 17 | # MAGIC
 18 | # MAGIC 0. Initialize Logger
 19 | # MAGIC 1. Create DDLS
 20 | # MAGIC 2. COPY INTO Bronze Tables
 21 | # MAGIC 3. MERGE Upserts (Multi Statement Transaction)
 22 | # MAGIC 4. Operational / Historical Snapshots Gold Tables
 23 | # MAGIC 5. Clean up staging tables
 24 | # MAGIC 6. Complete / Fail Runs in Logger
 25 | # MAGIC
 26 | 
 27 | # COMMAND ----------
 28 | 
 29 | # DBTITLE 1,Medallion Architecture
 30 | # MAGIC %md
 31 | # MAGIC
 32 | # MAGIC <img src="https://databricks.com/wp-content/uploads/2022/03/delta-lake-medallion-architecture-2.jpeg" >
 33 | 
 34 | # COMMAND ----------
 35 | 
 36 | # DBTITLE 1,Optional, can build these libraries into wheel
 37 | # MAGIC %pip install sqlglot
 38 | 
 39 | # COMMAND ----------
 40 | 
 41 | # DBTITLE 1,Available Libraries for Procedural Management
 42 | from helperfunctions.deltalogger import DeltaLogger ## Easy logging OOTB
 43 | from helperfunctions.dbsqlclient import ServerlessClient ## Push Statement down to DBSQL from anyhere spark.sql() ==> serverless_client.sql()
 44 | from helperfunctions.dbsqltransactions import DBSQLTransactionManager ## OOTB Multi-statement transactions to serverless SQL / DBSQL
 45 | from helperfunctions.deltahelpers import DeltaHelpers, DeltaMergeHelpers ## For Temp Tables and Concurrent Merge statements
 46 | 
 47 | # COMMAND ----------
 48 | 
 49 | # DBTITLE 1,Scope Session
 50 | # MAGIC %sql
 51 | # MAGIC CREATE DATABASE IF NOT EXISTS main.iot_dashboard;
 52 | # MAGIC USE CATALOG main;
 53 | # MAGIC USE SCHEMA iot_dashboard;
 54 | 
 55 | # COMMAND ----------
 56 | 
 57 | # DBTITLE 1,Step 0:  Initialize Logger and Serverless Client
 58 | WAREHOUSE_ID = "475b94ddc7cd5211"
 59 | HOST_NAME = "e2-demo-field-eng.cloud.databricks.com"
 60 | #TOKEN = <token>
 61 | LOGGER_TABLE = 'main.iot_dashboard.logger'
 62 | PIPELINE_PROCESS_NAME = 'iot_dashboard_scd2_end_to_end'
 63 | 
 64 | ## Create Serverless Client
 65 | serverless_client = ServerlessClient(warehouse_id=WAREHOUSE_ID, host_name=HOST_NAME) #token=TOKEN
 66 | 
 67 | ## Create Delta Logger
 68 | delta_logger = DeltaLogger(logger_table_name=LOGGER_TABLE, session_process_name=PIPELINE_PROCESS_NAME) # partition_cols=['start_date'], session_batch_id="12309821345"
 69 | 
 70 | ## Optionally create transaction manager for multi statement transaction requirements (like SCD2 upserts)
 71 | serverless_transaction_manager = DBSQLTransactionManager(warehouse_id=WAREHOUSE_ID, host_name=HOST_NAME)
 72 | 
 73 | # COMMAND ----------
 74 | 
 75 | print(delta_logger.active_process_name)
 76 | print(delta_logger.active_run_id)
 77 | print(delta_logger.active_batch_id)
 78 | 
 79 | # COMMAND ----------
 80 | 
 81 | # DBTITLE 1,Start Run with Delta Logger
 82 | delta_logger.start_run(process_name='copy_into_command', batch_id="custom_batch_id")
 83 | 
 84 | # COMMAND ----------
 85 | 
 86 | print(delta_logger.active_run_start_ts)
 87 | print(delta_logger.active_run_status)
 88 | 
 89 | # COMMAND ----------
 90 | 
 91 | # DBTITLE 1,Step 1: Create DDLs
 92 | ddl_sql = """CREATE TABLE IF NOT EXISTS main.iot_dashboard.bronze_sensors_scd_2
 93 | (
 94 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY,
 95 | device_id INT,
 96 | user_id INT,
 97 | calories_burnt DECIMAL(10,2), 
 98 | miles_walked DECIMAL(10,2), 
 99 | num_steps DECIMAL(10,2), 
100 | timestamp TIMESTAMP,
101 | value STRING,
102 | ingest_timestamp TIMESTAMP
103 | )
104 | USING DELTA
105 | ;
106 | 
107 | CREATE TABLE IF NOT EXISTS main.iot_dashboard.silver_sensors_scd_2
108 | (
109 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY,
110 | device_id INT,
111 | user_id INT,
112 | calories_burnt DECIMAL(10,2), 
113 | miles_walked DECIMAL(10,2), 
114 | num_steps DECIMAL(10,2), 
115 | timestamp TIMESTAMP,
116 | value STRING,
117 | ingest_timestamp TIMESTAMP,
118 | -- Processing Columns
119 | _start_timestamp TIMESTAMP,
120 | _end_timestamp TIMESTAMP,
121 | _batch_run_id STRING,
122 | _is_current BOOLEAN
123 | )
124 | USING DELTA 
125 | PARTITIONED BY (_is_current, user_id)
126 | TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported', 'delta.columnMapping.mode' = 'name')
127 | ;
128 | """
129 | 
130 | ## Simple Control Flow with Python, if/else, try/catch, for/while
131 | try: 
132 | 
133 | 
134 |   serverless_client.submit_multiple_sql_commands(ddl_sql)
135 |   delta_logger.log_run_info(log_level='INFO', msg= f'DDLQuery runtime: 1 seconds')
136 | 
137 | 
138 | except Exception as e:
139 | 
140 |   delta_logger.log_run_info(log_level='CRITICAL', msg='Failed to create DDLS with error')
141 | 
142 |   raise(e)
143 | 
144 | 
145 | 
146 | # COMMAND ----------
147 | 
148 | # DBTITLE 1,Step 2: Incrementally Ingest Source Data from Raw Files
149 | 
150 | copy_into_sql = """
151 | COPY INTO main.iot_dashboard.bronze_sensors_scd_2
152 | FROM (SELECT 
153 |       id::bigint AS Id,
154 |       device_id::integer AS device_id,
155 |       user_id::integer AS user_id,
156 |       calories_burnt::decimal(10,2) AS calories_burnt, 
157 |       miles_walked::decimal(10,2) AS miles_walked,
158 |       num_steps::decimal(10,2) AS num_steps, 
159 |       timestamp::timestamp AS timestamp,
160 |       value  AS value, -- This is a JSON object,
161 |       now() AS ingest_timestamp
162 | FROM "/databricks-datasets/iot-stream/data-device/")
163 | FILEFORMAT = json -- csv, xml, txt, parquet, binary, etc.
164 | COPY_OPTIONS('force'='true') --'true' always loads all data it sees. option to be incremental or always load all files
165 | """
166 | 
167 | ## Simple Control Flow with Python, if/else, try/catch, for/while
168 | try: 
169 | 
170 |   serverless_client.sql(copy_into_sql)
171 | 
172 |   batch_row_count = serverless_client.sql("SELECT COUNT(0) FROM main.iot_dashboard.bronze_sensors_scd_2").collect()[0][0]
173 | 
174 |   ## Log customer queryable metrics
175 |   delta_logger.log_run_metric(run_metrics_dict={"Batch_Rows": batch_row_count})
176 |   
177 |   delta_logger.log_run_info(msg = 'COPY INTO complete')
178 | 
179 | except Exception as e:
180 | 
181 |   delta_logger.log_run_info(log_level='CRITICAL', msg='Failed to COPY INTO with error')
182 |   raise(e)
183 | 
184 | # COMMAND ----------
185 | 
186 | # DBTITLE 1,Step 3: Multi Statement Transaction: Perform SCD2 INSERT ONLY Upserts - Device Data
187 | mst_scd_transaction_sql = """
188 | 
189 | CREATE OR REPLACE TABLE main.iot_dashboard.temp_batch_to_insert
190 | AS
191 | WITH de_dup (
192 | SELECT Id::integer,
193 |               device_id::integer,
194 |               user_id::integer,
195 |               calories_burnt::decimal,
196 |               miles_walked::decimal,
197 |               num_steps::decimal,
198 |               timestamp::timestamp,
199 |               value::string,
200 |               ingest_timestamp,
201 |               ROW_NUMBER() OVER(PARTITION BY device_id, user_id, timestamp ORDER BY ingest_timestamp DESC) AS DupRank
202 |               FROM main.iot_dashboard.bronze_sensors_scd_2
203 |               )
204 |               
205 | SELECT Id, device_id, user_id, calories_burnt, miles_walked, num_steps, timestamp, value, ingest_timestamp, 
206 | now() AS _start_timestamp, 
207 | true AS _is_current,
208 | 1 AS _batch_run_id -- example batch run id
209 | FROM de_dup
210 | WHERE DupRank = 1
211 | ;
212 | 
213 | MERGE INTO main.iot_dashboard.silver_sensors_scd_2 AS target
214 | USING ( 
215 | 
216 |       SELECT updates.Id AS merge_key_id,
217 |         updates.user_id AS merge_key_user_id,
218 |         updates.device_id AS merge_key_device_id,
219 |         updates.* --merge key can be built in whatever way makes sense to get unique rows
220 |       FROM main.iot_dashboard.temp_batch_to_insert AS updates
221 |     
222 |       UNION ALL
223 | 
224 |       -- These rows will INSERT updated rows of existing records and new rows
225 |       -- Setting the merge_key to NULL forces these rows to NOT MATCH and be INSERTed.
226 |       SELECT 
227 |       NULL AS merge_key_id,
228 |       NULL AS merge_key_user_id,
229 |       NULL AS merge_key_device_id,
230 |       updates.*
231 |       FROM main.iot_dashboard.temp_batch_to_insert AS updates
232 |       INNER JOIN main.iot_dashboard.silver_sensors_scd_2 as target_table
233 |       ON updates.Id = target_table.Id
234 |       AND updates.user_id = target_table.user_id
235 |       AND updates.device_id = target_table.device_id  -- What makes the key unique
236 |       -- This needs to be accounted for when deciding to expire existing rows
237 |       WHERE updates.value <> target_table.value -- Only update if any of the data has changed
238 | 
239 |         ) AS source
240 |         
241 | ON target.Id = source.merge_key_id
242 | AND target.user_id = source.merge_key_user_id
243 | AND target.device_id = source.merge_key_device_id
244 | 
245 | WHEN MATCHED AND (target._is_current = true AND target.value <> source.value) THEN
246 | UPDATE SET
247 | target._end_timestamp = source._start_timestamp, -- start of new record is end of old record
248 | target._is_current = false
249 | 
250 | WHEN NOT MATCHED THEN 
251 | INSERT (id, device_id, user_id, calories_burnt, miles_walked, num_steps, value, timestamp, ingest_timestamp, _start_timestamp, _end_timestamp, _is_current, _batch_run_id)
252 | VALUES  (
253 | source.id, source.device_id, source.user_id, source.calories_burnt, source.miles_walked, source.num_steps, source.value, source.timestamp, 
254 | source.ingest_timestamp,
255 | source._start_timestamp, -- start timestamp -- new records
256 | NULL ,-- end_timestamp 
257 | source._is_current, -- is current record
258 | source._batch_run_id --example batch run id
259 | )
260 | ;
261 | """
262 | 
263 | 
264 | ## Simple Control Flow with Python, if/else, try/catch, for/while
265 | 
266 | try: 
267 | 
268 |   #serverless_client.submit_multiple_sql_commands(mst_scd_transaction_sql)
269 |   serverless_transaction_manager = DBSQLTransactionManager(warehouse_id=WAREHOUSE_ID)
270 |   serverless_transaction_manager.execute_dbsql_transaction(sql_string=str(mst_scd_transaction_sql), tables_to_manage=['main.iot_dashboard.temp_batch_to_insert', 'main.iot_dashboard.silver_sensors_scd_2'])
271 | 
272 | except Exception as e:
273 | 
274 |   delta_logger.fail_run()
275 | 
276 |   raise(e)
277 | 
278 | # COMMAND ----------
279 | 
280 | # DBTITLE 1,Step 4: Clean up and Optimize Tables
281 | 
282 | ## Simple Control Flow with Python, if/else, try/catch, for/while
283 | try: 
284 | 
285 |   serverless_client.sql("TRUNCATE TABLE main.iot_dashboard.temp_batch_to_insert")
286 |   delta_logger.log_run_info(msg='Batch cleared!')
287 | 
288 | except Exception as e:
289 | 
290 |   delta_logger.log_run_info(log_level='INFO', msg='couldnt find table, was already deleted')
291 | 
292 | 
293 | ## Optimize command
294 | try: 
295 | 
296 |   serverless_client.sql("OPTIMIZE main.iot_dashboard.silver_sensors_scd_2 ZORDER BY (timestamp, device_id)")
297 |   
298 |   delta_logger.log_run_info(msg='Target tables optimized!')
299 | 
300 | except Exception as e:
301 | 
302 |   ## For these operations, they are not critical to the pipeline successs, so just log the event and keep going
303 |   delta_logger.log_run_info(log_level='WARN', msg='couldnt find table, this should exist or a conflect happens')
304 |   raise(e)
305 | 
306 | 
307 | # COMMAND ----------
308 | 
309 | # DBTITLE 1,Create "Current" View
310 | gold_views_sql = """
311 | CREATE OR REPLACE VIEW main.iot_dashboard.silver_sensors_current
312 | AS
313 | SELECT * FROM main.iot_dashboard.silver_sensors_scd_2
314 | WHERE _is_current = true;
315 | 
316 | CREATE OR REPLACE VIEW main.iot_dashboard.silver_sensors_snapshot_as_of_2023_10_10_19_30_00
317 | AS
318 | -- Get more recent record for each record as of a specific version
319 | WITH de_dup (
320 | SELECT Id::integer,
321 |               device_id::integer,
322 |               user_id::integer,
323 |               calories_burnt::decimal,
324 |               miles_walked::decimal,
325 |               num_steps::decimal,
326 |               timestamp::timestamp,
327 |               value::string,
328 |               ingest_timestamp,
329 |               _start_timestamp,
330 |               _is_current,
331 |               _end_timestamp,
332 |               ROW_NUMBER() OVER(PARTITION BY id ORDER BY _start_timestamp DESC) AS DupRank -- Get most recent record as of a specific point in time
333 |               FROM main.iot_dashboard.silver_sensors_scd_2
334 |               -- Point in time snapshot timestamp such as end of month
335 |               WHERE _start_timestamp <= '2023-10-10T19:30:00'::timestamp
336 |               )
337 |               
338 | SELECT *
339 | FROM de_dup
340 | WHERE DupRank = 1
341 | ;
342 | """
343 | 
344 | 
345 | ## Optimize command
346 | try: 
347 | 
348 |   serverless_client.submit_multiple_sql_commands(gold_views_sql)
349 | 
350 |   delta_logger.log_run_info(msg='Operational View Created!')
351 | 
352 | except Exception as e:
353 | 
354 |   delta_logger.log_run_info(log_level='CRITICAL', msg='couldnt find table, all should exist')
355 |   raise(e)
356 | 
357 | 
358 | # COMMAND ----------
359 | 
360 | # DBTITLE 1,Complete Run!
361 | delta_logger.complete_run()
362 | 
363 | # COMMAND ----------
364 | 
365 | delta_logger.full_table_name
366 | 
367 | # COMMAND ----------
368 | 
369 | # MAGIC %sql
370 | # MAGIC
371 | # MAGIC SELECT *,
372 | # MAGIC run_metadata:Batch_Rows -- our custom metrics we logged
373 | # MAGIC FROM main.iot_dashboard.logger
374 | # MAGIC ORDER BY run_id DESC
375 | 


--------------------------------------------------------------------------------
/Design Patterns Notebooks/Advanced Notebooks/Multi-plexing with Autoloader/Option 1: Actually Multi-plexing tables on write/Child Job Template.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC
  4 | # MAGIC ## Controller notebook
  5 | # MAGIC
  6 | # MAGIC Identifies and Orcestrates the sub jobs
  7 | 
  8 | # COMMAND ----------
  9 | 
 10 | from pyspark.sql.functions import *
 11 | from pyspark.sql.types import *
 12 | 
 13 | # COMMAND ----------
 14 | 
 15 | # DBTITLE 1,Step 1: Logic to get unique list of events/sub directories that separate the different streams
 16 | # Design considerations
 17 | # Ideally the writer of the raw data will separate out event types by folder so you can use globPathFilters to create separate streams
 18 | # If ALL events are in one data source, all streams will stream from 1 table and then will be filtered for that event in the stream. To avoid many file listings of the same file, enable useNotifications = true in autoloader
 19 | 
 20 | # COMMAND ----------
 21 | 
 22 | # DBTITLE 1,Define Params
 23 | dbutils.widgets.text("Input Root Path", "")
 24 | dbutils.widgets.text("Parent Job Name", "")
 25 | dbutils.widgets.text("Child Task Name", "")
 26 | 
 27 | # COMMAND ----------
 28 | 
 29 | # DBTITLE 1,Get Params
 30 | root_input_path = dbutils.widgets.get("Input Root Path")
 31 | parent_job_name = dbutils.widgets.get("Parent Job Name")
 32 | child_task_name = dbutils.widgets.get("Child Task Name")
 33 | 
 34 | print(f"Root input path: {root_input_path}")
 35 | print(f"Parent Job Name: {parent_job_name}")
 36 | print(f"Event Task Name: {child_task_name}")
 37 | 
 38 | # COMMAND ----------
 39 | 
 40 | # DBTITLE 1,Define Dynamic Checkpoint Path
 41 | ## Eeach stream needs its own checkpoint, we can dynamically define that for each event/table we want to create / teast out
 42 | 
 43 | checkpoint_path = f"dbfs:/checkpoints/<your_user_id_here>/{parent_job_name}/{child_task_name}/"
 44 | 
 45 | # COMMAND ----------
 46 | 
 47 | # DBTITLE 1,Target Location Definitions
 48 | spark.sql("""CREATE DATABASE IF NOT EXISTS iot_multiplexing_demo""")
 49 | 
 50 | # COMMAND ----------
 51 | 
 52 | # DBTITLE 1,Use Whatever custom event filtering logic is needed
 53 | filter_regex_string = "part-" + child_task_name + "*.json*"
 54 | 
 55 | print(filter_regex_string)
 56 | 
 57 | # COMMAND ----------
 58 | 
 59 | # DBTITLE 1,Read Stream
 60 | input_df = (spark
 61 |   .readStream
 62 |   .format("text")
 63 |   .option("multiLine", "true")
 64 |   .option("pathGlobFilter", filter_regex_string)
 65 |   .load(root_input_path)
 66 |   .withColumn("inputFileName", input_file_name()) ## you can filter using .option("globPathFilter") as well here
 67 | )
 68 | 
 69 | # COMMAND ----------
 70 | 
 71 | # DBTITLE 1,Transformation Logic on any events (can be conditional on event)
 72 | transformed_df = (input_df
 73 |   .withColumn("EventName", lit(child_task_name))
 74 |   .selectExpr("value:id::integer AS Id", 
 75 |               "EventName",
 76 |               "value:user_id::integer AS UserId",
 77 |               "value:device_id::integer AS DeviceId",
 78 |               "value:num_steps::decimal AS NumberOfSteps",
 79 |               "value:miles_walked::decimal AS MilesWalked",
 80 |               "value:calories_burnt::decimal AS Calories",
 81 |               "value:timestamp::timestamp AS EventTimestamp",
 82 |               "current_timestamp() AS IngestionTimestamp",
 83 |               "inputFileName")
 84 | 
 85 | )
 86 | 
 87 | # COMMAND ----------
 88 | 
 89 | # DBTITLE 1,Truncate this child stream and reload from all data
 90 | 
 91 | dbutils.fs.rm(checkpoint_path, recurse=True)
 92 | 
 93 | # COMMAND ----------
 94 | 
 95 | # DBTITLE 1,Dynamic Write Stream
 96 | (transformed_df
 97 |   .writeStream
 98 |   .trigger(once=True)
 99 |   .option("checkpointLocation", checkpoint_path)
100 |   .toTable(f"iot_multiplexing_demo.iot_stream_event_{child_task_name}")
101 | )
102 | 


--------------------------------------------------------------------------------
/Design Patterns Notebooks/Advanced Notebooks/Parallel Custom Named File Exports/Parallel File Exports - Python Version.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md 
  3 | # MAGIC
  4 | # MAGIC # Python Version
  5 | # MAGIC ### Author: Cody Austin Davis
  6 | # MAGIC ### Date: 2/22/2023
  7 | # MAGIC
  8 | # MAGIC This notebook shows users how to rename/move files from one s3 location/file name to another in parallel using spark. 
  9 | 
 10 | # COMMAND ----------
 11 | 
 12 | # MAGIC %pip install boto
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | # DBTITLE 1,Define Function To Dynamically Rename File Paths
 17 | @udf("string")
 18 | def getRenamedFilePath(source_path):
 19 | 
 20 |   source_path_root = "/".join(source_path.split("/")[:-1])
 21 |   source_path_file_name = "".join(source_path.split("/")[-1])
 22 | 
 23 |   ## insert any arbitrary file renaming logic here
 24 |   new_path_file_name = "/renamed/" + source_path_file_name
 25 | 
 26 |   new_path = source_path_root + new_path_file_name
 27 | 
 28 |   return new_path
 29 | 
 30 | # COMMAND ----------
 31 | 
 32 | # DBTITLE 1,Create test data set not in dbutils
 33 | (spark.read.json("dbfs:/databricks-datasets/iot-stream/data-device/")
 34 | .write.format("json").mode("overwrite").save('s3://oetrta/codyaustindavis/parallelfile_source/')
 35 | )
 36 | 
 37 | # COMMAND ----------
 38 | 
 39 | # DBTITLE 1,Define python udf that renames / copies files in parallel with boto or HTTP request
 40 | from pyspark.sql.functions import *
 41 | import boto3
 42 | 
 43 | ## This UDF can be adjusted to accept access keys as another parameter
 44 | 
 45 | @udf("string")
 46 | def mv_s3_object(source_path, target_path):
 47 | 
 48 |   ## Get SOURCE bucket name and source path separately for boto
 49 |   source_bucket_name = "/".join(source_path.split("/")[0:3]).split("//")[1]
 50 |   source_file_path = "/".join(source_path.split("/")[3:])
 51 |     
 52 |   ## Get TARGET bucket name and source path separately for boto
 53 |   target_bucket_name = "/".join(target_path.split("/")[0:3]).split("//")[1]
 54 |   target_file_path = "/".join(target_path.split("/")[3:])
 55 |   
 56 |   ## Prep boto request copy params
 57 |   source_dict = {'Bucket': source_bucket_name, 'Key': source_file_path}
 58 |   
 59 |   ## Try copying the file over, return SUCCESS or error message in pyspark data frame
 60 |   s3 = boto3.resource('s3')
 61 |   msg = 'NOOP'
 62 |   try:
 63 |     s3.Object(target_bucket_name, target_file_path).copy_from(CopySource=source_dict)
 64 |     ## This delete is optional, you might want to separate this out into another job. This just represents the 2 commands to simulate a "move"
 65 |     s3.Object(source_bucket_name, source_file_path).delete()
 66 | 
 67 |     msg = 'SUCCESS'
 68 | 
 69 |   except Exception as e:
 70 |     msg = f'FAIL: {str(e)} \n BUCKET: {source_bucket_name}, SOURCE: {source_file_path}, TARGET: {target_file_path}'
 71 |           
 72 |   return msg
 73 | 
 74 | # COMMAND ----------
 75 | 
 76 | # DBTITLE 1,Chose a source path (either dynamically or manually) and move / rename files with the udfs in parallel
 77 | input_path_to_move = 's3://oetrta/codyaustindavis/parallelfile_source/'
 78 | 
 79 | filesDf = (spark.createDataFrame(dbutils.fs.ls(input_path_to_move))
 80 |            .filter(~col("name").startswith("_"))  ## exclude out-of-scope files
 81 |            .withColumn("target_path", getRenamedFilePath(col("path"))) ## Python udf to create the new file path with any logic inside function
 82 |            .selectExpr("path AS source_path", "target_path") ## select 2 paths needed
 83 |            .withColumn("WasMoved", mv_s3_object(col("source_path"), col("target_path"))) ## Push source and target paths to udf to execute in parallel and return msg
 84 |           )
 85 | 
 86 | display(filesDf)
 87 | 
 88 | # COMMAND ----------
 89 | 
 90 | # DBTITLE 1,Confirm rename
 91 | dbutils.fs.ls("s3://oetrta/codyaustindavis/parallelfile_source/renamed/"
 92 | )
 93 | 
 94 | # COMMAND ----------
 95 | 
 96 | # MAGIC %md
 97 | # MAGIC
 98 | # MAGIC # Scala Version
 99 | 
100 | # COMMAND ----------
101 | 
102 | # DBTITLE 1,Define scala file renaming function
103 | # MAGIC %scala 
104 | # MAGIC
105 | # MAGIC
106 | # MAGIC def getNewFilePath(sourcePath: String): String = {
107 | # MAGIC   val source_path = sourcePath;
108 | # MAGIC
109 | # MAGIC   val slice_len = source_path.split("/").length - 1;
110 | # MAGIC   val source_path_root = source_path.split("/").slice(0, slice_len);
111 | # MAGIC   val source_path_file_name = source_path.split("/").last;
112 | # MAGIC
113 | # MAGIC   // any arbitrary file rename logic
114 | # MAGIC   val new_path_file_name = "renamed/"+source_path_file_name;
115 | # MAGIC   
116 | # MAGIC   
117 | # MAGIC   val new_path = source_path_root.mkString("/") + "/" + new_path_file_name;
118 | # MAGIC
119 | # MAGIC   return new_path
120 | # MAGIC }
121 | 
122 | # COMMAND ----------
123 | 
124 | # DBTITLE 1,Test Scala File Renaming function
125 | # MAGIC %scala 
126 | # MAGIC
127 | # MAGIC val test_new_path = getNewFilePath("dbfs:/databricks-datasets/iot-stream/data-device/part-00003.json.gz")
128 | # MAGIC
129 | # MAGIC println(test_new_path)
130 | 
131 | # COMMAND ----------
132 | 
133 | # DBTITLE 1,Broadcast Configs to Executors
134 | # MAGIC %scala 
135 | # MAGIC import org.apache.hadoop.fs
136 | # MAGIC
137 | # MAGIC // maybe we need to register access keys here? not sure yet. Still dealing with Auth issues
138 | # MAGIC val conf = new org.apache.spark.util.SerializableConfiguration(sc.hadoopConfiguration)
139 | # MAGIC
140 | # MAGIC val broadcastConf = sc.broadcast(conf)
141 | # MAGIC
142 | # MAGIC print(conf.value)
143 | 
144 | # COMMAND ----------
145 | 
146 | # DBTITLE 1,Run file renaming and moving for each row (need to add AUTH)
147 | # MAGIC %scala 
148 | # MAGIC
149 | # MAGIC import org.apache.hadoop.fs._
150 | # MAGIC
151 | # MAGIC // root bucket of where original files were dropped
152 | # MAGIC val filesToCopy = dbutils.fs.ls("dbfs:/databricks-datasets/iot-stream/data-device/").map(_.path)
153 | # MAGIC
154 | # MAGIC spark.sparkContext.parallelize(filesToCopy).foreachPartition(rows => rows.foreach {
155 | # MAGIC   
156 | # MAGIC   file => 
157 | # MAGIC   
158 | # MAGIC   println(file)
159 | # MAGIC   val fromPath = new Path(file)
160 | # MAGIC   
161 | # MAGIC   val tempNewPath = getNewFilePath(file)
162 | # MAGIC   
163 | # MAGIC   val toPath = new Path(tempNewPath)
164 | # MAGIC   
165 | # MAGIC   val fromFs = toPath.getFileSystem(conf.value)
166 | # MAGIC   
167 | # MAGIC   val toFs = toPath.getFileSystem(conf.value)
168 | # MAGIC   
169 | # MAGIC   FileUtil.copy(fromFs, fromPath, toFs, toPath, false, conf.value)
170 | # MAGIC   
171 | # MAGIC })
172 | 
173 | # COMMAND ----------
174 | 
175 | # DBTITLE 1,Look at files to Copy
176 | # MAGIC %scala
177 | # MAGIC
178 | # MAGIC val filesToCopy = dbutils.fs.ls("dbfs:/databricks-datasets/iot-stream/data-device/").map(_.path)
179 | # MAGIC
180 | # MAGIC
181 | # MAGIC val filesDf = spark.sparkContext.parallelize(filesToCopy).toDF()
182 | # MAGIC
183 | # MAGIC display(filesDf)
184 | 


--------------------------------------------------------------------------------
/Design Patterns Notebooks/Advanced Notebooks/Parallel Custom Named File Exports/Parallel File Exports.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # DBTITLE 1,helper function to dynamically build target path for each file
 3 | # MAGIC %scala 
 4 | # MAGIC
 5 | # MAGIC
 6 | # MAGIC def getNewFilePath(sourcePath: String): String = {
 7 | # MAGIC   val source_path = sourcePath;
 8 | # MAGIC
 9 | # MAGIC   val slice_len = source_path.split("/").length - 1;
10 | # MAGIC   val source_path_root = source_path.split("/").slice(0, slice_len);
11 | # MAGIC   val source_path_file_name = source_path.split("/").last;
12 | # MAGIC
13 | # MAGIC   // any arbitrary file rename logic
14 | # MAGIC   val new_path_file_name = "renamed/"+source_path_file_name;
15 | # MAGIC   val new_path = source_path_root.mkString("/") + "/" + new_path_file_name;
16 | # MAGIC
17 | # MAGIC   return new_path
18 | # MAGIC }
19 | 
20 | # COMMAND ----------
21 | 
22 | # DBTITLE 1,Test New Function to dynamically build target path for each row (file)
23 | # MAGIC %scala 
24 | # MAGIC
25 | # MAGIC val test_new_path = getNewFilePath("dbfs:/databricks-datasets/iot-stream/data-device/part-00003.json.gz")
26 | # MAGIC
27 | # MAGIC println(test_new_path)
28 | 
29 | # COMMAND ----------
30 | 
31 | # MAGIC %scala 
32 | # MAGIC import org.apache.hadoop.fs
33 | # MAGIC
34 | # MAGIC // maybe we need to register access keys here? not sure yet. Still dealing with Auth issues
35 | # MAGIC val conf = new org.apache.spark.util.SerializableConfiguration(sc.hadoopConfiguration)
36 | # MAGIC
37 | # MAGIC val broadcastConf = sc.broadcast(conf)
38 | # MAGIC
39 | # MAGIC print(conf.value)
40 | 
41 | # COMMAND ----------
42 | 
43 | # MAGIC %scala 
44 | # MAGIC
45 | # MAGIC import org.apache.hadoop.fs._
46 | # MAGIC
47 | # MAGIC // root bucket of where original files were dropped
48 | # MAGIC val filesToCopy = dbutils.fs.ls("dbfs:/databricks-datasets/iot-stream/data-device/").map(_.path)
49 | # MAGIC
50 | # MAGIC spark.sparkContext.parallelize(filesToCopy).foreachPartition(rows => rows.foreach {
51 | # MAGIC   
52 | # MAGIC   file => 
53 | # MAGIC   
54 | # MAGIC   println(file)
55 | # MAGIC   val fromPath = new Path(file)
56 | # MAGIC   
57 | # MAGIC   val tempNewPath = getNewFilePath(file)
58 | # MAGIC   
59 | # MAGIC   val toPath = new Path(tempNewPath)
60 | # MAGIC   
61 | # MAGIC   val fromFs = toPath.getFileSystem(conf.value)
62 | # MAGIC   
63 | # MAGIC   val toFs = toPath.getFileSystem(conf.value)
64 | # MAGIC   
65 | # MAGIC   FileUtil.copy(fromFs, fromPath, toFs, toPath, false, conf.value)
66 | # MAGIC   
67 | # MAGIC })
68 | 
69 | # COMMAND ----------
70 | 
71 | # MAGIC %scala
72 | # MAGIC
73 | # MAGIC val filesToCopy = dbutils.fs.ls("dbfs:/databricks-datasets/iot-stream/data-device/").map(_.path)
74 | # MAGIC
75 | # MAGIC
76 | # MAGIC val filesDf = spark.sparkContext.parallelize(filesToCopy).toDF()
77 | # MAGIC
78 | # MAGIC display(filesDf)
79 | 


--------------------------------------------------------------------------------
/Design Patterns Notebooks/Step 1 - SQL EDW Pipeline.sql:
--------------------------------------------------------------------------------
  1 | -- Databricks notebook source
  2 | -- MAGIC %md
  3 | -- MAGIC
  4 | -- MAGIC # This notebook generates a full data pipeline from databricks dataset - iot-stream
  5 | -- MAGIC
  6 | -- MAGIC ## This creates 2 tables: 
  7 | -- MAGIC
  8 | -- MAGIC <b> Database: </b> iot_dashboard
  9 | -- MAGIC
 10 | -- MAGIC <b> Tables: </b> silver_sensors, silver_users 
 11 | -- MAGIC
 12 | -- MAGIC <b> Params: </b> StartOver (Yes/No) - allows user to truncate and reload pipeline
 13 | 
 14 | -- COMMAND ----------
 15 | 
 16 | -- DBTITLE 1,Medallion Architecture
 17 | -- MAGIC %md
 18 | -- MAGIC
 19 | -- MAGIC <img src="https://databricks.com/wp-content/uploads/2022/03/delta-lake-medallion-architecture-2.jpeg" >
 20 | 
 21 | -- COMMAND ----------
 22 | 
 23 | DROP DATABASE IF EXISTS iot_dashboard CASCADE;
 24 | CREATE DATABASE IF NOT EXISTS iot_dashboard;
 25 | USE iot_dashboard;
 26 | 
 27 | -- COMMAND ----------
 28 | 
 29 | -- MAGIC %md
 30 | -- MAGIC
 31 | -- MAGIC # DDL Documentation: 
 32 | -- MAGIC
 33 | -- MAGIC https://docs.databricks.com/spark/latest/spark-sql/language-manual/sql-ref-syntax-ddl-alter-table.html
 34 | 
 35 | -- COMMAND ----------
 36 | 
 37 | CREATE TABLE IF NOT EXISTS iot_dashboard.bronze_sensors
 38 | (
 39 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY,
 40 | device_id INT,
 41 | user_id INT,
 42 | calories_burnt DECIMAL(10,2), 
 43 | miles_walked DECIMAL(10,2), 
 44 | num_steps DECIMAL(10,2), 
 45 | timestamp TIMESTAMP,
 46 | value STRING
 47 | )
 48 | USING DELTA
 49 | TBLPROPERTIES("delta.targetFileSize"="128mb")
 50 | -- Other helpful properties
 51 | -- delta.dataSkippingNumIndexedCols -- decides how many columns are automatically tracked with statistics kepts (defaults to first 32)
 52 | -- LOCATION "s3://bucket-name/data_lakehouse/tables/data/bronze/bronze_senors/"
 53 | ;
 54 | 
 55 | -- COMMAND ----------
 56 | 
 57 | -- DBTITLE 1,Look at Table Details
 58 | DESCRIBE TABLE EXTENDED iot_dashboard.bronze_sensors
 59 | 
 60 | -- COMMAND ----------
 61 | 
 62 | CREATE TABLE IF NOT EXISTS iot_dashboard.bronze_users
 63 | (
 64 | userid BIGINT GENERATED BY DEFAULT AS IDENTITY (START WITH 1 INCREMENT BY 1),
 65 | gender STRING,
 66 | age INT,
 67 | height DECIMAL(10,2), 
 68 | weight DECIMAL(10,2),
 69 | smoker STRING,
 70 | familyhistory STRING,
 71 | cholestlevs STRING,
 72 | bp STRING,
 73 | risk DECIMAL(10,2),
 74 | update_timestamp TIMESTAMP
 75 | )
 76 | USING DELTA 
 77 | TBLPROPERTIES("delta.targetFileSize"="128mb") 
 78 | --LOCATION s3://<path>/
 79 | ;
 80 | 
 81 | -- COMMAND ----------
 82 | 
 83 | -- MAGIC %md
 84 | -- MAGIC ## Exhaustive list of all COPY INTO Options
 85 | -- MAGIC https://docs.databricks.com/sql/language-manual/delta-copy-into.html#format-options-1
 86 | 
 87 | -- COMMAND ----------
 88 | 
 89 | -- MAGIC %md
 90 | -- MAGIC
 91 | -- MAGIC ## New FEATURES IN DBR 11!
 92 | -- MAGIC
 93 | -- MAGIC 1. COPY INTO GENERIC TABLE
 94 | -- MAGIC 2. DROP COLUMN STATEMENT
 95 | -- MAGIC 3. Select all except: SELECT * EXCEPT (col1,...) FROM table
 96 | -- MAGIC
 97 | -- MAGIC https://docs.databricks.com/release-notes/runtime/11.0.html
 98 | 
 99 | -- COMMAND ----------
100 | 
101 | 
102 | --With DBR 11, we dont need to specify DDL first
103 | --CREATE TABLE IF NOT EXISTS iot_dashboard.bronze_sensors
104 | 
105 | --COPY INTO iot_dashboard.bronze_sensors
106 | --FROM (SELECT 
107 | --      id::bigint AS Id,
108 | --      device_id::integer AS device_id,
109 | --      user_id::integer AS user_id,
110 | --      calories_burnt::decimal(10,2) AS calories_burnt, 
111 | --      miles_walked::decimal(10,2) AS miles_walked, 
112 | --      num_steps::decimal(10,2) AS num_steps, 
113 | --     timestamp::timestamp AS timestamp,
114 | --      value AS value -- This is a JSON object
115 | --FROM "/databricks-datasets/iot-stream/data-device/")
116 | --FILEFORMAT = json
117 | --COPY_OPTIONS('force'='true') -- 'false' -- process incrementally
118 | --option to be incremental or always load all files
119 |  
120 | 
121 | 
122 | -- COMMAND ----------
123 | 
124 | -- DBTITLE 1,Incrementally Ingest Source Data from Raw Files
125 | COPY INTO iot_dashboard.bronze_sensors
126 | FROM (SELECT 
127 |       id::bigint AS Id,
128 |       device_id::integer AS device_id,
129 |       user_id::integer AS user_id,
130 |       calories_burnt::decimal(10,2) AS calories_burnt, 
131 |       miles_walked::decimal(10,2) AS miles_walked, 
132 |       num_steps::decimal(10,2) AS num_steps, 
133 |       timestamp::timestamp AS timestamp,
134 |       value  AS value -- This is a JSON object
135 | FROM "/databricks-datasets/iot-stream/data-device/")
136 | FILEFORMAT = json -- csv, xml, txt, parquet, binary, etc.
137 | COPY_OPTIONS('force'='false') --'true' always loads all data it sees. option to be incremental or always load all files
138 | 
139 | 
140 | --Other Helpful copy options:
141 | /*
142 | PATTERN('[A-Za-z0-9].json')
143 | FORMAT_OPTIONS ('ignoreCorruptFiles' = 'true') -- skips bad files for more robust incremental loads
144 | COPY_OPTIONS ('mergeSchema' = 'true')
145 | 'ignoreChanges' = 'true' - ENSURE DOWNSTREAM PIPELINE CAN HANDLE DUPLICATE ALREADY PROCESSED RECORDS WITH MERGE/INSERT WHERE NOT EXISTS/Etc.
146 | 'ignoreDeletes' = 'true'
147 | */;
148 | 
149 | -- COMMAND ----------
150 | 
151 | -- DBTITLE 1,Create Silver Table for upserting updates
152 | CREATE TABLE IF NOT EXISTS iot_dashboard.silver_sensors
153 | (
154 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY,
155 | device_id INT,
156 | user_id INT,
157 | calories_burnt DECIMAL(10,2), 
158 | miles_walked DECIMAL(10,2), 
159 | num_steps DECIMAL(10,2), 
160 | timestamp TIMESTAMP,
161 | value STRING
162 | )
163 | USING DELTA 
164 | PARTITIONED BY (user_id)
165 | TBLPROPERTIES("delta.targetFileSize"="128mb") -- if update heavy, file sizes are great between 64-128 mbs. The more update heavy, the smaller the files (32-256mb)
166 | --LOCATION s3://<path>/ -- Always specify location for production tables so you control where it lives in S3/ADLS/GCS
167 | -- Not specifying location parth will put table in DBFS, a managed bucket that cannot be accessed by apps outside of databricks
168 | ;
169 | 
170 | -- COMMAND ----------
171 | 
172 | -- DBTITLE 1,Perform Upserts - Device Data
173 | MERGE INTO iot_dashboard.silver_sensors AS target
174 | USING (
175 | WITH de_dup (
176 | SELECT Id::integer,
177 |               device_id::integer,
178 |               user_id::integer,
179 |               calories_burnt::decimal,
180 |               miles_walked::decimal,
181 |               num_steps::decimal,
182 |               timestamp::timestamp,
183 |               value::string,
184 |               ROW_NUMBER() OVER(PARTITION BY device_id, user_id, timestamp ORDER BY timestamp DESC) AS DupRank
185 |               FROM iot_dashboard.bronze_sensors
186 |               )
187 |               
188 | SELECT Id, device_id, user_id, calories_burnt, miles_walked, num_steps, timestamp, value
189 | FROM de_dup
190 | WHERE DupRank = 1
191 | ) AS source
192 | ON source.Id = target.Id
193 | AND source.user_id = target.user_id
194 | AND source.device_id = target.device_id
195 | WHEN MATCHED THEN UPDATE SET 
196 |   target.calories_burnt = source.calories_burnt,
197 |   target.miles_walked = source.miles_walked,
198 |   target.num_steps = source.num_steps,
199 |   target.timestamp = source.timestamp
200 | WHEN NOT MATCHED THEN INSERT *;
201 | 
202 | -- This calculate table stats for all columns to ensure the optimizer can build the best plan
203 | -- THIS IS NOT INCREMENTAL
204 | ANALYZE TABLE iot_dashboard.silver_sensors COMPUTE STATISTICS FOR ALL COLUMNS;
205 | 
206 | 
207 | /*
208 | -- INCREMENTAL
209 | Two things will happen: 
210 | 
211 | 1. Files written into the table will be compacted into larger files - up to targetFileSize
212 | 2. Co-locate files by the ZORDER keys
213 | 
214 | 
215 | Choice Factors: 
216 | 1. Use on column often utilized in joins, filters, etc. 
217 | 2. High cardinality columns
218 | 
219 | Recommended 1-3 columns, can do 5+
220 | Order ZORDER cols in order of cardinality ascending
221 | 
222 | */
223 | 
224 | OPTIMIZE iot_dashboard.silver_sensors ZORDER BY (timestamp);
225 | 
226 | 
227 | -- Truncate bronze batch once successfully loaded
228 | 
229 | -- This is the classical batch design pattern - but we can also now use streaming tables
230 | 
231 | TRUNCATE TABLE iot_dashboard.bronze_sensors;
232 | 
233 | -- COMMAND ----------
234 | 
235 | DESCRIBE HISTORY iot_dashboard.silver_sensors
236 | 
237 | -- COMMAND ----------
238 | 
239 | SELECT * FROM iot_dashboard.silver_sensors VERSION AS OF 1;
240 | 
241 | -- COMMAND ----------
242 | 
243 | -- MAGIC %md
244 | -- MAGIC
245 | -- MAGIC ## Exhaustive list of optimizations on Delta Tables
246 | -- MAGIC https://docs.databricks.com/delta/optimizations/file-mgmt.html#set-a-target-size
247 | 
248 | -- COMMAND ----------
249 | 
250 | -- MAGIC %md
251 | -- MAGIC
252 | -- MAGIC # Levels of optimization on Databricks
253 | -- MAGIC
254 | -- MAGIC ## Partitions - Do not over partition - usually ZORDERING covers what you need - even in big tables
255 | -- MAGIC ### File Sizes - smaller for BI heavy and update heavy tables 64mb to 128mb
256 | -- MAGIC #### Order of files -- ZORDER(col,col) / CLUSTER BY -- ZORDER on most used filtering/join columns, in order of cardinality like a funnel
257 | -- MAGIC ##### Indexes -- For highly selective queries - need to create index first then fill with data "needle in a haystack"
258 | 
259 | -- COMMAND ----------
260 | 
261 | -- MAGIC %md
262 | -- MAGIC
263 | -- MAGIC ##### For partitions, make sure each partitions is at LEAST 10s of GB, otherwise, your partitions are too small
264 | 
265 | -- COMMAND ----------
266 | 
267 | -- DBTITLE 1,Change Size of Files - will be changed when files are optimized
268 | ALTER TABLE iot_dashboard.silver_sensors SET TBLPROPERTIES ('delta.targetFileSize'='64mb');
269 | 
270 | -- COMMAND ----------
271 | 
272 | -- DBTITLE 1,Table Optimizations
273 | -- You want to optimize by high cardinality columns like ids, timestamps, strings
274 | OPTIMIZE iot_dashboard.silver_sensors ZORDER BY (device_id, timestamp);
275 | 
276 | -- COMMAND ----------
277 | 
278 | -- MAGIC %md
279 | -- MAGIC
280 | -- MAGIC ## Details on Bloom Indexs Here:
281 | -- MAGIC https://docs.databricks.com/delta/optimizations/bloom-filters.html
282 | 
283 | -- COMMAND ----------
284 | 
285 | --Bloom filters need to exist first, so if you add an index later you need to reprocess the files (an optimize, insert, etc.)
286 | --Ideally a column that is highly selective but not used in z-order (text, other timestamps, etc.)
287 | 
288 | CREATE BLOOMFILTER INDEX
289 | ON TABLE iot_dashboard.silver_sensors
290 | FOR COLUMNS(device_id OPTIONS (fpp=0.1, numItems=50000000))
291 | 
292 | -- COMMAND ----------
293 | 
294 | -- DBTITLE 1,Select Semi Structured/Unstructred Data with JSON dot notation
295 | SELECT 
296 | *,
297 | value:user_id::integer AS parsed_user,
298 | value:time_stamp::timestamp AS parsed_time -- Pro tip: You can do the same thing if reading in json via the text reader. Makes for highly flexible data ingestion
299 | FROM iot_dashboard.silver_sensors;
300 | 
301 | -- COMMAND ----------
302 | 
303 | -- MAGIC %md 
304 | -- MAGIC
305 | -- MAGIC ## Ingest User Data As Well
306 | 
307 | -- COMMAND ----------
308 | 
309 | -- DBTITLE 1,Incrementally Ingest Raw User Data
310 | COPY INTO iot_dashboard.bronze_users
311 | FROM (SELECT 
312 |       userid::bigint AS userid,
313 |       gender AS gender,
314 |       age::integer AS age,
315 |       height::decimal(10,2) AS height, 
316 |       weight::decimal(10,2) AS weight,
317 |       smoker AS smoker,
318 |       familyhistory AS familyhistory,
319 |       cholestlevs AS cholestlevs,
320 |       bp AS bp,
321 |       risk::decimal(10,2) AS risk,
322 |       current_timestamp() AS update_timestamp
323 | FROM "/databricks-datasets/iot-stream/data-user/")
324 | FILEFORMAT = CSV
325 | FORMAT_OPTIONS('header'='true')
326 | COPY_OPTIONS('force'='true') --option to be incremental or always load all files
327 | ;
328 | 
329 | -- COMMAND ----------
330 | 
331 | CREATE TABLE IF NOT EXISTS iot_dashboard.silver_users
332 | (
333 | userid BIGINT GENERATED BY DEFAULT AS IDENTITY,
334 | gender STRING,
335 | age INT,
336 | height DECIMAL(10,2), 
337 | weight DECIMAL(10,2),
338 | smoker STRING,
339 | familyhistory STRING,
340 | cholestlevs STRING,
341 | bp STRING,
342 | risk DECIMAL(10,2),
343 | update_timestamp TIMESTAMP
344 | )
345 | USING DELTA 
346 | TBLPROPERTIES("delta.targetFileSize"="128mb")
347 | --LOCATION s3://<path>/ -- Always specify path for production tables. 
348 | ;
349 | 
350 | -- COMMAND ----------
351 | 
352 | MERGE INTO iot_dashboard.silver_users AS target
353 | USING (SELECT 
354 |       userid::int,
355 |       gender::string,
356 |       age::int,
357 |       height::decimal, 
358 |       weight::decimal,
359 |       smoker,
360 |       familyhistory,
361 |       cholestlevs,
362 |       bp,
363 |       risk,
364 |       update_timestamp
365 |       FROM iot_dashboard.bronze_users) AS source
366 | ON source.userid = target.userid
367 | WHEN MATCHED THEN UPDATE SET 
368 |   target.gender = source.gender,
369 |       target.age = source.age,
370 |       target.height = source.height, 
371 |       target.weight = source.weight,
372 |       target.smoker = source.smoker,
373 |       target.familyhistory = source.familyhistory,
374 |       target.cholestlevs = source.cholestlevs,
375 |       target.bp = source.bp,
376 |       target.risk = source.risk,
377 |       target.update_timestamp = source.update_timestamp
378 | WHEN NOT MATCHED THEN INSERT *;
379 | 
380 | --Truncate bronze batch once successfully loaded
381 | TRUNCATE TABLE iot_dashboard.bronze_users;
382 | 
383 | -- COMMAND ----------
384 | 
385 | OPTIMIZE iot_dashboard.silver_users ZORDER BY (userid);
386 | 
387 | -- COMMAND ----------
388 | 
389 | SELECT * FROM iot_dashboard.silver_users;
390 | 
391 | -- COMMAND ----------
392 | 
393 | SELECT * FROM iot_dashboard.silver_sensors;
394 | 


--------------------------------------------------------------------------------
/Design Patterns Notebooks/Step 10 - Lakehouse Federation.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # Using Lakehouse Federation for a single Pane of Glass
 3 | 
 4 | ## Topics
 5 | 
 6 | 1. How to use Lakehouse Federation
 7 | 2. Setting up new database
 8 | 3. Performance management / considerations
 9 | 4. Limitations
10 | 


--------------------------------------------------------------------------------
/Design Patterns Notebooks/Step 11 - SQL Orchestration in Production.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | ## Orchestrating SQL Pipelines in Production
3 | 
4 | 1. SQL Tasks Types
5 | 2. Airflow Operator
6 | 3. DBSQL REST API / Pushdown Client
7 | 4. Single Node Jobs pattern
8 | 


--------------------------------------------------------------------------------
/Design Patterns Notebooks/Step 12 - SCD2 - SQL EDW Pipeline.sql:
--------------------------------------------------------------------------------
  1 | -- Databricks notebook source
  2 | -- MAGIC %md
  3 | -- MAGIC
  4 | -- MAGIC # This notebook generates a full data pipeline from databricks dataset - iot-stream using INSERT ONLY SCD-2 Architecture
  5 | -- MAGIC
  6 | -- MAGIC ## This creates 2 tables: 
  7 | -- MAGIC
  8 | -- MAGIC <b> Database: </b> iot_dashboard
  9 | -- MAGIC
 10 | -- MAGIC <b> Tables: </b> silver_sensors_silver, silver_sensors_bronze (raw updates)
 11 | -- MAGIC
 12 | -- MAGIC <b> Params: </b> StartOver (Yes/No) - allows user to truncate and reload pipeline
 13 | 
 14 | -- COMMAND ----------
 15 | 
 16 | -- DBTITLE 1,Medallion Architecture
 17 | -- MAGIC %md
 18 | -- MAGIC
 19 | -- MAGIC <img src="https://databricks.com/wp-content/uploads/2022/03/delta-lake-medallion-architecture-2.jpeg" >
 20 | 
 21 | -- COMMAND ----------
 22 | 
 23 | DROP DATABASE IF EXISTS iot_dashboard CASCADE;
 24 | CREATE DATABASE IF NOT EXISTS iot_dashboard;
 25 | USE iot_dashboard;
 26 | 
 27 | -- COMMAND ----------
 28 | 
 29 | -- MAGIC %md
 30 | -- MAGIC
 31 | -- MAGIC # DDL Documentation: 
 32 | -- MAGIC
 33 | -- MAGIC https://docs.databricks.com/spark/latest/spark-sql/language-manual/sql-ref-syntax-ddl-alter-table.html
 34 | 
 35 | -- COMMAND ----------
 36 | 
 37 | CREATE TABLE IF NOT EXISTS iot_dashboard.bronze_sensors_scd_2
 38 | (
 39 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY,
 40 | device_id INT,
 41 | user_id INT,
 42 | calories_burnt DECIMAL(10,2), 
 43 | miles_walked DECIMAL(10,2), 
 44 | num_steps DECIMAL(10,2), 
 45 | timestamp TIMESTAMP,
 46 | value STRING,
 47 | ingest_timestamp TIMESTAMP
 48 | )
 49 | USING DELTA
 50 | TBLPROPERTIES("delta.targetFileSize"="128mb")
 51 | -- Other helpful properties
 52 | -- delta.dataSkippingNumIndexedCols -- decides how many columns are automatically tracked with statistics kepts (defaults to first 32)
 53 | -- LOCATION "s3://bucket-name/data_lakehouse/tables/data/bronze/bronze_senors/"
 54 | ;
 55 | 
 56 | -- COMMAND ----------
 57 | 
 58 | -- DBTITLE 1,Look at Table Details
 59 | DESCRIBE TABLE EXTENDED iot_dashboard.bronze_sensors_scd_2
 60 | 
 61 | -- COMMAND ----------
 62 | 
 63 | -- MAGIC %md
 64 | -- MAGIC
 65 | -- MAGIC ## New FEATURES IN DBR 11!
 66 | -- MAGIC
 67 | -- MAGIC 1. COPY INTO GENERIC TABLE
 68 | -- MAGIC 2. DROP COLUMN STATEMENT
 69 | -- MAGIC 3. Select all except: SELECT * EXCEPT (col1,...) FROM table
 70 | -- MAGIC
 71 | -- MAGIC https://docs.databricks.com/release-notes/runtime/11.0.html
 72 | 
 73 | -- COMMAND ----------
 74 | 
 75 | 
 76 | --With DBR 11, we dont need to specify DDL first
 77 | --CREATE TABLE IF NOT EXISTS iot_dashboard.bronze_sensors
 78 | 
 79 | --COPY INTO iot_dashboard.bronze_sensors
 80 | --FROM (SELECT 
 81 | --      id::bigint AS Id,
 82 | --      device_id::integer AS device_id,
 83 | --      user_id::integer AS user_id,
 84 | --      calories_burnt::decimal(10,2) AS calories_burnt, 
 85 | --      miles_walked::decimal(10,2) AS miles_walked, 
 86 | --      num_steps::decimal(10,2) AS num_steps, 
 87 | --     timestamp::timestamp AS timestamp,
 88 | --      value AS value -- This is a JSON object
 89 | --FROM "/databricks-datasets/iot-stream/data-device/")
 90 | --FILEFORMAT = json
 91 | --COPY_OPTIONS('force'='true') -- 'false' -- process incrementally
 92 | --option to be incremental or always load all files
 93 |  
 94 | 
 95 | 
 96 | -- COMMAND ----------
 97 | 
 98 | -- DBTITLE 1,Incrementally Ingest Source Data from Raw Files
 99 | COPY INTO iot_dashboard.bronze_sensors_scd_2
100 | FROM (SELECT 
101 |       id::bigint AS Id,
102 |       device_id::integer AS device_id,
103 |       user_id::integer AS user_id,
104 |       calories_burnt::decimal(10,2) AS calories_burnt, 
105 |       miles_walked::decimal(10,2) AS miles_walked, 
106 |       num_steps::decimal(10,2) AS num_steps, 
107 |       timestamp::timestamp AS timestamp,
108 |       value  AS value, -- This is a JSON object,
109 |       now() AS ingest_timestamp
110 | FROM "/databricks-datasets/iot-stream/data-device/")
111 | FILEFORMAT = json -- csv, xml, txt, parquet, binary, etc.
112 | COPY_OPTIONS('force'='true') --'true' always loads all data it sees. option to be incremental or always load all files
113 | 
114 | 
115 | --Other Helpful copy options:
116 | /*
117 | PATTERN('[A-Za-z0-9].json')
118 | FORMAT_OPTIONS ('ignoreCorruptFiles' = 'true') -- skips bad files for more robust incremental loads
119 | COPY_OPTIONS ('mergeSchema' = 'true')
120 | 'ignoreChanges' = 'true' - ENSURE DOWNSTREAM PIPELINE CAN HANDLE DUPLICATE ALREADY PROCESSED RECORDS WITH MERGE/INSERT WHERE NOT EXISTS/Etc.
121 | 'ignoreDeletes' = 'true'
122 | */;
123 | 
124 | -- COMMAND ----------
125 | 
126 | -- DBTITLE 1,Create Silver Table for upserting updates
127 | CREATE TABLE IF NOT EXISTS iot_dashboard.silver_sensors_scd_2
128 | (
129 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY,
130 | device_id INT,
131 | user_id INT,
132 | calories_burnt DECIMAL(10,2), 
133 | miles_walked DECIMAL(10,2), 
134 | num_steps DECIMAL(10,2), 
135 | timestamp TIMESTAMP,
136 | value STRING,
137 | ingest_timestamp TIMESTAMP,
138 | -- Processing Columns
139 | _start_timestamp TIMESTAMP,
140 | _end_timestamp TIMESTAMP,
141 | _batch_run_id STRING,
142 | _is_current BOOLEAN
143 | )
144 | USING DELTA 
145 | PARTITIONED BY (_is_current)
146 | TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported', 'delta.columnMapping.mode' = 'name') -- if update heavy, file sizes are great between 64-128 mbs. The more update heavy, the smaller the files (32-256mb)
147 | --LOCATION s3://<path>/ -- Always specify location for production tables so you control where it lives in S3/ADLS/GCS
148 | -- Not specifying location parth will put table in DBFS, a managed bucket that cannot be accessed by apps outside of databricks
149 | ;
150 | 
151 | -- COMMAND ----------
152 | 
153 | -- DBTITLE 1,Check incoming batch for data
154 | SELECT * FROM iot_dashboard.bronze_sensors_scd_2
155 | 
156 | -- COMMAND ----------
157 | 
158 | -- DBTITLE 1,Perform SCD2 INSERT ONLY Upserts - Device Data
159 | -- Step 1 - get state of the active batch
160 | 
161 | --DECLARE OR REPLACE VARIABLE var_batch_id STRING = uuid();
162 | 
163 | -- Optional intra-batch pre insert/merge de-cup
164 | CREATE OR REPLACE TABLE iot_dashboard.temp_batch_to_insert
165 | AS
166 | WITH de_dup (
167 | SELECT Id::integer,
168 |               device_id::integer,
169 |               user_id::integer,
170 |               calories_burnt::decimal,
171 |               miles_walked::decimal,
172 |               num_steps::decimal,
173 |               timestamp::timestamp,
174 |               value::string,
175 |               ingest_timestamp,
176 |               ROW_NUMBER() OVER(PARTITION BY device_id, user_id, timestamp ORDER BY ingest_timestamp DESC) AS DupRank
177 |               FROM iot_dashboard.bronze_sensors_scd_2
178 |               )
179 |               
180 | SELECT Id, device_id, user_id, calories_burnt, miles_walked, num_steps, timestamp, value, ingest_timestamp, 
181 | now() AS _start_timestamp, 
182 | true AS _is_current,
183 | 1001 AS _batch_run_id -- example batch run id
184 | FROM de_dup
185 | WHERE DupRank = 1
186 | ;
187 | 
188 | MERGE INTO iot_dashboard.silver_sensors_scd_2 AS target
189 | USING ( 
190 | 
191 |       SELECT updates.Id AS merge_key_id,
192 |         updates.user_id AS merge_key_user_id,
193 |         updates.device_id AS merge_key_device_id,
194 |         updates.* --merge key can be built in whatever way makes sense to get unique rows
195 |       FROM iot_dashboard.temp_batch_to_insert AS updates
196 |     
197 |       UNION ALL
198 | 
199 |       -- These rows will INSERT updated rows of existing records and new rows
200 |       -- Setting the merge_key to NULL forces these rows to NOT MATCH and be INSERTed.
201 |       SELECT 
202 |       NULL AS merge_key_id,
203 |       NULL AS merge_key_user_id,
204 |       NULL AS merge_key_device_id,
205 |       updates.*
206 |       FROM iot_dashboard.temp_batch_to_insert AS updates
207 |       INNER JOIN iot_dashboard.silver_sensors_scd_2 as target_table
208 |       ON updates.Id = target_table.Id
209 |       AND updates.user_id = target_table.user_id
210 |       AND updates.device_id = target_table.device_id  -- What makes the key unique
211 |       -- This needs to be accounted for when deciding to expire existing rows
212 |       WHERE updates.value <> target_table.value -- Only update if any of the data has changed
213 | 
214 |         ) AS source
215 |         
216 | ON target.Id = source.merge_key_id
217 | AND target.user_id = source.merge_key_user_id
218 | AND target.device_id = source.merge_key_device_id
219 | 
220 | WHEN MATCHED AND (target._is_current = true AND target.value <> source.value) THEN
221 | UPDATE SET
222 | target._end_timestamp = source._start_timestamp, -- start of new record is end of old record
223 | target._is_current = false
224 | 
225 | WHEN NOT MATCHED THEN 
226 | INSERT (id, device_id, user_id, calories_burnt, miles_walked, num_steps, value, timestamp, ingest_timestamp, _start_timestamp, _end_timestamp, _is_current, _batch_run_id)
227 | VALUES  (
228 | source.id, source.device_id, source.user_id, source.calories_burnt, source.miles_walked, source.num_steps, source.value, source.timestamp, 
229 | source.ingest_timestamp,
230 | source._start_timestamp, -- start timestamp -- new records
231 | NULL ,-- end_timestamp 
232 | source._is_current, -- is current record
233 | source._batch_run_id --example batch run id
234 | )
235 | ;
236 | 
237 | -- This calculate table stats for all columns to ensure the optimizer can build the best plan
238 | -- THIS IS NOT INCREMENTAL
239 | ANALYZE TABLE iot_dashboard.silver_sensors_scd_2 COMPUTE STATISTICS FOR ALL COLUMNS;
240 | 
241 | -- THIS IS INCREMENTAL
242 | OPTIMIZE iot_dashboard.silver_sensors_scd_2 ZORDER BY (timestamp, device_id);
243 | 
244 | -- Truncate bronze batch once successfully loaded
245 | -- If succeeds remove temp table
246 | TRUNCATE TABLE iot_dashboard.temp_batch_to_insert;
247 | 
248 | -- COMMAND ----------
249 | 
250 | DESCRIBE HISTORY iot_dashboard.silver_sensors_scd_2
251 | 
252 | -- COMMAND ----------
253 | 
254 | -- DBTITLE 1,Select Raw Table
255 | SELECT * FROM iot_dashboard.silver_sensors_scd_2
256 | 
257 | -- COMMAND ----------
258 | 
259 | -- DBTITLE 1,Get Amount of Expired Records
260 | SELECT 
261 | _is_current AS ActiveRecord,
262 | COUNT(0)
263 | FROM iot_dashboard.silver_sensors_scd_2
264 | GROUP BY _is_current
265 | 
266 | -- COMMAND ----------
267 | 
268 | -- DBTITLE 1,Look at various batch timelines over time
269 | SELECT 
270 | `_start_timestamp` AS active_timestamp,
271 | COUNT(0)
272 | FROM iot_dashboard.silver_sensors_scd_2
273 | GROUP BY `_start_timestamp`
274 | ORDER BY active_timestamp
275 | 
276 | -- COMMAND ----------
277 | 
278 | -- DBTITLE 1,Create "Current" View
279 | CREATE OR REPLACE VIEW iot_dashboard.silver_sensors_current
280 | AS
281 | SELECT * FROM iot_dashboard.silver_sensors_scd_2
282 | WHERE _is_current = true
283 | 
284 | -- COMMAND ----------
285 | 
286 | -- DBTITLE 1,Create "Snapshotted Views"
287 | CREATE OR REPLACE VIEW iot_dashboard.silver_sensors_snapshot_as_of_2023_10_10_19_30_00
288 | AS
289 | -- Get more recent record for each record as of a specific version
290 | WITH de_dup (
291 | SELECT Id::integer,
292 |               device_id::integer,
293 |               user_id::integer,
294 |               calories_burnt::decimal,
295 |               miles_walked::decimal,
296 |               num_steps::decimal,
297 |               timestamp::timestamp,
298 |               value::string,
299 |               ingest_timestamp,
300 |               _start_timestamp,
301 |               _is_current,
302 |               _end_timestamp,
303 |               ROW_NUMBER() OVER(PARTITION BY id ORDER BY _start_timestamp DESC) AS DupRank -- Get most recent record as of a specific point in time
304 |               FROM iot_dashboard.silver_sensors_scd_2
305 |               -- Point in time snapshot timestamp such as end of month
306 |               WHERE _start_timestamp <= '2023-10-10T19:30:00'::timestamp
307 |               )
308 |               
309 | SELECT *
310 | FROM de_dup
311 | WHERE DupRank = 1
312 | ;
313 | 
314 | -- COMMAND ----------
315 | 
316 | -- DBTITLE 1,Look at snapshot of most recent version of each record at a point in time
317 | SELECT *
318 | FROM iot_dashboard.silver_sensors_snapshot_as_of_2023_10_10_19_30_00
319 | 


--------------------------------------------------------------------------------
/Design Patterns Notebooks/Step 13 - Migrating Identity Columns.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md
 3 | -- MAGIC
 4 | -- MAGIC ## How to migrate IDENTITY columns from a Data Warehouse to DBSQL / Delta Lakehouse
 5 | -- MAGIC
 6 | -- MAGIC ## Summary
 7 | -- MAGIC Quick notebook showing how to properly migrate tables from a data warehouse to a Delta table where you want to retain the values of existing IDENTITY key values and ensure that the IDENTITY generation picks up from the most recent IDENTITY column value
 8 | 
 9 | -- COMMAND ----------
10 | 
11 | -- MAGIC %md
12 | -- MAGIC
13 | -- MAGIC
14 | -- MAGIC ### Steps to migrate key properly
15 | -- MAGIC
16 | -- MAGIC 1. Create a table with id columns such as: GENERATED BY DEFAULT AS IDENTITY (START WITH 1 INCREMENT BY 1)
17 | -- MAGIC 2. Backfill existing data warehouse tables with an INSERT INTO / MERGE from a snapshot of the datawarehouse table
18 | -- MAGIC 3. Run command: ALTER TABLE main.default.identity_test ALTER COLUMN id SYNC IDENTITY; to ensure that the newly inserted values pick up where the data warehouse left off on key generation
19 | -- MAGIC 4. Insert new identity values with new pipelines (or leave out column and let it auto-generate)
20 | 
21 | -- COMMAND ----------
22 | 
23 | -- DBTITLE 1,Simple End to End Example
24 | 
25 | CREATE OR REPLACE TABLE main.default.identity_test (
26 |   id BIGINT GENERATED BY DEFAULT AS IDENTITY (START WITH 1 INCREMENT BY 1),
27 |   name STRING DEFAULT 'cody'
28 |   )
29 |   TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported', 'delta.columnMapping.mode' = 'name')
30 | ;
31 | 
32 | -- Simulate EDW migration load with existing keys
33 | INSERT INTO main.default.identity_test (id,name) 
34 | VALUES (5, 'cody'), (6, 'davis');
35 | 
36 | 
37 | SELECT * FROM main.default.identity_test;
38 | 
39 | 
40 | -- Simulate new load incrmentally
41 | 
42 | INSERT INTO main.default.identity_test (name) 
43 | VALUES ('cody_new'), ('davis_new');
44 | 
45 | -- BAD! ID keys get messed up
46 | SELECT * FROM main.default.identity_test;
47 | 
48 | -- FIX
49 | ALTER TABLE main.default.identity_test ALTER COLUMN id SYNC IDENTITY;
50 | 
51 | -- try again
52 | INSERT INTO main.default.identity_test (name) 
53 | VALUES ('cody_fix'), ('davis_fix');
54 | 
55 | SELECT * FROM main.default.identity_test;
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/Design Patterns Notebooks/Step 14 - Using the Query Profile.sql:
--------------------------------------------------------------------------------
  1 | -- Databricks notebook source
  2 | -- QUERY PROFILE DEMO
  3 | -- SHOWS HOW TO SPOT AND RESOLVE BOTTLENECKS AND PERFORMANCE CHALLNEGES IN DBSQL WITH QUERY PROFILE
  4 | 
  5 | /* DEMO FLOW
  6 | 
  7 | 1. Show 4 Unoptimized Queries and Assess the query profile
  8 | 2. Optimize the 4 queries for different needs with PARTITION, ZORDER, CLUSTER BY (optional)
  9 | 3. Show Updated Query Profiles and improved bottlenecks
 10 | 
 11 | 
 12 | Query Profile Things to Look for at the top level in the profile:
 13 | 
 14 | 1. % Time in Photon - top level 
 15 | 2. % execution time vs optimizing/pruning files
 16 | 3. Spilling - If ANY, than query is very bad or cluster is too small
 17 | 
 18 | Node-level things to look for:
 19 | 
 20 | 1. Dark colored nodes -- indicate where time/effort is going 
 21 | 2. Large arrows -- indicate high data transfer across nodes
 22 | 3. File / Partition Pruning metrics
 23 | 4. Runtime for each node (correlated to darkness of color)
 24 | 
 25 | 
 26 | */
 27 | USE CATALOG main;
 28 | USE DATABASE tpc_edw_demo;
 29 | 
 30 | 
 31 | --============================================================================--
 32 | /*
 33 | Step 1: Start with unoptimized data model
 34 | 
 35 | Look at 4 queries: 
 36 | 1. Single point lookups -- Specific trade id -- File pruning bottleneck
 37 | 2. Big Joins and Large Selects -- Showing different types of bottleneck -- Shuffle
 38 | 2. Range lookups -- Analytics for a specific date range -- Complex file pruning and shuffle!
 39 | 3. Aggregates -- Aggregates on a specific date range - more complex question -- More complex query plans
 40 | 
 41 | */
 42 | 
 43 | 
 44 | --===== QUERY 1: Point selection -- look for specific trade id
 45 | 
 46 | SELECT
 47 | h.currenttradeid,
 48 | h.currentprice,
 49 | h.currentholding,
 50 | h.currentholding*h.currentprice AS CurrentMarketValue,
 51 | c.lastname,
 52 | c.firstname,
 53 | c.status
 54 | FROM main.tpc_edw_demo.factholdings h
 55 | INNER JOIN main.tpc_edw_demo.dimcustomer c ON c.sk_customerid = h.sk_customerid
 56 | INNER JOIN main.tpc_edw_demo.dimcompany comp ON comp.sk_companyid = h.sk_companyid
 57 | WHERE h.currenttradeid = 527764963
 58 | AND c.status = 'Active'
 59 | 
 60 | -- Look at unoptimized query profile - No file pruning!
 61 | 
 62 | -- Query 1 Profile Output: https://e2-demo-field-eng.cloud.databricks.com/sql/history?lookupKey=CiQwMWVlNzJiMi00MjZhLTEzNGYtODE3OC1hNGNmNjhmOGU1MzIQiLKmnLYxGhA0NzViOTRkZGM3Y2Q1MjExILyFvZC14AQ%3D&o=1444828305810485&uiQueryProfileVisible=true&userId=20904982561468
 63 | 
 64 | 
 65 | -- Look at Row and time spent output in the DAG in the query profile -- Fatter arrows mean more data movement and bottlenecks
 66 | -- Darker nodes in profile show bottlenecks AUTOMATICALLY
 67 | 
 68 | /* Scan Node for factholding
 69 | 
 70 | Files pruned	0	
 71 | Files read	448	
 72 | Number of output batches	449	
 73 | Number of output rows	1	
 74 | Peak memory usage	3.50	GB
 75 | Size of files pruned	0	
 76 | Size of files read	49.03	GB
 77 | */
 78 | 
 79 | /* Scan Node for dimcompany
 80 | Files pruned	0	
 81 | Files read	16	
 82 | Number of output batches	1,232	
 83 | Number of output rows	5,000,000	
 84 | Peak memory usage	288.37	MB
 85 | Size of files pruned	0	
 86 | Size of files read	709.25	MB
 87 | */
 88 | 
 89 | 
 90 | 
 91 | --===== QUERY 2: Big Joins and Select
 92 | 
 93 | SELECT
 94 | h.tradeid,
 95 | h.currentprice,
 96 | h.currentholding,
 97 | h.currentholding*h.currentprice AS CurrentMarketValue,
 98 | c.lastname,
 99 | c.firstname,
100 | c.status,
101 | *
102 | FROM main.tpc_edw_demo.factholdings h
103 | INNER JOIN main.tpc_edw_demo.dimcustomer c ON c.sk_customerid = h.sk_customerid
104 | INNER JOIN main.tpc_edw_demo.dimcompany comp ON comp.sk_companyid = h.sk_companyid
105 | 
106 | -- What is the bottleneck here? SHUFFLE
107 | -- Not only lots of rows, but lots of data to shuffle around
108 | 
109 | -- Query 2 Profile Output: https://e2-demo-field-eng.cloud.databricks.com/sql/history?lookupKey=CiQwMWVlNzJiMy00ZmRjLTE2NjUtYmYwZC05ZWY5NGVlNzJkZjIQ4f3BnLYxGhA0NzViOTRkZGM3Y2Q1MjExILyFvZC14AQ%3D&o=1444828305810485&uiQueryProfileVisible=true&userId=20904982561468
110 | 
111 | 
112 | 
113 | --===== QUERY 3: Range Timestamp/Date Filters
114 | 
115 | SELECT
116 | to_date(sk_dateid::string, "yyyyMMdd") AS Date,
117 | AVG(h.currentholding*h.currentprice) AS CurrentMarketValue,
118 | MAX(h.currentholding*h.currentprice) AS MaxHoldingValue
119 | FROM main.tpc_edw_demo.factholdings h
120 | INNER JOIN main.tpc_edw_demo.dimcustomer c ON c.sk_customerid = h.sk_customerid
121 | INNER JOIN main.tpc_edw_demo.dimcompany comp ON comp.sk_companyid = h.sk_companyid
122 | WHERE sk_dateid BETWEEN 20130101 AND 20131201
123 | GROUP BY to_date(sk_dateid::string, "yyyyMMdd")
124 | ORDER BY Date
125 | 
126 | 
127 | -- What is the bottleneck here? File Pruning on a range
128 | -- Lots of downstream aggregations, we want to minimize records BEFORE those transformations
129 | -- Look at time spent nodes and Rows node
130 | 
131 | -- Query 3 Profile Output: https://e2-demo-field-eng.cloud.databricks.com/sql/history?lookupKey=CiQwMWVlNzJiNy0xYWU4LTExMjUtOTU3YS1mNjgyMGNhZGEwZmMQuLWlnbYxGhA0NzViOTRkZGM3Y2Q1MjExILyFvZC14AQ%3D&o=1444828305810485&uiQueryProfileVisible=true&userId=20904982561468
132 | 
133 | 
134 | --===== QUERY 4: Complex Query Aggregates
135 | 
136 | -- For a given year, who were the top 10 holding customers?
137 | 
138 | WITH year_selected_holding AS (
139 | 
140 | SELECT
141 | h.tradeid,
142 | h.currentprice,
143 | h.currentholding,
144 | h.currentholding*h.currentprice AS CurrentMarketValue,
145 | c.lastname,
146 | c.firstname,
147 | c.status,
148 | comp.name AS company_name,
149 | to_date(sk_dateid::string, "yyyyMMdd") AS Date
150 | FROM main.tpc_edw_demo.factholdings h
151 | INNER JOIN main.tpc_edw_demo.dimcustomer c ON c.sk_customerid = h.sk_customerid
152 | INNER JOIN main.tpc_edw_demo.dimcompany comp ON comp.sk_companyid = h.sk_companyid
153 | WHERE h.sk_dateid BETWEEN 20150101 AND 20151201
154 | )
155 | ,
156 | holding_customer_agg AS (
157 | 
158 | SELECT
159 | CONCAT(lastname, ', ', firstname) AS CustomerName,
160 | SUM(CurrentMarketValue) AS TotalHoldingsValue
161 | FROM year_selected_holding
162 | GROUP BY CONCAT(lastname, ', ', firstname)
163 | ),
164 | customer_rank AS (
165 | 
166 | SELECT
167 |   *,
168 |   DENSE_RANK() OVER (ORDER BY TotalHoldingsValue DESC) AS CustomerRank
169 | FROM holding_customer_agg
170 | )
171 | SELECT * FROM customer_rank ORDER BY CustomerRank LIMIT 10
172 | 
173 | -- What is bottleneck here? -- SHUFFLE
174 | -- No file pruning happening still
175 | 
176 | -- Query 4 Profile Output: https://e2-demo-field-eng.cloud.databricks.com/sql/history?lookupKey=CiQwMWVlNzJiOS1jYjU2LTE3YTItYmFiMy0xNjY3YWRlMmRlOTcQ6%2FTrnbYxGhA0NzViOTRkZGM3Y2Q1MjExILyFvZC14AQ%3D&o=1444828305810485&uiQueryProfileVisible=true&userId=20904982561468
177 | 
178 | 
179 | 
180 | --============================================================================--
181 | /*
182 | Step 2: OPTIMIZE / ZORDER the core source tables
183 | 
184 | Questions to ask: 
185 | 
186 | 1. What did we filter on above? 
187 | 2. What is reused in filters? 
188 | 3. How can we do smarter joins to reduce those shuffle bottlenecks? 
189 | 
190 | */
191 | 
192 | -- Table: main.tpc_edw_demo.factholdings
193 | -- Columns Used Often in Filters: sk_dateid, currenttradeid
194 | 
195 | OPTIMIZE main.tpc_edw_demo.factholdings ZORDER BY (sk_dateid, currenttradeid);
196 | -- Large fact table so be careful here, be selective
197 | ANALYZE TABLE main.tpc_edw_demo.factholdings COMPUTE STATISTICS FOR COLUMNS sk_dateid, currenttradeid, sk_customerid, sk_companyid;
198 | 
199 | 
200 | -- Table: main.tpc_edw_demo.dimcustomer
201 | -- Columns Used Often in Joins as dim tables: sk_customerid
202 | 
203 | OPTIMIZE main.tpc_edw_demo.dimcustomer ZORDER BY (sk_customerid);
204 | -- Dim table so not really expensive to calculate
205 | ANALYZE TABLE main.tpc_edw_demo.dimcustomer COMPUTE STATISTICS FOR ALL COLUMNS;
206 | 
207 | 
208 | -- Table: main.tpc_edw_demo.dimcompany
209 | -- Columns Used Often in Joins as dim tables: sk_companyid
210 | 
211 | OPTIMIZE main.tpc_edw_demo.dimcompany ZORDER BY (sk_companyid);
212 | -- Dim table so not really expensive to calculate
213 | ANALYZE TABLE main.tpc_edw_demo.dimcompany COMPUTE STATISTICS FOR ALL COLUMNS;
214 | 
215 | 
216 | 
217 | 
218 | --============================================================================--
219 | /*
220 | Step 3: Look at updated query profiles! 
221 | 
222 | Updated Query Profiles re-run from above queries: 
223 | 
224 | 
225 | 
226 | Query 1 Profile: https://e2-demo-field-eng.cloud.databricks.com/sql/history?lookupKey=CiQwMWVlNzJiYi00MWVmLTExNDAtYWIwMi0yZDQwMjNmNWU0MTQQ4KKSnrYxGhA0NzViOTRkZGM3Y2Q1MjExILyFvZC14AQ%3D&o=1444828305810485&uiQueryProfileVisible=true&userId=20904982561468
227 | 
228 | Original Runtime: 3 seconds
229 | Optimize Runtime: 1.3 seconds
230 | 
231 | WHAT IS DIFFERENT? -- Pruned MUCH more --  no more scanning all of the table
232 | 
233 | Files pruned	1,018	
234 | Files read	6	
235 | Number of output batches	2	
236 | Number of output rows	1	
237 | Peak memory usage	98.89	MB
238 | Size of files pruned	47.88	GB
239 | Size of files read	293.59	MB
240 | 
241 | 
242 | ---===== 
243 | 
244 | Query 2 Profile: https://e2-demo-field-eng.cloud.databricks.com/sql/history?lookupKey=CiQwMWVlNzJiMy00ZmRjLTE2NjUtYmYwZC05ZWY5NGVlNzJkZjIQ4f3BnLYxGhA0NzViOTRkZGM3Y2Q1MjExILyFvZC14AQ%3D&o=1444828305810485&uiQueryProfileVisible=true&userId=20904982561468
245 | 
246 | Original Runtime: 5 min 39 seconds
247 | Optimized Runtime: 4 min 36 seconds
248 | 
249 | WHAT IS DIFFERENT? 
250 | Selecting everything from large tables is just wasteful :/
251 | 
252 | Shuffle node went down a little bit from 3.7 hours in total runtime to 3.4, but selecting that much data with no filters requires more tuning
253 | 
254 | 
255 | ---=====
256 | Query 3 Profile: https://e2-demo-field-eng.cloud.databricks.com/sql/history?lookupKey=CiQwMWVlNzJiYy1jOThlLTFiYjAtOWMwNS04NDIwYjA1MTE0M2QQ66%2B6nrYxGhA0NzViOTRkZGM3Y2Q1MjExILyFvZC14AQ%3D&o=1444828305810485&uiQueryProfileVisible=true&userId=20904982561468
257 | 
258 | Original Runtime: 9.2 seconds, 0 files pruned
259 | Optimized Runtime: 9.3 seconds, many files pruned, but now more work
260 | 
261 | 
262 | 
263 | WHAT IS DIFFERENT? 
264 | Files pruned	822	
265 | Files read	202	
266 | Number of output batches	57,690	
267 | Number of output rows	224,054,102
268 | 
269 | Look at shuffle
270 | 
271 | 
272 | 
273 | ---=====
274 | Query 4 Profile: https://e2-demo-field-eng.cloud.databricks.com/sql/history?lookupKey=CiQwMWVlNzJiZC0wZmZlLTE1ZjgtOWNmNy0xNWU1OTVlYmM1NmQQh8vBnrYxGhA0NzViOTRkZGM3Y2Q1MjExILyFvZC14AQ%3D&o=1444828305810485&uiQueryProfileVisible=true&userId=20904982561468
275 | 
276 | Original Runtime: 8.5 seconds, 0 files pruned
277 | Optimized Runtime: 12.3 seconds, Many files pruned
278 | 
279 | NEW BOTTLENECK: SCAN instead of shuffle. 
280 | 
281 | WHAT IS DIFFERENT?
282 | 
283 | Files pruned	826	
284 | Files read	198	
285 | Number of output batches	56,824	
286 | Number of output rows	224,077,524	
287 | Peak memory usage	5.76	GB
288 | Size of files pruned	38.86	GB
289 | Size of files read	9.30	GB
290 | 
291 | LESSON: When doing longer term queries, make sure the file sizing is in proportion to the queries that are run on it. 
292 | There are trade offs for optimizing for a few longer "historical" queries vs many "current" queries.
293 |  Typically current queries are prioritized since you can spin up a serverless backfill cluster. 
294 | 
295 | */
296 | 
297 | 


--------------------------------------------------------------------------------
/Design Patterns Notebooks/Step 2 - Optimize your Delta Tables.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC
  4 | # MAGIC # Delta Table Optimization Methods Tutorial
  5 | # MAGIC
  6 | # MAGIC This notebook walks through the various methods and consideration when tuning / optimizing Delta tables in SQL
  7 | 
  8 | # COMMAND ----------
  9 | 
 10 | # MAGIC %md
 11 | # MAGIC
 12 | # MAGIC ## Delta Tables Optimization Knobs
 13 | # MAGIC
 14 | # MAGIC ### File Sizes
 15 | # MAGIC
 16 | # MAGIC #### COMPACTION - OPTIMIZE
 17 | # MAGIC
 18 | # MAGIC ##### ZORDER / CLUSTER BY (liquid tables)
 19 | # MAGIC
 20 | # MAGIC ###### Bloom Filter
 21 | 
 22 | # COMMAND ----------
 23 | 
 24 | # MAGIC %md
 25 | # MAGIC
 26 | # MAGIC ### Optimizing for UPSERTS
 27 | # MAGIC
 28 | # MAGIC Commands 4-8
 29 | 
 30 | # COMMAND ----------
 31 | 
 32 | # MAGIC %sql
 33 | # MAGIC DROP TABLE IF EXISTS iot_dashboard.bronze_sensors_optimization;
 34 | # MAGIC CREATE OR REPLACE TABLE iot_dashboard.bronze_sensors_optimization
 35 | # MAGIC USING DELTA
 36 | # MAGIC TBLPROPERTIES("delta.targetFileSize"="2mb") --2-128 mb for tables with heavy updates or if used for BI
 37 | # MAGIC AS 
 38 | # MAGIC (SELECT * FROM iot_dashboard.silver_sensors LIMIT 10000) --Only load a subset for sample MERGE;
 39 | 
 40 | # COMMAND ----------
 41 | 
 42 | # MAGIC %sql
 43 | # MAGIC DROP TABLE IF EXISTS iot_dashboard.silver_sensors_optimization;
 44 | # MAGIC CREATE OR REPLACE TABLE iot_dashboard.silver_sensors_optimization
 45 | # MAGIC USING DELTA
 46 | # MAGIC TBLPROPERTIES("delta.targetFileSize"="2mb")
 47 | # MAGIC AS 
 48 | # MAGIC SELECT * fROM iot_dashboard.silver_sensors;
 49 | 
 50 | # COMMAND ----------
 51 | 
 52 | # MAGIC %sql
 53 | # MAGIC
 54 | # MAGIC MERGE INTO iot_dashboard.silver_sensors_optimization AS target
 55 | # MAGIC USING (SELECT Id::integer,
 56 | # MAGIC               device_id::integer,
 57 | # MAGIC               user_id::integer,
 58 | # MAGIC               calories_burnt::decimal,
 59 | # MAGIC               miles_walked::decimal,
 60 | # MAGIC               num_steps::decimal,
 61 | # MAGIC               timestamp::timestamp,
 62 | # MAGIC               value::string
 63 | # MAGIC               FROM iot_dashboard.bronze_sensors_optimization) AS source
 64 | # MAGIC ON source.Id = target.Id
 65 | # MAGIC AND source.user_id = target.user_id
 66 | # MAGIC AND source.device_id = target.device_id
 67 | # MAGIC AND target.timestamp > now() - INTERVAL 2 hours
 68 | # MAGIC WHEN MATCHED THEN UPDATE SET 
 69 | # MAGIC   target.calories_burnt = source.calories_burnt,
 70 | # MAGIC   target.miles_walked = source.miles_walked,
 71 | # MAGIC   target.num_steps = source.num_steps,
 72 | # MAGIC   target.timestamp = source.timestamp
 73 | # MAGIC WHEN NOT MATCHED THEN INSERT *;
 74 | # MAGIC
 75 | # MAGIC -- Without optimizing tables 8.82 seconds
 76 | # MAGIC -- After optimizing by merge columns 19 seconds
 77 | 
 78 | # COMMAND ----------
 79 | 
 80 | # MAGIC %md
 81 | # MAGIC
 82 | # MAGIC ## Run CREATE/REPLACE and MERGE statements, track runtime, then run OPTIMIZE statement and run all create/merge statements again to look at spark plan differences
 83 | 
 84 | # COMMAND ----------
 85 | 
 86 | # MAGIC %sql
 87 | # MAGIC
 88 | # MAGIC -- You want to optimize by high cardinality columns like ids, timestamps, strings
 89 | # MAGIC -- ON MERGE COLUMNS, then timeseries columns, then commonly used columns in queries
 90 | # MAGIC
 91 | # MAGIC --This operation is incremental
 92 | # MAGIC --OPTIMIZE iot_dashboard.bronze_sensors_test1 ZORDER BY (Id, user_id, device_id);
 93 | # MAGIC OPTIMIZE iot_dashboard.silver_sensors_optimization ZORDER BY (user_id, device_id, Id);
 94 | 
 95 | # COMMAND ----------
 96 | 
 97 | # MAGIC %md
 98 | # MAGIC
 99 | # MAGIC ## What about queries on this table?
100 | # MAGIC
101 | # MAGIC 1. ZORDER by commonly joined columns
102 | # MAGIC 2. Partition by larger chunks only if needed
103 | # MAGIC 3. Keep important columns in front of tables
104 | # MAGIC 4. For highly selective queries, use bloom indexes
105 | 
106 | # COMMAND ----------
107 | 
108 | # MAGIC %md
109 | # MAGIC
110 | # MAGIC ## Exercise 1: Change optimization strategies for single point filters
111 | 
112 | # COMMAND ----------
113 | 
114 | # MAGIC %sql
115 | # MAGIC OPTIMIZE iot_dashboard.silver_sensors_optimization ZORDER BY (user_id);
116 | # MAGIC
117 | # MAGIC -- by user_id, timestamp -- 8 files pruned
118 | # MAGIC -- by just user id selecting on user_id -- 34 files pruned (1 read) all but one
119 | # MAGIC -- by just timestamp -- no files pruned when selecting on user_id
120 | 
121 | # COMMAND ----------
122 | 
123 | # DBTITLE 1,Create gold aggregate VIEW
124 | # MAGIC %sql
125 | # MAGIC
126 | # MAGIC CREATE OR REPLACE VIEW iot_dashboard.hourly_summary_statistics
127 | # MAGIC AS
128 | # MAGIC SELECT user_id,
129 | # MAGIC date_trunc('hour', timestamp) AS HourBucket,
130 | # MAGIC AVG(num_steps) AS AvgNumStepsAcrossDevices,
131 | # MAGIC AVG(calories_burnt) AS AvgCaloriesBurnedAcrossDevices,
132 | # MAGIC AVG(miles_walked) AS AvgMilesWalkedAcrossDevices
133 | # MAGIC FROM iot_dashboard.silver_sensors_optimization
134 | # MAGIC GROUP BY user_id,date_trunc('hour', timestamp) -- wrapping a function around a column
135 | # MAGIC ORDER BY HourBucket
136 | 
137 | # COMMAND ----------
138 | 
139 | # DBTITLE 1,Exercise 1: Tuning for single column queries
140 | # MAGIC %sql
141 | # MAGIC
142 | # MAGIC -- LOOK AT BEFORE AND AFTER QUERIES for OPTIMIZE PRE/POST
143 | # MAGIC
144 | # MAGIC -- After optimize look at user_id files pruned
145 | # MAGIC -- by user_id, timestamp -- 8 files pruned
146 | # MAGIC -- by just user id selecting on user_id -- 34 files pruned (1 read) all but one
147 | # MAGIC -- by just timestamp -- no files pruned when selecting on user_is
148 | # MAGIC
149 | # MAGIC -- POST OPTIMIZE SCAN METRICS
150 | # MAGIC --number of files pruned	33
151 | # MAGIC -- number of files read	1
152 | # MAGIC
153 | # MAGIC SELECT * FROM iot_dashboard.hourly_summary_statistics WHERe user_id = 1
154 | 
155 | # COMMAND ----------
156 | 
157 | # MAGIC %md
158 | # MAGIC
159 | # MAGIC ## Exercise 2: Multi-dimensional filters and optimzation
160 | 
161 | # COMMAND ----------
162 | 
163 | # MAGIC %sql
164 | # MAGIC
165 | # MAGIC
166 | # MAGIC SELECT MIN(HourBucket), MAX(HourBucket)
167 | # MAGIC FROM iot_dashboard.hourly_summary_statistics 
168 | 
169 | # COMMAND ----------
170 | 
171 | # MAGIC %sql
172 | # MAGIC OPTIMIZE iot_dashboard.silver_sensors_optimization ZORDER BY (user_id, timestamp);
173 | # MAGIC
174 | # MAGIC -- by user_id, timestamp -- 2 files pruned, 29 scanned
175 | # MAGIC -- by timestamp, user_id --  does order matter? 2 files pruned, 29 scanned, - not really
176 | # MAGIC -- How to make this more selective? -- Hour bucket is abstracting the filter pushdown, lets try just the raw table
177 | 
178 | # COMMAND ----------
179 | 
180 | # DBTITLE 1,Exercise 2: Optimizing Multi-dimensional queries
181 | # MAGIC %sql
182 | # MAGIC
183 | # MAGIC SELECT * 
184 | # MAGIC FROM iot_dashboard.hourly_summary_statistics 
185 | # MAGIC WHERE user_id = 1
186 | # MAGIC AND HourBucket BETWEEN "2018-07-22T00:00:00.000+0000" AND "2018-07-22T01:00:00.000+0000"
187 | 
188 | # COMMAND ----------
189 | 
190 | # DBTITLE 1,Lesson learned -- let Delta do the filtering first, then group and aggregate -- subqueries are actually better
191 | # MAGIC %sql
192 | # MAGIC
193 | # MAGIC -- Look at SPARK QUERY PLAN SCAN node
194 | # MAGIC -- How many files are pruned/read? 
195 | # MAGIC -- Try optimizing the table on different columns (1,2,3) -- see what happens!
196 | # MAGIC --28 pruned, 3 files read
197 | # MAGIC
198 | # MAGIC SELECT * 
199 | # MAGIC FROM iot_dashboard.silver_sensors_optimization
200 | # MAGIC WHERE user_id = 1
201 | # MAGIC AND timestamp BETWEEN "2018-07-22T00:00:00.000+0000"::timestamp AND "2018-07-22T01:00:00.000+0000"::timestamp
202 | 
203 | # COMMAND ----------
204 | 
205 | # DBTITLE 1,Automate Certain Pushdown Filter Rules in VIEWs
206 | # MAGIC %sql
207 | # MAGIC
208 | # MAGIC CREATE OR REPLACE VIEW iot_dashboard.test_filter_pushdown
209 | # MAGIC AS 
210 | # MAGIC WITH raw_pushdown AS
211 | # MAGIC (
212 | # MAGIC   SELECT * 
213 | # MAGIC   FROM iot_dashboard.silver_sensors_optimization
214 | # MAGIC   WHERE user_id = 1
215 | # MAGIC   AND timestamp BETWEEN "2018-07-22T00:00:00.000+0000"::timestamp AND "2018-07-22T01:00:00.000+0000"::timestamp
216 | # MAGIC )
217 | # MAGIC SELECT user_id,
218 | # MAGIC date_trunc('hour', timestamp) AS HourBucket,
219 | # MAGIC AVG(num_steps) AS AvgNumStepsAcrossDevices,
220 | # MAGIC AVG(calories_burnt) AS AvgCaloriesBurnedAcrossDevices,
221 | # MAGIC AVG(miles_walked) AS AvgMilesWalkedAcrossDevices
222 | # MAGIC FROM raw_pushdown
223 | # MAGIC GROUP BY user_id,date_trunc('hour', timestamp)
224 | # MAGIC ORDER BY HourBucket
225 | 
226 | # COMMAND ----------
227 | 
228 | # MAGIC %sql
229 | # MAGIC
230 | # MAGIC -- Now pruning is automatically done and manual users do not have to remember each time for common views
231 | # MAGIC SELECT * FROM iot_dashboard.test_filter_pushdown
232 | 
233 | # COMMAND ----------
234 | 
235 | # DBTITLE 1,Efficacy on More Complex VIEWs
236 | # MAGIC %sql
237 | # MAGIC
238 | # MAGIC CREATE OR REPLACE VIEW iot_dashboard.smoothed_hourly_statistics
239 | # MAGIC AS 
240 | # MAGIC SELECT *,
241 | # MAGIC -- Number of Steps
242 | # MAGIC (avg(`AvgNumStepsAcrossDevices`) OVER (
243 | # MAGIC         ORDER BY `HourBucket`
244 | # MAGIC         ROWS BETWEEN
245 | # MAGIC           4 PRECEDING AND
246 | # MAGIC           CURRENT ROW
247 | # MAGIC       )) ::float AS SmoothedNumSteps4HourMA, -- 4 hour moving average
248 | # MAGIC       
249 | # MAGIC (avg(`AvgNumStepsAcrossDevices`) OVER (
250 | # MAGIC         ORDER BY `HourBucket`
251 | # MAGIC         ROWS BETWEEN
252 | # MAGIC           24 PRECEDING AND
253 | # MAGIC           CURRENT ROW
254 | # MAGIC       ))::float AS SmoothedNumSteps12HourMA --24 hour moving average
255 | # MAGIC ,
256 | # MAGIC -- Calories Burned
257 | # MAGIC (avg(`AvgCaloriesBurnedAcrossDevices`) OVER (
258 | # MAGIC         ORDER BY `HourBucket`
259 | # MAGIC         ROWS BETWEEN
260 | # MAGIC           4 PRECEDING AND
261 | # MAGIC           CURRENT ROW
262 | # MAGIC       ))::float AS SmoothedCalsBurned4HourMA, -- 4 hour moving average
263 | # MAGIC       
264 | # MAGIC (avg(`AvgCaloriesBurnedAcrossDevices`) OVER (
265 | # MAGIC         ORDER BY `HourBucket`
266 | # MAGIC         ROWS BETWEEN
267 | # MAGIC           24 PRECEDING AND
268 | # MAGIC           CURRENT ROW
269 | # MAGIC       ))::float AS SmoothedCalsBurned12HourMA --24 hour moving average,
270 | # MAGIC ,
271 | # MAGIC -- Miles Walked
272 | # MAGIC (avg(`AvgMilesWalkedAcrossDevices`) OVER (
273 | # MAGIC         ORDER BY `HourBucket`
274 | # MAGIC         ROWS BETWEEN
275 | # MAGIC           4 PRECEDING AND
276 | # MAGIC           CURRENT ROW
277 | # MAGIC       ))::float AS SmoothedMilesWalked4HourMA, -- 4 hour moving average
278 | # MAGIC       
279 | # MAGIC (avg(`AvgMilesWalkedAcrossDevices`) OVER (
280 | # MAGIC         ORDER BY `HourBucket`
281 | # MAGIC         ROWS BETWEEN
282 | # MAGIC           24 PRECEDING AND
283 | # MAGIC           CURRENT ROW
284 | # MAGIC       ))::float AS SmoothedMilesWalked12HourMA --24 hour moving average
285 | # MAGIC FROM iot_dashboard.hourly_summary_statistics
286 | 
287 | # COMMAND ----------
288 | 
289 | # DBTITLE 1,File Pruning on Complex VIEWs
290 | # MAGIC %sql
291 | # MAGIC
292 | # MAGIC -- How are files being pruned in the SCAN node?
293 | # MAGIC SELECt * FROM iot_dashboard.smoothed_hourly_statistics WHERE user_id = 1
294 | 


--------------------------------------------------------------------------------
/Design Patterns Notebooks/Step 3 - DLT Version Simple SQL EDW Pipeline.sql:
--------------------------------------------------------------------------------
  1 | -- Databricks notebook source
  2 | -- MAGIC %md
  3 | -- MAGIC
  4 | -- MAGIC # This notebook generates a full data pipeline from databricks dataset - iot-stream
  5 | -- MAGIC
  6 | -- MAGIC #### Define the SQL - Add as a library to a DLT pipeline, and run the pipeline!
  7 | -- MAGIC
  8 | -- MAGIC ## This creates 2 tables: 
  9 | -- MAGIC
 10 | -- MAGIC <b> Database: </b> iot_dashboard
 11 | -- MAGIC
 12 | -- MAGIC <b> Tables: </b> silver_sensors, silver_users 
 13 | -- MAGIC
 14 | -- MAGIC <b> Params: </b> StartOver (Yes/No) - allows user to truncate and reload pipeline
 15 | 
 16 | -- COMMAND ----------
 17 | 
 18 | -- MAGIC %md 
 19 | -- MAGIC
 20 | -- MAGIC ## This is built as a library for a Delta Live Tables pipeline
 21 | 
 22 | -- COMMAND ----------
 23 | 
 24 | -- MAGIC %md
 25 | -- MAGIC ## Exhaustive list of all cloud_files STREAMING LIVE TABLE options
 26 | -- MAGIC https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-incremental-data.html#language-sql
 27 | 
 28 | -- COMMAND ----------
 29 | 
 30 | -- DBTITLE 1,Incrementally Ingest Source Data from Raw Files
 31 | --No longer need a separate copy into statement, you can use the Databricks Autoloader directly in SQL by using the cloud_files function
 32 | -- OPTIONALLY defined DDL in the table definition
 33 | CREATE OR REFRESH STREAMING LIVE TABLE bronze_sensors
 34 | (
 35 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY,
 36 | device_id INT,
 37 | user_id INT,
 38 | calories_burnt DECIMAL(10,2), 
 39 | miles_walked DECIMAL(10,2), 
 40 | num_steps DECIMAL(10,2), 
 41 | timestamp TIMESTAMP,
 42 | value STRING,
 43 | CONSTRAINT has_device EXPECT (device_id IS NOT NULL) ON VIOLATION DROP ROW  ,
 44 | CONSTRAINT has_user EXPECT(user_id IS NOT NULL) ON VIOLATION DROP ROW,
 45 | CONSTRAINT has_data EXPECT(num_steps IS NOT NULL) -- with no violation rule, nothing happens, we just track quality in DLT
 46 | )
 47 | TBLPROPERTIES("delta.targetFileSize"="128mb",
 48 | "pipelines.autoOptimize.managed"="true",
 49 | "pipelines.autoOptimize.zOrderCols"="create_timestamp,device_id,user_id",
 50 | "pipelines.trigger.interval"="1 hour")
 51 | AS 
 52 | SELECT 
 53 | id::bigint AS Id,
 54 | device_id::integer AS device_id,
 55 | user_id::integer AS user_id,
 56 | calories_burnt::decimal(10,2) AS calories_burnt, 
 57 | miles_walked::decimal(10,2) AS miles_walked, 
 58 | num_steps::decimal(10,2) AS num_steps,
 59 | timestamp::timestamp AS timestamp,
 60 | value AS value
 61 | FROM cloud_files("/databricks-datasets/iot-stream/data-device/", "json")
 62 |  -- First 2 params of cloud_files are always input file path and format, then rest are map object of optional params
 63 | -- To make incremental - Add STREAMING keyword before LIVE TABLE
 64 | ;
 65 | 
 66 | 
 67 | 
 68 | -- COMMAND ----------
 69 | 
 70 | -- MAGIC %md
 71 | -- MAGIC
 72 | -- MAGIC ## Process Change data with updates or deletes 
 73 | -- MAGIC API Docs: https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html
 74 | -- MAGIC
 75 | -- MAGIC
 76 | -- MAGIC ### Automatically store change as SCD 1 or SCD 2 Type changes
 77 | -- MAGIC
 78 | -- MAGIC SCD 1/2 Docs: https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html#language-sql
 79 | 
 80 | -- COMMAND ----------
 81 | 
 82 | -- DBTITLE 1,Incremental upsert data into target silver layer
 83 | -- Create and populate the target table.
 84 | CREATE OR REFRESH STREAMING LIVE TABLE silver_sensors
 85 | (
 86 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY,
 87 | device_id INT,
 88 | user_id INT,
 89 | calories_burnt DECIMAL(10,2), 
 90 | miles_walked DECIMAL(10,2), 
 91 | num_steps DECIMAL(10,2), 
 92 | timestamp TIMESTAMP,
 93 | value STRING)
 94 | TBLPROPERTIES("delta.targetFileSize"="128mb",
 95 | "quality"="silver",
 96 | "pipelines.autoOptimize.managed"="true",
 97 | "pipelines.autoOptimize.zOrderCols"="create_timestamp,device_id,user_id",
 98 | "pipelines.trigger.interval"="1 hour"
 99 | );
100 | 
101 | -- COMMAND ----------
102 | 
103 | -- DBTITLE 1,Actually run CDC Transformation Operation
104 | APPLY CHANGES INTO
105 |   LIVE.silver_sensors
106 | FROM
107 |   STREAM(LIVE.bronze_sensors) -- use STREAM to get change feed, use LIVE to get DAG source table
108 | KEYS
109 |   (user_id, device_id) -- Identical to the ON statement in MERGE, can be 1 of many keys
110 | --APPLY AS DELETE WHEN
111 | --  operation = "DELETE" --Need if you have a operation columnd that specifies "APPEND"/"UPDATE"/"DELETE" like true CDC data
112 | SEQUENCE BY
113 |   timestamp
114 | COLUMNS * EXCEPT
115 |    (Id) --For auto increment keys, exclude the updates cause you dont want to replace Ids of auto_id columns
116 | --    Optionally exclude columns like metadata or operation types, by default, UPDATE * is the operation
117 | STORED AS
118 |   SCD TYPE 1 -- [SCD TYPE 2] will expire updated originals
119 | 
120 | -- COMMAND ----------
121 | 
122 | -- MAGIC %md 
123 | -- MAGIC
124 | -- MAGIC ## FULL REFRESH EXAMPLE - Ingest Full User Data Set Each Load
125 | 
126 | -- COMMAND ----------
127 | 
128 | -- DBTITLE 1,FulltIngest Raw User Data
129 | CREATE OR REPLACE STREAMING LIVE TABLE silver_users
130 | ( -- REPLACE truncates the checkpoint each time and loads from scratch every time
131 | userid BIGINT GENERATED BY DEFAULT AS IDENTITY,
132 | gender STRING,
133 | age INT,
134 | height DECIMAL(10,2), 
135 | weight DECIMAL(10,2),
136 | smoker STRING,
137 | familyhistory STRING,
138 | cholestlevs STRING,
139 | bp STRING,
140 | risk DECIMAL(10,2),
141 | update_timestamp TIMESTAMP,
142 | CONSTRAINT has_user EXPECT (userid IS NOT NULL) ON VIOLATION DROP ROW
143 | )
144 | TBLPROPERTIES("delta.targetFileSize"="128mb",
145 | "quality"="silver",
146 | "pipelines.autoOptimize.managed"="true",
147 | "pipelines.autoOptimize.zOrderCols"="userid",
148 | "pipelines.trigger.interval"="1 day"
149 | )
150 | AS (SELECT 
151 | userid::bigint AS userid,
152 | gender AS gender,
153 | age::integer AS age,
154 | height::decimal(10,2) AS height, 
155 | weight::decimal(10,2) AS weight,
156 | smoker AS smoker,
157 | familyhistory AS familyhistory,
158 | cholestlevs AS cholestlevs,
159 | bp AS bp,
160 | risk::decimal(10,2) AS risk,
161 | current_timestamp() AS update_timestamp
162 | FROM cloud_files("/databricks-datasets/iot-stream/data-user/","csv", map( 'header', 'true'))
163 | )
164 | ;
165 | 


--------------------------------------------------------------------------------
/Design Patterns Notebooks/Step 4 - Create Gold Layer Analytics Tables.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md
 3 | -- MAGIC
 4 | -- MAGIC ## Create Gold Layer Tables that aggregate and clean up the data for BI / ML
 5 | 
 6 | -- COMMAND ----------
 7 | 
 8 | CREATE OR REPLACE TABLE iot_dashboard.hourly_summary_statistics
 9 | AS
10 | SELECT user_id,
11 | date_trunc('hour', timestamp) AS HourBucket,
12 | AVG(num_steps)::float AS AvgNumStepsAcrossDevices,
13 | AVG(calories_burnt)::float AS AvgCaloriesBurnedAcrossDevices,
14 | AVG(miles_walked)::float AS AvgMilesWalkedAcrossDevices
15 | FROM iot_dashboard.silver_sensors
16 | GROUP BY user_id,date_trunc('hour', timestamp)
17 | ORDER BY HourBucket;
18 | 
19 | 
20 | CREATE OR REPLACE TABLE iot_dashboard.smoothed_hourly_statistics
21 | AS 
22 | SELECT *,
23 | -- Number of Steps
24 | (avg(`AvgNumStepsAcrossDevices`) OVER (
25 |         ORDER BY `HourBucket`
26 |         ROWS BETWEEN
27 |           4 PRECEDING AND
28 |           CURRENT ROW
29 |       )) ::float AS SmoothedNumSteps4HourMA, -- 4 hour moving average
30 |       
31 | (avg(`AvgNumStepsAcrossDevices`) OVER (
32 |         ORDER BY `HourBucket`
33 |         ROWS BETWEEN
34 |           24 PRECEDING AND
35 |           CURRENT ROW
36 |       ))::float AS SmoothedNumSteps12HourMA --24 hour moving average
37 | ,
38 | -- Calories Burned
39 | (avg(`AvgCaloriesBurnedAcrossDevices`) OVER (
40 |         ORDER BY `HourBucket`
41 |         ROWS BETWEEN
42 |           4 PRECEDING AND
43 |           CURRENT ROW
44 |       ))::float AS SmoothedCalsBurned4HourMA, -- 4 hour moving average
45 |       
46 | (avg(`AvgCaloriesBurnedAcrossDevices`) OVER (
47 |         ORDER BY `HourBucket`
48 |         ROWS BETWEEN
49 |           24 PRECEDING AND
50 |           CURRENT ROW
51 |       ))::float AS SmoothedCalsBurned12HourMA --24 hour moving average,
52 | ,
53 | -- Miles Walked
54 | (avg(`AvgMilesWalkedAcrossDevices`) OVER (
55 |         ORDER BY `HourBucket`
56 |         ROWS BETWEEN
57 |           4 PRECEDING AND
58 |           CURRENT ROW
59 |       ))::float AS SmoothedMilesWalked4HourMA, -- 4 hour moving average
60 |       
61 | (avg(`AvgMilesWalkedAcrossDevices`) OVER (
62 |         ORDER BY `HourBucket`
63 |         ROWS BETWEEN
64 |           24 PRECEDING AND
65 |           CURRENT ROW
66 |       ))::float AS SmoothedMilesWalked12HourMA --24 hour moving average
67 | FROM iot_dashboard.hourly_summary_statistics
68 | 
69 | -- COMMAND ----------
70 | 
71 | -- DBTITLE 1,Build Visuals in DBSQL, Directly in Notebook, or in any BI tool!
72 | SELECT * FROM iot_dashboard.smoothed_hourly_statistics WHERE user_id = 1
73 | 


--------------------------------------------------------------------------------
/Design Patterns Notebooks/Step 7 - COPY INTO Loading Patterns.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC
 4 | # MAGIC ## Materlized Views
 5 | # MAGIC
 6 | # MAGIC Patterns and Best Practices
 7 | # MAGIC
 8 | # MAGIC
 9 | # MAGIC 1. Create Materialized View
10 | # MAGIC 2. Optimize Materialized View
11 | # MAGIC 3. Check / Monitor Performance of MV
12 | # MAGIC 4. When to NOT use MVs
13 | 


--------------------------------------------------------------------------------
/Design Patterns Notebooks/Step 8 - Liquid Clustering Delta Tables.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md 
 3 | # MAGIC
 4 | # MAGIC ## Deep Dive on Liquid Clustering  Delta Tables
 5 | # MAGIC
 6 | # MAGIC ### Topics
 7 | # MAGIC
 8 | # MAGIC 1. How to create and optimize liquid tables
 9 | # MAGIC 2. How to merge/update/delete data from liquid tables
10 | # MAGIC 3. VACUUM/PURGE/REORG on Liqiud tables
11 | # MAGIC 4. Performance Measurement
12 | # MAGIC 5. When to use ZORDER/Partitions vs Liquid
13 | # MAGIC 6. Liquid Limitations
14 | 


--------------------------------------------------------------------------------
/Design Patterns Notebooks/Step 9 - Using SQL Functions.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # SQL Functions Topic Deep Dive
 3 | 
 4 | ## Topics
 5 | 
 6 | 1. How to use SQL functions
 7 | 2. Different languages - Python/SQL
 8 | 3. Variables, etc. 
 9 | 4. Using Models in SQL functions
10 | 5. AI Functions
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Cody Austin Davis
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # edw-best-practices
 2 | ## Git Repo for EDW Best Practice Assets on the Lakehouse
 3 | 
 4 | This Git Project Provides a framework of example notebooks that aims to show any typical data warehousing SQL users how to built pipelines and analytics on the Lakeshouse. Broken out in 4 steps, the notebooks will walk the user through a single use case that they can run in their own Databricks environment leading them through the data maturity curve as follows: 
 5 | 
 6 | <li> 1. Step 1 - Build a classical batched-oriented SQL pipeline with best practices on the Lakehouse
 7 | 
 8 | <li> 2. Step 2 - Build the above in Delta Live Tables and automate all orchestration
 9 | 
10 | <li> 3. Step 3 - Build and analyze summary analytics tables
11 |   
12 | <li> 4. Step 4 - Create gold views
13 |   
14 | <li> 4. Step 5 - Convert and old batch pipeline to a Streaming pipeline 
15 |   
16 |   
17 |  This Git repo also provides some examples of more advacned use cases like using the Delta Change Data feed. 
18 |   
19 |  This Git repo also provides some helper functions that make ETL easier in production pipelines.
20 | 
21 | 


--------------------------------------------------------------------------------
/Realtime Data Apps Workshop/Step 0 - Real Time Data Generator Simulator.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC 
  4 | # MAGIC ### This notebook simulates a real-time feed from an IoT Device
  5 | # MAGIC 
  6 | # MAGIC <b> Notes: </b>
  7 | # MAGIC   
  8 | # MAGIC   <li> 1. Starts with an initial batch of the earlist data from the databricks-datasets/iot-stream
  9 | # MAGIC   <li> 2. Allows user to truncate and reload simulated streaming data
 10 | # MAGIC   <li> 3. Allows user to decide how often to drop files to simulate different update frequencies
 11 | 
 12 | # COMMAND ----------
 13 | 
 14 | spark.conf.set("spark.sql.shuffle.partitions", "32")
 15 | 
 16 | # COMMAND ----------
 17 | 
 18 | from pyspark.sql.functions import *
 19 | 
 20 | # COMMAND ----------
 21 | 
 22 | # DBTITLE 1,Define Source and Sink Paths
 23 | source_data_path = "/databricks-datasets/iot-stream/data-device/"
 24 | target_data_path =  "dbfs:/Filestore/real-time-data-demo/iot_dashboard/"
 25 | 
 26 | # COMMAND ----------
 27 | 
 28 | # DBTITLE 1,Get all records, order by timestamp, and drop 1 at time
 29 | df = (spark.read.json(source_data_path).orderBy("timestamp")
 30 |       .withColumn("second", date_trunc("second", col("timestamp")))
 31 |      )
 32 | 
 33 | # COMMAND ----------
 34 | 
 35 | dbutils.widgets.text("Second Frequency (Integer)", "1")
 36 | dbutils.widgets.text("Starting Record Batch Size", "1000")
 37 | dbutils.widgets.dropdown("Start Over Each Run", "Yes", ["Yes", "No"])
 38 | dbutils.widgets.text("Records Per Trigger (Integer):", "1000")
 39 | dbutils.widgets.dropdown("Run Mode", "Real Time", ["Real Time", "Historical Stream"])
 40 | 
 41 | run_mode = dbutils.widgets.get("Run Mode")
 42 | start_over = dbutils.widgets.get("Start Over Each Run")
 43 | drop_periodicity = int(dbutils.widgets.get("Second Frequency (Integer)"))
 44 | start_batch_size = int(dbutils.widgets.get("Starting Record Batch Size"))
 45 | records_per_trigger = int(dbutils.widgets.get("Records Per Trigger (Integer):"))
 46 | 
 47 | print(f"Run Mode: {run_mode}... \n Generating {records_per_trigger} records every {drop_periodicity} seconds starting with {start_batch_size} records. \n Start over each run?: {start_over}")
 48 | 
 49 | # COMMAND ----------
 50 | 
 51 | from pyspark.sql import Window
 52 | from pyspark.sql.functions import *
 53 | import time
 54 | 
 55 | # COMMAND ----------
 56 | 
 57 | # DBTITLE 1,Sort Data to Drop files in order of timeframe to simulate real-time
 58 | historical_overSpec = Window.orderBy("timestamp")
 59 | realtime_overSpec = Window.orderBy("second")
 60 | 
 61 | prepped_df = (df.withColumn("row_num", row_number().over(historical_overSpec)) ## For 
 62 |               .withColumn("sec_rank", dense_rank().over(realtime_overSpec))
 63 |              )
 64 | 
 65 | # COMMAND ----------
 66 | 
 67 | # DBTITLE 1,Write Starting Batch to get initial state
 68 | ## Start over each time 
 69 | 
 70 | if start_over == "Yes":
 71 |   print("Truncating and reloading source data...")
 72 |   dbutils.fs.rm(target_data_path, recurse=True)
 73 | 
 74 |   
 75 | ## Write initial batch size
 76 | 
 77 | if run_mode == "Historical Stream":
 78 |   
 79 |   ## This separates data in batches by #rows
 80 |   initial_batch = prepped_df.filter(col("row_num") <= lit(start_batch_size)).select("value").coalesce(1)
 81 |   initial_batch.write.text(f"{target_data_path}initial_batch_0_{start_batch_size}.json")
 82 | 
 83 | elif run_mode == "Real Time":
 84 |   
 85 |   # This separates data in batches by seconds
 86 |   initial_batch = prepped_df.filter(col("sec_rank") <= lit(start_batch_size)).select("value").coalesce(1)
 87 |   initial_batch.write.text(f"{target_data_path}initial_batch_0_{start_batch_size}.json")
 88 | 
 89 | # COMMAND ----------
 90 | 
 91 | # DBTITLE 1,Load Incremental Records in order of timestamp after initial batch
 92 | if run_mode == "Historical Stream":
 93 |   
 94 |   max_val = prepped_df.agg(max("row_num")).collect()[0][0]
 95 |   batches = list(range(start_batch_size, max_val, records_per_trigger))
 96 | 
 97 | 
 98 |   coalesced_prepped_df = prepped_df.coalesce(1)
 99 | 
100 |   for i, j in enumerate(batches):
101 | 
102 |     print(i)
103 |     print(f"Dropping batch {i} from records {j} --> {batches[i+1]}")
104 | 
105 |     start_rec = j
106 |     end_rec = batches[i+1]
107 | 
108 |     incremental_df = (coalesced_prepped_df
109 |                     .filter((col("row_num") > lit(start_rec)) & (col("row_num") <= lit(end_rec)))
110 |                     .coalesce(1)
111 |                     .orderBy("row_num").select("value")
112 |                    )
113 |     incremental_df.write.text(f"{target_data_path}batch_{i}_from_{start_rec}_to_{end_rec}.json")
114 | 
115 |     time.sleep(drop_periodicity)
116 |     
117 |     
118 | elif run_mode == "Real Time":
119 |   
120 |   max_val = prepped_df.agg(max("sec_rank")).collect()[0][0]
121 |   
122 |   ## Dropping X seconds of data at a time proportional to the real drop rate
123 |   batches = list(range(start_batch_size, max_val, drop_periodicity))
124 | 
125 | 
126 |   coalesced_prepped_df = prepped_df.coalesce(1)
127 | 
128 |   for i, j in enumerate(batches):
129 | 
130 |     print(i)
131 |     print(f"Dropping batch {i} from records {j} --> {batches[i+1]}")
132 | 
133 |     start_rec = j
134 |     end_rec = batches[i+1]
135 | 
136 |     incremental_df = (coalesced_prepped_df
137 |                     .filter((col("sec_rank") > lit(start_rec)) & (col("sec_rank") <= lit(end_rec)))
138 |                     .coalesce(1)
139 |                     .orderBy("sec_rank").select("value")
140 |                    )
141 |     incremental_df.write.text(f"{target_data_path}batch_{i}_from_{start_rec}_to_{end_rec}.json")
142 | 
143 |     time.sleep(drop_periodicity)
144 | 


--------------------------------------------------------------------------------
/Realtime Data Apps Workshop/Step 2 - Create Gold Views for App Layer.sql:
--------------------------------------------------------------------------------
  1 | -- Databricks notebook source
  2 | -- MAGIC %md
  3 | -- MAGIC
  4 | -- MAGIC ## Building Production Data Apps - Last Mile BI on Databricks and Dash
  5 | -- MAGIC
  6 | -- MAGIC <b> Dash apps:  </b> https://dash.gallery/Portal/
  7 | -- MAGIC
  8 | -- MAGIC
  9 | 
 10 | -- COMMAND ----------
 11 | 
 12 | -- MAGIC %md
 13 | -- MAGIC
 14 | -- MAGIC <img src="https://miro.medium.com/max/1400/1*N2hJnle6RJ6HRRF4ISFBjw.gif">
 15 | -- MAGIC
 16 | -- MAGIC
 17 | 
 18 | -- COMMAND ----------
 19 | 
 20 | -- MAGIC %md
 21 | -- MAGIC
 22 | -- MAGIC ## Dashboard Recommendations
 23 | -- MAGIC
 24 | -- MAGIC 1. Pushdown timestamp filters as much as possible (especially now that insert order is preserved)
 25 | -- MAGIC 2. Bring back as little data as necessary
 26 | -- MAGIC 3. Make the Lakehouse do all the work
 27 | 
 28 | -- COMMAND ----------
 29 | 
 30 | -- DBTITLE 1,Generate View with Heavy Logic
 31 | -- We can decide to build directly on bronze or on silver for higher quality data
 32 | 
 33 | CREATE OR REPLACE VIEW real_time_iot_dashboard.gold_sensors
 34 | AS 
 35 | (
 36 | WITH water_mark AS ((SELECT MAX(timestamp) FROM real_time_iot_dashboard.bronze_sensors))
 37 | 
 38 | SELECT timestamp,
 39 | -- Number of Steps
 40 | (avg(`num_steps`) OVER (
 41 |         ORDER BY timestamp
 42 |         ROWS BETWEEN
 43 |           15 PRECEDING AND
 44 |           CURRENT ROW
 45 |       )) ::float AS SmoothedNumSteps30SecondMA, -- 30 second moving average
 46 |      
 47 | (avg(`num_steps`) OVER (
 48 |         ORDER BY timestamp
 49 |         ROWS BETWEEN
 50 |           60 PRECEDING AND
 51 |           CURRENT ROW
 52 |       ))::float AS SmoothedNumSteps120SecondMA,--120 second moving average,
 53 | -- Calories Burnt
 54 | (avg(`calories_burnt`) OVER (
 55 |         ORDER BY timestamp
 56 |         ROWS BETWEEN
 57 |           15 PRECEDING AND
 58 |           CURRENT ROW
 59 |       )) ::float AS SmoothedCaloriesBurnt30SecondMA, -- 30 second moving average
 60 |      
 61 | (avg(`calories_burnt`) OVER (
 62 |         ORDER BY timestamp
 63 |         ROWS BETWEEN
 64 |           60 PRECEDING AND
 65 |           CURRENT ROW
 66 |       ))::float AS SmoothedCaloriesBurnt120SecondMA --120 second moving average
 67 | FROM real_time_iot_dashboard.bronze_sensors
 68 | WHERE timestamp >= ((SELECT * FROM water_mark) - INTERVAL '15 MINUTES') -- In real time, you would use current_timestamp, but this is synthetic old data
 69 | ORDER BY timestamp DESC
 70 | )
 71 | 
 72 | -- COMMAND ----------
 73 | 
 74 | CREATE OR REPLACE VIEW real_time_iot_dashboard.gold_sensors_stateful
 75 | AS 
 76 | SELECT EventStart as timestamp,
 77 | num_steps AS SmoothedNumSteps30SecondMA, -- 30 second moving average
 78 |      
 79 | (avg(`num_steps`) OVER (
 80 |         ORDER BY EventStart
 81 |         ROWS BETWEEN
 82 |           30 PRECEDING AND
 83 |           CURRENT ROW
 84 |       ))::float AS SmoothedNumSteps120SecondMA,--120 second moving average,
 85 | -- Calories Burnt
 86 | calories_burnt AS SmoothedCaloriesBurnt30SecondMA, -- 30 second moving average
 87 |      
 88 | (avg(`calories_burnt`) OVER (
 89 |         ORDER BY EventStart
 90 |         ROWS BETWEEN
 91 |           30 PRECEDING AND
 92 |           CURRENT ROW
 93 |       ))::float AS SmoothedCaloriesBurnt120SecondMA --120 second moving average
 94 | FROM real_time_iot_dashboard.silver_sensors_stateful ss
 95 | WHERE
 96 | --Use partition pruning to ignore data as it ages
 97 | ss.Date = ((SELECT MAX(Date) FROM real_time_iot_dashboard.silver_sensors_stateful))
 98 | AND ss.EventStart >= ((SELECT MAX(EventStart) FROM real_time_iot_dashboard.silver_sensors_stateful) - INTERVAL '15 MINUTES')
 99 | ORDER BY timestamp DESC
100 | LIMIT 200
101 | 
102 | -- COMMAND ----------
103 | 
104 | -- MAGIC %sql
105 | -- MAGIC
106 | -- MAGIC SELECT * FROM real_time_iot_dashboard.gold_sensors_stateful
107 | 
108 | -- COMMAND ----------
109 | 
110 | -- DBTITLE 1,Example of Dashboard Client Side Query
111 | SELECT * 
112 | FROM real_time_iot_dashboard.gold_sensors
113 | LIMIT 1000
114 | 
115 | -- COMMAND ----------
116 | 
117 | -- DBTITLE 1,Embed this into a Dash Callback to create automatically refreshing tables that trigger when the table updates
118 | WITH log AS
119 | (DESCRIBE HISTORY real_time_iot_dashboard.bronze_sensors
120 | ),
121 | state AS (
122 | SELECT
123 | version,
124 | timestamp,
125 | operation
126 | FROM log
127 | WHERE (timestamp >= current_timestamp() - INTERVAL '24 hours')
128 | AND operation IN ('MERGE', 'WRITE', 'DELETE', 'STREAMING UPDATE')
129 | ORDER By version DESC
130 | ),
131 | comparison AS (
132 | SELECT DISTINCT
133 | s1.version,
134 | s1.timestamp,
135 | s1.operation,
136 | LAG(version) OVER (ORDER BY version) AS Previous_Version,
137 | LAG(timestamp) OVER (ORDER BY timestamp) AS Previous_Timestamp
138 | FROM state AS s1
139 | ORDER BY version DESC)
140 | 
141 | SELECT
142 | date_trunc('hour', timestamp) AS HourBlock,
143 | AVG(timestamp::double - Previous_Timestamp::double) AS AvgUpdateFrequencyInSeconds
144 | FROM comparison
145 | GROUP BY date_trunc('hour', timestamp)
146 | ORDER BY HourBlock
147 | 


--------------------------------------------------------------------------------
/RedshiftDDLMigrator/Redshift DDL Migrator.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC 
  4 | # MAGIC ## Redshift --> Databricks DDL Migrator
  5 | # MAGIC 
  6 | # MAGIC ### v000 (PROTOTYPE)
  7 | # MAGIC ### Author: Cody Austin Davis
  8 | # MAGIC ### Date: 8/13/2022
  9 | # MAGIC 
 10 | # MAGIC 
 11 | # MAGIC #### DEPENDENCIES: 
 12 | # MAGIC 
 13 | # MAGIC <li> 1. Must first create a table/view in Redshift that contains all historical DDL statements. This statement can be found from AWS <a href="https://github.com/awslabs/amazon-redshift-utils/blob/master/src/AdminViews/v_generate_tbl_ddl.sql">here </a>. You can name the table whatever you would like and supply the table name in this notebook.
 14 | # MAGIC <li> 2. Must install the Redshift <> Databricks Jar file to the cluster on Databricks found here: <a href="https://docs.databricks.com/data/data-sources/aws/amazon-redshift.html" >Amazon Redshift Connector</a>
 15 | # MAGIC   
 16 | # MAGIC   
 17 | # MAGIC #### ROADMAP: 
 18 | # MAGIC   
 19 | # MAGIC <li> CALL OUT EDGE CASES: Super data type, timeszones
 20 | # MAGIC <li> Parse external tables - from Redshift
 21 | # MAGIC <li> Edge data type (timezone, encoding, etc.)
 22 | # MAGIC <li> Translate primary key generation object and automatically run the sync command (stretch goal)
 23 | # MAGIC <li> Translate default values in DDL statements
 24 | # MAGIC <li> Make Identity column generation more robust (translate increments, etc.)
 25 | 
 26 | # COMMAND ----------
 27 | 
 28 | # MAGIC %pip install sqlparse
 29 | # MAGIC %pip install sql-metadata
 30 | 
 31 | # COMMAND ----------
 32 | 
 33 | import json
 34 | import sqlparse
 35 | from sql_metadata import Parser
 36 | from pyspark.sql.functions import *
 37 | 
 38 | # COMMAND ----------
 39 | 
 40 | redshift_user = dbutils.secrets.get(scope='rm_redshift', key = 'username') ## Supply your own secret values or raw keys here for username and password
 41 | redshift_password = dbutils.secrets.get(scope='rm_redshift', key = 'password')
 42 | 
 43 | # COMMAND ----------
 44 | 
 45 | hostname_redshift = '<insert-host-name>'
 46 | port_redshift = '5439'
 47 | tempdir_redshift_unloads = '<insert-temp-s3-path>'
 48 | iam_role_redshift = '<insert-redshift-iam-role>'
 49 | database = "<insert-database>"
 50 | print(f"Running testing off: {hostname_redshift}")
 51 | 
 52 | # COMMAND ----------
 53 | 
 54 | # DBTITLE 1,Get DDL Admin View if not exists
 55 | dbutils.widgets.text("Redshift DDL Table Name", "")
 56 | redshift_table_name = dbutils.widgets.get("Redshift DDL Table Name")
 57 | 
 58 | 
 59 | dbutils.widgets.text("Redshift Schemas to migrate(csv)", "")
 60 | schemas_to_migrate = [i.strip() for i in dbutils.widgets.get("Redshift Schemas to migrate(csv)").split(",") if len(i) > 0]
 61 | 
 62 | if len(schemas_to_migrate) == 0:
 63 |     schemas_to_migrate = "All"
 64 |     
 65 | print(f"Extracting DDL from the following table: {redshift_table_name}")
 66 | print(f"Migrating the following schemas: {schemas_to_migrate}")
 67 | 
 68 | 
 69 | # COMMAND ----------
 70 | 
 71 | redshift_url = f"jdbc:redshift://{hostname_redshift}:{port_redshift}/{database}?user={redshift_user}&password={redshift_password}&ssl=true&sslfactory=org.postgresql.ssl.NonValidatingFactory"
 72 | 
 73 | # COMMAND ----------
 74 | 
 75 | ## Pull and Aggregate mode recent DDL statement for all tables, and optionally filter for a set of schemas
 76 | 
 77 | rsh_query = f"""SELECT LISTAGG(CASE WHEN LEN(RTRIM(ddl)) = 0 THEN ddl ELSE RTRIM(ddl) END) WITHIN GROUP (ORDER BY seq) as query_statement, schemaname, tablename 
 78 |                    FROM {redshift_table_name} GROUP BY schemaname, tablename"""
 79 | 
 80 | 
 81 | 
 82 | if schemas_to_migrate == "All":
 83 |     view_create = (spark.read.format("com.databricks.spark.redshift")
 84 |                    .option("url", redshift_url)
 85 |                    .option("query", rsh_query)
 86 |                    .option("tempdir", tempdir_redshift_unloads)
 87 |                    .option("aws_iam_role", iam_role_redshift)
 88 |                    .load()
 89 |                   )
 90 | else: 
 91 |     view_create = (spark.read.format("com.databricks.spark.redshift")
 92 |                    .option("url", redshift_url)
 93 |                    .option("query", rsh_query)
 94 |                    .option("tempdir", tempdir_redshift_unloads)
 95 |                    .option("aws_iam_role", iam_role_redshift)
 96 |                    .load()
 97 |                    .filter(col("schemaname").isin(*schemas_to_migrate))
 98 |                   )   
 99 | 
100 | # COMMAND ----------
101 | 
102 | spark.sql("""CREATE DATABASE IF NOT EXISTS redshift_migration;""")
103 | 
104 | # COMMAND ----------
105 | 
106 | # MAGIC %md
107 | # MAGIC 
108 | # MAGIC 
109 | # MAGIC Output: 
110 | # MAGIC 
111 | # MAGIC {"<table_name": {"ddl": "<sql_string>", "optimize_command": "<dml_string>"},...}
112 | # MAGIC Query text, command Id, rawSql String, run timestamp, recency rank, table_name, clean DDL, clean OPTIMIZE command
113 | 
114 | # COMMAND ----------
115 | 
116 | @udf("string")
117 | def getCreateStatementOnly(sqlString):
118 |     try:
119 |         resultStr = sqlString.partition("CREATE")[1] + sqlString.partition("CREATE")[2].partition(";")[0]
120 |         return resultStr
121 |     except:
122 |         resultStr = ''
123 |         return resultStr
124 |     
125 |     
126 | 
127 | def getCreateStatementOnlyPython(sqlString):
128 |     try:
129 |         resultStr = sqlString.partition("CREATE")[1] + sqlString.partition("CREATE")[2].partition(";")[0]
130 |         return resultStr
131 |     except:
132 |         resultStr = ''
133 |         return resultStr
134 | 
135 | # COMMAND ----------
136 | 
137 | # DBTITLE 1,Parsing Functions
138 | import re
139 | 
140 | def get_table_name(tokens):
141 |     for token in reversed(tokens):
142 |         if token.ttype is None:
143 |             return token.value
144 |     return ""
145 | 
146 | ## Get zorder cols from DIST and SORT KEYS
147 | 
148 | ## Allow ZORDER cols to be empty (no ZORDER, just optimize)
149 | def get_zorder_cols(tokens):
150 |     
151 |     zorder_keys = []
152 |     dist_cols = []
153 |     sort_cols = []
154 |     for i, t in enumerate(tokens):
155 | 
156 |         if re.search('distkey', str(t).lower()):
157 |             dc = str(tokens[i+1])
158 |             dist_cols = [i.strip() for i in re.sub("[\t\n]", "", dc[dc.find("(")+1:dc.find(")")]).split(",")]
159 |             #print(f"found dist key! {dist_cols}")
160 | 
161 |         if re.search('sortkey', str(t).lower()):
162 |             sc = str(tokens[i+1])
163 |             sort_cols = [i.strip() for i in re.sub("[\t\n]", "", sc[sc.find("(")+1:sc.find(")")]).split(",")]
164 |             #print(f"found sort key! {sort_cols}")
165 | 
166 |         ## TO DO: Make need to automate the ordering of these cols since they will go into a Z ORDER
167 | 
168 |     zorder_keys = list(set(dist_cols + sort_cols))
169 |  
170 |     return zorder_keys or []
171 | 
172 | ### See if columns is an identity column or not
173 | 
174 | def is_identity_column(token):
175 |     has_id_cols = False
176 |     
177 |     if re.search('identity', str(token).lower()):
178 |         dc = str(token)
179 |         has_id_cols = True
180 |         return has_id_cols
181 | 
182 |     return has_id_cols
183 | 
184 | 
185 | 
186 | ## Spark UDF function
187 | @udf("string")
188 | def getDDLFromSQLString(sqlString):
189 | 
190 |     cleanSqlString = getCreateStatementOnlyPython(sqlString)
191 |     parse = sqlparse.parse(cleanSqlString)
192 | 
193 |     ## For each statement in the sql string (can be thousands, parse SQL String and built DDL expression and optimize statement)
194 |     final_ddl_json = {}
195 | 
196 |     try:
197 |         for stmt in parse:
198 |             # Get all the tokens except whitespaces
199 |             tokens = [t for t in sqlparse.sql.TokenList(stmt.tokens) if t.ttype != sqlparse.tokens.Whitespace]
200 |             is_create_stmt = False
201 | 
202 |             zorder_cols = get_zorder_cols(tokens)
203 | 
204 |             for i, token in enumerate(tokens):
205 |                 # Check if create statement
206 |                 if token.match(sqlparse.tokens.DDL, 'CREATE'):
207 |                     is_create_stmt = True
208 |                     continue
209 | 
210 | 
211 |                 # If it was a create statement and the current token starts with "("
212 |                 if is_create_stmt and token.value.startswith("("):
213 |                     # Get the table name by looking at the tokens in reverse order till you find
214 |                     # a token with None type
215 | 
216 |                     ## Get Table Info 
217 |                     table_name = get_table_name(tokens[:i])
218 |                     #print (f"table: {table_name}")
219 | 
220 |                     ### Get Column Info
221 |                     txt = token.value
222 | 
223 |                     ## Split on comma but only if not in parentheses (eg. NUMERIC(10,2))
224 |                     s = txt[1:txt.rfind(")")].replace("\n","")
225 |                     #columns = re.split(r',\s*(?![^()]*\))', s)
226 |                     columns = re.split(r"(?<=[^\d+()]),(?![^()]*\))", s)
227 |                     
228 |                     ## Prep for rebuilding SQL String
229 |                     target_ddl_array = []
230 | 
231 |                     for column in columns:
232 |                         c = ' '.join(column.split()).split()
233 |                         c_name = c[0].replace('\"',"")
234 |                         c_type = c[1]  # For condensed type information 
235 | 
236 |                         c_type_full = " ".join(c[1:]) # For detailed type information ## Do not do this for stage 1 of migrator
237 |                         ## Check for identity generation column
238 |                         is_id = is_identity_column(c_type_full)
239 | 
240 |                         ## Make identity column if id col found in Redshift
241 |                         ## !!! USER MUST RUN ID SYNC WHEN MOVING ACTUAL EXISTING IDS ON FIRST BACKFILL FROM REDSHIFT!!!
242 |                         if is_id is True:
243 |                             c_type = "BIGINT" + " GENERATED BY DEFAULT AS IDENTITY"
244 | 
245 |                         #print (f"column: {c_name}")
246 |                         #print (f"date type: {c_type}")
247 | 
248 |                         ## Rebuild String for DBX
249 |                         clean_col = c_name + " " + c_type
250 |                         
251 |                         if clean_col.lower() == 'primary key':
252 |                             pass
253 |                         else:
254 |                             target_ddl_array.append(clean_col)
255 | 
256 |                     #print(f"Table columns: {target_ddl_array}")
257 |                     #print(f"Z ORDER Columns: {zorder_cols}")
258 | 
259 |                     ## Build entire statement
260 |                     full_ddl_string = f"CREATE TABLE IF NOT EXISTS {table_name} ({','.join(target_ddl_array)});"
261 |                     
262 |                     if len(zorder_cols) >= 1:
263 |                         full_optimize_string = f"OPTIMIZE {table_name} ZORDER BY ({','.join(zorder_cols)});"
264 |                     else:
265 |                         full_optimize_string = f"OPTIMIZE {table_name};"
266 | 
267 |                     #print(full_ddl_string)
268 |                     #print(full_optimize_string)
269 |                     #print ("---"*20)
270 | 
271 |                     final_ddl_json = {"table_name": table_name, "ddl": full_ddl_string, "optimize_command": full_optimize_string}
272 | 
273 |                     break
274 |     except:
275 |         pass
276 |                 
277 |     return json.dumps(final_ddl_json)
278 | 
279 | # COMMAND ----------
280 | 
281 | # MAGIC %md
282 | # MAGIC 
283 | # MAGIC ## TO DO: 
284 | # MAGIC 
285 | # MAGIC 1. Pull out database and table from results
286 | # MAGIC 2. Get most recent DDL statement for each table
287 | # MAGIC 3. Write command to auto run commands and migrate entire DDL in 1 command
288 | # MAGIC 4. Make incremental and merge results into target table
289 | 
290 | # COMMAND ----------
291 | 
292 | (view_create.withColumn("ParsedDDL", getDDLFromSQLString(col("query_statement")))
293 |  ## Get most recent table ddl command
294 |  ## merge into target table (just truncating and reloading right now)
295 |  ## Add separate command to run all statements
296 |  .write
297 |  .format("delta")
298 |  .option("overwriteSchema", "true")
299 |  .mode("overwrite")
300 |  .saveAsTable("redshift_migration.redshift_ddl_to_databricks")
301 | )
302 | 
303 | # COMMAND ----------
304 | 
305 | spark.sql("""SELECT query_statement, ParsedDDL:ddl, ParsedDDL:optimize_command FROM redshift_migration.redshift_ddl_to_databricks""")
306 | 


--------------------------------------------------------------------------------
/Using DBSQL Serverless Client Example.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %pip install -r helperfunctions/requirements.txt
 3 | 
 4 | # COMMAND ----------
 5 | 
 6 | from helperfunctions.dbsqlclient import ServerlessClient
 7 | 
 8 | # COMMAND ----------
 9 | 
10 | # DBTITLE 1,Example Inputs For Client
11 | 
12 | 
13 | token = None ## optional
14 | host_name = None ## optional
15 | warehouse_id = "475b94ddc7cd5211"
16 | 
17 | ## Single Query Example
18 | sql_statement = "SELECT concat_ws('-', M.id, N.id, random()) as ID FROM range(1000) AS M, range(1000) AS N LIMIT 10000000"
19 | 
20 | ## Multi Query Example
21 | multi_statement = "SELECT 1; SELECT 2; SELECT concat_ws('-', M.id, N.id, random()) as ID FROM range(1000) AS M, range(1000) AS N LIMIT 10000000"
22 | 
23 | # COMMAND ----------
24 | 
25 | serverless_client = ServerlessClient(warehouse_id = warehouse_id, token=token, host_name=host_name) ## token=<optional>, host_name=<optional>verbose=True for print statements and other debugging messages
26 | 
27 | # COMMAND ----------
28 | 
29 | # DBTITLE 1,Basic sql drop-in command
30 | """
31 | Optional Params:
32 | 1. full_results
33 | 2. use_catalog = <catalog> - this is a command specific USE CATALOG statement for the single SQL command
34 | 3. use_schema = <schema> - this is a command specific USE SCHEMA 
35 | 
36 | """
37 | 
38 | result_df = serverless_client.sql(sql_statement = sql_statement) ## OPTIONAL: use_catalog="hive_metastore", use_schema="default"
39 | 
40 | # COMMAND ----------
41 | 
42 | # DBTITLE 1,Multi Statement Command - No Results just Status - Recommended for production
43 | """
44 | Optional Params:
45 | 1. full_results
46 | 2. use_catalog = <catalog> - this is a command specific USE CATALOG statement for the single SQL command
47 | 3. use_schema = <schema> - this is a command specific USE SCHEMA 
48 | 
49 | """
50 | 
51 | result = serverless_client.submit_multiple_sql_commands(sql_statements = multi_statement, full_results=False) #session_catalog, session_schema are also optional parameters that will simulate a USE statement. True full_results just returns the whole API response for each query
52 | 
53 | # COMMAND ----------
54 | 
55 | # DBTITLE 1,Multi Statement Command Returning Results of Last Command - Best for simple processes
56 | result_multi_df = serverless_client.submit_multiple_sql_commands_last_results(sql_statements = multi_statement)
57 | 
58 | # COMMAND ----------
59 | 
60 | display(result_multi_df)
61 | 
62 | # COMMAND ----------
63 | 
64 | # DBTITLE 1,If Multi Statement Fails, this is how to access the result chain
65 | ## The function save the state of each command in the chain, even if it fails to return results for troubleshooting
66 | 
67 | last_saved_multi_statement_state = serverless_client.multi_statement_result_state
68 | print(last_saved_multi_statement_state)
69 | 


--------------------------------------------------------------------------------
/Using DBSQL Serverless Transaction Manager Example.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %pip install -r helperfunctions/requirements.txt
  3 | 
  4 | # COMMAND ----------
  5 | 
  6 | from helperfunctions.dbsqltransactions import DBSQLTransactionManager
  7 | 
  8 | # COMMAND ----------
  9 | 
 10 | # DBTITLE 1,Example Inputs For Client
 11 | token = None ## optional
 12 | host_name = None ## optional
 13 | warehouse_id = "475b94ddc7cd5211"
 14 | 
 15 | # COMMAND ----------
 16 | 
 17 | # DBTITLE 1,Example Multi Statement Transaction
 18 | sqlString = """
 19 | USE CATALOG hive_metastore;
 20 | 
 21 | CREATE SCHEMA IF NOT EXISTS iot_dashboard;
 22 | 
 23 | USE SCHEMA iot_dashboard;
 24 | 
 25 | -- Create Tables
 26 | CREATE OR REPLACE TABLE iot_dashboard.bronze_sensors
 27 | (
 28 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY,
 29 | device_id INT,
 30 | user_id INT,
 31 | calories_burnt DECIMAL(10,2), 
 32 | miles_walked DECIMAL(10,2), 
 33 | num_steps DECIMAL(10,2), 
 34 | timestamp TIMESTAMP,
 35 | value STRING
 36 | )
 37 | USING DELTA
 38 | TBLPROPERTIES("delta.targetFileSize"="128mb");
 39 | 
 40 | CREATE OR REPLACE TABLE iot_dashboard.silver_sensors
 41 | (
 42 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY,
 43 | device_id INT,
 44 | user_id INT,
 45 | calories_burnt DECIMAL(10,2), 
 46 | miles_walked DECIMAL(10,2), 
 47 | num_steps DECIMAL(10,2), 
 48 | timestamp TIMESTAMP,
 49 | value STRING
 50 | )
 51 | USING DELTA 
 52 | PARTITIONED BY (user_id)
 53 | TBLPROPERTIES("delta.targetFileSize"="128mb");
 54 | 
 55 | -- Statement 1 -- the load
 56 | COPY INTO iot_dashboard.bronze_sensors
 57 | FROM (SELECT 
 58 |       id::bigint AS Id,
 59 |       device_id::integer AS device_id,
 60 |       user_id::integer AS user_id,
 61 |       calories_burnt::decimal(10,2) AS calories_burnt, 
 62 |       miles_walked::decimal(10,2) AS miles_walked, 
 63 |       num_steps::decimal(10,2) AS num_steps, 
 64 |       timestamp::timestamp AS timestamp,
 65 |       value AS value -- This is a JSON object
 66 | FROM "/databricks-datasets/iot-stream/data-device/")
 67 | FILEFORMAT = json
 68 | COPY_OPTIONS('force'='true') -- 'false' -- process incrementally
 69 | --option to be incremental or always load all files
 70 | ; 
 71 | 
 72 | -- Statement 2
 73 | MERGE INTO iot_dashboard.silver_sensors AS target
 74 | USING (SELECT Id::integer,
 75 |               device_id::integer,
 76 |               user_id::integer,
 77 |               calories_burnt::decimal,
 78 |               miles_walked::decimal,
 79 |               num_steps::decimal,
 80 |               timestamp::timestamp,
 81 |               value::string
 82 |               FROM iot_dashboard.bronze_sensors) AS source
 83 | ON source.Id = target.Id
 84 | AND source.user_id = target.user_id
 85 | AND source.device_id = target.device_id
 86 | WHEN MATCHED THEN UPDATE SET 
 87 |   target.calories_burnt = source.calories_burnt,
 88 |   target.miles_walked = source.miles_walked,
 89 |   target.num_steps = source.num_steps,
 90 |   target.timestamp = source.timestamp
 91 | WHEN NOT MATCHED THEN INSERT *;
 92 | 
 93 | OPTIMIZE iot_dashboard.silver_sensors ZORDER BY (timestamp);
 94 | 
 95 | -- This calculate table stats for all columns to ensure the optimizer can build the best plan
 96 | -- Statement 3
 97 | 
 98 | ANALYZE TABLE iot_dashboard.silver_sensors COMPUTE STATISTICS FOR ALL COLUMNS;
 99 | 
100 | CREATE OR REPLACE TABLE hourly_summary_statistics
101 | AS
102 | SELECT user_id,
103 | date_trunc('hour', timestamp) AS HourBucket,
104 | AVG(num_steps)::float AS AvgNumStepsAcrossDevices,
105 | AVG(calories_burnt)::float AS AvgCaloriesBurnedAcrossDevices,
106 | AVG(miles_walked)::float AS AvgMilesWalkedAcrossDevices
107 | FROM silver_sensors
108 | GROUP BY user_id,date_trunc('hour', timestamp)
109 | ORDER BY HourBucket;
110 | 
111 | -- Statement 4
112 | -- Truncate bronze batch once successfully loaded
113 | TRUNCATE TABLE bronze_sensors;
114 | """
115 | 
116 | # COMMAND ----------
117 | 
118 | serverless_client_t = DBSQLTransactionManager(warehouse_id = warehouse_id, mode="inferred_altered_tables") ## token=<optional>, host_name=<optional>verbose=True for print statements and other debugging messages
119 | 
120 | # COMMAND ----------
121 | 
122 | # DBTITLE 1,Submitting the Multi Statement Transaction to Serverless SQL Warehouse
123 | """
124 | PARAMS: 
125 | warehouse_id --> Required, the SQL warehouse to submit statements
126 | mode -> selected_tables, inferred_altered_tables
127 | token --> optional, will try to get one for the user
128 | host_name --> optional, will try to infer same workspace url
129 | 
130 | 
131 | execute_sql_transaction params: 
132 | return_type --> "message", "last_results". "message" will return status of query chain. "last_result" will run all statements and return the last results of the final query in the chain
133 | 
134 | """
135 | 
136 | result_df = serverless_client_t.execute_dbsql_transaction(sql_string = sqlString)
137 | 


--------------------------------------------------------------------------------
/Using Delta Helpers Notebook Example.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC 
 4 | # MAGIC ## Using Delta Helpers Materialization Class. 
 5 | # MAGIC 
 6 | # MAGIC <p3> This class is for the purpose of materializing tables with delta onto cloud storage. This is often helpful for debugging and for simplifying longer, more complex query pipelines that would otherwise require highly nested CTE statements. Often times, the plan is simplified and performane is improved by removing the lazy evaluation and creating "checkpoint" steps with a materialized temp_db. Currently spark temp tables are NOT materialized, and thus not evaluated until called which is identical to a subquery. 
 7 | # MAGIC   
 8 | # MAGIC #### Initialization
 9 | # MAGIC   
10 | # MAGIC   <li> <b> deltaHelpers = DeltaHelpers(temp_root_path= "dbfs:/delta_temp_db", db_name="delta_temp") </b> - The parameters are defaults and can be changed to a customer db name or s3 path
11 | # MAGIC     
12 | # MAGIC #### There are 4 methods: 
13 | # MAGIC   
14 | # MAGIC   <li> <b> createOrReplaceTempDeltaTable(df: DataFrame, table_name: String) </b> - This creates or replaces materialized delta table in the default location in dbfs or in your provided s3 path
15 | # MAGIC   <li> <b> appendToTempDeltaTable(df: DataFrame, table_name: String) </b> - This appends to an existing delta table or creates a new one if not exists in dbfs or your provided s3 path
16 | # MAGIC   <li> <b> removeTempDeltaTable(table_name) </b> - This removes the delta table from your delta_temp database session
17 | # MAGIC   <li> <b> removeAllTempTablesForSession() </b> - This truncates the initialized temp_db session. It does NOT run a DROP DATABASE command because the database can be global. It only removes the session path it creates. 
18 | 
19 | # COMMAND ----------
20 | 
21 | # MAGIC %pip install -r helperfunctions/requirements.txt
22 | 
23 | # COMMAND ----------
24 | 
25 | # DBTITLE 1,Import
26 | from helperfunctions.deltahelpers import DeltaHelpers
27 | 
28 | # COMMAND ----------
29 | 
30 | # DBTITLE 1,Initialize
31 | ## 2 Params [Optional - db_name, temp_root_path]
32 | deltaHelpers = DeltaHelpers()
33 | 
34 | # COMMAND ----------
35 | 
36 | # DBTITLE 1,Create or Replace Temp Delta Table
37 | df = spark.read.format("json").load("/databricks-datasets/iot-stream/data-device/")
38 | 
39 | ## Methods return the cached dataframe so you can continue on as needed without reloading source each time AND you can reference in SQL (better for foreachBatch)
40 | ## No longer lazy -- this calls an action
41 | df = deltaHelpers.createOrReplaceTempDeltaTable(df, "iot_data")
42 | 
43 | ## Build ML Models
44 | 
45 | display(df)
46 | 
47 | # COMMAND ----------
48 | 
49 | # DBTITLE 1,Read cached table quickly in python or SQL
50 | # MAGIC %sql
51 | # MAGIC -- Read cahced table quickly in python or SQL
52 | # MAGIC SELECT * FROM delta_temp.iot_data
53 | 
54 | # COMMAND ----------
55 | 
56 | df.count()
57 | 
58 | # COMMAND ----------
59 | 
60 | # DBTITLE 1,Append to Temp Delta Table
61 | ## Data is 1,000,000 rows
62 | df_doubled = deltaHelpers.appendToTempDeltaTable(df, "iot_data")
63 | 
64 | ## Be CAREFUL HERE! Since the function calls an action, it is NOT lazily evaluated. So running it multiple times can append the same data
65 | df_doubled.count()
66 | 
67 | # COMMAND ----------
68 | 
69 | # MAGIC %sql
70 | # MAGIC 
71 | # MAGIC DESCRIBE HISTORY delta_temp.iot_data
72 | 
73 | # COMMAND ----------
74 | 
75 | # DBTITLE 1,Remove Temp Delta Table
76 | deltaHelpers.removeTempDeltaTable("iot_data")
77 | 
78 | # COMMAND ----------
79 | 
80 | # MAGIC %sql
81 | # MAGIC 
82 | # MAGIC SELECT * FROM delta_temp.iot_data
83 | 
84 | # COMMAND ----------
85 | 
86 | # DBTITLE 1,Truncate Session
87 | ## Deletes all tables in session path but does not drop that delta_temp database
88 | deltaHelpers.removeAllTempTablesForSession()
89 | 


--------------------------------------------------------------------------------
/Using Delta Logger Example.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC
  4 | # MAGIC ## Delta Logger - How to use
  5 | # MAGIC
  6 | # MAGIC Purpose: This notebook utilizes the delta logger library to automatically and easiy log general pipeline information all in one place for any data pipeline. 
  7 | # MAGIC
  8 | # MAGIC All logger tables have a standard default schema DDL: 
  9 | # MAGIC
 10 | # MAGIC CREATE TABLE IF NOT EXISTS {full_table_name} (
 11 | # MAGIC       run_id BIGINT GENERATED BY DEFAULT AS IDENTITY,
 12 | # MAGIC       batch_id STRING, 
 13 | # MAGIC       session_process_name STRING NOT NULL,
 14 | # MAGIC       process_name STRING NOT NULL,
 15 | # MAGIC       status STRING NOT NULL, -- RUNNING, FAIL, SUCCESS, STALE
 16 | # MAGIC       start_timestamp TIMESTAMP NOT NULL,
 17 | # MAGIC       end_timestamp TIMESTAMP,
 18 | # MAGIC       duration_seconds DECIMAL,
 19 | # MAGIC       duration_ms DECIMAL,
 20 | # MAGIC       run_metadata STRING, -- String formatted like JSON
 21 | # MAGIC       update_timestamp TIMESTAMP,
 22 | # MAGIC       update_date DATE GENERATED ALWAYS AS (update_timestamp::date),
 23 | # MAGIC       start_date DATE GENERATED ALWAYS AS (start_timestamp::date),
 24 | # MAGIC       end_date DATE GENERATED ALWAYS AS (end_timestamp::date)
 25 | # MAGIC     )
 26 | # MAGIC     USING DELTA 
 27 | # MAGIC
 28 | # MAGIC ## Overivew
 29 | # MAGIC The Delta logger is organized into Sessions, Processes, and Runs. 
 30 | # MAGIC A session is just like a Spark Session. It is an attempt at running a particular job/task. It is scoped like an active session each time the delta_logger is initialized. 
 31 | # MAGIC
 32 | # MAGIC A session can then have one or many proesses running inside it. This is to allow for nested tracking of specific actions/processes within a Databricks job/notebook. By default, if a custom process name is not provided when starting a run, the session_process_name = active_process_name. 
 33 | # MAGIC
 34 | # MAGIC Then, each process (can be 1 or many for each session/batch) can perform a run. A run is the smallest atomic unit. It is an ever-incrementing attempt at running a process. Runs then have start times, status, end times, metadata, etc. 
 35 | # MAGIC
 36 | # MAGIC
 37 | # MAGIC ## Initialize 
 38 | # MAGIC delta_logger = DeltaLogger(logger_table="main.iot_dashboard.pipeline_logs", 
 39 | # MAGIC                             session_process_name="iot_pipeline", 
 40 | # MAGIC                             batch_id = None ## Optional - allows user to pass in custom session batch id, by default a uuid is created for measuring a session id. 
 41 | # MAGIC                             logger_location=None, ## Optional location of the underlying table. S3/ADLS/GCS/dbfs path. 
 42 | # MAGIC                             partition_col:[str] = None ## Optional list of custom partition columns for the table. Allows user to customerize their logger to their query and management needs. 
 43 | # MAGIC                             )
 44 | # MAGIC
 45 | # MAGIC  - <b>logger_table</b> is the logging table you want to store and reference. You can create and manage as many logger tables as you would like. If you initilize a DeltaLogger and that table does not exist, it will create it for you. 
 46 | # MAGIC - <b> session_process_name</b> OPTIONAL - Users can log events/runs and pass the process_name into each event, or they can simply define it at the session level this way. This will default to using the session_process_name passed in here for the whole session. It can be overridden anytime. You can also use this to log child processes within a session by starting/completing runs with additional process names while using the session process name as the parent. And example is provided in this notebook. 
 47 | # MAGIC - <b> logger_location </b> OPTIONAL - default = None. This is an override for specifying a specific object storage location for where the user wants the table to live. If not provided, it will be a managed table by default (recommended).
 48 | # MAGIC
 49 | # MAGIC ## Methods: 
 50 | # MAGIC
 51 | # MAGIC For most methods: --  if process_name not provided, will use session. If cannot find process_name, will error. 
 52 | # MAGIC
 53 | # MAGIC - <b> create_logger() </b> -- creates a logger table if not exists. This also optimizes the table since it is used in initlialization. 
 54 | # MAGIC - <b> drop_logger() </b> -- drops the logger table attached to the session
 55 | # MAGIC - <b> truncate_logger() </b> -- clears an existing logger table
 56 | # MAGIC - <b> start_run(process_name: Optional, msg: Optional) </b>
 57 | # MAGIC - <b> fail_run(process_name: Optional, msg: Optional) </b>
 58 | # MAGIC - <b> complete_run(process_name: Optional, msg: Optional) </b>
 59 | # MAGIC - <b> log_run_metric(process_name: Optional, run_metric_dic:dict[str])
 60 | # MAGIC - <b> log_run_info(log_level = 'INFO', msg = None) </b>
 61 | # MAGIC - <b> get_last_successful_run_id(proces_name: Optional)</b> -- If no previous successful run, return -1
 62 | # MAGIC - <b> get_last_successful_run_timestamp(process_name: Optional)</b> -- If no previous successful run for the process, defaults to "1900-01-01 00:00:00"
 63 | # MAGIC - <b> get_last_run_id(process_name: Optional)</b> -- Get last run id regardless of status, if none return -1
 64 | # MAGIC - <b> get_last_run_timestamp(process_name: Optional)</b> -- Get last run timestamp , If no previous run for the process, defaults to "1900-01-01 00:00:00"
 65 | # MAGIC - <b> get_last_failed_run_id(process_name: Optional) </b>
 66 | # MAGIC - <b> get_last_failed_run_timestamp(prcoess_name: Optional) </b>
 67 | # MAGIC - <b> clean_stale_runs(process_name: Optional) </b> -- Will mark any runs without and end timestamp in the running state to "STALE" and give them an end timestamp. This ONLY happens when a new run is created and the runs are < the max existing RUNNING run id
 68 | # MAGIC - <b> optimize_log(process_name:Optional, zorderCols=["end_timestamp", "start_timestamp", "run_id"]) </b> -- Optimizes the underlying log table for a particular process name a ZORDERs by input col list
 69 | # MAGIC ### Limitations / Considerations
 70 | # MAGIC 1. Currently supports 1 concurrent run per process_name for a given delta table. If you want to run concurrent pipelines, you need to create separate process names for them. This is meant to be a simple run and logging tracking solution for EDW pipelines. 
 71 | # MAGIC
 72 | # MAGIC 2. User can pass in the fully qualified table name, use the spark session defaults, or pass in catalog and database overrides to the parameters. Pick one. 
 73 | # MAGIC
 74 | 
 75 | # COMMAND ----------
 76 | 
 77 | # MAGIC %md
 78 | # MAGIC
 79 | # MAGIC ## Design Patterns In this Example
 80 | # MAGIC
 81 | # MAGIC 1. Use for Basic error handling, tracking of runs of various processes
 82 | # MAGIC 2. Use for watermarking loading patterns. i.e. Creating a new run automatically pulls the most recent previous successful run and provide a "watermark" variable you can utilize for incremental loading. Use delta_logger.get_last_succes
 83 | # MAGIC 3. Use with DBSQL Client and Transaction Manager Together for end to end
 84 | 
 85 | # COMMAND ----------
 86 | 
 87 | # DBTITLE 1,Import Logger
 88 | from helperfunctions.deltalogger import DeltaLogger
 89 | 
 90 | # COMMAND ----------
 91 | 
 92 | # DBTITLE 1,Initialize a Delta Logger (creates logger table referenced in not exists)
 93 | ## Session_process_name - Name for the session of a notebook. By default it is the notebook path. 
 94 | ## Session_batch_id - Id for the session. By default it is a generated uuid for each delta_logger initialization (session). Can customize to any string. 
 95 | ## partition_cols - Customer partition columns for a Delta table.  By default the partition columns are: ['start_date', 'session_process_name', 'process_name']
 96 | 
 97 | ## All date colummns are auto-generated columns that are based on the timestamp columns in the table. 
 98 | 
 99 | delta_logger = DeltaLogger(logger_table_name="main.iot_dashboard.logger")
100 | 
101 | # COMMAND ----------
102 | 
103 | # DBTITLE 1,Start A Run
104 | ## process_name - Optional additionl / sub process name within session. By default process_name is the same as the session process name
105 | ## batch_id - Optional Batch Id
106 | 
107 | delta_logger.start_run()
108 | 
109 | 
110 | # COMMAND ----------
111 | 
112 | # DBTITLE 1,Get Active Run Info
113 | print(delta_logger.session_process_name)
114 | print(delta_logger.active_process_name)
115 | print(delta_logger.active_run_id)
116 | print(delta_logger.active_run_end_ts)
117 | print(delta_logger.active_run_start_ts)
118 | print(delta_logger.active_run_status)
119 | print(delta_logger.active_run_metadata)
120 | 
121 | # COMMAND ----------
122 | 
123 | # DBTITLE 1,Log a Custom Named Metrics to Reference in Queries
124 | ## Seems to cancel out metrics when a manual process id is defined
125 | 
126 | delta_logger.log_run_metric(run_metrics_dict={"Rows_Affected": 10000, "Percent_Success": 1})
127 | 
128 | # COMMAND ----------
129 | 
130 | # DBTITLE 1,Watermarking Example Baked into Logger with Process Run Start Times
131 | watermark_ts = delta_logger.get_most_recent_success_run_start_time()
132 | 
133 | print(watermark_ts)
134 | 
135 | # COMMAND ----------
136 | 
137 | # DBTITLE 1,Log Run Info/Messages
138 | delta_logger.log_run_info(log_level='INFO', msg = "This step did some weird stuff")
139 | 
140 | # COMMAND ----------
141 | 
142 | delta_logger.log_run_info(log_level='WARN', msg = "This step did some weird stuff")
143 | 
144 | # COMMAND ----------
145 | 
146 | # DBTITLE 1,Complete a Run
147 | delta_logger.complete_run()
148 | ## delta_logger.fail_run()
149 | 
150 | # COMMAND ----------
151 | 
152 | # MAGIC %md
153 | # MAGIC
154 | # MAGIC ### Run a custom child/sub process name run within a session
155 | 
156 | # COMMAND ----------
157 | 
158 | ## Start a customer process name run within a session
159 | 
160 | ## This starts a run with this sub-process and registers the process_name as the active process
161 | delta_logger.start_run(process_name='MERGE STEP')
162 | 
163 | 
164 | # COMMAND ----------
165 | 
166 | delta_logger.log_run_metric(run_metrics_dict={"Rows_Affected": 40124, "Percent_Success": 0.5})
167 | 
168 | # COMMAND ----------
169 | 
170 | delta_logger.complete_run(process_name='MERGE STEP')
171 | 
172 | # COMMAND ----------
173 | 
174 | # MAGIC %md
175 | # MAGIC
176 | # MAGIC ## Analyze and Use the Logs!
177 | 
178 | # COMMAND ----------
179 | 
180 | # DBTITLE 1,Select From Logger in order of events DESC
181 | # MAGIC %sql
182 | # MAGIC
183 | # MAGIC SELECT * 
184 | # MAGIC FROM main.iot_dashboard.logger
185 | # MAGIC ORDER BY run_id DESC
186 | 
187 | # COMMAND ----------
188 | 
189 | # DBTITLE 1,Analyze Custom Logged Metrics
190 | # MAGIC %sql
191 | # MAGIC
192 | # MAGIC SELECT 
193 | # MAGIC session_process_name,
194 | # MAGIC process_name,
195 | # MAGIC date_trunc('HOUR', start_timestamp) AS EventHour,
196 | # MAGIC AVG(run_metadata:Rows_Affected) AS AvgRowsProcessed -- We can use our custom metrics in SQL Queries and Dashboards
197 | # MAGIC FROM main.iot_dashboard.logger
198 | # MAGIC GROUP BY
199 | # MAGIC session_process_name,
200 | # MAGIC process_name,
201 | # MAGIC date_trunc('HOUR', start_timestamp)
202 | # MAGIC ORDER BY EventHour
203 | 
204 | # COMMAND ----------
205 | 
206 | # DBTITLE 1,Check the partition columns of this logger
207 | delta_logger.logger_partition_cols
208 | 
209 | # COMMAND ----------
210 | 
211 | # DBTITLE 1,Check the ZORDER columns of this logger
212 | delta_logger.logger_zorder_cols
213 | 
214 | # COMMAND ----------
215 | 
216 | # DBTITLE 1,Use the Delta Partitions/ZORDER cols To Easily Query Large Logger Table
217 | # MAGIC %sql
218 | # MAGIC
219 | # MAGIC -- Using Partition Pruning
220 | # MAGIC SELECT
221 | # MAGIC *
222 | # MAGIC FROM main.iot_dashboard.logger
223 | # MAGIC WHERE start_date = '2023-11-02'::DATE
224 | # MAGIC AND session_process_name = '/Repos/cody.davis@databricks.com/edw-best-practices/Using Delta Logger Example'
225 | # MAGIC
226 | 
227 | # COMMAND ----------
228 | 
229 | # DBTITLE 1,Using the ZORDER cols to do analysis over time
230 | # MAGIC %sql
231 | # MAGIC
232 | # MAGIC
233 | # MAGIC -- Using Partition Pruning
234 | # MAGIC SELECT
235 | # MAGIC *
236 | # MAGIC FROM main.iot_dashboard.logger
237 | # MAGIC WHERE start_date = '2023-11-02'::DATE
238 | # MAGIC AND session_process_name = '/Repos/cody.davis@databricks.com/edw-best-practices/Using Delta Logger Example'
239 | # MAGIC AND start_timestamp BETWEEN (now() - INTERVAL 2 DAYS) AND now();
240 | # MAGIC
241 | # MAGIC
242 | # MAGIC -- Using Partition Pruning
243 | # MAGIC SELECT
244 | # MAGIC *
245 | # MAGIC FROM main.iot_dashboard.logger
246 | # MAGIC WHERE start_date = '2023-11-02'::DATE
247 | # MAGIC AND session_process_name = '/Repos/cody.davis@databricks.com/edw-best-practices/Using Delta Logger Example'
248 | # MAGIC AND run_id BETWEEN 1 AND 5
249 | 


--------------------------------------------------------------------------------
/Using Delta Merge Helpers Example.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC
 4 | # MAGIC ## Delta Merge Helpers:
 5 | # MAGIC
 6 | # MAGIC <p4> This is class with a set of static methods that help the user easily perform retry statements on operataions that may be cause a lot of conflicting transactions (usually in MERGE / UPDATE statements). 
 7 | # MAGIC   
 8 | # MAGIC <li> <b> 1 Method: retrySqlStatement(spark: SparkSession, operation_name: String, sqlStatement: String) </b> - the spark param is your existing Spark session, the operation name is simply an operation to identify your transaction, the sqlStatement parameter is the SQL statement you want to retry. 
 9 | 
10 | # COMMAND ----------
11 | 
12 | # MAGIC %pip install -r helperfunctions/requirements.txt
13 | 
14 | # COMMAND ----------
15 | 
16 | from helperfunctions.deltahelpers import DeltaMergeHelpers
17 | 
18 | # COMMAND ----------
19 | 
20 | 
21 | sql_statement = """
22 | MERGE INTO iot_dashboard.silver_sensors AS target
23 | USING (SELECT Id::integer,
24 |               device_id::integer,
25 |               user_id::integer,
26 |               calories_burnt::decimal,
27 |               miles_walked::decimal,
28 |               num_steps::decimal,
29 |               timestamp::timestamp,
30 |               value::string
31 |               FROM iot_dashboard.bronze_sensors) AS source
32 | ON source.Id = target.Id
33 | AND source.user_id = target.user_id
34 | AND source.device_id = target.device_id
35 | WHEN MATCHED THEN UPDATE SET 
36 |   target.calories_burnt = source.calories_burnt,
37 |   target.miles_walked = source.miles_walked,
38 |   target.num_steps = source.num_steps,
39 |   target.timestamp = source.timestamp
40 | WHEN NOT MATCHED THEN INSERT *;
41 | """
42 | 
43 | DeltaMergeHelpers.retrySqlStatement(spark, "merge_sensors", sqlStatement=sql_statement)
44 | 


--------------------------------------------------------------------------------
/Using Streaming Tables and MV Orchestrator.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md
 3 | # MAGIC
 4 | # MAGIC ## This library helps orchestrate Streaming tables in conjunction with other tables that may depend on synchronous updated from the streaming table for classical EDW loading patterns
 5 | # MAGIC  
 6 | # MAGIC ## Assumptions / Best Practices
 7 | # MAGIC
 8 | # MAGIC 1. Assumes ST is NOT SCHEDULED in the CREATE STATEMENT (externally orchestrated) (that is a different loading pattern that is not as common in classical EDW)
 9 | # MAGIC
10 | # MAGIC 2. Assumes that one or many pipelines are dependent upon the successful CREATe OR REFRESH of the streaming table, so this library will simply block the tasks from moving the job onto the rest of the DAG to ensure the downstream tasks actually read from the table when it finishes updated
11 | # MAGIC
12 | # MAGIC 3. This works best with a single node "Driver" notebook loading sql files from Git similar to how airflow would orchestrate locally. The single job node would then call spark.sql() to run the CREATE OR REFRESH and then you arent needing a warehouse and a DLT pipeline in the job for streaming refreshes. 
13 | 
14 | # COMMAND ----------
15 | 
16 | # MAGIC %md
17 | # MAGIC
18 | # MAGIC ## Library Steps 
19 | # MAGIC
20 | # MAGIC ### This library only takes in 1 sql statement at a time, this is because if there are multiple and only some pass and others fail, then it would not be correct failing or passing the whole statement. Each ST/MV must be done separately. This can be done by simply calling the static methods multiple times.
21 | # MAGIC
22 | # MAGIC 1. Parse Streaming Table / MV Create / Refresh commmand
23 | # MAGIC 2. Identify ST / MV table(s) for that command
24 | # MAGIC 3. Run SQL command - CREATE / REFRESH ST/MV
25 | # MAGIC 4. DESCRIBE DETAIL to get pipelines.pipelineId metadata
26 | # MAGIC 5. Perform REST API Call to check for in-progress Refreshes
27 | # MAGIC 6. Poll and block statement chain from "finishing" until all pipelines identified are in either "PASS/FAIL"
28 | # MAGIC 7. If statement PASSES - then complete and return
29 | # MAGIC 8. If statement FAILS - then throw REFRESH FAIL exception
30 | 
31 | # COMMAND ----------
32 | 
33 | from helperfunctions.stmvorchestrator import orchestrate_stmv_statement
34 | 
35 | # COMMAND ----------
36 | 
37 | sql_statement = """
38 | CREATE OR REFRESH STREAMING TABLE main.iot_dashboard.streaming_tables_raw_data
39 |   AS SELECT 
40 |       id::bigint AS Id,
41 |       device_id::integer AS device_id,
42 |       user_id::integer AS user_id,
43 |       calories_burnt::decimal(10,2) AS calories_burnt, 
44 |       miles_walked::decimal(10,2) AS miles_walked, 
45 |       num_steps::decimal(10,2) AS num_steps, 
46 |       timestamp::timestamp AS timestamp,
47 |       value  AS value -- This is a JSON object
48 |   FROM STREAM read_files('dbfs:/databricks-datasets/iot-stream/data-device/*.json*', 
49 |   format => 'json',
50 |   maxFilesPerTrigger => 12 -- what does this do when you
51 |   )
52 | """
53 | 
54 | # COMMAND ----------
55 | 
56 | orchestrate_stmv_statement(spark, dbutils, sql_statement=sql_statement)
57 | 


--------------------------------------------------------------------------------
/Using Transaction Manager Example.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md
  3 | # MAGIC
  4 | # MAGIC ## TO DO: 
  5 | # MAGIC
  6 | # MAGIC 1. Continue to add edge cases on affected tables: RESTORE TABLE, OPTIMIZE
  7 | # MAGIC 2. Ensure shapshot versions are created for tables that do not exists. if transaction fails and snapshot is -1, then run DROP TABLE IF EXISTS statement. 
  8 | 
  9 | # COMMAND ----------
 10 | 
 11 | # MAGIC %pip install -r helperfunctions/requirements.txt
 12 | 
 13 | # COMMAND ----------
 14 | 
 15 | from helperfunctions.transactions import Transaction
 16 | 
 17 | # COMMAND ----------
 18 | 
 19 | # DBTITLE 1,Example SQL Transaction Block
 20 | sqlString = """
 21 | USE CATALOG hive_metastore;
 22 | 
 23 | CREATE SCHEMA IF NOT EXISTS iot_dashboard;
 24 | 
 25 | USE SCHEMA iot_dashboard;
 26 | 
 27 | -- Create Tables
 28 | CREATE OR REPLACE TABLE iot_dashboard.bronze_sensors
 29 | (
 30 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY,
 31 | device_id INT,
 32 | user_id INT,
 33 | calories_burnt DECIMAL(10,2), 
 34 | miles_walked DECIMAL(10,2), 
 35 | num_steps DECIMAL(10,2), 
 36 | timestamp TIMESTAMP,
 37 | value STRING
 38 | )
 39 | USING DELTA
 40 | TBLPROPERTIES("delta.targetFileSize"="128mb");
 41 | 
 42 | CREATE OR REPLACE TABLE iot_dashboard.silver_sensors
 43 | (
 44 | Id BIGINT GENERATED BY DEFAULT AS IDENTITY,
 45 | device_id INT,
 46 | user_id INT,
 47 | calories_burnt DECIMAL(10,2), 
 48 | miles_walked DECIMAL(10,2), 
 49 | num_steps DECIMAL(10,2), 
 50 | timestamp TIMESTAMP,
 51 | value STRING
 52 | )
 53 | USING DELTA 
 54 | PARTITIONED BY (user_id)
 55 | TBLPROPERTIES("delta.targetFileSize"="128mb");
 56 | 
 57 | -- Statement 1 -- the load
 58 | COPY INTO iot_dashboard.bronze_sensors
 59 | FROM (SELECT 
 60 |       id::bigint AS Id,
 61 |       device_id::integer AS device_id,
 62 |       user_id::integer AS user_id,
 63 |       calories_burnt::decimal(10,2) AS calories_burnt, 
 64 |       miles_walked::decimal(10,2) AS miles_walked, 
 65 |       num_steps::decimal(10,2) AS num_steps, 
 66 |       timestamp::timestamp AS timestamp,
 67 |       value AS value -- This is a JSON object
 68 | FROM "/databricks-datasets/iot-stream/data-device/")
 69 | FILEFORMAT = json
 70 | COPY_OPTIONS('force'='true') -- 'false' -- process incrementally
 71 | --option to be incremental or always load all files
 72 | ; 
 73 | 
 74 | -- Statement 2
 75 | MERGE INTO iot_dashboard.silver_sensors AS target
 76 | USING (SELECT Id::integer,
 77 |               device_id::integer,
 78 |               user_id::integer,
 79 |               calories_burnt::decimal,
 80 |               miles_walked::decimal,
 81 |               num_steps::decimal,
 82 |               timestamp::timestamp,
 83 |               value::string
 84 |               FROM iot_dashboard.bronze_sensors) AS source
 85 | ON source.Id = target.Id
 86 | AND source.user_id = target.user_id
 87 | AND source.device_id = target.device_id
 88 | WHEN MATCHED THEN UPDATE SET 
 89 |   target.calories_burnt = source.calories_burnt,
 90 |   target.miles_walked = source.miles_walked,
 91 |   target.num_steps = source.num_steps,
 92 |   target.timestamp = source.timestamp
 93 | WHEN NOT MATCHED THEN INSERT *;
 94 | 
 95 | OPTIMIZE iot_dashboard.silver_sensors ZORDER BY (timestamp);
 96 | 
 97 | -- This calculate table stats for all columns to ensure the optimizer can build the best plan
 98 | -- Statement 3
 99 | 
100 | ANALYZE TABLE iot_dashboard.silver_sensors COMPUTE STATISTICS FOR ALL COLUMNS;
101 | 
102 | CREATE OR REPLACE TABLE hourly_summary_statistics
103 | AS
104 | SELECT user_id,
105 | date_trunc('hour', timestamp) AS HourBucket,
106 | AVG(num_steps)::float AS AvgNumStepsAcrossDevices,
107 | AVG(calories_burnt)::float AS AvgCaloriesBurnedAcrossDevices,
108 | AVG(miles_walked)::float AS AvgMilesWalkedAcrossDevices
109 | FROM silver_sensors
110 | GROUP BY user_id,date_trunc('hour', timestamp)
111 | ORDER BY HourBucket;
112 | 
113 | -- Statement 4
114 | -- Truncate bronze batch once successfully loaded
115 | TRUNCATE TABLE bronze_sensors;
116 | """
117 | 
118 | # COMMAND ----------
119 | 
120 | # MAGIC %md
121 | # MAGIC
122 | # MAGIC ## 3 Primary Ways to Do a Transaction
123 | # MAGIC
124 | # MAGIC 1. <b> SQL - selected_tables: </b> This allows the user to explicitly control which exact tables get snapshotted and rolledback - good for production where lots of jobs are running. 
125 | # MAGIC
126 | # MAGIC 2. <b> SQL - inferred_selected_tables </b> This uses SQL Glot to automatically find tables that would be altered from the SQL inside the transaction block, and will snapshot those tables. Great for simplicity but should be checked in a test before moving to production
127 | # MAGIC
128 | # MAGIC 3. <b> Python </b> - call .begin_transaction() and rollback_transaction() methods manually do manage a transaction state. This allows for more complex logic outside of a contiguous multi statement SQL block
129 | 
130 | # COMMAND ----------
131 | 
132 | # MAGIC %md
133 | # MAGIC
134 | # MAGIC ## Method 1: SQL - selected_tables
135 | 
136 | # COMMAND ----------
137 | 
138 | # DBTITLE 1,Initialize Transaction Class - Manually Define Selected Tables
139 | x = Transaction(mode="selected_tables", uc_default=False)
140 | 
141 | # COMMAND ----------
142 | 
143 | # DBTITLE 1,Execute a multi statement SQL transaction from a SQL string - Manually Defining 
144 | ## This method is great because to do not need to rollback manually, it is handled for you
145 | ## This statement auto-commmits on success. If you do not want that, you can write pyspark or regular SQL outside of this method and then manually rollback
146 | x.execute_sql_transaction(sqlString, tables_to_manage=["hive_metastore.iot_dashboard.silver_sensors"])
147 | 
148 | # COMMAND ----------
149 | 
150 | # MAGIC %md
151 | # MAGIC ## Method 2: SQL - inferred_altered_tables
152 | 
153 | # COMMAND ----------
154 | 
155 | y = Transaction(mode="inferred_altered_tables", uc_default=False) ## uc_default=True if you want to infer schema with main as default instead of hive_metastore.
156 | 
157 | # COMMAND ----------
158 | 
159 | ## This statement auto-commmits on success. If you do not want that, you can write pyspark or regular SQL outside of this method and then manually rollback
160 | 
161 | y.execute_sql_transaction(sqlString)
162 | 
163 | # COMMAND ----------
164 | 
165 | # MAGIC %md
166 | # MAGIC
167 | # MAGIC ## Method 3: Python
168 | # MAGIC
169 | # MAGIC Call transaction begin and rollback and do any logic in between
170 | 
171 | # COMMAND ----------
172 | 
173 | # DBTITLE 1,Begin Transaction in Python
174 | x.begin_transaction(tables_to_snapshot=["hive_metastore.iot_dashbaord.silver_sensors"])
175 | 
176 | # COMMAND ----------
177 | 
178 | ##### Do a bunch of logic here, any logic at all
179 | #####
180 | 
181 | # COMMAND ----------
182 | 
183 | # DBTITLE 1,Get Transaction Snapshot Info
184 | x.get_transaction_snapshot()
185 | 
186 | # COMMAND ----------
187 | 
188 | # DBTITLE 1,Manually rollback a transaction from most recent explicit snapshot for tables
189 | ### If you use the SQL execute method, it auto commits!! So you cannot roll back once it succeed. It will do it automatically. You can still use all the manual methods if you want to opt out of auto handling the rollback/committ process
190 | x.rollback_transaction()
191 | 


--------------------------------------------------------------------------------
/helperfunctions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodyAustinDavis/edw-best-practices/523f7cec7e37a156e275439dc83767b26a85685e/helperfunctions/__init__.py


--------------------------------------------------------------------------------
/helperfunctions/dbsqltransactions.py:
--------------------------------------------------------------------------------
  1 | from helperfunctions.dbsqlclient import ServerlessClient
  2 | from helperfunctions.transactions import Transaction, TransactionException, AlteredTableParser
  3 | import warnings
  4 | 
  5 | class DBSQLTransactionManager(Transaction):
  6 | 
  7 |   def __init__(self, warehouse_id, mode="selected_tables", uc_default=False, host_name=None, token=None):
  8 | 
  9 |     super().__init__(mode=mode, uc_default=uc_default)
 10 |     self.host_name = host_name
 11 |     self.token = token
 12 |     self.warehouse_id = warehouse_id
 13 | 
 14 |     ## other state
 15 |     self.use_sessions = None
 16 |     
 17 |     return 
 18 |   
 19 | 
 20 |   ### Execute multi statment SQL, now we can implement this easier for Serverless or not Serverless
 21 |   def execute_dbsql_transaction(self, sql_string, tables_to_manage=[], force=False, return_type="message"):
 22 | 
 23 |     ## return_type = message (returns status messages), last_result (returns the result of the last command in the sql chain)
 24 |     ## If force= True, then if transaction manager fails to find tables, then it runs the SQL anyways
 25 |     ## You do not NEED to run SQL this way to rollback a transaction,
 26 |     ## but it automatically breaks up multiple statements in one SQL file into a series of spark.sql() commands
 27 | 
 28 |     serverless_client = ServerlessClient(warehouse_id = self.warehouse_id, token=self.token, host_name=self.host_name) ## token=<optional>, host_name=<optional>verbose=True for print statements and other debugging messages
 29 | 
 30 |     current_catalog = serverless_client.spark.sql("SELECT current_catalog()").collect()[0][0]
 31 |     current_schema = serverless_client.spark.sql("SELECT current_schema()").collect()[0][0]
 32 | 
 33 |     ## Add default USE session scopes if USE statement were defined outside of the SQL string in the same spark session
 34 | 
 35 |     try: 
 36 | 
 37 |       default_use_session_scope = 'USE ' + current_catalog + '.' + current_schema + '; '
 38 | 
 39 |     ## Default to defaults if for some reason session level fetching fails
 40 |     except:
 41 |       if self.uc_default:
 42 |         default_use_session_scope = 'USE main.default; '
 43 |       elif not self.uc_default:
 44 |         default_use_session_scope = 'USE hive_metastore.uc_default; '
 45 |       else: 
 46 |         raise(ValueError("Unable to infer current session and uc_default is not True or False. True = main.default, False = hive_metastore.default as the default base session"))
 47 | 
 48 |     scoped_sql_string = default_use_session_scope + sql_string
 49 |     
 50 |     result_df = None
 51 | 
 52 |     stmts = [i.strip() for i in scoped_sql_string.split(";") if len(i.strip()) >0]
 53 | 
 54 |     ## Save to class state
 55 |     self.raw_sql_statement = scoped_sql_string
 56 |     self.sql_statement_list = stmts
 57 | 
 58 |     success_tables = False
 59 | 
 60 |     try:
 61 | 
 62 |       self.begin_dynamic_transaction(tables_to_manage=tables_to_manage)
 63 |       success_tables = True
 64 | 
 65 |     except Exception as e:
 66 |       print(f"FAILED: failed to acquire tables with errors: {str(e)}")
 67 |     
 68 | 
 69 |     ## If succeeded or force = True, then run the SQL
 70 |     if success_tables or force:
 71 |       if success_tables == False and force == True:
 72 |         warnings.warn("WARNING: Failed to acquire tables but force flag = True, so SQL statement will run anyways")
 73 | 
 74 |       ## Run the Transaction Logic with Serverless Client
 75 | 
 76 |       try:
 77 |         print(f"TRANSACTION IN PROGRESS ...Running multi statement SQL transaction now\n")
 78 | 
 79 |         ###!! Since the DBSQL execution API does not understand multiple statements, we need to submit the USE commands in the correct order manually. This is done with the AlteredTableParser()
 80 | 
 81 |         ### Get the USE session tree and submit SQL statements according to that tree
 82 |         parser = AlteredTableParser()
 83 |         parser.parse_sql_chain_for_altered_tables(self.sql_statement_list)
 84 |         self.use_sessions = parser.get_use_session_tree()
 85 | 
 86 |         for i in self.use_sessions:
 87 | 
 88 |           session_catalog = i.get("session_cat")
 89 |           session_db = i.get("session_db")
 90 |           use_session_statemnts = i.get("sql_statements")
 91 | 
 92 |           #print(use_session_statemnts)
 93 | 
 94 |           for s in use_session_statemnts:
 95 | 
 96 |             single_st = s.get("statement")
 97 |             
 98 |             print(f"\nRunning \n   {single_st}")
 99 | 
100 |             if single_st is not None:
101 | 
102 |               ## Submit the single command with the session USE scoped commands from the Parser Tree
103 |               ## OPTION 1: return status message
104 |               if return_type == "message":
105 | 
106 |                 result_df = serverless_client.submit_multiple_sql_commands(sql_statements=single_st, use_catalog=session_catalog, use_schema=session_db)
107 | 
108 |               elif return_type == "last_result":
109 |                 
110 |                 result_df = serverless_client.submit_multiple_sql_commands_last_results(sql_statements=single_st, use_catalog=session_catalog, use_schema=session_db)
111 | 
112 |               else:
113 |                 result_df = None
114 |                 print("No run mode selected, select 'message' or 'last_results'")
115 | 
116 | 
117 |         print(f"\n TRANSACTION SUCCEEDED: Multi Statement SQL Transaction Successfull! Updating Snapshot\n ")
118 |         self.commit_transaction()
119 | 
120 | 
121 |         ## Return results after committing sucesss outside of the for loop
122 |         return result_df
123 | 
124 |           
125 |       except Exception as e:
126 |         print(f"\n TRANSACTION FAILED to run all statements... ROLLING BACK \n")
127 |         self.rollback_transaction()
128 |         print(f"Rollback successful!")
129 |         
130 |         raise(e)
131 | 
132 |     else:
133 | 
134 |       raise(TransactionException(message="Failed to acquire tables and force=False, not running process.", errors="Failed to acquire tables and force=False, not running process."))
135 |       


--------------------------------------------------------------------------------
/helperfunctions/deltahelpers.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import requests
  3 | import re
  4 | import os
  5 | from datetime import datetime, timedelta
  6 | import uuid
  7 | from pyspark.sql import SparkSession
  8 | from pyspark.sql.functions import col, count, lit, max
  9 | from pyspark.sql.types import *
 10 | 
 11 | 
 12 | ### Helps Materialize temp tables during ETL pipelines
 13 | class DeltaHelpers():
 14 | 
 15 |     
 16 |     def __init__(self, db_name="delta_temp", temp_root_path="dbfs:/delta_temp_db"):
 17 |         
 18 |         self.spark = SparkSession.getActiveSession()
 19 |         self.db_name = db_name
 20 |         self.temp_root_path = temp_root_path
 21 | 
 22 |         self.dbutils = None
 23 |       
 24 |         #if self.spark.conf.get("spark.databricks.service.client.enabled") == "true":
 25 |         try:     
 26 |             from pyspark.dbutils import DBUtils
 27 |             self.dbutils = DBUtils(self.spark)
 28 |         
 29 |         except:
 30 |             
 31 |             import IPython
 32 |             self.dbutils = IPython.get_ipython().user_ns["dbutils"]
 33 | 
 34 |         self.session_id =self.dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
 35 |         self.temp_env = self.temp_root_path + self.session_id
 36 |         self.spark.sql(f"""DROP DATABASE IF EXISTS {self.db_name} CASCADE;""")
 37 |         self.spark.sql(f"""CREATE DATABASE IF NOT EXISTS {self.db_name} LOCATION '{self.temp_env}'; """)
 38 |         print(f"Initializing Root Temp Environment: {self.db_name} at {self.temp_env}")
 39 |         
 40 |         return
 41 |     
 42 | 
 43 |     def createOrReplaceTempDeltaTable(self, df, table_name):
 44 |         
 45 |         tblObj = {}
 46 |         new_table_id = table_name
 47 |         write_path = self.temp_env + new_table_id
 48 |         
 49 |         self.spark.sql(f"DROP TABLE IF EXISTS {self.db_name}.{new_table_id}")
 50 |         self.dbutils.fs.rm(write_path, recurse=True)
 51 |         
 52 |         df.write.format("delta").mode("overwrite").option("path", write_path).saveAsTable(f"{self.db_name}.{new_table_id}")
 53 |         
 54 |         persisted_df = self.spark.read.format("delta").load(write_path)
 55 |         return persisted_df
 56 |  
 57 |     def appendToTempDeltaTable(self, df, table_name):
 58 |         
 59 |         tblObj = {}
 60 |         new_table_id = table_name
 61 |         write_path = self.temp_env + new_table_id
 62 |         
 63 |         df.write.format("delta").mode("append").option("path", write_path).saveAsTable(f"{self.db_name}.{new_table_id}")
 64 |         
 65 |         persisted_df = self.spark.read.format("delta").load(write_path)
 66 |         return persisted_df
 67 |       
 68 |     def removeTempDeltaTable(self, table_name):
 69 |         
 70 |         table_path = self.temp_env + table_name
 71 |         self.dbutils.fs.rm(table_path, recurse=True)
 72 |         self.spark.sql(f"""DROP TABLE IF EXISTS {self.db_name}.{table_name}""")
 73 |         
 74 |         print(f"Temp Table: {table_name} has been deleted.")
 75 |         return
 76 |     
 77 |     def removeAllTempTablesForSession(self):
 78 |         
 79 |         self.dbutils.fs.rm(self.temp_env, recurse=True)
 80 |         ##spark.sql(f"""DROP DATABASE IF EXISTS {self.db_name} CASCADE""") This temp db name COULD be global, never delete without separate method
 81 |         print(f"All temp tables in the session have been removed: {self.temp_env}")
 82 |         return
 83 |         
 84 | 
 85 | 
 86 | class SchemaHelpers():
 87 |     
 88 |     def __init__():
 89 |         import json
 90 |         return
 91 |     
 92 |     @staticmethod
 93 |     def getDDLString(structObj):
 94 |         import json
 95 |         ddl = []
 96 |         for c in json.loads(structObj.json()).get("fields"):
 97 | 
 98 |             name = c.get("name")
 99 |             dType = c.get("type")
100 |             ddl.append(f"{name}::{dType} AS {name}")
101 | 
102 |         final_ddl = ", ".join(ddl)
103 |         return final_ddl
104 |     
105 |     @staticmethod
106 |     def getDDLList(structObj):
107 |         import json
108 |         ddl = []
109 |         for c in json.loads(structObj.json()).get("fields"):
110 | 
111 |             name = c.get("name")
112 |             dType = c.get("type")
113 |             ddl.append(f"{name}::{dType} AS {name}")
114 | 
115 |         return ddl
116 |     
117 |     @staticmethod
118 |     def getFlattenedSqlExprFromValueColumn(structObj):
119 |         import json
120 |         ddl = []
121 |         for c in json.loads(structObj.json()).get("fields"):
122 | 
123 |             name = c.get("name")
124 |             dType = c.get("type")
125 |             ddl.append(f"value:{name}::{dType} AS {name}")
126 | 
127 |         return ddl
128 |       
129 |       
130 |       
131 |       
132 | class DeltaMergeHelpers():
133 |  
134 |     def __init__(self):
135 |         return
136 |  
137 |     @staticmethod
138 |     def retrySqlStatement(spark, operationName, sqlStatement, maxRetries = 10, maxSecondsBetweenAttempts=60):
139 |  
140 |         import time
141 |         maxRetries = maxRetries
142 |         numRetries = 0
143 |         maxWaitTime = maxSecondsBetweenAttempts
144 |         ### Does not check for existence, ensure that happens before merge
145 |  
146 |         while numRetries <= maxRetries:
147 |  
148 |             try: 
149 |  
150 |                 print(f"SQL Statement Attempt for {operationName} #{numRetries + 1}...")
151 |  
152 |                 spark.sql(sqlStatement)
153 |  
154 |                 print(f"SQL Statement Attempt for {operationName} #{numRetries + 1} Successful!")
155 |                 break
156 |  
157 |             except Exception as e:
158 |                 error_msg = str(e)
159 |  
160 |                 print(f"Failed SQL Statment Attmpet for {operationName} #{numRetries} with error: {error_msg}")
161 |  
162 |                 numRetries += 1
163 |                 if numRetries > maxRetries:
164 |                     break
165 |  
166 |             waitTime = waitTime = 2**(numRetries-1) ## Wait longer up to max wait time for failed operations
167 |  
168 |             if waitTime > maxWaitTime:
169 |                 waitTime = maxWaitTime
170 |  
171 |             print(f"Waiting {waitTime} seconds before next attempt on {operationName}...")
172 |             time.sleep(waitTime)


--------------------------------------------------------------------------------
/helperfunctions/redshiftchecker.py:
--------------------------------------------------------------------------------
  1 | from pyspark.sql import SparkSession
  2 | from pyspark.sql.functions import col, count, lit, date_trunc
  3 | 
  4 | class RedshiftChecker():
  5 |     
  6 | 
  7 |     """
  8 |     
  9 |     This class reads a table with optional filters from a Redshift environment, validates schema, rows counts, data types, and returns a diff
 10 |     Dependencies: 
 11 |     1. Assumes Redshift Connector is installed on the running cluster
 12 |     2. Assumes cluster has IAM Instance profile access to the requested Databricks tables
 13 |     
 14 |     ## TO DO: 
 15 |     1. Add Data Type Comparisons
 16 |     2. Add Row-level comparisons
 17 |     """
 18 | 
 19 |     def __init__(self, connectionString, iamRole, tempDir):
 20 | 
 21 |         print(f"Initialized Redshift Data Checker")
 22 |         self.spark = SparkSession.getActiveSession()
 23 |         self.connectionString = connectionString
 24 |         self.iamRole = iamRole
 25 |         self.tempDir = tempDir
 26 |     
 27 |     #### Build a Query and return the result  
 28 | 
 29 |     def getSpark(self):
 30 |         return self.spark
 31 | 
 32 |     def getQuery(self, tableName, dateFilterColumn=None, startDateTime=None, endDateTime=None, limit=None):
 33 |         
 34 |         tableName = tableName
 35 | 
 36 |         sqlQuery = f"""SELECT * FROM {tableName}"""
 37 |         dateFilterColumn = dateFilterColumn
 38 |         startDateTime = startDateTime
 39 |         endDateTime = endDateTime
 40 | 
 41 |         try: 
 42 |             
 43 |             if dateFilterColumn is not None:
 44 | 
 45 |                 if (endDateTime is not None) and (startDateTime is not None):
 46 |                     sqlFilter = f""" WHERE {dateFilterColumn} BETWEEN 
 47 |                                     (CASE WHEN '{startDateTime}' = 'None' THEN now() ELSE '{startDateTime}'::timestamp END) 
 48 |                                     AND 
 49 |                                     (CASE WHEN '{endDateTime}' = 'None' THEN now() ELSE '{endDateTime}'::timestamp END)"""
 50 |                     filteredQuery = sqlQuery + sqlFilter
 51 | 
 52 |                 elif startDateTime is not None:
 53 |                     sqlFilter = f""" WHERE {dateFilterColumn} BETWEEN 
 54 |                                     (CASE WHEN '{startDateTime}' = 'None' THEN now() ELSE '{startDateTime}'::timestamp END) 
 55 |                                     AND 
 56 |                                     now()"""
 57 |                     filteredQuery = sqlQuery + sqlFilter
 58 |                     
 59 |                 else:
 60 |                     filteredQuery = sqlQuery
 61 | 
 62 |             else: 
 63 |                 filteredQuery = sqlQuery
 64 |             
 65 |             ## filteredQuery
 66 |             ## Limit query if supplied
 67 |             if isinstance(limit, int):
 68 |                 limitStr = f""" LIMIT {limit}"""
 69 |                 finalQuery = filteredQuery + limitStr
 70 |                 
 71 |             elif limit is None: 
 72 |                 finalQuery = filteredQuery
 73 |             else: 
 74 |                 finalQuery = filteredQuery
 75 |                 print("No valid limit provided... not limiting table...")
 76 |                 
 77 |         except Exception as e:
 78 |             print(f"ERROR: Please provide a valid date filter or limit: {str(e)}")
 79 |             
 80 |         return finalQuery
 81 |     
 82 |     #### Get Redshift Table from a query
 83 |     def getRedshiftQueryResult(self, query):
 84 |         
 85 |         rsh_query = query
 86 |         redshift_df = ( self.spark.read
 87 |            .format("com.databricks.spark.redshift")
 88 |            .option("url", self.connectionString)
 89 |            .option("query", rsh_query)
 90 |            .option("tempdir", self.tempDir)
 91 |            .option("aws_iam_role", self.iamRole)
 92 |            .load()
 93 |                       )
 94 |         
 95 |         return redshift_df   
 96 |     
 97 |     #### Get Databricks Table from a query
 98 |     def getDatabricksQueryResults(self, query):
 99 |         
100 |         dbx_query = query
101 |         databricks_df =  self.spark.sql(dbx_query)
102 |         
103 |         return databricks_df
104 |     
105 |     #### Get Databricks Table
106 |     def getDatabricksTable(self, tableName, dateFilterColumn=None, startDateTime=None, endDateTime=None, limit=None):
107 |         
108 |         finalQuery = self.getQuery(tableName, dateFilterColumn, startDateTime, endDateTime, limit)
109 |         databricks_df = self.getDatabricksQueryResults(finalQuery)
110 |         return databricks_df
111 |     
112 |     #### Get Redshift Table
113 |     def getRedshiftTable(self, tableName, dateFilterColumn=None, startDateTime=None, endDateTime=None, limit=None):
114 |         
115 |         finalQuery = self.getQuery(tableName, dateFilterColumn, startDateTime, endDateTime, limit)
116 |         redshift_df = self.getRedshiftQueryResult(finalQuery)
117 |         return redshift_df      
118 |         
119 |         
120 |     def compareColumnsOfTable(self, redshiftTableName, databricksTableName):
121 |     
122 |         redshift_table = self.getRedshiftTable(redshiftTableName).columns
123 |         dbx_table = self.getDatabricksTable(databricksTableName).columns
124 |         
125 |         int_cols = ','.join(list(set(redshift_table).intersection(set(dbx_table))))
126 |         in_dbx_not_redshift = ','.join([i for i in dbx_table if i not in int_cols])
127 |         in_redshift_not_dbx = ','.join([i for i in redshift_table if i not in int_cols])
128 | 
129 |         cols_schema = ['in_both', 'in_redshift_not_databricks', 'in_databricks_not_redshift']
130 |         data = [[int_cols, in_redshift_not_dbx, in_dbx_not_redshift]]
131 | 
132 |         cols_comp_df =  self.spark.createDataFrame(data, cols_schema)
133 | 
134 |         return cols_comp_df
135 |         
136 |         
137 |     def compareRowCountOfTable(self, redsfhitTableName, databricksTableName, dateFilterColumn=None, startDateTime=None, endDateTime = None, limit=None, groupByAgg='all'):
138 |         
139 |         from pyspark.sql.functions import date_trunc
140 |         ## Group by agg options 
141 |         #None -- All Rows will be counted and compared
142 |         #all -- same as None, all rows will be counted
143 |         #day -- All rows within the range will be counted and grouped by day
144 |         #hour -- All rows within the range will be counted and grouped by hour
145 |         #minute -- All rows within the range will be counted and grouped by minute
146 |         
147 |         ## If dateFilter column is None, just count whole table
148 |         redshift_table = self.getRedshiftTable(redsfhitTableName, dateFilterColumn, startDateTime, endDateTime, limit)
149 |         dbx_table = self.getDatabricksTable(databricksTableName, dateFilterColumn, startDateTime, endDateTime, limit)
150 |         
151 |         if (groupByAgg.lower() == 'all') or (groupByAgg is None) or (dateFilterColumn is None):
152 |     
153 |             red_times = (redshift_table
154 |                          .agg(count("*").alias("RedshiftRowCount"))
155 |                          .withColumn("condition", lit("Full Table Row Counts"))
156 |                         )
157 | 
158 |             dbx_times = (dbx_table
159 |                          .agg(count("*").alias("DatabricksRowCount"))
160 |                          .withColumn("condition", lit("Full Table Row Counts"))
161 |                         )
162 | 
163 |             final_df = red_times.join(dbx_times, on="condition", how="full_outer")
164 |             return final_df
165 | 
166 |         elif groupByAgg.lower() in ['day', 'hour', 'minute', 'month', 'year']:
167 | 
168 |             red_times = (redshift_table
169 |                          .withColumn("date_col", date_trunc(groupByAgg, col(dateFilterColumn)))
170 |                          .groupBy("date_col")
171 |                          .agg(count(dateFilterColumn).alias("RedshiftRowCount"))
172 |                          .orderBy("date_col")
173 |                         )
174 | 
175 |             dbx_times = (dbx_table
176 |                          .withColumn("date_col", date_trunc(groupByAgg, col(dateFilterColumn)))
177 |                          .groupBy("date_col")
178 |                          .agg(count(dateFilterColumn).alias("DatabricksRowCount"))
179 |                          .orderBy("date_col")
180 |                         )
181 | 
182 | 
183 |             final_df = red_times.join(dbx_times, on="date_col", how="full_outer")
184 |             return final_df
185 | 
186 |         else: 
187 |             print("ERROR: please provide valid grouping, or dont provide one at all :)")
188 |             return


--------------------------------------------------------------------------------
/helperfunctions/requirements.txt:
--------------------------------------------------------------------------------
1 | sqlglot
2 | pyarrow


--------------------------------------------------------------------------------
/helperfunctions/stmvorchestrator.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import requests
  3 | import time
  4 | 
  5 | 
  6 | ## Function to block Create or REFRESH of ST or MV statements to wait until it is finishing before moving to next task
  7 | 
  8 | ## Similar to the awaitTermination() method in a streaming pipeline
  9 | 
 10 | ## Only supports 1 sql statement at a time on purpose
 11 | 
 12 | def orchestrate_stmv_statement(spark, dbutils, sql_statement, host_name=None, token=None):
 13 | 
 14 |   host_name = None
 15 |   token = None
 16 | 
 17 |   ## Infer hostname from same workspace
 18 |   if host_name is not None:
 19 |     host_name = host_name
 20 | 
 21 |   else:
 22 |     host_name = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiUrl().getOrElse(None).replace("https://", "")
 23 | 
 24 |   ## Automatically get user token if none provided
 25 |   if token is not None:
 26 |     token = token
 27 |   else: 
 28 |     token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().getOrElse(None)
 29 | 
 30 | 
 31 |   ## Get current catalogs/schemas from outside USE commands
 32 |   current_schema = spark.sql("SELECT current_schema()").collect()[0][0]
 33 |   current_catalog = spark.sql("SELECT current_catalog()").collect()[0][0]
 34 | 
 35 |   if current_catalog == 'spark_catalog':
 36 |     current_catalog = 'hive_metastore'
 37 | 
 38 | 
 39 |   ## Check for multiple statements, if more than 1, than raise too many statement exception
 40 |   all_statements = re.split(";", sql_statement)
 41 | 
 42 |   if (len(all_statements) > 1):
 43 |     print("WARNING: There are more than one statements in this sql command, this function will just pick and try to run the first statement and ignore the rest.")
 44 | 
 45 | 
 46 |   sql_statement = all_statements[0]
 47 | 
 48 | 
 49 |   try:
 50 | 
 51 |     ## Get table/mv that is being refreshed
 52 |     table_match = re.split("CREATE OR REFRESH STREAMING TABLE\s|REFRESH STREAMING TABLE\s|CREATE OR REFRESH MATERIALIZED VIEW\s|REFRESH MATERIALIZED VIEW\s", sql_statement.upper())[1].split(" ")[0]
 53 | 
 54 |   except Exception as e:
 55 | 
 56 |     ## If it was not able to find a REFRESH statement, ignore and unblock the operation and move on (i.e. if its not an ST/MV or if its just a CREATE)
 57 | 
 58 |     print("WARNING: No ST / MV Refresh statements found. Moving on.")
 59 |     return
 60 |   
 61 |   ## If ST/MV refresh was found
 62 |   
 63 |   if (len(table_match.split(".")) == 3):
 64 |     ## fully qualified, dont change it
 65 |     pass
 66 |   elif (len(table_match.split(".")) == 2):
 67 |     table_match = current_catalog + "." + table_match
 68 | 
 69 |   elif(len(table_match.split(".")) == 1):
 70 |     table_match = current_catalog + "." + current_schema + "." + table_match
 71 | 
 72 | 
 73 |   ## Step 2 - Execute SQL Statement
 74 |   spark.sql(sql_statement)
 75 | 
 76 | 
 77 |   ## Step 3 - Get pipeline Id for table 
 78 |   active_pipeline_id = (spark.sql(f"DESCRIBE DETAIL {table_match}")
 79 |     .selectExpr("properties").take(1)[0][0]
 80 |     .get("pipelines.pipelineId")
 81 |   )
 82 | 
 83 |   ## Poll for pipeline status
 84 |   
 85 | 
 86 |   current_state = "UNKNOWN"
 87 | 
 88 |   ## Pipeline is active 
 89 |   while current_state not in ("FAILED", "IDLE"):
 90 | 
 91 |     url = "https://" + host_name + "/api/2.0/pipelines/"
 92 |     headers_auth = {"Authorization":f"Bearer {token}"}
 93 | 
 94 |     check_status_resp = requests.get(url + active_pipeline_id , headers=headers_auth).json()
 95 | 
 96 |     current_state = check_status_resp.get("state")
 97 | 
 98 |     if current_state == "IDLE":
 99 |       print(f"STMV Pipeline {active_pipeline_id} completed! \n Moving on")
100 |       return
101 |     
102 |     elif current_state == "FAILED":
103 |       raise(BaseException(f"PIPELINE {active_pipeline_id} FAILED!"))
104 |     
105 | 
106 |     else:
107 |       ## Wait before polling again
108 |       ## TODO: Do exponential backoff
109 |       time.sleep(5)
110 | 
111 |     


--------------------------------------------------------------------------------