├── SQL ├── ~Configs │ ├── SCD-0-SetupSCD3.py │ ├── SCD-0-SetupSCD1.py │ ├── SCD-0-SetupSCD2.py │ ├── SCD-0-SetupSCD4.py │ └── SCD-0-Init.py ├── SCD-Start.sql ├── SCD-Type1.sql ├── SCD-Type4.sql ├── SCD-Type3.sql └── SCD-Type2.sql └── Python ├── ~Configs ├── SCD-0-SetupSCD1.py ├── SCD-0-SetupSCD3.py ├── SCD-0-SetupSCD2.py ├── SCD-0-SetupSCD4.py └── SCD-0-Init.py ├── SCD-Start.py ├── SCD-Type1.py ├── SCD-Type3.py └── SCD-Type2.py /SQL/~Configs/SCD-0-SetupSCD3.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %sql 3 | # MAGIC DROP TABLE IF EXISTS scd.scdType3; 4 | # MAGIC CREATE TABLE scd.scdType3 USING delta AS 5 | # MAGIC SELECT employee_id, first_name, last_name, gender, address_country FROM scd.employees 6 | # MAGIC ORDER BY employee_id 7 | -------------------------------------------------------------------------------- /SQL/~Configs/SCD-0-SetupSCD1.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %sql 3 | # MAGIC DROP TABLE IF EXISTS scd.scdType1; 4 | # MAGIC CREATE TABLE scd.scdType1 USING delta AS 5 | # MAGIC SELECT employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title FROM scd.employees 6 | -------------------------------------------------------------------------------- /Python/~Configs/SCD-0-SetupSCD1.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %sql 3 | # MAGIC DROP TABLE IF EXISTS scd.scdType1; 4 | # MAGIC CREATE TABLE scd.scdType1 USING delta AS 5 | # MAGIC SELECT employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title FROM scd.employees 6 | -------------------------------------------------------------------------------- /Python/~Configs/SCD-0-SetupSCD3.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %sql 3 | # MAGIC DROP TABLE IF EXISTS scd.scdType3; 4 | # MAGIC CREATE TABLE scd.scdType3 USING delta AS 5 | # MAGIC SELECT employee_id, first_name, last_name, gender, address_country FROM scd.employees 6 | # MAGIC ORDER BY employee_id 7 | 8 | # COMMAND ---------- 9 | 10 | 11 | -------------------------------------------------------------------------------- /SQL/~Configs/SCD-0-SetupSCD2.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %sql 3 | # MAGIC DROP TABLE IF EXISTS scd.scdType2; 4 | # MAGIC CREATE TABLE scd.scdType2 USING delta AS 5 | # MAGIC SELECT (employee_id - 1) AS id, employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title, start_date, end_date FROM scd.employees 6 | # MAGIC ORDER BY employee_id 7 | -------------------------------------------------------------------------------- /Python/~Configs/SCD-0-SetupSCD2.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %sql 3 | # MAGIC DROP TABLE IF EXISTS scd.scdType2; 4 | # MAGIC CREATE TABLE scd.scdType2 USING delta AS 5 | # MAGIC SELECT (employee_id - 1) AS id, employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title, start_date, end_date FROM scd.employees 6 | # MAGIC ORDER BY employee_id 7 | -------------------------------------------------------------------------------- /SQL/~Configs/SCD-0-SetupSCD4.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %sql 3 | # MAGIC DROP TABLE IF EXISTS scd.scdType4; 4 | # MAGIC CREATE TABLE scd.scdType4 USING delta AS 5 | # MAGIC SELECT employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title FROM scd.employees 6 | # MAGIC 7 | # MAGIC ORDER BY employee_id 8 | # MAGIC LIMIT 5 9 | 10 | # COMMAND ---------- 11 | 12 | 13 | -------------------------------------------------------------------------------- /Python/~Configs/SCD-0-SetupSCD4.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %sql 3 | # MAGIC DROP TABLE IF EXISTS scd.scdType4; 4 | # MAGIC CREATE TABLE scd.scdType4 USING delta AS 5 | # MAGIC SELECT employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title FROM scd.employees 6 | # MAGIC 7 | # MAGIC ORDER BY employee_id 8 | # MAGIC LIMIT 5 9 | 10 | # COMMAND ---------- 11 | 12 | 13 | -------------------------------------------------------------------------------- /SQL/SCD-Start.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md # Starting Notebook 3 | -- MAGIC Initiallise Datasets and Configs 4 | 5 | -- COMMAND ---------- 6 | 7 | -- MAGIC %md _C.Williams - 2021_ 8 | 9 | -- COMMAND ---------- 10 | 11 | -- MAGIC %md ### Setup Dataset 12 | 13 | -- COMMAND ---------- 14 | 15 | -- MAGIC %md Run settings notebook 16 | 17 | -- COMMAND ---------- 18 | 19 | -- MAGIC %run "./~Configs/SCD-0-Init" 20 | 21 | -- COMMAND ---------- 22 | 23 | -- MAGIC %md Get sample dataset 24 | 25 | -- COMMAND ---------- 26 | 27 | -- Preview Dataset 28 | SELECT * FROM EmployeeSample 29 | 30 | -- COMMAND ---------- 31 | 32 | -- MAGIC %md ### Write Dataset to DB 33 | -- MAGIC For use in other notebooks 34 | 35 | -- COMMAND ---------- 36 | 37 | -- Create DB if it doesn't already exist 38 | CREATE DATABASE IF NOT EXISTS scd; 39 | -- Drop tabl if it already exists 40 | DROP TABLE IF EXISTS scd.employees; 41 | -- Create new table (delta format) using the sample Employee dataset 42 | CREATE TABLE scd.employees USING delta 43 | AS SELECT * FROM EmployeeSample 44 | -------------------------------------------------------------------------------- /Python/SCD-Start.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %md # Starting Notebook 3 | # MAGIC Initiallise Datasets and Configs 4 | 5 | # COMMAND ---------- 6 | 7 | # MAGIC %md _C.Williams - 2021_ 8 | 9 | # COMMAND ---------- 10 | 11 | # MAGIC %md ### Setup Dataset 12 | 13 | # COMMAND ---------- 14 | 15 | # MAGIC %md Run settings notebook 16 | 17 | # COMMAND ---------- 18 | 19 | # MAGIC %run "./~Configs/SCD-0-Init" 20 | 21 | # COMMAND ---------- 22 | 23 | # MAGIC %md Get sample dataset 24 | 25 | # COMMAND ---------- 26 | 27 | # Preview Dataset 28 | display(sql("SELECT * FROM EmployeeSample")) 29 | 30 | # COMMAND ---------- 31 | 32 | # MAGIC %md ### Write Dataset to DB 33 | # MAGIC For use in other notebooks 34 | 35 | # COMMAND ---------- 36 | 37 | # Create DB if it doesn't already exist 38 | sql("CREATE DATABASE IF NOT EXISTS scd") 39 | # Drop table if it already exists 40 | sql("DROP TABLE IF EXISTS scd.employees") 41 | # Create new table (delta format) using the sample Employee dataset 42 | sql("CREATE TABLE scd.employees USING delta AS SELECT * FROM EmployeeSample") 43 | -------------------------------------------------------------------------------- /SQL/~Configs/SCD-0-Init.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | import urllib.request, json, requests 3 | from pyspark.sql.functions import * 4 | from pyspark.sql.types import * 5 | 6 | # COMMAND ---------- 7 | 8 | # Set GitHub URL 9 | url = 'https://raw.githubusercontent.com/cwilliams87/Blog/main/07-2021/sampleEmployees' 10 | 11 | # COMMAND ---------- 12 | 13 | def getSample(): 14 | 15 | # Get Url 16 | resp = requests.get(url) 17 | jsonData = json.loads(resp.text) 18 | data = sc.parallelize(jsonData) 19 | 20 | # Set Schema 21 | schema = StructType( 22 | [ 23 | StructField("id", LongType(), True), 24 | StructField("first_name", StringType(), True), 25 | StructField("last_name", StringType(), True), 26 | StructField("gender", StringType(), True), 27 | StructField("address_street", StringType(), True), 28 | StructField("address_city", StringType(), True), 29 | StructField("address_country", StringType(), True), 30 | StructField("email", StringType(), True), 31 | StructField("job_title", StringType(), True), 32 | StructField("start_date", StringType(), True), 33 | StructField("end_date", StringType(), True) 34 | ] 35 | ) 36 | 37 | df = spark.createDataFrame(data, schema) 38 | 39 | # Convert Date Columns 40 | df = df.withColumn("start_date", col("start_date").cast(DateType())) 41 | df = df.withColumn("end_date", col("end_date").cast(DateType())) 42 | 43 | # Rename Column (for later SCD's) 44 | df = df.withColumnRenamed('id', 'employee_id') 45 | 46 | # Additional column for SCD Type 3 47 | df = df.withColumn('previous_country', lit(None).cast(StringType())) 48 | 49 | df.registerTempTable("EmployeeSample") 50 | 51 | return 52 | 53 | 54 | # COMMAND ---------- 55 | 56 | getSample() 57 | -------------------------------------------------------------------------------- /Python/~Configs/SCD-0-Init.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | import urllib.request, json, requests 3 | from pyspark.sql.functions import * 4 | from pyspark.sql.types import * 5 | 6 | # COMMAND ---------- 7 | 8 | # Set GitHub URL 9 | url = 'https://raw.githubusercontent.com/cwilliams87/Blog/main/07-2021/sampleEmployees' 10 | 11 | # COMMAND ---------- 12 | 13 | def getSample(): 14 | 15 | # Get Url 16 | resp = requests.get(url) 17 | jsonData = json.loads(resp.text) 18 | data = sc.parallelize(jsonData) 19 | 20 | # Set Schema 21 | schema = StructType( 22 | [ 23 | StructField("id", LongType(), True), 24 | StructField("first_name", StringType(), True), 25 | StructField("last_name", StringType(), True), 26 | StructField("gender", StringType(), True), 27 | StructField("address_street", StringType(), True), 28 | StructField("address_city", StringType(), True), 29 | StructField("address_country", StringType(), True), 30 | StructField("email", StringType(), True), 31 | StructField("job_title", StringType(), True), 32 | StructField("start_date", StringType(), True), 33 | StructField("end_date", StringType(), True) 34 | ] 35 | ) 36 | 37 | df = spark.createDataFrame(data, schema) 38 | 39 | # Convert Date Columns 40 | df = df.withColumn("start_date", col("start_date").cast(DateType())) 41 | df = df.withColumn("end_date", col("end_date").cast(DateType())) 42 | 43 | # Rename Column (for later SCD's) 44 | df = df.withColumnRenamed('id', 'employee_id') 45 | 46 | # Additional column for SCD Type 3 47 | df = df.withColumn('previous_country', lit(None).cast(StringType())) 48 | 49 | df.registerTempTable("EmployeeSample") 50 | 51 | return 52 | 53 | 54 | # COMMAND ---------- 55 | 56 | getSample() 57 | -------------------------------------------------------------------------------- /SQL/SCD-Type1.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %md
https://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/dimensional-modeling-techniques/type-1/
5 | 6 | -- COMMAND ---------- 7 | 8 | -- MAGIC %md ### Setup SCD Type 1 example table 9 | 10 | -- COMMAND ---------- 11 | 12 | -- MAGIC %run "./~Configs/SCD-0-SetupSCD1" 13 | 14 | -- COMMAND ---------- 15 | 16 | USE scd 17 | 18 | -- COMMAND ---------- 19 | 20 | -- MAGIC %md ### Load employee table 21 | 22 | -- COMMAND ---------- 23 | 24 | -- Select employees table (ordered by id column) 25 | SELECT * FROM scd.scdType1 26 | ORDER BY employee_id 27 | 28 | -- COMMAND ---------- 29 | 30 | -- MAGIC %md ### Amend Job Title 31 | 32 | -- COMMAND ---------- 33 | 34 | -- MAGIC %mdThe employee Stu Sand (employee_id = 2) has changed their job title from Paralegal to Solicitor.
35 | -- MAGIC
We want to check if the row exists in the table, insert if new and overwrite if exists.
36 | -- MAGIC
Insert if new, Update if already exists
64 | 65 | -- COMMAND ---------- 66 | 67 | -- Merge scdType1NEW dataset into existing 68 | MERGE INTO scdType1 69 | USING scdType1NEW 70 | 71 | -- based on the following column(s) 72 | ON scdType1.employee_id = scdType1NEW.employee_id 73 | 74 | -- if there is a match do this... 75 | WHEN MATCHED THEN UPDATE SET * 76 | -- if there is no match insert new row 77 | WHEN NOT MATCHED THEN INSERT * 78 | 79 | -- COMMAND ---------- 80 | 81 | -- MAGIC %md ### Check Row 82 | -- MAGICCheck row and value(s) have been updated
83 | 84 | -- COMMAND ---------- 85 | 86 | -- Check table 87 | SELECT * FROM scdType1 88 | WHERE first_name = 'Stu' AND last_name = 'Sand' 89 | 90 | -- COMMAND ---------- 91 | 92 | -- Check Delta history 93 | DESCRIBE HISTORY scdType1 94 | 95 | -- COMMAND ---------- 96 | 97 | -- Clean up 98 | DROP VIEW IF EXISTS scdType1NEW 99 | -------------------------------------------------------------------------------- /Python/SCD-Type1.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %mdhttps://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/dimensional-modeling-techniques/type-1/
5 | 6 | # COMMAND ---------- 7 | 8 | # MAGIC %md ### Setup SCD Type 1 example table 9 | 10 | # COMMAND ---------- 11 | 12 | # MAGIC %run "./~Configs/SCD-0-SetupSCD1" 13 | 14 | # COMMAND ---------- 15 | 16 | # Import dependencies 17 | from delta.tables import * 18 | from pyspark.sql.functions import * 19 | from pyspark.sql.types import * 20 | 21 | # COMMAND ---------- 22 | 23 | # Set default database 24 | spark.catalog.setCurrentDatabase("scd") 25 | 26 | # COMMAND ---------- 27 | 28 | # MAGIC %md ### Load employee table 29 | 30 | # COMMAND ---------- 31 | 32 | # Create dataframe from HIVE db scd 33 | scdType1DF = spark.table("scd.scdType1") 34 | 35 | # COMMAND ---------- 36 | 37 | # Display dataframe 38 | display(scdType1DF.orderBy("employee_id")) 39 | 40 | # COMMAND ---------- 41 | 42 | # MAGIC %md ### Amend Job Title 43 | 44 | # COMMAND ---------- 45 | 46 | # MAGIC %mdThe employee Stu Sand (employee_id = 2) has changed their job title from Paralegal to Solicitor.
47 | # MAGIC
We want to check if the row exists in the table, insert if new and overwrite if exists.
48 | # MAGIC
Insert if new, Update if already exists
80 | 81 | # COMMAND ---------- 82 | 83 | # Convert table to Delta 84 | deltaTable = DeltaTable.forName(spark, "scdType1") 85 | 86 | # Merge Delta table with new dataset 87 | ( 88 | deltaTable 89 | .alias("original1") 90 | # Merge using the following conditions 91 | .merge( 92 | scd1Temp.alias("updates1"), 93 | "original1.employee_id = updates1.employee_id" 94 | ) 95 | # When matched UPDATE ALL values 96 | .whenMatchedUpdateAll() 97 | # When not matched INSERT ALL rows 98 | .whenNotMatchedInsertAll() 99 | # Execute 100 | .execute() 101 | ) 102 | 103 | # COMMAND ---------- 104 | 105 | # MAGIC %md ### Check Row 106 | # MAGICCheck row and value(s) have been updated
107 | 108 | # COMMAND ---------- 109 | 110 | # Check 111 | display( 112 | deltaTable.toDF().orderBy("employee_id").where("first_name = 'Stu' AND last_name = 'Sand'") 113 | ) 114 | 115 | # COMMAND ---------- 116 | 117 | # Check Delta History 118 | display(deltaTable.history()) 119 | -------------------------------------------------------------------------------- /SQL/SCD-Type4.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %mdhttps://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/dimensional-modeling-techniques/type-4/
5 | 6 | -- COMMAND ---------- 7 | 8 | -- MAGIC %md ### Setup SCD Type 4 example table 9 | 10 | -- COMMAND ---------- 11 | 12 | -- MAGIC %run "./~Configs/SCD-0-SetupSCD4" 13 | 14 | -- COMMAND ---------- 15 | 16 | USE scd 17 | 18 | -- COMMAND ---------- 19 | 20 | -- MAGIC %md ### Load employee table 21 | 22 | -- COMMAND ---------- 23 | 24 | -- Select employees table (ordered by id column) 25 | SELECT * FROM scd.scdType4 26 | ORDER BY employee_id 27 | 28 | -- COMMAND ---------- 29 | 30 | -- MAGIC %md ### Amend Email Address and Name 31 | 32 | -- COMMAND ---------- 33 | 34 | -- MAGIC %mdThe employees Rae Maith (employee_id = 1) and Stu Sand (employee_id = 2) have changes to be made.
35 | -- MAGIC
Rae needs a change of email address whereas Stu only shows a shortened version of his name which should be stuart.
36 | -- MAGIC
37 | -- MAGIC
We can make the changes using a similar approach to Type 1 SCD and use the Delta History to TIME TRAVEL to historical versions. This will create the functionality similar to a traditional Type 4 Slowly Changing Dimension e.g. active dataset and history table.
38 | -- MAGIC
Insert if new, Update if already exists
67 | 68 | -- COMMAND ---------- 69 | 70 | -- Merge scdType1NEW dataset into existing 71 | MERGE INTO scdType4 72 | USING scdType4NEW 73 | 74 | -- based on the following column(s) 75 | ON scdType4.employee_id = scdType4NEW.employee_id 76 | 77 | -- if there is a match do this... 78 | WHEN MATCHED THEN 79 | UPDATE SET * 80 | -- if there is no match insert new row 81 | WHEN NOT MATCHED THEN INSERT * 82 | 83 | -- COMMAND ---------- 84 | 85 | -- MAGIC %md ### Check Row 86 | -- MAGICCheck row and value(s) have been updated
87 | 88 | -- COMMAND ---------- 89 | 90 | -- Check table for changed rows 91 | SELECT * FROM scdType4 92 | 93 | -- COMMAND ---------- 94 | 95 | -- Check Delta history 96 | DESCRIBE HISTORY scdType4 97 | 98 | -- COMMAND ---------- 99 | 100 | -- MAGIC %md ### View previous version of table 101 | 102 | -- COMMAND ---------- 103 | 104 | -- View current and previous versions 105 | SELECT 0 AS version, * FROM scdType4 VERSION AS OF 0 106 | WHERE employee_id = 1 OR employee_id = 2 107 | UNION ALL 108 | SELECT 1 AS version, * FROM scdType4 VERSION AS OF 1 109 | WHERE employee_id = 1 OR employee_id = 2 110 | 111 | -- COMMAND ---------- 112 | 113 | -- MAGIC %md ### Rollback to previous versions 114 | 115 | -- COMMAND ---------- 116 | 117 | -- The table can be restored to a previous version 118 | RESTORE TABLE scdType4 TO VERSION AS OF 0 119 | 120 | -- COMMAND ---------- 121 | 122 | SELECT * FROM scdType4 123 | 124 | -- COMMAND ---------- 125 | 126 | -- Clean up 127 | DROP VIEW IF EXISTS scdType4NEW 128 | -------------------------------------------------------------------------------- /SQL/SCD-Type3.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %mdhttps://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/dimensional-modeling-techniques/type-3/
5 | 6 | -- COMMAND ---------- 7 | 8 | -- MAGIC %md ### Setup SCD Type 3 example table 9 | 10 | -- COMMAND ---------- 11 | 12 | -- MAGIC %run "./~Configs/SCD-0-SetupSCD3" 13 | 14 | -- COMMAND ---------- 15 | 16 | USE scd 17 | 18 | -- COMMAND ---------- 19 | 20 | -- MAGIC %md ### Load employee table 21 | 22 | -- COMMAND ---------- 23 | 24 | -- Select employees table (ordered by id column) 25 | SELECT * FROM scdType3 26 | ORDER BY employee_id 27 | 28 | -- COMMAND ---------- 29 | 30 | -- MAGIC %md ### Amend Country 31 | 32 | -- COMMAND ---------- 33 | 34 | -- MAGIC %md The employees **Maximo Moxon** (employee_id = 9), **Augueste Dimeloe** (employee_id = 10) and **Austina Wimbury** (employee_id = 11) have all relocated to an office in a different country. \ 35 | -- MAGIC We want to amend the country values and create an accompanying column to display the previous ones as well as inserting any rows that may be new. 36 | 37 | -- COMMAND ---------- 38 | 39 | -- Drop if already exists (for notebook continuity) 40 | DROP VIEW IF EXISTS scdType3NEW; 41 | 42 | -- Create View to merge 43 | CREATE VIEW scdType3NEW AS SELECT 44 | col1 AS employee_id, 45 | col2 AS first_name, 46 | col3 AS last_name, 47 | col4 AS gender, 48 | col5 AS address_country 49 | FROM VALUES 50 | (9, 'Maximo', 'Moxon', 'Male', 'Canada'), 51 | (10, 'Augueste', 'Dimeloe', 'Female', 'France'), 52 | (11, 'Austina', 'Wimbury', 'Male', 'Germany'), 53 | (501, 'Steven', 'Smithson', 'Male', 'France'); 54 | 55 | -- Preview results 56 | SELECT * FROM scdType3NEW 57 | 58 | -- COMMAND ---------- 59 | 60 | -- MAGIC %md ### Merge tables 61 | -- MAGICInsert if new, Update if already exists
62 | 63 | -- COMMAND ---------- 64 | 65 | -- MAGIC %md As the View containing new values has a different set of columns than the primary table, we use autoMerge for schema differences or evolution 66 | 67 | -- COMMAND ---------- 68 | 69 | -- Set autoMerge to True 70 | SET spark.databricks.delta.schema.autoMerge.enabled=true; 71 | 72 | -- COMMAND ---------- 73 | 74 | -- Create WIDGET to pass in column name variable and keep it dynamic 75 | CREATE WIDGET TEXT changingColumn DEFAULT ''; 76 | SET $changingColumn = 'address_country'; 77 | 78 | -- Create ChangeRows table (union of rows to amend and new rows to insert) 79 | CREATE OR REPLACE TEMP VIEW scd3ChangeRows AS 80 | SELECT scdType3New.*, scdType3.$changingColumn AS previous_$changingColumn FROM scdType3New 81 | INNER JOIN scdType3 82 | ON scdType3.employee_id = scdType3New.employee_id 83 | AND scdType3.$changingColumn <> scdType3New.$changingColumn 84 | UNION 85 | -- Union join any new rows to be inserted 86 | SELECT scdType3New.*, null AS previous_$changingColumn FROM scdType3New 87 | LEFT JOIN scdType3 88 | ON scdType3.employee_id = scdType3New.employee_id 89 | WHERE scdType3.employee_id IS NULL; 90 | 91 | -- COMMAND ---------- 92 | 93 | -- Merge scdType3NEW dataset into existing 94 | MERGE INTO scdType3 95 | USING scd3ChangeRows 96 | 97 | -- based on the following column(s) 98 | ON scdType3.employee_id = scd3ChangeRows.employee_id 99 | 100 | -- if there is a match do this... 101 | WHEN MATCHED THEN 102 | UPDATE SET * 103 | WHEN NOT MATCHED THEN 104 | INSERT * 105 | 106 | -- COMMAND ---------- 107 | 108 | -- MAGIC %md ### Check Rows 109 | -- MAGICCheck row and value(s) have been updated
110 | 111 | -- COMMAND ---------- 112 | 113 | -- Check table for changed rows 114 | SELECT * FROM scdType3 115 | WHERE employee_id IN (9,10,11,501) 116 | 117 | -- COMMAND ---------- 118 | 119 | -- Check Delta history 120 | DESCRIBE HISTORY scdType3 121 | 122 | -- COMMAND ---------- 123 | 124 | -- Clean up 125 | DROP VIEW IF EXISTS scdType3NEW; 126 | DROP VIEW IF EXISTS scd3ChangeRows; 127 | -------------------------------------------------------------------------------- /Python/SCD-Type3.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %mdhttps://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/dimensional-modeling-techniques/type-3/
5 | 6 | # COMMAND ---------- 7 | 8 | # MAGIC %md ### Setup SCD Type 3 example table 9 | 10 | # COMMAND ---------- 11 | 12 | # MAGIC %run "./~Configs/SCD-0-SetupSCD3" 13 | 14 | # COMMAND ---------- 15 | 16 | # Import dependencies 17 | from delta.tables import * 18 | from pyspark.sql.functions import * 19 | from pyspark.sql.types import * 20 | 21 | # COMMAND ---------- 22 | 23 | # Set default database 24 | spark.catalog.setCurrentDatabase("scd") 25 | 26 | # COMMAND ---------- 27 | 28 | # MAGIC %md ### Load employee table 29 | 30 | # COMMAND ---------- 31 | 32 | # Create dataframe from HIVE db scd 33 | scdType3DF = spark.table("scd.scdType3") 34 | 35 | # COMMAND ---------- 36 | 37 | # Display dataframe 38 | display(scdType3DF.orderBy("employee_id")) 39 | 40 | # COMMAND ---------- 41 | 42 | # MAGIC %md ### Amend Country 43 | 44 | # COMMAND ---------- 45 | 46 | # MAGIC %md The employees **Maximo Moxon** (employee_id = 9), **Augueste Dimeloe** (employee_id = 10) and **Austina Wimbury** (employee_id = 11) have all relocated to an office in a different country. \ 47 | # MAGIC We want to amend the country values and create an accompanying column to display the previous ones as well as inserting any rows that may be new. 48 | 49 | # COMMAND ---------- 50 | 51 | # Create dataset 52 | dataForDF = [ 53 | (9, 'Maximo', 'Moxon', 'Male', 'Canada'), 54 | (10, 'Augueste', 'Dimeloe', 'Female', 'France'), 55 | (11, 'Austina', 'Wimbury', 'Male', 'Germany'), 56 | (501, 'Steven', 'Smithson', 'Male', 'France') 57 | ] 58 | 59 | # Create Schema structure 60 | schema = StructType([ 61 | StructField("employee_id", IntegerType(), True), 62 | StructField("first_name", StringType(), True), 63 | StructField("last_name", StringType(), True), 64 | StructField("gender", StringType(), True), 65 | StructField("address_country", StringType(), True) 66 | ]) 67 | 68 | # Create as Dataframe 69 | scd3Temp = spark.createDataFrame(dataForDF, schema) 70 | 71 | # Preview dataset 72 | display(scd3Temp) 73 | 74 | # COMMAND ---------- 75 | 76 | # MAGIC %md ### Merge tables 77 | # MAGICInsert if new, Update if already exists
78 | 79 | # COMMAND ---------- 80 | 81 | # MAGIC %md As the View containing new values has a different set of columns than the primary table, we use autoMerge for schema differences or evolution 82 | 83 | # COMMAND ---------- 84 | 85 | # Set autoMerge to True 86 | spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", True) 87 | 88 | # COMMAND ---------- 89 | 90 | # Create WIDGET to pass in column name variable and keep it dynamic 91 | dbutils.widgets.text("changingColumn", "address_country") 92 | changingColumn = dbutils.widgets.get("changingColumn") 93 | 94 | # Create ChangeRows table (union of rows to amend and new rows to insert) 95 | changeRowsDF = sql(f""" 96 | SELECT scdType3New.*, scdType3.{changingColumn} AS previous_{changingColumn} FROM scdType3New 97 | INNER JOIN scdType3 98 | ON scdType3.employee_id = scdType3New.employee_id 99 | AND scdType3.{changingColumn} <> scdType3New.{changingColumn} 100 | UNION 101 | SELECT scdType3New.*, null AS previous_{changingColumn} FROM scdType3New 102 | LEFT JOIN scdType3 103 | ON scdType3.employee_id = scdType3New.employee_id 104 | WHERE scdType3.employee_id IS NULL 105 | """) 106 | 107 | display(changeRowsDF) 108 | 109 | # COMMAND ---------- 110 | 111 | # Convert table to Delta 112 | deltaTable = DeltaTable.forName(spark, "scdType3") 113 | 114 | # Merge Delta table with new dataset 115 | ( 116 | deltaTable 117 | .alias("original3") 118 | # Merge using the following conditions 119 | .merge( 120 | changeRowsDF.alias("updates3"), 121 | "original3.employee_id = updates3.employee_id" 122 | ) 123 | # When matched UPDATE these values 124 | .whenMatchedUpdateAll() 125 | # When not matched INSERT ALL rows 126 | .whenNotMatchedInsertAll() 127 | # Execute 128 | .execute() 129 | ) 130 | 131 | # COMMAND ---------- 132 | 133 | # MAGIC %md ### Check Rows 134 | # MAGICCheck row and value(s) have been updated
135 | 136 | # COMMAND ---------- 137 | 138 | # Check table for changed rows 139 | display( 140 | sql("SELECT * FROM scdType3 WHERE employee_id IN (9,10,11,501)") 141 | ) 142 | 143 | # COMMAND ---------- 144 | 145 | # Check Delta history 146 | display(sql("DESCRIBE HISTORY scdType3")) 147 | -------------------------------------------------------------------------------- /SQL/SCD-Type2.sql: -------------------------------------------------------------------------------- 1 | -- Databricks notebook source 2 | -- MAGIC %mdhttps://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/dimensional-modeling-techniques/type-2/
5 | 6 | -- COMMAND ---------- 7 | 8 | -- MAGIC %md ### Setup SCD Type 2 example table 9 | 10 | -- COMMAND ---------- 11 | 12 | -- MAGIC %run "./~Configs/SCD-0-SetupSCD2" 13 | 14 | -- COMMAND ---------- 15 | 16 | USE scd 17 | 18 | -- COMMAND ---------- 19 | 20 | -- MAGIC %md ### Load employee table 21 | 22 | -- COMMAND ---------- 23 | 24 | -- Select employees table (ordered by id column) 25 | SELECT * FROM scd.scdType2 26 | ORDER BY employee_id 27 | 28 | -- COMMAND ---------- 29 | 30 | -- MAGIC %md We can see that all employee records are active as they contain nulls for the end_date. 31 | 32 | -- COMMAND ---------- 33 | 34 | -- Check to see which rows are inactive (have an end date) 35 | SELECT * FROM scd.scdType2 36 | WHERE end_date <> null 37 | 38 | -- COMMAND ---------- 39 | 40 | -- MAGIC %md ### Amend Address and Last Name 41 | -- MAGIC (and end_date) 42 | 43 | -- COMMAND ---------- 44 | 45 | -- MAGIC %mdThe employees Fred Flintoff (employee_id = 6) and Hilary Casillis (employee_id = 21) have changes to be made.
46 | -- MAGIC
Fred needs a change of address whereas Hilary has recently got married and will be changing her last_name.
47 | -- MAGIC
48 | -- MAGIC
We want to create new rows for these updated records and add an end_date to signifiy that the old rows have now expired.
49 | -- MAGIC
Note: An additional Active Flag column could also be created to show the currently active records in the table.
50 | -- MAGIC
Insert if new, Update if already exists
80 | 81 | -- COMMAND ---------- 82 | 83 | -- Example ChangeRows table 84 | SELECT 85 | null AS id, employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title, current_date AS start_date, null AS end_date 86 | FROM scdType2NEW 87 | UNION ALL 88 | SELECT 89 | id, employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title, start_date, end_date 90 | FROM scdType2 91 | WHERE employee_id IN 92 | (SELECT employee_id FROM scdType2NEW) 93 | 94 | -- COMMAND ---------- 95 | 96 | -- Merge scdType2NEW dataset into existing 97 | MERGE INTO scdType2 98 | USING 99 | 100 | -- Update table with rows to match (new and old referenced) 101 | ( 102 | SELECT 103 | null AS id, employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title, current_date AS start_date, null AS end_date 104 | FROM scdType2NEW 105 | UNION ALL 106 | SELECT 107 | id, employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title, start_date, end_date 108 | FROM scdType2 109 | WHERE employee_id IN 110 | (SELECT employee_id FROM scdType2NEW) 111 | ) scdChangeRows 112 | 113 | -- based on the following column(s) 114 | ON scdType2.id = scdChangeRows.id 115 | 116 | -- if there is a match do this... 117 | WHEN MATCHED THEN 118 | UPDATE SET scdType2.end_date = current_date() 119 | -- if there is no match insert new row 120 | WHEN NOT MATCHED THEN INSERT * 121 | 122 | -- COMMAND ---------- 123 | 124 | -- Order nulls in DF to the end and recreate row numbers (as delta does not currently support auto incrementals) 125 | INSERT OVERWRITE scdType2 126 | SELECT 127 | ROW_NUMBER() OVER (ORDER BY id NULLS LAST) - 1 AS id, employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title, start_date, end_date 128 | FROM scdType2 129 | 130 | -- COMMAND ---------- 131 | 132 | -- MAGIC %md ### Check Row 133 | -- MAGICCheck row and value(s) have been updated
134 | 135 | -- COMMAND ---------- 136 | 137 | -- Check table for changed rows 138 | SELECT * FROM scdType2 139 | WHERE employee_id = 21 OR employee_id =6 140 | 141 | -- COMMAND ---------- 142 | 143 | -- Check Delta history 144 | DESCRIBE HISTORY scdType2 145 | 146 | -- COMMAND ---------- 147 | 148 | -- Clean up 149 | DROP VIEW IF EXISTS scdType2NEW 150 | -------------------------------------------------------------------------------- /Python/SCD-Type2.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # MAGIC %mdhttps://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/dimensional-modeling-techniques/type-2/
5 | 6 | # COMMAND ---------- 7 | 8 | # MAGIC %md ### Setup SCD Type 2 example table 9 | 10 | # COMMAND ---------- 11 | 12 | # MAGIC %run "./~Configs/SCD-0-SetupSCD2" 13 | 14 | # COMMAND ---------- 15 | 16 | # Import dependencies 17 | from delta.tables import * 18 | from pyspark.sql.functions import * 19 | from pyspark.sql.types import * 20 | 21 | # COMMAND ---------- 22 | 23 | # Set default database 24 | spark.catalog.setCurrentDatabase("scd") 25 | 26 | # COMMAND ---------- 27 | 28 | # MAGIC %md ### Load employee table 29 | 30 | # COMMAND ---------- 31 | 32 | # Create dataframe from HIVE db scd 33 | scdType2DF = spark.table("scd.scdType2") 34 | 35 | # COMMAND ---------- 36 | 37 | # Display dataframe 38 | display(scdType2DF.orderBy("employee_id")) 39 | 40 | # COMMAND ---------- 41 | 42 | # MAGIC %md We can see that all employee records are active as they contain nulls for the end_date. 43 | 44 | # COMMAND ---------- 45 | 46 | # Check to see which rows are inactive (have an end date) 47 | display(scdType2DF.where("end_date <> null").orderBy("employee_id")) 48 | 49 | # COMMAND ---------- 50 | 51 | # MAGIC %md ### Amend Address and Last Name 52 | # MAGIC (and end_date) 53 | 54 | # COMMAND ---------- 55 | 56 | # MAGIC %mdThe employees Fred Flintoff (employee_id = 6) and Hilary Casillis (employee_id = 21) have changes to be made.
57 | # MAGIC
Fred needs a change of address whereas Hilary has recently got married and will be changing her last_name.
58 | # MAGIC
59 | # MAGIC
We want to create new rows for these updated records and add an end_date to signifiy that the old rows have now expired.
60 | # MAGIC
Note: An additional Active Flag column could also be created to show the currently active records in the table.
61 | # MAGIC
Insert if new, Update if already exists
95 | 96 | # COMMAND ---------- 97 | 98 | # Create list of selected employee_id's 99 | empList = scd2Temp.select(collect_list(scd2Temp['employee_id'])).collect()[0][0] 100 | 101 | # Select columns in new dataframe to merge 102 | scdChangeRows = scd2Temp.selectExpr( 103 | "null AS id", "employee_id", "first_name", "last_name", "gender", "address_street", "address_city", "address_country", "email", "job_title", "current_date AS start_date", "null AS end_date" 104 | ) 105 | 106 | # Union join queries to match incoming rows with existing 107 | scdChangeRows = scdChangeRows.unionByName( 108 | scdType2DF 109 | .where(col("employee_id").isin(empList)), allowMissingColumns=True 110 | ) 111 | # Preview results 112 | display(scdChangeRows) 113 | 114 | # COMMAND ---------- 115 | 116 | # Convert table to Delta 117 | deltaTable = DeltaTable.forName(spark, "scdType2") 118 | 119 | # Merge Delta table with new dataset 120 | ( 121 | deltaTable 122 | .alias("original2") 123 | # Merge using the following conditions 124 | .merge( 125 | scdChangeRows.alias("updates2"), 126 | "original2.id = updates2.id" 127 | ) 128 | # When matched UPDATE ALL values 129 | .whenMatchedUpdate( 130 | set={ 131 | "original2.end_date" : current_date() 132 | } 133 | ) 134 | # When not matched INSERT ALL rows 135 | .whenNotMatchedInsertAll() 136 | # Execute 137 | .execute() 138 | ) 139 | 140 | # COMMAND ---------- 141 | 142 | scdType2DF.selectExpr( 143 | "ROW_NUMBER() OVER (ORDER BY id NULLS LAST) - 1 AS id", "employee_id", "first_name", "last_name", "gender", "address_street", "address_city", "address_country", 144 | "email", "job_title", "start_date", "end_date" 145 | ).write.insertInto(tableName="scdType2", overwrite=True) 146 | 147 | # COMMAND ---------- 148 | 149 | # MAGIC %md ### Check Row 150 | # MAGICCheck row and value(s) have been updated
151 | 152 | # COMMAND ---------- 153 | 154 | display( 155 | sql("SELECT * FROM scdType2 WHERE employee_id = 6 OR employee_id = 21") 156 | ) 157 | 158 | # COMMAND ---------- 159 | 160 | # Check Delta history 161 | sql("DESCRIBE HISTORY scdType2") 162 | --------------------------------------------------------------------------------