├── SQL
    ├── ~Configs
    │   ├── SCD-0-SetupSCD3.py
    │   ├── SCD-0-SetupSCD1.py
    │   ├── SCD-0-SetupSCD2.py
    │   ├── SCD-0-SetupSCD4.py
    │   └── SCD-0-Init.py
    ├── SCD-Start.sql
    ├── SCD-Type1.sql
    ├── SCD-Type4.sql
    ├── SCD-Type3.sql
    └── SCD-Type2.sql
└── Python
    ├── ~Configs
        ├── SCD-0-SetupSCD1.py
        ├── SCD-0-SetupSCD3.py
        ├── SCD-0-SetupSCD2.py
        ├── SCD-0-SetupSCD4.py
        └── SCD-0-Init.py
    ├── SCD-Start.py
    ├── SCD-Type1.py
    ├── SCD-Type3.py
    └── SCD-Type2.py


/SQL/~Configs/SCD-0-SetupSCD3.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %sql
3 | # MAGIC DROP TABLE IF EXISTS scd.scdType3;
4 | # MAGIC CREATE TABLE scd.scdType3 USING delta AS
5 | # MAGIC SELECT employee_id, first_name, last_name, gender, address_country FROM scd.employees
6 | # MAGIC ORDER BY employee_id
7 | 


--------------------------------------------------------------------------------
/SQL/~Configs/SCD-0-SetupSCD1.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %sql
3 | # MAGIC DROP TABLE IF EXISTS scd.scdType1;
4 | # MAGIC CREATE TABLE scd.scdType1 USING delta AS
5 | # MAGIC SELECT employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title FROM scd.employees
6 | 


--------------------------------------------------------------------------------
/Python/~Configs/SCD-0-SetupSCD1.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %sql
3 | # MAGIC DROP TABLE IF EXISTS scd.scdType1;
4 | # MAGIC CREATE TABLE scd.scdType1 USING delta AS
5 | # MAGIC SELECT employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title FROM scd.employees
6 | 


--------------------------------------------------------------------------------
/Python/~Configs/SCD-0-SetupSCD3.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %sql
 3 | # MAGIC DROP TABLE IF EXISTS scd.scdType3;
 4 | # MAGIC CREATE TABLE scd.scdType3 USING delta AS
 5 | # MAGIC SELECT employee_id, first_name, last_name, gender, address_country FROM scd.employees
 6 | # MAGIC ORDER BY employee_id
 7 | 
 8 | # COMMAND ----------
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/SQL/~Configs/SCD-0-SetupSCD2.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %sql
3 | # MAGIC DROP TABLE IF EXISTS scd.scdType2;
4 | # MAGIC CREATE TABLE scd.scdType2 USING delta AS
5 | # MAGIC SELECT (employee_id - 1) AS id, employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title, start_date, end_date FROM scd.employees
6 | # MAGIC ORDER BY employee_id
7 | 


--------------------------------------------------------------------------------
/Python/~Configs/SCD-0-SetupSCD2.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # MAGIC %sql
3 | # MAGIC DROP TABLE IF EXISTS scd.scdType2;
4 | # MAGIC CREATE TABLE scd.scdType2 USING delta AS
5 | # MAGIC SELECT (employee_id - 1) AS id, employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title, start_date, end_date FROM scd.employees
6 | # MAGIC ORDER BY employee_id
7 | 


--------------------------------------------------------------------------------
/SQL/~Configs/SCD-0-SetupSCD4.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %sql
 3 | # MAGIC DROP TABLE IF EXISTS scd.scdType4;
 4 | # MAGIC CREATE TABLE scd.scdType4 USING delta AS
 5 | # MAGIC SELECT employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title FROM scd.employees
 6 | # MAGIC 
 7 | # MAGIC ORDER BY employee_id
 8 | # MAGIC LIMIT 5
 9 | 
10 | # COMMAND ----------
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/Python/~Configs/SCD-0-SetupSCD4.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %sql
 3 | # MAGIC DROP TABLE IF EXISTS scd.scdType4;
 4 | # MAGIC CREATE TABLE scd.scdType4 USING delta AS
 5 | # MAGIC SELECT employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title FROM scd.employees
 6 | # MAGIC 
 7 | # MAGIC ORDER BY employee_id
 8 | # MAGIC LIMIT 5
 9 | 
10 | # COMMAND ----------
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/SQL/SCD-Start.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md # Starting Notebook
 3 | -- MAGIC Initiallise Datasets and Configs
 4 | 
 5 | -- COMMAND ----------
 6 | 
 7 | -- MAGIC %md _C.Williams - 2021_
 8 | 
 9 | -- COMMAND ----------
10 | 
11 | -- MAGIC %md ### Setup Dataset
12 | 
13 | -- COMMAND ----------
14 | 
15 | -- MAGIC %md Run settings notebook
16 | 
17 | -- COMMAND ----------
18 | 
19 | -- MAGIC %run "./~Configs/SCD-0-Init"
20 | 
21 | -- COMMAND ----------
22 | 
23 | -- MAGIC %md Get sample dataset
24 | 
25 | -- COMMAND ----------
26 | 
27 | -- Preview Dataset
28 | SELECT * FROM EmployeeSample
29 | 
30 | -- COMMAND ----------
31 | 
32 | -- MAGIC %md ### Write Dataset to DB
33 | -- MAGIC For use in other notebooks
34 | 
35 | -- COMMAND ----------
36 | 
37 | -- Create DB if it doesn't already exist
38 | CREATE DATABASE IF NOT EXISTS scd;
39 | -- Drop tabl if it already exists
40 | DROP TABLE IF EXISTS scd.employees;
41 | -- Create new table (delta format) using the sample Employee dataset
42 | CREATE TABLE scd.employees USING delta 
43 | AS SELECT * FROM EmployeeSample
44 | 


--------------------------------------------------------------------------------
/Python/SCD-Start.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | # MAGIC %md # Starting Notebook
 3 | # MAGIC Initiallise Datasets and Configs
 4 | 
 5 | # COMMAND ----------
 6 | 
 7 | # MAGIC %md _C.Williams - 2021_
 8 | 
 9 | # COMMAND ----------
10 | 
11 | # MAGIC %md ### Setup Dataset
12 | 
13 | # COMMAND ----------
14 | 
15 | # MAGIC %md Run settings notebook
16 | 
17 | # COMMAND ----------
18 | 
19 | # MAGIC %run "./~Configs/SCD-0-Init"
20 | 
21 | # COMMAND ----------
22 | 
23 | # MAGIC %md Get sample dataset
24 | 
25 | # COMMAND ----------
26 | 
27 | # Preview Dataset
28 | display(sql("SELECT * FROM EmployeeSample"))
29 | 
30 | # COMMAND ----------
31 | 
32 | # MAGIC %md ### Write Dataset to DB
33 | # MAGIC For use in other notebooks
34 | 
35 | # COMMAND ----------
36 | 
37 | # Create DB if it doesn't already exist
38 | sql("CREATE DATABASE IF NOT EXISTS scd")
39 | # Drop table if it already exists
40 | sql("DROP TABLE IF EXISTS scd.employees")
41 | # Create new table (delta format) using the sample Employee dataset
42 | sql("CREATE TABLE scd.employees USING delta AS SELECT * FROM EmployeeSample")
43 | 


--------------------------------------------------------------------------------
/SQL/~Configs/SCD-0-Init.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | import urllib.request, json, requests
 3 | from pyspark.sql.functions import *
 4 | from pyspark.sql.types import *
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | # Set GitHub URL
 9 | url = 'https://raw.githubusercontent.com/cwilliams87/Blog/main/07-2021/sampleEmployees'
10 | 
11 | # COMMAND ----------
12 | 
13 | def getSample():
14 |   
15 |   # Get Url
16 |   resp = requests.get(url)
17 |   jsonData = json.loads(resp.text)
18 |   data = sc.parallelize(jsonData)
19 |   
20 |   # Set Schema
21 |   schema = StructType(
22 |     [
23 |       StructField("id", LongType(), True),
24 |       StructField("first_name", StringType(), True),
25 |       StructField("last_name", StringType(), True),
26 |       StructField("gender", StringType(), True),
27 |       StructField("address_street", StringType(), True),
28 |       StructField("address_city", StringType(), True),
29 |       StructField("address_country", StringType(), True),
30 |       StructField("email", StringType(), True),
31 |       StructField("job_title", StringType(), True),
32 |       StructField("start_date", StringType(), True),
33 |       StructField("end_date", StringType(), True)
34 |     ]
35 |   )
36 |   
37 |   df = spark.createDataFrame(data, schema)
38 |   
39 |   # Convert Date Columns
40 |   df = df.withColumn("start_date", col("start_date").cast(DateType()))
41 |   df = df.withColumn("end_date", col("end_date").cast(DateType()))
42 |   
43 |   # Rename Column (for later SCD's)
44 |   df = df.withColumnRenamed('id', 'employee_id')
45 |   
46 |   # Additional column for SCD Type 3
47 |   df = df.withColumn('previous_country', lit(None).cast(StringType()))
48 |   
49 |   df.registerTempTable("EmployeeSample")
50 |   
51 |   return
52 | 
53 | 
54 | # COMMAND ----------
55 | 
56 | getSample()
57 | 


--------------------------------------------------------------------------------
/Python/~Configs/SCD-0-Init.py:
--------------------------------------------------------------------------------
 1 | # Databricks notebook source
 2 | import urllib.request, json, requests
 3 | from pyspark.sql.functions import *
 4 | from pyspark.sql.types import *
 5 | 
 6 | # COMMAND ----------
 7 | 
 8 | # Set GitHub URL
 9 | url = 'https://raw.githubusercontent.com/cwilliams87/Blog/main/07-2021/sampleEmployees'
10 | 
11 | # COMMAND ----------
12 | 
13 | def getSample():
14 |   
15 |   # Get Url
16 |   resp = requests.get(url)
17 |   jsonData = json.loads(resp.text)
18 |   data = sc.parallelize(jsonData)
19 |   
20 |   # Set Schema
21 |   schema = StructType(
22 |     [
23 |       StructField("id", LongType(), True),
24 |       StructField("first_name", StringType(), True),
25 |       StructField("last_name", StringType(), True),
26 |       StructField("gender", StringType(), True),
27 |       StructField("address_street", StringType(), True),
28 |       StructField("address_city", StringType(), True),
29 |       StructField("address_country", StringType(), True),
30 |       StructField("email", StringType(), True),
31 |       StructField("job_title", StringType(), True),
32 |       StructField("start_date", StringType(), True),
33 |       StructField("end_date", StringType(), True)
34 |     ]
35 |   )
36 |   
37 |   df = spark.createDataFrame(data, schema)
38 |   
39 |   # Convert Date Columns
40 |   df = df.withColumn("start_date", col("start_date").cast(DateType()))
41 |   df = df.withColumn("end_date", col("end_date").cast(DateType()))
42 |   
43 |   # Rename Column (for later SCD's)
44 |   df = df.withColumnRenamed('id', 'employee_id')
45 |   
46 |   # Additional column for SCD Type 3
47 |   df = df.withColumn('previous_country', lit(None).cast(StringType()))
48 |   
49 |   df.registerTempTable("EmployeeSample")
50 |   
51 |   return
52 | 
53 | 
54 | # COMMAND ----------
55 | 
56 | getSample()
57 | 


--------------------------------------------------------------------------------
/SQL/SCD-Type1.sql:
--------------------------------------------------------------------------------
 1 | -- Databricks notebook source
 2 | -- MAGIC %md <h1>Slowly Changing Dimensions with Delta</h1>
 3 | -- MAGIC <h2>Type 1 - Overwriting the value(s) or inserting a new row</h2>
 4 | -- MAGIC <p>https://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/dimensional-modeling-techniques/type-1/</p>
 5 | 
 6 | -- COMMAND ----------
 7 | 
 8 | -- MAGIC %md ### Setup SCD Type 1 example table
 9 | 
10 | -- COMMAND ----------
11 | 
12 | -- MAGIC %run "./~Configs/SCD-0-SetupSCD1"
13 | 
14 | -- COMMAND ----------
15 | 
16 | USE scd
17 | 
18 | -- COMMAND ----------
19 | 
20 | -- MAGIC %md ### Load employee table
21 | 
22 | -- COMMAND ----------
23 | 
24 | -- Select employees table (ordered by id column)
25 | SELECT * FROM scd.scdType1
26 | ORDER BY employee_id
27 | 
28 | -- COMMAND ----------
29 | 
30 | -- MAGIC %md ### Amend Job Title
31 | 
32 | -- COMMAND ----------
33 | 
34 | -- MAGIC %md <p>The employee <b>Stu Sand</b> (employee_id = 2) has changed their job title from <b>Paralegal</b> to <b>Solicitor</b>.
35 | -- MAGIC <br>We want to check if the row exists in the table, insert if new and overwrite if exists.
36 | -- MAGIC </p>
37 | 
38 | -- COMMAND ----------
39 | 
40 | -- Drop if already exists (for notebook continuity)
41 | DROP VIEW IF EXISTS scdType1NEW;
42 | 
43 | -- Create View to merge
44 | CREATE VIEW scdType1NEW AS SELECT 
45 | col1 AS employee_id, 
46 | col2 AS first_name,
47 | col3 AS last_name,
48 | col4 AS gender, 
49 | col5 AS address_street, 
50 | col6 AS address_city, 
51 | col7 AS address_country, 
52 | col8 AS email, 
53 | col9 AS job_title
54 | FROM VALUES 
55 | (2, 'Stu', 'Sand', 'Male', '83570 Fairview Way', 'Chicago', 'United States', 'ssand1@imdb.com', 'Solicitor');
56 | 
57 | -- Preview results
58 | SELECT * FROM scdType1NEW
59 | 
60 | -- COMMAND ----------
61 | 
62 | -- MAGIC %md ### Merge tables
63 | -- MAGIC <p>Insert if new, Update if already exists</p>
64 | 
65 | -- COMMAND ----------
66 | 
67 | -- Merge scdType1NEW dataset into existing
68 | MERGE INTO scdType1
69 | USING scdType1NEW
70 | 
71 | -- based on the following column(s)
72 | ON scdType1.employee_id = scdType1NEW.employee_id
73 | 
74 | -- if there is a match do this...
75 | WHEN MATCHED THEN UPDATE SET *
76 | -- if there is no match insert new row
77 | WHEN NOT MATCHED THEN INSERT *
78 | 
79 | -- COMMAND ----------
80 | 
81 | -- MAGIC %md ### Check Row
82 | -- MAGIC <p>Check row and value(s) have been updated</p>
83 | 
84 | -- COMMAND ----------
85 | 
86 | -- Check table
87 | SELECT * FROM scdType1
88 | WHERE first_name = 'Stu' AND last_name = 'Sand'
89 | 
90 | -- COMMAND ----------
91 | 
92 | -- Check Delta history
93 | DESCRIBE HISTORY scdType1
94 | 
95 | -- COMMAND ----------
96 | 
97 | -- Clean up
98 | DROP VIEW IF EXISTS scdType1NEW
99 | 


--------------------------------------------------------------------------------
/Python/SCD-Type1.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md <h1>Slowly Changing Dimensions with Delta</h1>
  3 | # MAGIC <h2>Type 1 - Overwriting the value(s) or inserting a new row</h2>
  4 | # MAGIC <p>https://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/dimensional-modeling-techniques/type-1/</p>
  5 | 
  6 | # COMMAND ----------
  7 | 
  8 | # MAGIC %md ### Setup SCD Type 1 example table
  9 | 
 10 | # COMMAND ----------
 11 | 
 12 | # MAGIC %run "./~Configs/SCD-0-SetupSCD1"
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | # Import dependencies
 17 | from delta.tables import *
 18 | from pyspark.sql.functions import *
 19 | from pyspark.sql.types import *
 20 | 
 21 | # COMMAND ----------
 22 | 
 23 | # Set default database
 24 | spark.catalog.setCurrentDatabase("scd")
 25 | 
 26 | # COMMAND ----------
 27 | 
 28 | # MAGIC %md ### Load employee table
 29 | 
 30 | # COMMAND ----------
 31 | 
 32 | # Create dataframe from HIVE db scd
 33 | scdType1DF = spark.table("scd.scdType1")
 34 | 
 35 | # COMMAND ----------
 36 | 
 37 | # Display dataframe
 38 | display(scdType1DF.orderBy("employee_id"))
 39 | 
 40 | # COMMAND ----------
 41 | 
 42 | # MAGIC %md ### Amend Job Title
 43 | 
 44 | # COMMAND ----------
 45 | 
 46 | # MAGIC %md <p>The employee <b>Stu Sand</b> (employee_id = 2) has changed their job title from <b>Paralegal</b> to <b>Solicitor</b>.
 47 | # MAGIC <br>We want to check if the row exists in the table, insert if new and overwrite if exists.
 48 | # MAGIC </p>
 49 | 
 50 | # COMMAND ----------
 51 | 
 52 | # Create dataset
 53 | dataForDF = [
 54 | (2, 'Stu', 'Sand', 'Male', '83570 Fairview Way', 'Chicago', 'United States', 'ssand1@imdb.com', 'Solicitor')
 55 | ]
 56 | 
 57 | # Create Schema structure
 58 | schema = StructType([
 59 |   StructField("employee_id", IntegerType(), True),
 60 |   StructField("first_name", StringType(), True),
 61 |   StructField("last_name", StringType(), True),
 62 |   StructField("gender", StringType(), True),
 63 |   StructField("address_street", StringType(), True),
 64 |   StructField("address_city", StringType(), True),
 65 |   StructField("address_country", StringType(), True),
 66 |   StructField("email", StringType(), True),
 67 |   StructField("job_title", StringType(), True)
 68 | ])
 69 | 
 70 | # Create as Dataframe
 71 | scd1Temp = spark.createDataFrame(dataForDF, schema)
 72 | 
 73 | # Preview dataset
 74 | display(scd1Temp)
 75 | 
 76 | # COMMAND ----------
 77 | 
 78 | # MAGIC %md ### Merge tables
 79 | # MAGIC <p>Insert if new, Update if already exists</p>
 80 | 
 81 | # COMMAND ----------
 82 | 
 83 | # Convert table to Delta
 84 | deltaTable = DeltaTable.forName(spark, "scdType1")
 85 | 
 86 | # Merge Delta table with new dataset
 87 | (
 88 |   deltaTable
 89 |     .alias("original1")
 90 |     # Merge using the following conditions
 91 |     .merge( 
 92 |       scd1Temp.alias("updates1"),
 93 |       "original1.employee_id = updates1.employee_id"
 94 |     )
 95 |     # When matched UPDATE ALL values
 96 |     .whenMatchedUpdateAll()
 97 |     # When not matched INSERT ALL rows
 98 |     .whenNotMatchedInsertAll()
 99 |     # Execute
100 |     .execute()
101 | )
102 | 
103 | # COMMAND ----------
104 | 
105 | # MAGIC %md ### Check Row
106 | # MAGIC <p>Check row and value(s) have been updated</p>
107 | 
108 | # COMMAND ----------
109 | 
110 | # Check 
111 | display(
112 |   deltaTable.toDF().orderBy("employee_id").where("first_name = 'Stu' AND last_name = 'Sand'")
113 | )
114 | 
115 | # COMMAND ----------
116 | 
117 | # Check Delta History
118 | display(deltaTable.history())
119 | 


--------------------------------------------------------------------------------
/SQL/SCD-Type4.sql:
--------------------------------------------------------------------------------
  1 | -- Databricks notebook source
  2 | -- MAGIC %md <h1>Slowly Changing Dimensions with Delta</h1>
  3 | -- MAGIC <h2>Type 4 - Using the Delta History</h2>
  4 | -- MAGIC <p>https://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/dimensional-modeling-techniques/type-4/</p>
  5 | 
  6 | -- COMMAND ----------
  7 | 
  8 | -- MAGIC %md ### Setup SCD Type 4 example table
  9 | 
 10 | -- COMMAND ----------
 11 | 
 12 | -- MAGIC %run "./~Configs/SCD-0-SetupSCD4"
 13 | 
 14 | -- COMMAND ----------
 15 | 
 16 | USE scd
 17 | 
 18 | -- COMMAND ----------
 19 | 
 20 | -- MAGIC %md ### Load employee table
 21 | 
 22 | -- COMMAND ----------
 23 | 
 24 | -- Select employees table (ordered by id column)
 25 | SELECT * FROM scd.scdType4
 26 | ORDER BY employee_id
 27 | 
 28 | -- COMMAND ----------
 29 | 
 30 | -- MAGIC %md ### Amend Email Address and Name
 31 | 
 32 | -- COMMAND ----------
 33 | 
 34 | -- MAGIC %md <p>The employees <b>Rae Maith</b> (employee_id = 1) and <b>Stu Sand</b> (employee_id = 2) have changes to be made.
 35 | -- MAGIC <br><b>Rae</b> needs a change of  email address whereas <b>Stu</b> only shows a shortened version of his name which should be stuart.
 36 | -- MAGIC <br>
 37 | -- MAGIC <br>We can make the changes using a similar approach to Type 1 SCD and use the Delta History to TIME TRAVEL to historical versions. This will create the functionality similar to a traditional Type 4 Slowly Changing Dimension e.g. active dataset and history table.
 38 | -- MAGIC </p>
 39 | 
 40 | -- COMMAND ----------
 41 | 
 42 | -- Drop if already exists (for notebook continuity)
 43 | DROP VIEW IF EXISTS scdType4NEW;
 44 | 
 45 | -- Create View to merge
 46 | CREATE VIEW scdType4NEW AS SELECT 
 47 | col1 AS employee_id, 
 48 | col2 AS first_name,
 49 | col3 AS last_name,
 50 | col4 AS gender, 
 51 | col5 AS address_street, 
 52 | col6 AS address_city, 
 53 | col7 AS address_country, 
 54 | col8 AS email, 
 55 | col9 AS job_title
 56 | FROM VALUES 
 57 | ( 1, 'Rae', 'Maith', 'Male', '6877 Roth Hill', 'Sioux City', 'United States', 'rmaith0@wisc.org', 'VP Quality Control'),
 58 | ( 2, 'Stuart', 'Sand', 'Male', '83570 Fairview Way', 'Chicago', 'United States', 'ssand1@imdb.com', 'Paralegal');
 59 | 
 60 | -- Preview results
 61 | SELECT * FROM scdType4NEW
 62 | 
 63 | -- COMMAND ----------
 64 | 
 65 | -- MAGIC %md ### Merge tables
 66 | -- MAGIC <p>Insert if new, Update if already exists</p>
 67 | 
 68 | -- COMMAND ----------
 69 | 
 70 | -- Merge scdType1NEW dataset into existing
 71 | MERGE INTO scdType4
 72 | USING scdType4NEW
 73 | 
 74 | -- based on the following column(s)
 75 | ON scdType4.employee_id = scdType4NEW.employee_id
 76 | 
 77 | -- if there is a match do this...
 78 | WHEN MATCHED THEN 
 79 |   UPDATE SET *
 80 | -- if there is no match insert new row
 81 | WHEN NOT MATCHED THEN INSERT *
 82 | 
 83 | -- COMMAND ----------
 84 | 
 85 | -- MAGIC %md ### Check Row
 86 | -- MAGIC <p>Check row and value(s) have been updated</p>
 87 | 
 88 | -- COMMAND ----------
 89 | 
 90 | -- Check table for changed rows
 91 | SELECT * FROM scdType4
 92 | 
 93 | -- COMMAND ----------
 94 | 
 95 | -- Check Delta history
 96 | DESCRIBE HISTORY scdType4
 97 | 
 98 | -- COMMAND ----------
 99 | 
100 | -- MAGIC %md ### View previous version of table
101 | 
102 | -- COMMAND ----------
103 | 
104 | -- View current and previous versions
105 | SELECT 0 AS version, *  FROM scdType4 VERSION AS OF 0
106 | WHERE employee_id = 1 OR employee_id = 2
107 | UNION ALL
108 | SELECT 1 AS version, * FROM scdType4 VERSION AS OF 1
109 | WHERE employee_id = 1 OR employee_id = 2
110 | 
111 | -- COMMAND ----------
112 | 
113 | -- MAGIC %md ### Rollback to previous versions
114 | 
115 | -- COMMAND ----------
116 | 
117 | -- The table can be restored to a previous version
118 | RESTORE TABLE scdType4 TO VERSION AS OF 0
119 | 
120 | -- COMMAND ----------
121 | 
122 | SELECT * FROM scdType4
123 | 
124 | -- COMMAND ----------
125 | 
126 | -- Clean up
127 | DROP VIEW IF EXISTS scdType4NEW
128 | 


--------------------------------------------------------------------------------
/SQL/SCD-Type3.sql:
--------------------------------------------------------------------------------
  1 | -- Databricks notebook source
  2 | -- MAGIC %md <h1>Slowly Changing Dimensions with Delta</h1>
  3 | -- MAGIC <h2>Type 3 - Add a previous value column</h2>
  4 | -- MAGIC <p>https://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/dimensional-modeling-techniques/type-3/</p>
  5 | 
  6 | -- COMMAND ----------
  7 | 
  8 | -- MAGIC %md ### Setup SCD Type 3 example table
  9 | 
 10 | -- COMMAND ----------
 11 | 
 12 | -- MAGIC %run "./~Configs/SCD-0-SetupSCD3"
 13 | 
 14 | -- COMMAND ----------
 15 | 
 16 | USE scd
 17 | 
 18 | -- COMMAND ----------
 19 | 
 20 | -- MAGIC %md ### Load employee table
 21 | 
 22 | -- COMMAND ----------
 23 | 
 24 | -- Select employees table (ordered by id column)
 25 | SELECT * FROM scdType3
 26 | ORDER BY employee_id
 27 | 
 28 | -- COMMAND ----------
 29 | 
 30 | -- MAGIC %md ### Amend Country
 31 | 
 32 | -- COMMAND ----------
 33 | 
 34 | -- MAGIC %md The employees **Maximo Moxon** (employee_id = 9), **Augueste Dimeloe** (employee_id = 10) and **Austina Wimbury** (employee_id = 11) have all relocated to an office in a different country. \
 35 | -- MAGIC We want to amend the country values and create an accompanying column to display the previous ones as well as inserting any rows that may be new.
 36 | 
 37 | -- COMMAND ----------
 38 | 
 39 | -- Drop if already exists (for notebook continuity)
 40 | DROP VIEW IF EXISTS scdType3NEW;
 41 | 
 42 | -- Create View to merge
 43 | CREATE VIEW scdType3NEW AS SELECT 
 44 | col1 AS employee_id, 
 45 | col2 AS first_name,
 46 | col3 AS last_name,
 47 | col4 AS gender,
 48 | col5 AS address_country
 49 | FROM VALUES 
 50 | (9, 'Maximo', 'Moxon', 'Male', 'Canada'),
 51 | (10, 'Augueste', 'Dimeloe', 'Female', 'France'),
 52 | (11, 'Austina', 'Wimbury', 'Male', 'Germany'),
 53 | (501, 'Steven', 'Smithson', 'Male', 'France');
 54 | 
 55 | -- Preview results
 56 | SELECT * FROM scdType3NEW
 57 | 
 58 | -- COMMAND ----------
 59 | 
 60 | -- MAGIC %md ### Merge tables
 61 | -- MAGIC <p>Insert if new, Update if already exists</p>
 62 | 
 63 | -- COMMAND ----------
 64 | 
 65 | -- MAGIC %md As the View containing new values has a different set of columns than the primary table, we use autoMerge for schema differences or evolution
 66 | 
 67 | -- COMMAND ----------
 68 | 
 69 | -- Set autoMerge to True
 70 | SET spark.databricks.delta.schema.autoMerge.enabled=true;
 71 | 
 72 | -- COMMAND ----------
 73 | 
 74 | -- Create WIDGET to pass in column name variable and keep it dynamic
 75 | CREATE WIDGET TEXT changingColumn DEFAULT '';
 76 | SET $changingColumn = 'address_country';
 77 | 
 78 | -- Create ChangeRows table (union of rows to amend and new rows to insert)
 79 | CREATE OR REPLACE TEMP VIEW scd3ChangeRows AS
 80 | SELECT scdType3New.*, scdType3.$changingColumn AS previous_$changingColumn FROM scdType3New
 81 | INNER JOIN scdType3
 82 | ON scdType3.employee_id = scdType3New.employee_id
 83 | AND scdType3.$changingColumn <> scdType3New.$changingColumn
 84 | UNION
 85 | -- Union join any new rows to be inserted
 86 | SELECT scdType3New.*, null AS previous_$changingColumn FROM scdType3New
 87 | LEFT JOIN scdType3
 88 | ON scdType3.employee_id = scdType3New.employee_id
 89 | WHERE scdType3.employee_id IS NULL;
 90 | 
 91 | -- COMMAND ----------
 92 | 
 93 | -- Merge scdType3NEW dataset into existing
 94 | MERGE INTO scdType3
 95 | USING scd3ChangeRows
 96 | 
 97 | -- based on the following column(s)
 98 | ON scdType3.employee_id = scd3ChangeRows.employee_id
 99 | 
100 | -- if there is a match do this...
101 | WHEN MATCHED THEN 
102 |   UPDATE SET *
103 | WHEN NOT MATCHED THEN 
104 |   INSERT *
105 | 
106 | -- COMMAND ----------
107 | 
108 | -- MAGIC %md ### Check Rows
109 | -- MAGIC <p>Check row and value(s) have been updated</p>
110 | 
111 | -- COMMAND ----------
112 | 
113 | -- Check table for changed rows
114 | SELECT * FROM scdType3
115 | WHERE employee_id IN (9,10,11,501)
116 | 
117 | -- COMMAND ----------
118 | 
119 | -- Check Delta history
120 | DESCRIBE HISTORY scdType3
121 | 
122 | -- COMMAND ----------
123 | 
124 | -- Clean up
125 | DROP VIEW IF EXISTS scdType3NEW;
126 | DROP VIEW IF EXISTS scd3ChangeRows;
127 | 


--------------------------------------------------------------------------------
/Python/SCD-Type3.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md <h1>Slowly Changing Dimensions with Delta</h1>
  3 | # MAGIC <h2>Type 3 - Add a previous value column</h2>
  4 | # MAGIC <p>https://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/dimensional-modeling-techniques/type-3/</p>
  5 | 
  6 | # COMMAND ----------
  7 | 
  8 | # MAGIC %md ### Setup SCD Type 3 example table
  9 | 
 10 | # COMMAND ----------
 11 | 
 12 | # MAGIC %run "./~Configs/SCD-0-SetupSCD3"
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | # Import dependencies
 17 | from delta.tables import *
 18 | from pyspark.sql.functions import *
 19 | from pyspark.sql.types import *
 20 | 
 21 | # COMMAND ----------
 22 | 
 23 | # Set default database
 24 | spark.catalog.setCurrentDatabase("scd")
 25 | 
 26 | # COMMAND ----------
 27 | 
 28 | # MAGIC %md ### Load employee table
 29 | 
 30 | # COMMAND ----------
 31 | 
 32 | # Create dataframe from HIVE db scd
 33 | scdType3DF = spark.table("scd.scdType3")
 34 | 
 35 | # COMMAND ----------
 36 | 
 37 | # Display dataframe
 38 | display(scdType3DF.orderBy("employee_id"))
 39 | 
 40 | # COMMAND ----------
 41 | 
 42 | # MAGIC %md ### Amend Country
 43 | 
 44 | # COMMAND ----------
 45 | 
 46 | # MAGIC %md The employees **Maximo Moxon** (employee_id = 9), **Augueste Dimeloe** (employee_id = 10) and **Austina Wimbury** (employee_id = 11) have all relocated to an office in a different country. \
 47 | # MAGIC We want to amend the country values and create an accompanying column to display the previous ones as well as inserting any rows that may be new.
 48 | 
 49 | # COMMAND ----------
 50 | 
 51 | # Create dataset
 52 | dataForDF = [
 53 | (9, 'Maximo', 'Moxon', 'Male', 'Canada'),
 54 | (10, 'Augueste', 'Dimeloe', 'Female', 'France'),
 55 | (11, 'Austina', 'Wimbury', 'Male', 'Germany'),
 56 | (501, 'Steven', 'Smithson', 'Male', 'France')
 57 | ]
 58 | 
 59 | # Create Schema structure
 60 | schema = StructType([
 61 |   StructField("employee_id", IntegerType(), True),
 62 |   StructField("first_name", StringType(), True),
 63 |   StructField("last_name", StringType(), True),
 64 |   StructField("gender", StringType(), True),
 65 |   StructField("address_country", StringType(), True)
 66 | ])
 67 | 
 68 | # Create as Dataframe
 69 | scd3Temp = spark.createDataFrame(dataForDF, schema)
 70 | 
 71 | # Preview dataset
 72 | display(scd3Temp)
 73 | 
 74 | # COMMAND ----------
 75 | 
 76 | # MAGIC %md ### Merge tables
 77 | # MAGIC <p>Insert if new, Update if already exists</p>
 78 | 
 79 | # COMMAND ----------
 80 | 
 81 | # MAGIC %md As the View containing new values has a different set of columns than the primary table, we use autoMerge for schema differences or evolution
 82 | 
 83 | # COMMAND ----------
 84 | 
 85 | # Set autoMerge to True
 86 | spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", True)
 87 | 
 88 | # COMMAND ----------
 89 | 
 90 | # Create WIDGET to pass in column name variable and keep it dynamic 
 91 | dbutils.widgets.text("changingColumn", "address_country")
 92 | changingColumn = dbutils.widgets.get("changingColumn")
 93 | 
 94 | # Create ChangeRows table (union of rows to amend and new rows to insert)
 95 | changeRowsDF = sql(f"""
 96 |   SELECT scdType3New.*, scdType3.{changingColumn} AS previous_{changingColumn} FROM scdType3New
 97 |   INNER JOIN scdType3
 98 |   ON scdType3.employee_id = scdType3New.employee_id
 99 |   AND scdType3.{changingColumn} <> scdType3New.{changingColumn}
100 |   UNION
101 |   SELECT scdType3New.*, null AS previous_{changingColumn} FROM scdType3New
102 |   LEFT JOIN scdType3
103 |   ON scdType3.employee_id = scdType3New.employee_id
104 |   WHERE scdType3.employee_id IS NULL
105 | """)
106 | 
107 | display(changeRowsDF)
108 | 
109 | # COMMAND ----------
110 | 
111 | # Convert table to Delta
112 | deltaTable = DeltaTable.forName(spark, "scdType3")
113 | 
114 | # Merge Delta table with new dataset
115 | (
116 |   deltaTable
117 |     .alias("original3")
118 |     # Merge using the following conditions
119 |     .merge( 
120 |       changeRowsDF.alias("updates3"),
121 |       "original3.employee_id = updates3.employee_id"
122 |     )
123 |     # When matched UPDATE these values
124 |     .whenMatchedUpdateAll()
125 |     # When not matched INSERT ALL rows
126 |     .whenNotMatchedInsertAll()
127 |     # Execute
128 |     .execute()
129 | )
130 | 
131 | # COMMAND ----------
132 | 
133 | # MAGIC %md ### Check Rows
134 | # MAGIC <p>Check row and value(s) have been updated</p>
135 | 
136 | # COMMAND ----------
137 | 
138 | # Check table for changed rows
139 | display(
140 |   sql("SELECT * FROM scdType3 WHERE employee_id IN (9,10,11,501)")
141 | )
142 | 
143 | # COMMAND ----------
144 | 
145 | # Check Delta history
146 | display(sql("DESCRIBE HISTORY scdType3"))
147 | 


--------------------------------------------------------------------------------
/SQL/SCD-Type2.sql:
--------------------------------------------------------------------------------
  1 | -- Databricks notebook source
  2 | -- MAGIC %md <h1>Slowly Changing Dimensions with Delta</h1>
  3 | -- MAGIC <h2>Type 2 - Add a new row (with active row indicators)</h2>
  4 | -- MAGIC <p>https://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/dimensional-modeling-techniques/type-2/</p>
  5 | 
  6 | -- COMMAND ----------
  7 | 
  8 | -- MAGIC %md ### Setup SCD Type 2 example table
  9 | 
 10 | -- COMMAND ----------
 11 | 
 12 | -- MAGIC %run "./~Configs/SCD-0-SetupSCD2"
 13 | 
 14 | -- COMMAND ----------
 15 | 
 16 | USE scd
 17 | 
 18 | -- COMMAND ----------
 19 | 
 20 | -- MAGIC %md ### Load employee table
 21 | 
 22 | -- COMMAND ----------
 23 | 
 24 | -- Select employees table (ordered by id column)
 25 | SELECT * FROM scd.scdType2
 26 | ORDER BY employee_id
 27 | 
 28 | -- COMMAND ----------
 29 | 
 30 | -- MAGIC %md We can see that all employee records are active as they contain nulls for the end_date.
 31 | 
 32 | -- COMMAND ----------
 33 | 
 34 | -- Check to see which rows are inactive (have an end date)
 35 | SELECT * FROM scd.scdType2
 36 | WHERE end_date <> null
 37 | 
 38 | -- COMMAND ----------
 39 | 
 40 | -- MAGIC %md ### Amend Address and Last Name
 41 | -- MAGIC (and end_date)
 42 | 
 43 | -- COMMAND ----------
 44 | 
 45 | -- MAGIC %md <p>The employees <b>Fred Flintoff</b> (employee_id = 6) and <b>Hilary Casillis</b> (employee_id = 21) have changes to be made.
 46 | -- MAGIC <br><b>Fred</b> needs a change of address whereas <b>Hilary</b> has recently got married and will be changing her last_name.
 47 | -- MAGIC <br>
 48 | -- MAGIC <br>We want to create new rows for these updated records and add an end_date to signifiy that the old rows have now expired.
 49 | -- MAGIC <br><i>Note: An additional Active Flag column could also be created to show the currently active records in the table.</i>
 50 | -- MAGIC </p>
 51 | 
 52 | -- COMMAND ----------
 53 | 
 54 | -- Drop if already exists (for notebook continuity)
 55 | DROP VIEW IF EXISTS scdType2NEW;
 56 | 
 57 | -- Create View to merge
 58 | CREATE VIEW scdType2NEW AS SELECT 
 59 | col1 AS id,
 60 | col2 AS employee_id, 
 61 | col3 AS first_name,
 62 | col4 AS last_name,
 63 | col5 AS gender, 
 64 | col6 AS address_street, 
 65 | col7 AS address_city, 
 66 | col8 AS address_country, 
 67 | col9 AS email, 
 68 | col10 AS job_title
 69 | FROM VALUES 
 70 | (5, 6, 'Fred', 'Flintoff', 'Male', '3686 Dogwood Road', 'Phoenix', 'United States', 'fflintoft5@unblog.fr', 'Financial Advisor'),
 71 | (20, 21, 'Hilary', 'Thompson', 'Female', '4 Oxford Pass', 'San Diego', 'United States', 'hcasillisk@washington.edu', 'Senior Sales Associate');
 72 | 
 73 | -- Preview results
 74 | SELECT * FROM scdType2NEW
 75 | 
 76 | -- COMMAND ----------
 77 | 
 78 | -- MAGIC %md ### Merge tables
 79 | -- MAGIC <p>Insert if new, Update if already exists</p>
 80 | 
 81 | -- COMMAND ----------
 82 | 
 83 | -- Example ChangeRows table
 84 | SELECT 
 85 |   null AS id, employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title, current_date AS start_date, null AS end_date
 86 | FROM scdType2NEW
 87 | UNION ALL
 88 | SELECT 
 89 |   id, employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title, start_date, end_date 
 90 | FROM scdType2
 91 | WHERE employee_id IN 
 92 | (SELECT employee_id FROM scdType2NEW)
 93 | 
 94 | -- COMMAND ----------
 95 | 
 96 | -- Merge scdType2NEW dataset into existing
 97 | MERGE INTO scdType2
 98 | USING 
 99 | 
100 | -- Update table with rows to match (new and old referenced)
101 | (
102 | SELECT 
103 |   null AS id, employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title, current_date AS start_date, null AS end_date
104 | FROM scdType2NEW
105 | UNION ALL
106 | SELECT 
107 |   id, employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title, start_date, end_date 
108 | FROM scdType2
109 | WHERE employee_id IN 
110 | (SELECT employee_id FROM scdType2NEW)
111 | ) scdChangeRows
112 | 
113 | -- based on the following column(s)
114 | ON scdType2.id = scdChangeRows.id
115 | 
116 | -- if there is a match do this...
117 | WHEN MATCHED THEN 
118 |   UPDATE SET scdType2.end_date = current_date()
119 | -- if there is no match insert new row
120 | WHEN NOT MATCHED THEN INSERT *
121 | 
122 | -- COMMAND ----------
123 | 
124 | -- Order nulls in DF to the end and recreate row numbers (as delta does not currently support auto incrementals)
125 | INSERT OVERWRITE scdType2
126 | SELECT 
127 | ROW_NUMBER() OVER (ORDER BY id NULLS LAST) - 1 AS id, employee_id, first_name, last_name, gender, address_street, address_city, address_country, email, job_title, start_date, end_date
128 | FROM scdType2
129 | 
130 | -- COMMAND ----------
131 | 
132 | -- MAGIC %md ### Check Row
133 | -- MAGIC <p>Check row and value(s) have been updated</p>
134 | 
135 | -- COMMAND ----------
136 | 
137 | -- Check table for changed rows
138 | SELECT * FROM scdType2
139 | WHERE employee_id = 21 OR employee_id =6
140 | 
141 | -- COMMAND ----------
142 | 
143 | -- Check Delta history
144 | DESCRIBE HISTORY scdType2
145 | 
146 | -- COMMAND ----------
147 | 
148 | -- Clean up
149 | DROP VIEW IF EXISTS scdType2NEW
150 | 


--------------------------------------------------------------------------------
/Python/SCD-Type2.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # MAGIC %md <h1>Slowly Changing Dimensions with Delta</h1>
  3 | # MAGIC <h2>Type 2 - Add a new row (with active row indicators)</h2>
  4 | # MAGIC <p>https://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/dimensional-modeling-techniques/type-2/</p>
  5 | 
  6 | # COMMAND ----------
  7 | 
  8 | # MAGIC %md ### Setup SCD Type 2 example table
  9 | 
 10 | # COMMAND ----------
 11 | 
 12 | # MAGIC %run "./~Configs/SCD-0-SetupSCD2"
 13 | 
 14 | # COMMAND ----------
 15 | 
 16 | # Import dependencies
 17 | from delta.tables import *
 18 | from pyspark.sql.functions import *
 19 | from pyspark.sql.types import *
 20 | 
 21 | # COMMAND ----------
 22 | 
 23 | # Set default database
 24 | spark.catalog.setCurrentDatabase("scd")
 25 | 
 26 | # COMMAND ----------
 27 | 
 28 | # MAGIC %md ### Load employee table
 29 | 
 30 | # COMMAND ----------
 31 | 
 32 | # Create dataframe from HIVE db scd
 33 | scdType2DF = spark.table("scd.scdType2")
 34 | 
 35 | # COMMAND ----------
 36 | 
 37 | # Display dataframe
 38 | display(scdType2DF.orderBy("employee_id"))
 39 | 
 40 | # COMMAND ----------
 41 | 
 42 | # MAGIC %md We can see that all employee records are active as they contain nulls for the end_date.
 43 | 
 44 | # COMMAND ----------
 45 | 
 46 | # Check to see which rows are inactive (have an end date)
 47 | display(scdType2DF.where("end_date <> null").orderBy("employee_id"))
 48 | 
 49 | # COMMAND ----------
 50 | 
 51 | # MAGIC %md ### Amend Address and Last Name
 52 | # MAGIC (and end_date)
 53 | 
 54 | # COMMAND ----------
 55 | 
 56 | # MAGIC %md <p>The employees <b>Fred Flintoff</b> (employee_id = 6) and <b>Hilary Casillis</b> (employee_id = 21) have changes to be made.
 57 | # MAGIC <br><b>Fred</b> needs a change of address whereas <b>Hilary</b> has recently got married and will be changing her last_name.
 58 | # MAGIC <br>
 59 | # MAGIC <br>We want to create new rows for these updated records and add an end_date to signifiy that the old rows have now expired.
 60 | # MAGIC <br><i>Note: An additional Active Flag column could also be created to show the currently active records in the table.</i>
 61 | # MAGIC </p>
 62 | 
 63 | # COMMAND ----------
 64 | 
 65 | # Create dataset
 66 | dataForDF = [
 67 | (None, 6, 'Fred', 'Flintoff', 'Male', '3686 Dogwood Road', 'Phoenix', 'United States', 'fflintoft5@unblog.fr', 'Financial Advisor'),
 68 | (None, 21, 'Hilary', 'Thompson', 'Female', '4 Oxford Pass', 'San Diego', 'United States', 'hcasillisk@washington.edu', 'Senior Sales Associate')
 69 | ]
 70 | 
 71 | # Create Schema structure
 72 | schema = StructType([
 73 |   StructField("id", IntegerType(), True),
 74 |   StructField("employee_id", IntegerType(), True),
 75 |   StructField("first_name", StringType(), True),
 76 |   StructField("last_name", StringType(), True),
 77 |   StructField("gender", StringType(), True),
 78 |   StructField("address_street", StringType(), True),
 79 |   StructField("address_city", StringType(), True),
 80 |   StructField("address_country", StringType(), True),
 81 |   StructField("email", StringType(), True),
 82 |   StructField("job_title", StringType(), True)
 83 | ])
 84 | 
 85 | # Create as Dataframe
 86 | scd2Temp = spark.createDataFrame(dataForDF, schema)
 87 | 
 88 | # Preview dataset
 89 | display(scd2Temp)
 90 | 
 91 | # COMMAND ----------
 92 | 
 93 | # MAGIC %md ### Merge tables
 94 | # MAGIC <p>Insert if new, Update if already exists</p>
 95 | 
 96 | # COMMAND ----------
 97 | 
 98 | # Create list of selected employee_id's
 99 | empList = scd2Temp.select(collect_list(scd2Temp['employee_id'])).collect()[0][0]
100 | 
101 | # Select columns in new dataframe to merge
102 | scdChangeRows = scd2Temp.selectExpr(
103 |   "null AS id", "employee_id", "first_name", "last_name", "gender", "address_street", "address_city", "address_country", "email", "job_title", "current_date AS start_date", "null AS end_date"
104 | )
105 | 
106 | # Union join queries to match incoming rows with existing
107 | scdChangeRows = scdChangeRows.unionByName(
108 |   scdType2DF  
109 |   .where(col("employee_id").isin(empList)), allowMissingColumns=True
110 | )
111 | # Preview results
112 | display(scdChangeRows)
113 | 
114 | # COMMAND ----------
115 | 
116 | # Convert table to Delta
117 | deltaTable = DeltaTable.forName(spark, "scdType2")
118 | 
119 | # Merge Delta table with new dataset
120 | (
121 |   deltaTable
122 |     .alias("original2")
123 |     # Merge using the following conditions
124 |     .merge( 
125 |       scdChangeRows.alias("updates2"),
126 |       "original2.id = updates2.id"
127 |     )
128 |     # When matched UPDATE ALL values
129 |     .whenMatchedUpdate(
130 |     set={
131 |       "original2.end_date" : current_date()
132 |     }
133 |     )
134 |     # When not matched INSERT ALL rows
135 |     .whenNotMatchedInsertAll()
136 |     # Execute
137 |     .execute()
138 | )
139 | 
140 | # COMMAND ----------
141 | 
142 | scdType2DF.selectExpr( 
143 |   "ROW_NUMBER() OVER (ORDER BY id NULLS LAST) - 1 AS id", "employee_id", "first_name", "last_name", "gender", "address_street", "address_city", "address_country", 
144 |   "email", "job_title", "start_date", "end_date"
145 | ).write.insertInto(tableName="scdType2", overwrite=True)
146 | 
147 | # COMMAND ----------
148 | 
149 | # MAGIC %md ### Check Row
150 | # MAGIC <p>Check row and value(s) have been updated</p>
151 | 
152 | # COMMAND ----------
153 | 
154 | display(
155 |   sql("SELECT * FROM scdType2 WHERE employee_id = 6 OR employee_id = 21")
156 | )
157 | 
158 | # COMMAND ----------
159 | 
160 | # Check Delta history
161 | sql("DESCRIBE HISTORY scdType2")
162 | 


--------------------------------------------------------------------------------