├── .gitignore ├── README.md ├── cdc-library └── scd_lib.py ├── data ├── CDC.drawio ├── CDC.png ├── day0.csv ├── day1.csv └── sun-earth.gif ├── hive └── tables.hql └── scd_driver_code.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Change Data Capture (Pyspark and Hive) 2 | Change Data Capture (CDC) using Pyspark and Hive provides a capability to determine and track changes in data over time. It performs a type-4 implementation of Slowly Changing Dimension (SCD) by maintaining a snapshot table and a history table. The identified records are marked with the following notation: 3 | 4 | | Record Type | row_opern | 5 | | ----------- | --------- | 6 | | No Change | N | 7 | | Updated | U | 8 | | Inserted | I | 9 | | Deleted | D | 10 | 11 | Full Description: https://medium.com/@akashmehta10/change-data-capture-cdc-using-spark-hive-big-data-beca5afd669f 12 | 13 | **Snapshot Table** 14 | 15 | |eid|name |address |phone_num |row_opern|rec_eff_dt| 16 | |---|----------|------------------------|----------|---------|----------| 17 | |1 |John D. |1200 Flagstaff Rd., TX |9984467826|U |2021-08-08| 18 | |3 |Linda S. |3323 Rivera Blvd., NY |5526631276|U |2021-08-08| 19 | |4 |Michael W.|2227 Ricks Rd., AR |2456548766|N |2021-08-07| 20 | |5 |Sean J. |1312 MacStreet Blvd., WA|2339794455|N |2021-08-07| 21 | |6 |Bethany S.|5354 Britain Rd., OR |5559875643|N |2021-08-07| 22 | |7 |Christy L.|3321 Fountain Blvd., OR |6642346545|I |2021-08-08| 23 | 24 | 25 | **History Table** 26 | 27 | |eid|name |address |phone_num |row_opern|rec_eff_dt| 28 | |---|----------|------------------------|----------|---------|----------| 29 | |1 |John D. |1200 Flagstaff Rd., TX |9984467825|I |2021-08-07| 30 | |1 |John D. |1200 Flagstaff Rd., TX |9984467826|U |2021-08-08| 31 | |2 |Kristy P. |4432 Preston Rd., MA |3243454112|I |2021-08-07| 32 | |2 |Kristy P. |4432 Preston Rd., MA |3243454112|D |2021-08-08| 33 | |3 |Linda S. |1104 Olympus Blvd., NY |5526631276|I |2021-08-07| 34 | |3 |Linda S. |3323 Rivera Blvd., NY |5526631276|U |2021-08-08| 35 | |4 |Michael W.|2227 Ricks Rd., AR |2456548766|I |2021-08-07| 36 | |5 |Sean J. |1312 MacStreet Blvd., WA|2339794455|I |2021-08-07| 37 | |6 |Bethany S.|5354 Britain Rd., OR |5559875643|I |2021-08-07| 38 | |7 |Christy L.|3321 Fountain Blvd., OR |6642346545|I |2021-08-08| 39 | 40 | ## Configuration Steps 41 | 42 | ### Step 1 43 | Create Hive Tables. Samples attached under hive directory. 44 | 45 | ### Step 2 46 | Include scd_lib.py library and adjust scd_driver_code.py based on requirements. 47 | 48 | ### Step 3 49 | spark-submit scd_driver_code.py --py-files cdc-library/scd_lib.py -------------------------------------------------------------------------------- /cdc-library/scd_lib.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union 2 | from datetime import datetime 3 | from pyspark.sql import DataFrame 4 | from pyspark.sql.functions import col, to_timestamp, lit, hash, array, explode, md5, concat_ws 5 | 6 | SCD_COLS = ["rec_eff_dt", "row_opern"] 7 | 8 | def rename_columns(df: DataFrame, alias: str, keys_list: List, add_suffix: int = 1) -> DataFrame: 9 | """ Rename columns to denote if they belong to history ot current. 10 | 11 | Args: 12 | df (DataFrame): dataframe to perform renaming on 13 | alias (str): alias to add as suffix 14 | keys_list (List): list of keys which are used for joins 15 | add_suffix (int, optional): 0 - remove suffix, 1- add suffix. Defaults to 1. 16 | 17 | Returns: 18 | DataFrame: dataframe with performed adding/removing of suffix 19 | """ 20 | if add_suffix == 1: 21 | for column in set(df.columns) - set(SCD_COLS) - set(keys_list): 22 | df = df.withColumnRenamed(column, column + "_" + alias) 23 | else: 24 | for column in [cols for cols in df.columns if f"_{alias}" in cols]: 25 | df = df.withColumnRenamed(column, column.replace("_" + alias, "")) 26 | return df 27 | 28 | 29 | def get_hash_column(dataframe: DataFrame, keys_list: List, ignored_columns: List = []) -> DataFrame: 30 | """ Hash column to generate hash for non-key columns. Algorithm used: MD5 31 | 32 | Args: 33 | dataframe (DataFrame): dataframe to add "hash" column to 34 | keys_list (List): keys_list which will be used for joins 35 | ignored_columns (List): columns which will not be included in hash apart from SCD_COLS and keys_list 36 | 37 | Returns: 38 | DataFrame: Dataframe with added column "hash" 39 | """ 40 | cols = sorted(list(set(dataframe.columns) - set(keys_list) - set(SCD_COLS) - set(ignored_columns))) 41 | columns = [col(column) for column in cols] 42 | if columns: 43 | return dataframe.withColumn("hash", md5(concat_ws("|", *columns))) 44 | else: 45 | return dataframe.withColumn("hash", md5(lit(1))) 46 | 47 | def get_no_change_records(history: DataFrame, 48 | current: DataFrame, 49 | keys_list: List, ignored_columns: List = []) -> DataFrame: 50 | """ handling of not changed rows. 51 | 52 | Args: 53 | history (DataFrame): open SCD rows 54 | current (DataFrame): current status 55 | keys_list (List): keys list used for joins 56 | ignored_columns (List, optional): keys not to be considered for determining change 57 | Defaults to empty list. 58 | 59 | Returns: 60 | DataFrame: Spark DF with not changed rows 61 | """ 62 | 63 | history_hash = rename_columns(get_hash_column(history, keys_list, ignored_columns), alias="history", keys_list=keys_list) 64 | current_hash = rename_columns(get_hash_column(current, keys_list, ignored_columns), alias="current", keys_list=keys_list) 65 | 66 | not_changed = rename_columns(history_hash 67 | .join(other=current_hash, on=keys_list, how="inner") 68 | .where(history_hash["hash_history"] == current_hash["hash_current"]) 69 | .drop(*["hash_history", "hash_current"]) 70 | .drop(*[column for column in current_hash.columns if "_current" in column]) 71 | , alias="history", keys_list=keys_list, add_suffix=0).select(history.columns) 72 | not_changed = not_changed.withColumn("row_opern", lit("N")) 73 | 74 | 75 | return (not_changed) 76 | 77 | def get_updated_records(history: DataFrame, 78 | current: DataFrame, 79 | keys_list: List, rec_eff_dt: str, ignored_columns: List = []) -> DataFrame: 80 | """ handling of updated rows. 81 | 82 | Args: 83 | history (DataFrame): open SCD rows 84 | current (DataFrame): current status 85 | keys_list (List): keys list used for joins 86 | rec_eff_dt (str): to mark the effective date for updated records 87 | ignored_columns (List, optional): keys not to be considered for determining change 88 | Defaults to empty list. 89 | 90 | Returns: 91 | DataFrame: Spark DF with updated rows 92 | """ 93 | 94 | history_hash = rename_columns(get_hash_column(history, keys_list, ignored_columns), alias="history", keys_list=keys_list) 95 | current_hash = rename_columns(get_hash_column(current, keys_list, ignored_columns), alias="current", keys_list=keys_list) 96 | 97 | changed = (history_hash 98 | .join(other=current_hash, on=keys_list, how="inner") 99 | .where(history_hash["hash_history"] != current_hash["hash_current"]) 100 | ) 101 | 102 | updated = (rename_columns((changed.withColumn("rec_eff_dt", lit(rec_eff_dt)) 103 | .drop(*["hash_history", "hash_current", "hist_flag"]) 104 | .drop(*[column for column in changed.columns if "_history" in column]) 105 | ), alias="current", keys_list=keys_list, add_suffix=0) 106 | .withColumn("row_opern", lit("U")) 107 | .select(history.columns)) 108 | 109 | return (updated) 110 | 111 | 112 | def get_new_records(history: DataFrame, 113 | current: DataFrame, 114 | keys_list: List, rec_eff_dt: str) ->DataFrame: 115 | """Handling new rows insertion 116 | 117 | Args: 118 | history (DataFrame): SCD open rows DF 119 | current (DataFrame): current state 120 | keys_list (List): keys list for joins 121 | rec_eff_dt (str): to mark the effective date for new records 122 | 123 | Returns: 124 | DataFrame: Only new rows part is returned 125 | """ 126 | new = current.join(other=history, on=keys_list, how="left_anti") 127 | new = (new.withColumn("row_opern", lit("I"))) 128 | 129 | return new.withColumn("rec_eff_dt", lit(rec_eff_dt)) 130 | 131 | 132 | def get_deleted_rows(history: DataFrame, 133 | current: DataFrame, 134 | keys_list: List, rec_eff_dt: str) -> DataFrame: 135 | """Handling of deleted rows 136 | 137 | Args: 138 | history (DataFrame): SCD open rows 139 | current (DataFrame): current state DF 140 | keys_list (List): keys list for joins 141 | rec_eff_dt (str): to mark the effective date for deleted records 142 | 143 | Returns: 144 | DataFrame: DF with deleted rows changes 145 | """ 146 | 147 | deleted = history.join(other=current, on=keys_list, how="left_anti") 148 | deleted = (deleted.withColumn("row_opern", lit("D"))) 149 | return deleted.withColumn("rec_eff_dt", lit(rec_eff_dt)) -------------------------------------------------------------------------------- /data/CDC.drawio: -------------------------------------------------------------------------------- 1 | 7VtbU9s4FP41noGHdGL5EuexIdDSbZndAkv71FFs2VZRrNRWIOmvX8mWr3Jz6eIkdPwAWEf3c/nOkY7QjIv56l0MF+En6iGigaG30oypBsDIsflvQVhnBN02RhkliLEnaSXhFv9EkjiU1CX2UFJryCglDC/qRJdGEXJZjQbjmD7Xm/mU1GddwAAphFsXEpX6gD0WZlQHjEr6e4SDMJ9Zt8dZzRzmjeVOkhB69LlCMi414yKmlGVf89UFIoJ5OV8ertcP5OOj/e7DP8kPeD/56+7m30E22NU+XYotxChivz30rbGMCQgnj9/ff/D//nI/M9Y/B4aZjf0EyVIyTG6WrXMOxnQZeUiMMtSMyXOIGbpdQFfUPnOd4bSQzQkv6fwTEhxE/Jsgny91suPK5Q6fUMzQqiI3uZN3iM4Ri9e8iaw1pEykVg5MWX4uRTzONTCsiNdxJBFKtQqKoUvW8Q/JvT04OVL4hjyuiLJIYxbSgEaQXJbUSZ2zZZuPlC4kP78jxtbSquCS0Tq30QqzL6L7G0uWvlZqpis5clpY54WIb7fSSRS/VuvKbmkp77efKBO6jF20oZ0hsQDGAdo0nsQfwcyNihEjAhl+qlv9/5HyplVXzOU6cukcRwGnTiGDG6xH3249Po2YFLbONzkJCEwSyf+ExfSxwDDRugAkUe1jQi4ooXE6reFB5Phu0a1SY7sOmvkd2qZu143TUW1TBy22aXZlms4xTVOvGGZppttMs2aYpZ0exjTtHU1zdCzTbJVyDvc9Au8n5pcTn+z6N8V8fQUcFPGThAMwbth5pmeyV0MJimX8PmTbCmRf4TgRC7zDc8T/fF7ygOVKAzacCziOZsmiYHYVyEM6ny2T7SBeh2WO6FdwjonY/XtEnhDDLmyB+jxwcrlwUdyO93xK4WmMqV2W7lJFHZiqC/B9H7itLsCzZ7Zld+kCRnWZF+WqDxi2xWdd+QCr9wF7gcNI9QGtfDVPKjwbKbb+EHNbFedXuBbnI8EghhmmUXqWjIX64YTRdI0MzghKlZTrKF9dNl8Z33lpfMcPhvEj14p0+WB4HSXcLPjH2fW5oEVeulv3G/L9bx7r40FgmE0wsFQwsFvAYNQVGBhGHyrshQZ6fnOzDQ70F48JdwsqjAMHFTlDKkhzAYm75JsSCBLRgRvCKEAFIiwXHD2QxAYaewI6BvxnBpOUmgLSp6klTAAmYQ8bQDet9rixChtWC2zYXcHGUJH55c1URRJC8CJBO0ipwWoLOZ7ZxmoHzAy7y3BtNLbrCO2oCO20cNrqitPqJWSPz2gT7KrwvOn66lRO7Io93d69/Xz3J1iU7jQuqMfHNSj7mAZ12sefTcearfHO8KQMSk3m5McfLhq+6wgukpCy/qjTldlbyr3HsY86upqx+IoSRVR8f0xrTeQV91GCC9iF5K2smGPPy3ACJfhnqk+ZOBYi6E73YU00ayrG4tCQSNl2yH0exzTCmJHCfbOF+aAz5qsGeUP/UN43MrJFFF7hvXFI3gOzjyH3iyF3vfLL3deJOD1dvfTTgE2ESc34RyA+yrO58Tav5JMV9UWHuEnx8FOTlCxglNPEET5Cz9VjfTONUMxW7Vcfq6Kk9o+leE+SusOBLxMHfM3DYmDuxwwz1dUZFW69QU6bXrR1gDGG5NftExglA+7gsS9IaUW6iAw8xBJ0c7Eq6mbQfQxSexm4mYcVTeJgdgYsqxi08X1ebrDgxCXQHFsbg504VdBqTBaPTb7BiGE+y3eOgCJEKYMZMTc3uITx0vmWWTi5InEhXQ8R1Li46UXXLoYdRLlVbLmgsnlzIZ6/otjyQC+gbPCmfj9mDnXV4+qW6nKLxNvL+9yjZtleoc8Fu96rA3BSPhf0T2q6EvTR3rttXHcluLqPsuxpGvYIkC5TG9mNQIvHzO4PykyruF7wYyieX7waXO8KyJt5Dst01DuDg+Y5wLi37v2sG+xq3af1ZM7Qjyno1+ivdxW0cVoXw/m694Hxag67AeSVi+QeyX+N5LYFjo3k6uPH6oOoSNvrQRSu3f/XdebsXpJK7382Pa+rTp8vaGqM2bg1tUxD1ZiD5gvAr5/Qbckh7aQdVVA5u+n1Y0/9sE01o3Hgp3N/7COYZsp+1ILeL5Sz58Xy3wazJ2nlP18al/8B -------------------------------------------------------------------------------- /data/CDC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akashmehta10/cdc_pyspark_hive/61f4b64fd189f66a5f5dae08a37178d6df09b277/data/CDC.png -------------------------------------------------------------------------------- /data/day0.csv: -------------------------------------------------------------------------------- 1 | 1,"John D.","1200 Flagstaff Rd., TX","9984467825" 2 | 2,"Kristy P.","4432 Preston Rd., MA","3243454112" 3 | 3,"Linda S.","1104 Olympus Blvd., NY","5526631276" 4 | 4,"Michael W.","2227 Ricks Rd., AR","2456548766" 5 | 5,"Sean J.","1312 MacStreet Blvd., WA","2339794455" 6 | 6,"Bethany S.","5354 Britain Rd., OR","5559875643" -------------------------------------------------------------------------------- /data/day1.csv: -------------------------------------------------------------------------------- 1 | 1,"John D.","1200 Flagstaff Rd., TX","9984467826" 2 | 3,"Linda S.","3323 Rivera Blvd., NY","5526631276" 3 | 4,"Michael W.","2227 Ricks Rd., AR","2456548766" 4 | 5,"Sean J.","1312 MacStreet Blvd., WA","2339794455" 5 | 6,"Bethany S.","5354 Britain Rd., OR","5559875643" 6 | 7,"Christy L.","3321 Fountain Blvd., OR","6642346545" -------------------------------------------------------------------------------- /data/sun-earth.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akashmehta10/cdc_pyspark_hive/61f4b64fd189f66a5f5dae08a37178d6df09b277/data/sun-earth.gif -------------------------------------------------------------------------------- /hive/tables.hql: -------------------------------------------------------------------------------- 1 | CREATE TABLE employee 2 | ( 3 | eid INT, 4 | name STRING, 5 | address STRING, 6 | phone_num STRING, 7 | row_opern STRING, 8 | rec_eff_dt STRING 9 | ) 10 | STORED AS PARQUET; 11 | 12 | CREATE TABLE employee_history 13 | ( 14 | eid INT, 15 | name STRING, 16 | address STRING, 17 | phone_num STRING, 18 | row_opern STRING 19 | ) 20 | PARTITIONED BY 21 | ( 22 | rec_eff_dt STRING 23 | ) 24 | STORED AS PARQUET; -------------------------------------------------------------------------------- /scd_driver_code.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import datetime, time 3 | from pyspark.sql import SparkSession 4 | from pyspark import SparkContext, SparkConf 5 | from pyspark.sql import functions as F 6 | from pyspark.sql.types import StructType, StructField, StringType, IntegerType 7 | from scd_lib import get_no_change_records, get_updated_records, get_new_records, get_deleted_rows 8 | 9 | conf = SparkConf() 10 | conf.set('set hive.vectorized.execution', 'true') 11 | conf.set('set hive.vectorized.execution.enabled', 'true') 12 | conf.set('set hive.cbo.enable', 'true') 13 | conf.set('set hive.compute.query.using.stats', 'true') 14 | conf.set('set hive.stats.fetch.column.stats','true') 15 | conf.set('set hive.stats.fetch.partition.stats', 'true') 16 | conf.set('spark.cleaner.referenceTracking.cleanCheckpoints', 'true') 17 | 18 | 19 | spark = SparkSession.builder.appName("scd_driver_program").config(conf=conf).enableHiveSupport().getOrCreate() 20 | spark.sql('set hive.exec.dynamic.partition=True') 21 | spark.sql('set hive.exec.dynamic.partition.mode=nonstrict') 22 | 23 | schema = StructType([ 24 | StructField(name='eid', dataType= IntegerType(), nullable = False), 25 | StructField(name='name', dataType= StringType(), nullable = False), 26 | StructField(name='address', dataType= StringType(), nullable = False), 27 | StructField(name='phone_num', dataType= StringType(), nullable = False) 28 | ]) 29 | 30 | 31 | #rec_eff_dt = datetime.datetime.fromtimestamp(time.time().strftime('%Y-%m-%d')) 32 | rec_eff_dt = '2021-08-07' 33 | # To Simulate 34 | #rec_eff_dt = '2021-08-08' 35 | key_list = ['eid'] 36 | ignored_columns = [''] 37 | 38 | history = spark.sql(''' select * from scd_hive_db.employee ''') 39 | current = spark.read.format('csv').option("header", False).schema(schema).load("/tmp/day0.csv") 40 | # To Simulate 41 | #current = spark.read.format('csv').option("header", False).schema(schema).load("/tmp/day1.csv") 42 | 43 | if len(history.head(1)) == 0: 44 | # First time run, mark everything as an INSERT 45 | print("First time run started...") 46 | new_rows_df = get_new_records(history, current, key_list, rec_eff_dt) 47 | new_rows_df = new_rows_df.select(history.columns) 48 | print("Writing to History...") 49 | new_rows_df.write.mode("overwrite").insertInto("scd_hive_db.employee_history", overwrite=True) 50 | print("Data saved to History...") 51 | print("Writing to Snapshot...") 52 | new_rows_df.write.mode("overwrite").saveAsTable("scd_hive_db.cdc_employee_temp") 53 | new_rows_df_temp_table = spark.sql(''' select * from scd_hive_db.cdc_employee_temp ''') 54 | new_rows_df_temp_table.write.mode("overwrite").insertInto("scd_hive_db.employee", overwrite=True) 55 | print("First time run end...") 56 | print("Data saved to Snapshot...") 57 | else: 58 | print("Subsequent run started...") 59 | updated_rows_df = get_updated_records(history, current, key_list, rec_eff_dt, ignored_columns) 60 | deleted_rows_df = get_deleted_rows(history, current, key_list, rec_eff_dt) 61 | new_rows_df = get_new_records(history, current, key_list, rec_eff_dt) 62 | unchanged_rows_df = get_no_change_records(history, current, key_list, ignored_columns) 63 | history_df = updated_rows_df.unionByName(deleted_rows_df).unionByName(new_rows_df) 64 | history_df = history_df.select(history.columns) 65 | print("Writing to History...") 66 | history_df.write.mode("overwrite").insertInto("scd_hive_db.employee_history", overwrite=True) 67 | print("Data saved to History...") 68 | unchanged_rows_df = unchanged_rows_df.select(history.columns) 69 | snapshot_df = history_df.filter("row_opern != 'D'").unionByName(unchanged_rows_df) 70 | print("Writing to Snapshot...") 71 | snapshot_df.write.mode("overwrite").saveAsTable("scd_hive_db.cdc_employee_temp") 72 | new_rows_df_temp_table = spark.sql(''' select * from scd_hive_db.cdc_employee_temp ''') 73 | new_rows_df_temp_table.write.mode("overwrite").insertInto("scd_hive_db.employee", overwrite=True) 74 | print("Data saved to Snapshot...") 75 | 76 | print("Finished CDC Processing!") 77 | --------------------------------------------------------------------------------