├── LICENSE ├── README.md ├── cdc_logical_replication_pgoutput.py ├── cdc_logical_replication_test_decoding.py └── utilities ├── __init__.py ├── message_decoder.py └── message_formatter.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Vinjit 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cdc_postgres_logical_replication 2 | This is implementation of CDC on postgresql database using pgoutput and test_decoding output plugins using python. 3 | 4 | The underlying logic for pgoutput decoder is https://www.postgresql.org/docs/12/protocol-logicalrep-message-formats.html. 5 | 6 | There are 2 scripts. One for test_decoding output plugin and other for pgoutput plugin. 7 | 8 | The flow of code is as follows: 9 | 10 | 1. Pgoutput 11 | Postgresql DB -> postgresql WAL -> cdc_logical_replication_pgoutput.py will bring the encoded data from WAL -> Algorithm of utilities/message_decoder.py decode the message to string format -> Algorithm of utilities/message_formatter.py converts the operation messages [Insert('I'), Update('U'), Delete('D'), Truncate('T')] to JSON format. 12 | 13 | Raw decoded messages are converted to JSON so that it is more readable, understandable, contains column names and column values together as key-value and can easily be used further. In Raw decoded messages, column names are present only in Relation('R') message and it is generated by CDC WAL only when a table appears first time in current run or the table schema is altered. On the other hand, operation messages(I,U,D,T) has only the column values. Hence for more usability, formatter is created. 14 | 15 | 2. Test_decoding 16 | Postgresql DB -> postgresql WAL -> cdc_logical_replication_test_decoding.py will bring the decoded data from WAL. 17 | 18 | 19 | Keywords - cdc, WAL, postgresql, postgres, logical_replication, test_decoding, pgoutput, pgoutput_decoding, change_data_capture 20 | -------------------------------------------------------------------------------- /cdc_logical_replication_pgoutput.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on July 11 2022 3 | @author: Vinjit 4 | """ 5 | 6 | import psycopg2 7 | import time 8 | from psycopg2.extras import LogicalReplicationConnection 9 | from utilities import message_decoder, message_formatter 10 | 11 | def main(): # Main code of CDC pgoutput 12 | conn = psycopg2.connect( 13 | database="mydatabasename", 14 | user='myusername', 15 | password='mypassword', 16 | host='myhost', 17 | port='5432', 18 | connection_factory=LogicalReplicationConnection 19 | ) 20 | cur = conn.cursor() 21 | options = {'publication_names': 'MyPublicationName', 'proto_version': '1'} 22 | 23 | try: 24 | cur.start_replication(slot_name='MySlotName', decode=False, options=options) 25 | except psycopg2.ProgrammingError: 26 | cur.create_replication_slot('MySlotName',output_plugin='pgoutput') 27 | cur.start_replication(slot_name='MySlotName', decode=False, options=options) 28 | 29 | class DemoConsumer(object): 30 | def __call__(self, msg): 31 | print("Original Encoded Message:\n",msg.payload) # Original encoded message 32 | message = message_decoder.decode_message(msg.payload) 33 | print("\nOriginal Decoded Message:\n",message) # Original decoded message 34 | formatted_message = message_formatter.get_message(str(message)) 35 | print("\nFormatted Message:\n", formatted_message) # Decoded message formatted to JSON 36 | msg.cursor.send_feedback(flush_lsn=msg.data_start) 37 | print("\n#################### END OF MESSAGE #########################\n\n") 38 | 39 | democonsumer = DemoConsumer() 40 | 41 | def start_stream(): 42 | cur.consume_stream(democonsumer) 43 | try: 44 | start_stream() 45 | except: 46 | print('\nStopping Replication') 47 | 48 | if __name__ == '__main__': 49 | main() 50 | -------------------------------------------------------------------------------- /cdc_logical_replication_test_decoding.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on July 4 2022 3 | @author:Vinjit 4 | """ 5 | 6 | import psycopg2 7 | from psycopg2.extras import LogicalReplicationConnection 8 | 9 | conn = psycopg2.connect( 10 | host='myhost', 11 | dbname='mydatabasename', 12 | user='myusername', 13 | password='mypassword', 14 | port='5432', 15 | connection_factory=LogicalReplicationConnection 16 | ) 17 | 18 | cur = conn.cursor() 19 | class DemoConsumer(object): 20 | def __call__(self, msg): 21 | print("\nOriginal Decoded Message:\n",msg.payload) 22 | msg.cursor.send_feedback(flush_lsn=msg.data_start) 23 | print("\n#################### END OF MESSAGE #######################\n\n") 24 | 25 | try: 26 | cur.start_replication(slot_name='MyPublicationName', decode=True) 27 | except psycopg2.ProgrammingError: 28 | cur.create_replication_slot('MyPublicationName', output_plugin='test_decoding') 29 | cur.start_replication(slot_name='MyPublicationName', decode=True) 30 | 31 | democonsumer = DemoConsumer() 32 | cur.consume_stream(democonsumer) 33 | 34 | -------------------------------------------------------------------------------- /utilities/__init__.py: -------------------------------------------------------------------------------- 1 | from utilities.message_decoder import PgoutputMessage 2 | from utilities.message_decoder import Begin 3 | from utilities.message_decoder import Commit 4 | from utilities.message_decoder import Origin 5 | from utilities.message_decoder import Relation 6 | from utilities.message_decoder import TupleData 7 | from utilities.message_decoder import Insert 8 | from utilities.message_decoder import Update 9 | from utilities.message_decoder import Delete 10 | from utilities.message_decoder import Truncate 11 | from utilities.message_decoder import decode_message 12 | from utilities.message_decoder import ColumnData -------------------------------------------------------------------------------- /utilities/message_decoder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on July 4 2022 3 | @author: made by dgea005 [https://github.com/panoplyio/py-pgoutput/tree/master/src/pypgoutput], modified by Vinjit 4 | """ 5 | 6 | from abc import ABC, abstractmethod 7 | from dataclasses import dataclass 8 | from datetime import datetime, timezone, timedelta 9 | import io 10 | from typing import Tuple, Union, Optional, List 11 | 12 | 13 | # integer byte lengths 14 | INT8 = 1 15 | INT16 = 2 16 | INT32 = 4 17 | INT64 = 8 18 | 19 | 20 | def convert_pg_ts(_ts_in_microseconds: int) -> datetime: 21 | ts = datetime(2000, 1, 1, 0, 0, 0, 0, tzinfo=timezone.utc) 22 | return ts + timedelta(microseconds=_ts_in_microseconds) 23 | 24 | def convert_bytes_to_int(_in_bytes: bytes) -> int: 25 | return int.from_bytes(_in_bytes, byteorder='big', signed=True) 26 | 27 | def convert_bytes_to_utf8(_in_bytes: Union[bytes, bytearray]) -> str: 28 | return (_in_bytes).decode('utf-8') 29 | 30 | 31 | @dataclass(frozen=True) 32 | class ColumnData: 33 | # col_data_category is NOT the type. it means null value/toasted(not sent)/text formatted 34 | col_data_category: Optional[str] 35 | col_data_length: Optional[int] = None 36 | col_data: Optional[str] = None 37 | 38 | 39 | @dataclass(frozen=True) 40 | class TupleData: 41 | n_columns: Optional[int] 42 | column_data: Optional[List[ColumnData]] 43 | 44 | def __repr__(self): 45 | return f"( n_columns : {self.n_columns}, data : {self.column_data})" 46 | 47 | 48 | class PgoutputMessage(ABC): 49 | def __init__(self, buffer: bytes): 50 | self.buffer: io.BytesIO = io.BytesIO(buffer) 51 | self.byte1: str = self.read_utf8(1) 52 | self.decode_buffer() 53 | 54 | @abstractmethod 55 | def decode_buffer(self): 56 | pass 57 | 58 | @abstractmethod 59 | def __repr__(self): 60 | pass 61 | 62 | def read_int8(self) -> int: 63 | return convert_bytes_to_int(self.buffer.read(INT8)) 64 | 65 | def read_int16(self) -> int: 66 | return convert_bytes_to_int(self.buffer.read(INT16)) 67 | 68 | def read_int32(self) -> int: 69 | return convert_bytes_to_int(self.buffer.read(INT32)) 70 | 71 | def read_int64(self) -> int: 72 | return convert_bytes_to_int(self.buffer.read(INT64)) 73 | 74 | def read_utf8(self, n: int = 1) -> str: 75 | return convert_bytes_to_utf8(self.buffer.read(n)) 76 | 77 | def read_timestamp(self) -> datetime: 78 | # 8 chars -> int64 -> timestamp 79 | return convert_pg_ts(_ts_in_microseconds=self.read_int64()) 80 | 81 | def read_string(self): 82 | output = bytearray() 83 | next_char = self.buffer.read(1) 84 | while (next_char != b'\x00'): 85 | output += next_char 86 | next_char = self.buffer.read(1) 87 | return convert_bytes_to_utf8(output) 88 | 89 | # def read_string(self): 90 | # output = bytearray() 91 | # while ((next_char := self.buffer.read(1)) != b'\x00'): 92 | # output += next_char 93 | # return convert_bytes_to_utf8(output) 94 | 95 | def read_tuple_data(self) -> TupleData: 96 | """ 97 | TupleData 98 | Int16 Number of columns. 99 | Next, one of the following submessages appears for each column (except generated columns): 100 | Byte1('n') Identifies the data as NULL value. 101 | Or 102 | Byte1('u') Identifies unchanged TOASTed value (the actual value is not sent). 103 | Or 104 | Byte1('t') Identifies the data as text formatted value. 105 | Int32 Length of the column value. 106 | Byten The value of the column, in text format. (A future release might support additional formats.) n is the above length. 107 | """ 108 | column_data = list() 109 | n_columns = self.read_int16() 110 | for column in range(n_columns): 111 | col_data_category = self.read_utf8() 112 | if col_data_category in ("n", "u"): 113 | # "n"=NULL, "t"=TOASTed 114 | column_data.append(ColumnData(col_data_category=col_data_category)) 115 | elif col_data_category == "t": 116 | # t = tuple 117 | col_data_length = self.read_int32() 118 | col_data = self.read_utf8(col_data_length) 119 | column_data.append(ColumnData(col_data_category=col_data_category, col_data_length=col_data_length, col_data=col_data)) 120 | else: 121 | # what happens with the generated columns? should an empty ColumnData be returned? 122 | pass 123 | return TupleData(n_columns=n_columns, column_data=column_data) 124 | 125 | 126 | class Begin(PgoutputMessage): 127 | """ 128 | https://pgpedia.info/x/xlogrecptr.html 129 | https://www.postgresql.org/docs/14/datatype-pg-lsn.html 130 | 131 | byte1 Byte1('B') Identifies the message as a begin message. 132 | final_tx_lsn Int64 The final LSN of the transaction. 133 | commit_tx_ts Int64 Commit timestamp of the transaction. The value is in number of microseconds since PostgreSQL epoch (2000-01-01). 134 | tx_xid Int32 Xid of the transaction. 135 | """ 136 | byte1: str 137 | final_tx_lsn: int 138 | commit_tx_ts: datetime 139 | tx_xid: int 140 | 141 | def decode_buffer(self): 142 | if self.byte1 != 'B': 143 | raise Exception('first byte in buffer does not match Begin message') 144 | self.final_tx_lsn = self.read_int64() 145 | self.commit_tx_ts = self.read_timestamp() 146 | self.tx_xid = self.read_int64() 147 | return self 148 | 149 | def __repr__(self): 150 | return f"\tOperation : BEGIN, \n\tbyte1 : '{self.byte1}', \n\tfinal_tx_lsn : {self.final_tx_lsn}, " \ 151 | f"\n\tcommit_tx_ts : {self.commit_tx_ts}, \n\ttx_xid : {self.tx_xid}" 152 | 153 | 154 | class Commit(PgoutputMessage): 155 | """ 156 | byte1: Byte1('C') Identifies the message as a commit message. 157 | flags: Int8 Flags; currently unused (must be 0). 158 | lsn_commit: Int64 The LSN of the commit. 159 | final_tx_lsn: Int64 The end LSN of the transaction. 160 | Int64 Commit timestamp of the transaction. The value is in number of microseconds since PostgreSQL epoch (2000-01-01). 161 | """ 162 | byte1: str 163 | flags: int 164 | lsn_commit: int 165 | final_tx_lsn: int 166 | commit_tx_ts: datetime 167 | 168 | def decode_buffer(self): 169 | if self.byte1 != 'C': 170 | raise Exception('first byte in buffer does not match Commit message') 171 | self.flags = self.read_utf8() 172 | self.lsn_commit = self.read_int64() 173 | self.final_tx_lsn = self.read_int64() 174 | self.commit_tx_ts = self.read_timestamp() 175 | return self 176 | 177 | def __repr__(self): 178 | return f"\tOperation : COMMIT, \n\tbyte1 : {self.byte1}, \n\tflags : {self.flags}, \n\tlsn_commit : {self.lsn_commit}, " \ 179 | f"\n\tfinal_tx_lsn : {self.final_tx_lsn}, \n\tcommit_tx_ts : {self.commit_tx_ts}" 180 | 181 | 182 | class Origin: 183 | """ 184 | Byte1('O') Identifies the message as an origin message. 185 | Int64 The LSN of the commit on the origin server. 186 | String Name of the origin. 187 | Note that there can be multiple Origin messages inside a single transaction. 188 | This seems to be what origin means: https://www.postgresql.org/docs/12/replication-origins.html 189 | """ 190 | pass 191 | 192 | 193 | class Relation(PgoutputMessage): 194 | """ 195 | Byte1('R') Identifies the message as a relation message. 196 | Int32 ID of the relation. 197 | String Namespace (empty string for pg_catalog). 198 | String Relation name. 199 | Int8 Replica identity setting for the relation (same as relreplident in pg_class). 200 | # select relreplident from pg_class where relname = 'test_table'; 201 | # from reading the documentation and looking at the tables this is not int8 but a single character 202 | # background: https://www.postgresql.org/docs/10/sql-altertable.html#SQL-CREATETABLE-REPLICA-IDENTITY 203 | Int16 Number of columns. 204 | Next, the following message part appears for each column (except generated columns): 205 | Int8 Flags for the column. Currently can be either 0 for no flags or 1 which marks the column as part of the key. 206 | String Name of the column. 207 | Int32 ID of the column's data type. 208 | Int32 Type modifier of the column (atttypmod). 209 | """ 210 | byte1: str 211 | relation_id: int 212 | namespace: str 213 | relation_name: str 214 | replica_identity_setting: str 215 | n_columns: int 216 | # TODO define column type, could eventually look this up from the DB 217 | columns: List[Tuple[int, str, int, int]] 218 | 219 | def decode_buffer(self): 220 | if self.byte1 != 'R': 221 | raise Exception('first byte in buffer does not match Relation message') 222 | self.relation_id = self.read_int32() 223 | self.namespace = self.read_string() 224 | self.relation_name = self.read_string() 225 | self.replica_identity_setting = self.read_utf8() 226 | self.n_columns = self.read_int16() 227 | self.columns = list() 228 | 229 | for column in range(self.n_columns): 230 | part_of_pkey = self.read_int8() 231 | col_name = self.read_string() 232 | data_type_id = self.read_int32() 233 | # TODO: check on use of signed / unsigned 234 | # check with select oid from pg_type where typname = ; timestamp == 1184, int4 = 23 235 | col_modifier = self.read_int32() 236 | self.columns.append((part_of_pkey, col_name, data_type_id, col_modifier)) 237 | 238 | def __repr__(self): 239 | return f"\tOperation : RELATION, \n\tbyte1 : '{self.byte1}', \n\trelation_id : {self.relation_id}" \ 240 | f",\n\tnamespace/schema : '{self.namespace}',\n\trelation_name : '{self.relation_name}'" \ 241 | f",\n\treplica_identity_setting : '{self.replica_identity_setting}',\n\tn_columns : {self.n_columns} " \ 242 | f",\n\tcolumns : {self.columns}" 243 | 244 | 245 | class PgType: 246 | """ 247 | Renamed to PgType not to collide with "type" 248 | 249 | Byte1('Y') Identifies the message as a type message. 250 | Int32 ID of the data type. 251 | String Namespace (empty string for pg_catalog). 252 | String Name of the data type. 253 | """ 254 | pass 255 | 256 | 257 | class Insert(PgoutputMessage): 258 | """ 259 | Byte1('I') Identifies the message as an insert message. 260 | Int32 ID of the relation corresponding to the ID in the relation message. 261 | Byte1('N') Identifies the following TupleData message as a new tuple. 262 | TupleData TupleData message part representing the contents of new tuple. 263 | """ 264 | byte1: str 265 | relation_id: int 266 | new_tuple_byte: str 267 | new_tuple: TupleData 268 | 269 | def decode_buffer(self): 270 | if self.byte1 != 'I': 271 | raise Exception(f"first byte in buffer does not match Insert message (expected 'I', got '{self.byte1}'") 272 | self.relation_id = self.read_int32() 273 | self.new_tuple_byte = self.read_utf8() 274 | self.new_tuple = self.read_tuple_data() 275 | return self 276 | 277 | def __repr__(self): 278 | return f"\tOperation : INSERT, \n\tbyte1: '{self.byte1}', \n\trelation_id : {self.relation_id}, " \ 279 | f"\n\tnew tuple byte: '{self.new_tuple_byte}', \n\tnew_tuple : {self.new_tuple}" 280 | 281 | 282 | class Update(PgoutputMessage): 283 | """ 284 | Byte1('U') Identifies the message as an update message. 285 | Int32 ID of the relation corresponding to the ID in the relation message. 286 | Byte1('K') Identifies the following TupleData submessage as a key. This field is optional and is only present if the update changed data in any of the column(s) that are part of the REPLICA IDENTITY index. 287 | Byte1('O') Identifies the following TupleData submessage as an old tuple. This field is optional and is only present if table in which the update happened has REPLICA IDENTITY set to FULL. 288 | TupleData TupleData message part representing the contents of the old tuple or primary key. Only present if the previous 'O' or 'K' part is present. 289 | Byte1('N') Identifies the following TupleData message as a new tuple. 290 | TupleData TupleData message part representing the contents of a new tuple. 291 | 292 | The Update message may contain either a 'K' message part or an 'O' message part or neither of them, but never both of them. 293 | """ 294 | byte1: str 295 | relation_id: int 296 | next_byte_identifier: Optional[str] 297 | optional_tuple_identifier: Optional[str] 298 | old_tuple: Optional[TupleData] 299 | new_tuple_byte: str 300 | new_tuple: TupleData 301 | 302 | def decode_buffer(self): 303 | self.optional_tuple_identifier = None 304 | self.old_tuple = None 305 | if self.byte1 != 'U': 306 | raise Exception(f"first byte in buffer does not match Update message (expected 'U', got '{self.byte1}'") 307 | self.relation_id = self.read_int32() 308 | # TODO test update to PK, test update with REPLICA IDENTITY = FULL 309 | self.next_byte_identifier = self.read_utf8() # one of K, O or N 310 | if self.next_byte_identifier == 'K' or self.next_byte_identifier == 'O': 311 | self.optional_tuple_identifier = self.next_byte_identifier 312 | self.old_tuple = self.read_tuple_data() 313 | self.new_tuple_byte = self.read_utf8() 314 | else: 315 | self.new_tuple_byte = self.next_byte_identifier 316 | if self.new_tuple_byte != 'N': 317 | raise Exception(f"did not find new_tuple_byte ('N') at position: {self.buffer.tell()}, found: '{self.new_tuple_byte}'") 318 | self.new_tuple = self.read_tuple_data() 319 | return self 320 | 321 | def __repr__(self): 322 | return f"\tOperation : UPDATE, \n\tbyte1 : '{self.byte1}', \n\trelation_id : {self.relation_id}, " \ 323 | f" \n\toptional_tuple_identifier : '{self.optional_tuple_identifier}', \n\toptional_old_tuple_data : {self.old_tuple}, " \ 324 | f" \n\tnew_tuple_byte : '{self.new_tuple_byte}', \n\tnew_tuple : {self.new_tuple}" 325 | 326 | 327 | class Delete(PgoutputMessage): 328 | """ 329 | Byte1('D') Identifies the message as a delete message. 330 | Int32 ID of the relation corresponding to the ID in the relation message. 331 | Byte1('K') Identifies the following TupleData submessage as a key. This field is present if the table in which the delete has happened uses an index as REPLICA IDENTITY. 332 | Byte1('O') Identifies the following TupleData message as a old tuple. This field is present if the table in which the delete has happened has REPLICA IDENTITY set to FULL. 333 | TupleData TupleData message part representing the contents of the old tuple or primary key, depending on the previous field. 334 | 335 | The Delete message may contain either a 'K' message part or an 'O' message part, but never both of them. 336 | """ 337 | byte1: str 338 | relation_id: int 339 | message_type: str 340 | old_tuple: TupleData 341 | 342 | def decode_buffer(self): 343 | if self.byte1 != 'D': 344 | raise Exception(f"first byte in buffer does not match Delete message (expected 'D', got '{self.byte1}'") 345 | self.relation_id = self.read_int32() 346 | self.message_type = self.read_utf8() 347 | if self.message_type not in ['K','O']: 348 | raise Exception(f"message type byte is not 'K' or 'O', got : '{self.message_type}'") 349 | self.old_tuple = self.read_tuple_data() 350 | return self 351 | 352 | def __repr__(self): 353 | return f"\tOperation : DELETE, \n\tbyte1 : {self.byte1}, \n\trelation_id : {self.relation_id}, " \ 354 | f"\n\tmessage_type : {self.message_type}, \n\told_tuple : {self.old_tuple}" 355 | 356 | 357 | class Truncate(PgoutputMessage): 358 | """ 359 | Byte1('T') Identifies the message as a truncate message. 360 | Int32 Number of relations 361 | Int8 Option bits for TRUNCATE: 1 for CASCADE, 2 for RESTART IDENTITY 362 | Int32 ID of the relation corresponding to the ID in the relation message. This field is repeated for each relation. 363 | """ 364 | byte1: str 365 | number_of_relations: int 366 | option_bits: str 367 | relation_ids: List[int] 368 | 369 | def decode_buffer(self): 370 | if self.byte1 != 'T': 371 | raise Exception(f"first byte in buffer does not match Truncate message (expected 'T', got '{self.byte1}'") 372 | self.number_of_relations = self.read_int32() 373 | self.option_bits = self.read_int8() 374 | self.relation_ids = [] 375 | for relation in range(self.number_of_relations): 376 | self.relation_ids.append(self.read_int32()) 377 | 378 | def __repr__(self): 379 | return f"\tOperation : TRUNCATE, \n\tbyte1 : {self.byte1}, \n\tn_relations : {self.number_of_relations}, "\ 380 | f"\n\toption_bits : {self.option_bits}, relation_ids : {self.relation_ids}" 381 | 382 | 383 | def decode_message(_input_bytes: bytes) -> Optional[Union[Begin, Commit, Relation, Insert, Update, Delete, Truncate]]: 384 | """Peak first byte and initialise the appropriate message object""" 385 | first_byte = (_input_bytes[:1]).decode('utf-8') 386 | if first_byte == 'B': 387 | output = Begin(_input_bytes) 388 | elif first_byte == "C": 389 | output = Commit(_input_bytes) 390 | elif first_byte == "R": 391 | output = Relation(_input_bytes) 392 | elif first_byte == "I": 393 | output = Insert(_input_bytes) 394 | elif first_byte == "U": 395 | output = Update(_input_bytes) 396 | elif first_byte == 'D': 397 | output = Delete(_input_bytes) 398 | elif first_byte == 'T': 399 | output = Truncate(_input_bytes) 400 | else: 401 | print(f"warning unrecognised message {_input_bytes}") 402 | output = None 403 | return output 404 | -------------------------------------------------------------------------------- /utilities/message_formatter.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on July 12 2022 3 | @author: Vinjit 4 | """ 5 | 6 | import re 7 | 8 | relation_pool={} 9 | trans_info = [None] * 3 10 | 11 | def update_timestamp(splitted_records): 12 | trans_info[0]=(splitted_records[3].split(' : ')[1]).strip() # timestamp 13 | trans_info[1]=(splitted_records[2].split(':')[1]).strip() # lsn 14 | trans_info[2]=(splitted_records[4].split(':')[1]).strip() # xid 15 | 16 | def replace_relation(splitted_records,relation_id,relationdata): 17 | new_schema_name = ((splitted_records[3].split(':')[1]).strip()).strip("'") 18 | # print (new_schema_name) 19 | 20 | new_table_name = ((splitted_records[4].split(':')[1]).strip()).strip("'") 21 | # print (new_table_name) 22 | 23 | new_col_list = [] 24 | new_col_record = relationdata.split(':')[-1] 25 | # print('col rec : ', new_col_record) 26 | new_col_list=re.findall(r'\'.*?\'', new_col_record) 27 | for i in range(len(new_col_list)): 28 | new_col_list[i]=new_col_list[i].strip("'") 29 | 30 | relation_pool[relation_id] = { 31 | "Namespace/schema" : new_schema_name, 32 | "Table_name" : new_table_name, 33 | "Columns" : new_col_list 34 | } 35 | 36 | # print("\nThe current relation pool is : ",relation_pool) 37 | 38 | 39 | def add_relation(splitted_records,relation_id,relationdata): 40 | schema_name = ((splitted_records[3].split(':')[1]).strip()).strip("'") 41 | # print (schema_name) 42 | 43 | table_name = ((splitted_records[4].split(':')[1]).strip()).strip("'") 44 | # print (table_name) 45 | 46 | col_list = [] 47 | col_record = relationdata.split(':')[-1] 48 | # print('col rec : ',col_record) 49 | col_list=re.findall(r'\'.*?\'',col_record) 50 | for i in range(len(col_list)): 51 | col_list[i]=col_list[i].strip("'") 52 | 53 | relation_dict = { 54 | "Namespace/schema" : schema_name, 55 | "Table_name" : table_name, 56 | "Columns" : col_list 57 | } 58 | # print(relation_dict) 59 | 60 | relation_pool.update({relation_id : relation_dict}) 61 | # print("\nThe current relation pool is : ",relation_pool) 62 | 63 | def update_relation_pool(splitted_records,relationdata): 64 | relation_id=(splitted_records[2].split(':')[1]).strip() 65 | # print("relation id is: ",relation_id) 66 | if relation_id in relation_pool: 67 | # print("rel found") 68 | replace_relation(splitted_records,relation_id,relationdata) 69 | # print(f"\nSUCCESS : Relation {relation_id} updated in the relation pool") 70 | 71 | else: 72 | # print("rel not found") 73 | add_relation(splitted_records,relation_id,relationdata) 74 | # print(f"\nSUCCESS : New Relation {relation_id} added to the relation pool") 75 | 76 | def create_insert_message_json(splitted_records,insertdata): 77 | relation_id = (splitted_records[2].split(':')[1]).strip() 78 | if relation_id in relation_pool: 79 | rels=relation_pool[relation_id] 80 | col_names=rels['Columns'] 81 | schema_name=rels['Namespace/schema'] 82 | table_name=rels['Table_name'] 83 | 84 | else: 85 | # print(f"\nERROR : Relation id {relation_id} not found in pool") 86 | raise Exception (f"\nERROR : Relation id {relation_id} not found in pool") 87 | 88 | raw_cols = (insertdata.split(':')[-1]).split(',') 89 | # print(raw_cols) 90 | col_data_string = "" 91 | for i in range(2,len(raw_cols),3): 92 | col_data_string+=raw_cols[i] 93 | col_data=re.findall(r'\'.*?\'', col_data_string) 94 | 95 | for i in range(len(col_data)): 96 | col_data[i]=col_data[i].strip("'") 97 | # print(col_data) 98 | 99 | if(len(col_data) != len(col_names)): 100 | # print(f"\nERROR : Number of columns inserted is not equal to number of columns in relation {relation_id}") 101 | raise Exception (f"\nERROR : Number of columns inserted is not equal to number of columns in relation {relation_id}") 102 | 103 | insert_message = { 104 | "Operation" : "INSERT", 105 | "LSN" : trans_info[1], 106 | "Transaction_Xid" : trans_info[2], 107 | "Commit_timestamp" : trans_info[0], 108 | "Schema" : schema_name, 109 | "Table_name" : table_name, 110 | "Relation_id" : relation_id 111 | } 112 | 113 | insert_message.update(zip(col_names, col_data)) 114 | return insert_message 115 | 116 | def create_update_message_json(splitted_records,updatedata): 117 | relation_id = (splitted_records[2].split(':')[1]).strip() 118 | if relation_id in relation_pool: 119 | rels=relation_pool[relation_id] 120 | col_names=rels['Columns'] 121 | schema_name=rels['Namespace/schema'] 122 | table_name=rels['Table_name'] 123 | 124 | else: 125 | # print(f"\nERROR : Relation id {relation_id} not found in pool") 126 | raise Exception (f"\nERROR : Relation id {relation_id} not found in pool") 127 | 128 | raw_cols = (updatedata.split(':')[-1]).split(',') 129 | # print(raw_cols) 130 | col_data_string = "" 131 | for i in range(2,len(raw_cols),3): 132 | col_data_string+=raw_cols[i] 133 | col_data=re.findall(r'\'.*?\'', col_data_string) 134 | 135 | for i in range(len(col_data)): 136 | col_data[i]=col_data[i].strip("'") 137 | # print(col_data) 138 | 139 | if(len(col_data) != len(col_names)): 140 | # print(f"\nERROR : Number of columns inserted is not equal to number of columns in relation {relation_id}") 141 | raise Exception (f"\nERROR : Number of columns inserted is not equal to number of columns in relation {relation_id}") 142 | 143 | update_message = { 144 | "Operation" : "UPDATE", 145 | "LSN" : trans_info[1], 146 | "Transaction_Xid" : trans_info[2], 147 | "Commit_timestamp" : trans_info[0], 148 | "Schema" : schema_name, 149 | "Table_name" : table_name, 150 | "Relation_id" : relation_id 151 | } 152 | 153 | update_message.update(zip(col_names, col_data)) 154 | return update_message 155 | 156 | def create_truncate_message_json(truncatedata): 157 | 158 | raw_rel_ids = ((truncatedata.split(':')[-1].strip()).lstrip('[')).rstrip(']') 159 | # print("new rec - ",raw_rel_ids) 160 | relation_ids=raw_rel_ids.split(',') 161 | relation_ids=[ x.strip() for x in relation_ids] 162 | table_list = [] 163 | schema_list = [] 164 | 165 | for ids in relation_ids: 166 | if ids in relation_pool: 167 | rels=relation_pool[ids] 168 | schema_list.append(rels['Namespace/schema']) 169 | table_list.append(rels['Table_name']) 170 | 171 | 172 | truncate_message = { 173 | "Operation" : "TRUNCATE", 174 | "LSN" : trans_info[1], 175 | "Transaction_Xid" : trans_info[2], 176 | "Commit_timestamp" : trans_info[0], 177 | "Schema_names" : schema_list, 178 | "Table_names" : table_list, 179 | "Relation_ids" : relation_ids 180 | } 181 | return truncate_message 182 | 183 | def create_delete_message_json(splitted_records,deletedata): 184 | relation_id = (splitted_records[2].split(':')[1]).strip() 185 | if relation_id in relation_pool: 186 | rels=relation_pool[relation_id] 187 | col_names=rels['Columns'] 188 | schema_name=rels['Namespace/schema'] 189 | table_name=rels['Table_name'] 190 | 191 | else: 192 | # print(f"\nERROR : Relation id {relation_id} not found in pool") 193 | raise Exception (f"\nERROR : Relation id {relation_id} not found in pool") 194 | 195 | raw_col = (deletedata.split(':')[-1]).split(',') 196 | result = re.search('col_data=(.*)', raw_col[2]) 197 | # print(result) 198 | result=result.group(1) 199 | col_data=(result.strip(')')).strip("'") 200 | 201 | delete_message = { 202 | "Operation" : "DELETE", 203 | "LSN" : trans_info[1], 204 | "Transaction_Xid" : trans_info[2], 205 | "Commit_timestamp" : trans_info[0], 206 | "Schema" : schema_name, 207 | "Table_name" : table_name, 208 | "Relation_id" : relation_id, 209 | col_names[0] : col_data 210 | } 211 | delete_message.update() 212 | return delete_message 213 | 214 | def get_message(data): 215 | try: 216 | splitted_records=data.split(',') 217 | operation=(splitted_records[0].split(':')[1]).strip() 218 | if(operation=='RELATION'): 219 | update_relation_pool(splitted_records,data) 220 | return '' 221 | elif(operation=='BEGIN'): 222 | update_timestamp(splitted_records) 223 | return '' 224 | elif(operation=='INSERT'): 225 | message=create_insert_message_json(splitted_records,data) 226 | return message 227 | elif(operation=='UPDATE'): 228 | message=create_update_message_json(splitted_records,data) 229 | return message 230 | elif(operation=='DELETE'): 231 | message=create_delete_message_json(splitted_records,data) 232 | return message 233 | elif(operation=='TRUNCATE'): 234 | message=create_truncate_message_json(data) 235 | return message 236 | else: 237 | return '' 238 | except Exception as error: 239 | # print(error) 240 | return error 241 | --------------------------------------------------------------------------------