├── LICENSE
├── README.md
├── cdc_logical_replication_pgoutput.py
├── cdc_logical_replication_test_decoding.py
└── utilities
    ├── __init__.py
    ├── message_decoder.py
    └── message_formatter.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Vinjit
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # cdc_postgres_logical_replication
 2 | This is implementation of CDC on postgresql database using pgoutput and test_decoding output plugins using python.
 3 | 
 4 | The underlying logic for pgoutput decoder is https://www.postgresql.org/docs/12/protocol-logicalrep-message-formats.html.
 5 | 
 6 | There are 2 scripts. One for test_decoding output plugin and other for pgoutput plugin.
 7 | 
 8 | The flow of code is as follows: 
 9 | 
10 | 1. Pgoutput
11 | Postgresql DB -> postgresql WAL -> cdc_logical_replication_pgoutput.py will bring the encoded data from WAL -> Algorithm of utilities/message_decoder.py decode the message to string format -> Algorithm of utilities/message_formatter.py converts the operation messages [Insert('I'), Update('U'), Delete('D'), Truncate('T')] to JSON format.
12 | 
13 |   Raw decoded messages are converted to JSON so that it is more readable, understandable, contains column names and column values together as key-value and can easily be used further. In Raw decoded messages, column names are present only in Relation('R') message and it is generated by CDC WAL only when a table appears first time in current run or the table schema is altered. On the other hand, operation messages(I,U,D,T) has only the column values. Hence for more usability, formatter is created.
14 | 
15 | 2. Test_decoding
16 | Postgresql DB -> postgresql WAL -> cdc_logical_replication_test_decoding.py will bring the decoded data from WAL.
17 | 
18 | 
19 | Keywords - cdc, WAL, postgresql, postgres, logical_replication, test_decoding, pgoutput, pgoutput_decoding, change_data_capture
20 | 


--------------------------------------------------------------------------------
/cdc_logical_replication_pgoutput.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Created on July 11 2022
 3 | @author: Vinjit
 4 | """
 5 | 
 6 | import psycopg2
 7 | import time
 8 | from psycopg2.extras import LogicalReplicationConnection
 9 | from utilities import message_decoder, message_formatter
10 | 
11 | def main():                                                                                # Main code of CDC pgoutput
12 |     conn = psycopg2.connect(
13 |         database="mydatabasename",
14 |         user='myusername', 
15 |         password='mypassword',
16 |         host='myhost',
17 |         port='5432',
18 |         connection_factory=LogicalReplicationConnection
19 |     )
20 |     cur = conn.cursor()
21 |     options = {'publication_names': 'MyPublicationName', 'proto_version': '1'}
22 | 
23 |     try:
24 |         cur.start_replication(slot_name='MySlotName', decode=False, options=options)
25 |     except psycopg2.ProgrammingError:
26 |         cur.create_replication_slot('MySlotName',output_plugin='pgoutput')
27 |         cur.start_replication(slot_name='MySlotName', decode=False, options=options)
28 | 
29 |     class DemoConsumer(object):
30 |         def __call__(self, msg):
31 |             print("Original Encoded Message:\n",msg.payload)                                # Original encoded message
32 |             message = message_decoder.decode_message(msg.payload)        
33 |             print("\nOriginal Decoded Message:\n",message)                                  # Original decoded message
34 |             formatted_message = message_formatter.get_message(str(message))
35 |             print("\nFormatted Message:\n", formatted_message)                              # Decoded message formatted to JSON
36 |             msg.cursor.send_feedback(flush_lsn=msg.data_start)
37 |             print("\n#################### END OF MESSAGE #########################\n\n")
38 | 
39 |     democonsumer = DemoConsumer()
40 | 
41 |     def start_stream():
42 |         cur.consume_stream(democonsumer)
43 |     try:       
44 |         start_stream()
45 |     except: 
46 |         print('\nStopping Replication')
47 |     
48 | if __name__ == '__main__':
49 |     main()
50 | 


--------------------------------------------------------------------------------
/cdc_logical_replication_test_decoding.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Created on July 4 2022
 3 | @author:Vinjit
 4 | """
 5 | 
 6 | import psycopg2
 7 | from psycopg2.extras import LogicalReplicationConnection
 8 | 
 9 | conn = psycopg2.connect(
10 |     host='myhost',
11 |     dbname='mydatabasename',
12 |     user='myusername',
13 |     password='mypassword',
14 |     port='5432', 
15 |     connection_factory=LogicalReplicationConnection
16 |     )
17 | 
18 | cur = conn.cursor()
19 | class DemoConsumer(object):
20 |     def __call__(self, msg):        
21 |         print("\nOriginal Decoded Message:\n",msg.payload)
22 |         msg.cursor.send_feedback(flush_lsn=msg.data_start)
23 |         print("\n#################### END OF MESSAGE #######################\n\n")
24 | 
25 | try:
26 |     cur.start_replication(slot_name='MyPublicationName', decode=True)
27 | except psycopg2.ProgrammingError:
28 |     cur.create_replication_slot('MyPublicationName', output_plugin='test_decoding')
29 |     cur.start_replication(slot_name='MyPublicationName', decode=True)
30 | 
31 | democonsumer = DemoConsumer()
32 | cur.consume_stream(democonsumer)
33 | 
34 | 


--------------------------------------------------------------------------------
/utilities/__init__.py:
--------------------------------------------------------------------------------
 1 | from utilities.message_decoder import PgoutputMessage
 2 | from utilities.message_decoder import Begin
 3 | from utilities.message_decoder import Commit
 4 | from utilities.message_decoder import Origin
 5 | from utilities.message_decoder import Relation
 6 | from utilities.message_decoder import TupleData
 7 | from utilities.message_decoder import Insert
 8 | from utilities.message_decoder import Update
 9 | from utilities.message_decoder import Delete
10 | from utilities.message_decoder import Truncate
11 | from utilities.message_decoder import decode_message
12 | from utilities.message_decoder import ColumnData


--------------------------------------------------------------------------------
/utilities/message_decoder.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Created on July 4 2022
  3 | @author: made by dgea005 [https://github.com/panoplyio/py-pgoutput/tree/master/src/pypgoutput], modified by Vinjit
  4 | """
  5 | 
  6 | from abc import ABC, abstractmethod
  7 | from dataclasses import dataclass
  8 | from datetime import datetime, timezone, timedelta
  9 | import io
 10 | from typing import Tuple, Union, Optional, List
 11 | 
 12 | 
 13 | # integer byte lengths
 14 | INT8 = 1
 15 | INT16 = 2
 16 | INT32 = 4
 17 | INT64 = 8
 18 | 
 19 | 
 20 | def convert_pg_ts(_ts_in_microseconds: int) -> datetime:
 21 |     ts = datetime(2000, 1, 1, 0, 0, 0, 0, tzinfo=timezone.utc) 
 22 |     return ts + timedelta(microseconds=_ts_in_microseconds)
 23 | 
 24 | def convert_bytes_to_int(_in_bytes: bytes) -> int:
 25 |     return int.from_bytes(_in_bytes, byteorder='big', signed=True)    
 26 | 
 27 | def convert_bytes_to_utf8(_in_bytes: Union[bytes, bytearray]) -> str:
 28 |     return (_in_bytes).decode('utf-8')
 29 | 
 30 | 
 31 | @dataclass(frozen=True)
 32 | class ColumnData:
 33 |     # col_data_category is NOT the type. it means null value/toasted(not sent)/text formatted
 34 |     col_data_category: Optional[str] 
 35 |     col_data_length: Optional[int] = None
 36 |     col_data: Optional[str] = None
 37 | 
 38 | 
 39 | @dataclass(frozen=True)
 40 | class TupleData:
 41 |     n_columns: Optional[int]
 42 |     column_data: Optional[List[ColumnData]]
 43 | 
 44 |     def __repr__(self):
 45 |         return f"( n_columns : {self.n_columns}, data : {self.column_data})"
 46 | 
 47 | 
 48 | class PgoutputMessage(ABC):
 49 |     def __init__(self, buffer: bytes):
 50 |         self.buffer: io.BytesIO = io.BytesIO(buffer)
 51 |         self.byte1: str = self.read_utf8(1)
 52 |         self.decode_buffer()
 53 | 
 54 |     @abstractmethod
 55 |     def decode_buffer(self):
 56 |         pass
 57 | 
 58 |     @abstractmethod
 59 |     def __repr__(self):
 60 |         pass
 61 | 
 62 |     def read_int8(self) -> int:
 63 |         return convert_bytes_to_int(self.buffer.read(INT8))
 64 | 
 65 |     def read_int16(self) -> int:
 66 |         return convert_bytes_to_int(self.buffer.read(INT16))
 67 | 
 68 |     def read_int32(self) -> int:
 69 |         return convert_bytes_to_int(self.buffer.read(INT32)) 
 70 | 
 71 |     def read_int64(self) -> int:
 72 |         return convert_bytes_to_int(self.buffer.read(INT64)) 
 73 | 
 74 |     def read_utf8(self, n: int = 1) -> str:
 75 |         return convert_bytes_to_utf8(self.buffer.read(n))
 76 | 
 77 |     def read_timestamp(self) -> datetime:
 78 |         # 8 chars -> int64 -> timestamp
 79 |         return convert_pg_ts(_ts_in_microseconds=self.read_int64())
 80 | 
 81 |     def read_string(self):
 82 |         output = bytearray()
 83 |         next_char = self.buffer.read(1)
 84 |         while (next_char != b'\x00'):
 85 |             output += next_char
 86 |             next_char = self.buffer.read(1)
 87 |         return convert_bytes_to_utf8(output)
 88 | 
 89 |     # def read_string(self):
 90 |     #     output = bytearray()
 91 |     #     while ((next_char := self.buffer.read(1)) != b'\x00'):
 92 |     #         output += next_char
 93 |     #     return convert_bytes_to_utf8(output)
 94 | 
 95 |     def read_tuple_data(self) -> TupleData:
 96 |         """
 97 |         TupleData 
 98 |         Int16  Number of columns.
 99 |         Next, one of the following submessages appears for each column (except generated columns):
100 |                 Byte1('n') Identifies the data as NULL value.
101 |             Or
102 |                 Byte1('u') Identifies unchanged TOASTed value (the actual value is not sent).
103 |             Or
104 |                 Byte1('t') Identifies the data as text formatted value.
105 |                 Int32 Length of the column value.
106 |                 Byten The value of the column, in text format. (A future release might support additional formats.) n is the above length.
107 |         """
108 |         column_data = list()
109 |         n_columns = self.read_int16()
110 |         for column in range(n_columns):
111 |             col_data_category = self.read_utf8()
112 |             if col_data_category in ("n", "u"):
113 |                 # "n"=NULL, "t"=TOASTed
114 |                 column_data.append(ColumnData(col_data_category=col_data_category))
115 |             elif col_data_category == "t":
116 |                 # t = tuple
117 |                 col_data_length = self.read_int32()
118 |                 col_data = self.read_utf8(col_data_length)
119 |                 column_data.append(ColumnData(col_data_category=col_data_category, col_data_length=col_data_length, col_data=col_data))
120 |             else:
121 |                 # what happens with the generated columns? should an empty ColumnData be returned?
122 |                 pass
123 |         return TupleData(n_columns=n_columns, column_data=column_data)
124 | 
125 | 
126 | class Begin(PgoutputMessage):
127 |     """
128 |     https://pgpedia.info/x/xlogrecptr.html
129 |     https://www.postgresql.org/docs/14/datatype-pg-lsn.html
130 | 
131 |     byte1 Byte1('B') Identifies the message as a begin message.
132 |     final_tx_lsn Int64 The final LSN of the transaction.
133 |     commit_tx_ts Int64 Commit timestamp of the transaction. The value is in number of microseconds since PostgreSQL epoch (2000-01-01).
134 |     tx_xid Int32 Xid of the transaction.
135 |     """
136 |     byte1: str
137 |     final_tx_lsn: int
138 |     commit_tx_ts: datetime
139 |     tx_xid: int
140 | 
141 |     def decode_buffer(self):
142 |         if self.byte1 != 'B':
143 |             raise Exception('first byte in buffer does not match Begin message')
144 |         self.final_tx_lsn = self.read_int64() 
145 |         self.commit_tx_ts = self.read_timestamp()
146 |         self.tx_xid = self.read_int64()
147 |         return self
148 | 
149 |     def __repr__(self):
150 |         return f"\tOperation : BEGIN, \n\tbyte1 : '{self.byte1}', \n\tfinal_tx_lsn : {self.final_tx_lsn}, " \
151 |                f"\n\tcommit_tx_ts : {self.commit_tx_ts}, \n\ttx_xid : {self.tx_xid}"
152 | 
153 | 
154 | class Commit(PgoutputMessage):
155 |     """
156 |     byte1: Byte1('C') Identifies the message as a commit message.
157 |     flags: Int8 Flags; currently unused (must be 0).
158 |     lsn_commit: Int64 The LSN of the commit.
159 |     final_tx_lsn: Int64 The end LSN of the transaction.
160 |     Int64 Commit timestamp of the transaction. The value is in number of microseconds since PostgreSQL epoch (2000-01-01).
161 |     """
162 |     byte1: str
163 |     flags: int
164 |     lsn_commit: int
165 |     final_tx_lsn: int
166 |     commit_tx_ts: datetime
167 | 
168 |     def decode_buffer(self):
169 |         if self.byte1 != 'C':
170 |             raise Exception('first byte in buffer does not match Commit message')
171 |         self.flags = self.read_utf8()
172 |         self.lsn_commit = self.read_int64()
173 |         self.final_tx_lsn = self.read_int64()
174 |         self.commit_tx_ts = self.read_timestamp()
175 |         return self
176 | 
177 |     def __repr__(self):
178 |         return f"\tOperation : COMMIT, \n\tbyte1 : {self.byte1}, \n\tflags : {self.flags}, \n\tlsn_commit : {self.lsn_commit}, " \
179 |                f"\n\tfinal_tx_lsn : {self.final_tx_lsn}, \n\tcommit_tx_ts : {self.commit_tx_ts}"     
180 | 
181 | 
182 | class Origin:
183 |     """
184 |     Byte1('O') Identifies the message as an origin message.
185 |     Int64  The LSN of the commit on the origin server.
186 |     String Name of the origin.
187 |     Note that there can be multiple Origin messages inside a single transaction.
188 |     This seems to be what origin means: https://www.postgresql.org/docs/12/replication-origins.html
189 |     """
190 |     pass
191 | 
192 | 
193 | class Relation(PgoutputMessage):
194 |     """
195 |     Byte1('R')  Identifies the message as a relation message.
196 |     Int32 ID of the relation.
197 |     String Namespace (empty string for pg_catalog).
198 |     String Relation name.
199 |     Int8 Replica identity setting for the relation (same as relreplident in pg_class).
200 |         # select relreplident from pg_class where relname = 'test_table';
201 |         # from reading the documentation and looking at the tables this is not int8 but a single character
202 |         # background: https://www.postgresql.org/docs/10/sql-altertable.html#SQL-CREATETABLE-REPLICA-IDENTITY 
203 |     Int16 Number of columns.
204 |     Next, the following message part appears for each column (except generated columns):
205 |         Int8 Flags for the column. Currently can be either 0 for no flags or 1 which marks the column as part of the key.
206 |         String Name of the column.
207 |         Int32 ID of the column's data type.
208 |         Int32 Type modifier of the column (atttypmod).
209 |     """
210 |     byte1: str
211 |     relation_id: int
212 |     namespace: str
213 |     relation_name: str
214 |     replica_identity_setting: str
215 |     n_columns: int
216 |     # TODO define column type, could eventually look this up from the DB
217 |     columns: List[Tuple[int, str, int, int]] 
218 | 
219 |     def decode_buffer(self):
220 |         if self.byte1 != 'R':
221 |             raise Exception('first byte in buffer does not match Relation message')
222 |         self.relation_id = self.read_int32()
223 |         self.namespace = self.read_string()
224 |         self.relation_name = self.read_string()
225 |         self.replica_identity_setting = self.read_utf8()
226 |         self.n_columns = self.read_int16()
227 |         self.columns = list()
228 | 
229 |         for column in range(self.n_columns):
230 |             part_of_pkey = self.read_int8()
231 |             col_name = self.read_string()
232 |             data_type_id = self.read_int32()
233 |             # TODO: check on use of signed / unsigned
234 |             # check with select oid from pg_type where typname = <type>; timestamp == 1184, int4 = 23
235 |             col_modifier = self.read_int32()          
236 |             self.columns.append((part_of_pkey, col_name, data_type_id, col_modifier))
237 | 
238 |     def __repr__(self):
239 |         return f"\tOperation : RELATION, \n\tbyte1 : '{self.byte1}', \n\trelation_id : {self.relation_id}" \
240 |                f",\n\tnamespace/schema : '{self.namespace}',\n\trelation_name : '{self.relation_name}'" \
241 |                f",\n\treplica_identity_setting : '{self.replica_identity_setting}',\n\tn_columns : {self.n_columns} " \
242 |                f",\n\tcolumns : {self.columns}"
243 | 
244 | 
245 | class PgType:
246 |     """
247 |     Renamed to PgType not to collide with "type"
248 | 
249 |     Byte1('Y') Identifies the message as a type message.
250 |     Int32 ID of the data type.
251 |     String Namespace (empty string for pg_catalog).
252 |     String Name of the data type.
253 |     """
254 |     pass
255 | 
256 | 
257 | class Insert(PgoutputMessage):
258 |     """
259 |     Byte1('I')  Identifies the message as an insert message.
260 |     Int32 ID of the relation corresponding to the ID in the relation message.
261 |     Byte1('N') Identifies the following TupleData message as a new tuple.
262 |     TupleData TupleData message part representing the contents of new tuple.
263 |     """
264 |     byte1: str
265 |     relation_id: int
266 |     new_tuple_byte: str 
267 |     new_tuple: TupleData
268 | 
269 |     def decode_buffer(self):
270 |         if self.byte1 != 'I':
271 |             raise Exception(f"first byte in buffer does not match Insert message (expected 'I', got '{self.byte1}'")
272 |         self.relation_id = self.read_int32()
273 |         self.new_tuple_byte = self.read_utf8()
274 |         self.new_tuple = self.read_tuple_data()
275 |         return self
276 | 
277 |     def __repr__(self):
278 |         return f"\tOperation : INSERT, \n\tbyte1: '{self.byte1}', \n\trelation_id : {self.relation_id}, " \
279 |                f"\n\tnew tuple byte: '{self.new_tuple_byte}', \n\tnew_tuple : {self.new_tuple}"
280 | 
281 | 
282 | class Update(PgoutputMessage):
283 |     """
284 |     Byte1('U')      Identifies the message as an update message.
285 |     Int32           ID of the relation corresponding to the ID in the relation message.
286 |     Byte1('K')      Identifies the following TupleData submessage as a key. This field is optional and is only present if the update changed data in any of the column(s) that are part of the REPLICA IDENTITY index.
287 |     Byte1('O')      Identifies the following TupleData submessage as an old tuple. This field is optional and is only present if table in which the update happened has REPLICA IDENTITY set to FULL.
288 |     TupleData       TupleData message part representing the contents of the old tuple or primary key. Only present if the previous 'O' or 'K' part is present.
289 |     Byte1('N')      Identifies the following TupleData message as a new tuple.
290 |     TupleData       TupleData message part representing the contents of a new tuple.
291 | 
292 |     The Update message may contain either a 'K' message part or an 'O' message part or neither of them, but never both of them.
293 |     """
294 |     byte1: str
295 |     relation_id: int
296 |     next_byte_identifier: Optional[str]
297 |     optional_tuple_identifier: Optional[str]
298 |     old_tuple: Optional[TupleData]
299 |     new_tuple_byte: str
300 |     new_tuple: TupleData
301 | 
302 |     def decode_buffer(self):
303 |         self.optional_tuple_identifier = None
304 |         self.old_tuple = None
305 |         if self.byte1 != 'U':
306 |             raise Exception(f"first byte in buffer does not match Update message (expected 'U', got '{self.byte1}'")
307 |         self.relation_id = self.read_int32()
308 |         # TODO test update to PK, test update with REPLICA IDENTITY = FULL
309 |         self.next_byte_identifier = self.read_utf8() # one of K, O or N
310 |         if self.next_byte_identifier == 'K' or self.next_byte_identifier == 'O':
311 |             self.optional_tuple_identifier = self.next_byte_identifier
312 |             self.old_tuple = self.read_tuple_data()
313 |             self.new_tuple_byte = self.read_utf8()
314 |         else:
315 |             self.new_tuple_byte = self.next_byte_identifier
316 |         if self.new_tuple_byte != 'N':
317 |             raise Exception(f"did not find new_tuple_byte ('N') at position: {self.buffer.tell()}, found: '{self.new_tuple_byte}'")
318 |         self.new_tuple = self.read_tuple_data()
319 |         return self
320 | 
321 |     def __repr__(self):
322 |         return f"\tOperation : UPDATE, \n\tbyte1 : '{self.byte1}', \n\trelation_id : {self.relation_id}, " \
323 |                f" \n\toptional_tuple_identifier : '{self.optional_tuple_identifier}', \n\toptional_old_tuple_data : {self.old_tuple}, " \
324 |                f" \n\tnew_tuple_byte : '{self.new_tuple_byte}', \n\tnew_tuple : {self.new_tuple}"
325 | 
326 | 
327 | class Delete(PgoutputMessage):
328 |     """
329 |     Byte1('D')      Identifies the message as a delete message.
330 |     Int32           ID of the relation corresponding to the ID in the relation message.
331 |     Byte1('K')      Identifies the following TupleData submessage as a key. This field is present if the table in which the delete has happened uses an index as REPLICA IDENTITY.
332 |     Byte1('O')      Identifies the following TupleData message as a old tuple. This field is present if the table in which the delete has happened has REPLICA IDENTITY set to FULL.
333 |     TupleData       TupleData message part representing the contents of the old tuple or primary key, depending on the previous field.
334 | 
335 |     The Delete message may contain either a 'K' message part or an 'O' message part, but never both of them.
336 |     """
337 |     byte1: str
338 |     relation_id: int
339 |     message_type: str
340 |     old_tuple: TupleData
341 | 
342 |     def decode_buffer(self):
343 |         if self.byte1 != 'D':
344 |             raise Exception(f"first byte in buffer does not match Delete message (expected 'D', got '{self.byte1}'")
345 |         self.relation_id = self.read_int32()
346 |         self.message_type = self.read_utf8() 
347 |         if self.message_type not in ['K','O']:
348 |             raise Exception(f"message type byte is not 'K' or 'O', got : '{self.message_type}'")
349 |         self.old_tuple = self.read_tuple_data()
350 |         return self
351 | 
352 |     def __repr__(self):
353 |         return f"\tOperation : DELETE, \n\tbyte1 : {self.byte1}, \n\trelation_id : {self.relation_id}, " \
354 |                f"\n\tmessage_type : {self.message_type}, \n\told_tuple : {self.old_tuple}"
355 | 
356 | 
357 | class Truncate(PgoutputMessage):
358 |     """
359 |     Byte1('T')      Identifies the message as a truncate message.
360 |     Int32           Number of relations
361 |     Int8            Option bits for TRUNCATE: 1 for CASCADE, 2 for RESTART IDENTITY
362 |     Int32           ID of the relation corresponding to the ID in the relation message. This field is repeated for each relation.
363 |     """
364 |     byte1: str
365 |     number_of_relations: int
366 |     option_bits: str
367 |     relation_ids: List[int]
368 | 
369 |     def decode_buffer(self):
370 |         if self.byte1 != 'T':
371 |             raise Exception(f"first byte in buffer does not match Truncate message (expected 'T', got '{self.byte1}'")
372 |         self.number_of_relations = self.read_int32()
373 |         self.option_bits = self.read_int8()
374 |         self.relation_ids = []
375 |         for relation in range(self.number_of_relations):
376 |             self.relation_ids.append(self.read_int32())
377 | 
378 |     def __repr__(self):
379 |         return f"\tOperation : TRUNCATE, \n\tbyte1 : {self.byte1}, \n\tn_relations : {self.number_of_relations}, "\
380 |                f"\n\toption_bits : {self.option_bits}, relation_ids : {self.relation_ids}"
381 | 
382 | 
383 | def decode_message(_input_bytes: bytes) -> Optional[Union[Begin, Commit, Relation, Insert, Update, Delete, Truncate]]:
384 |     """Peak first byte and initialise the appropriate message object"""
385 |     first_byte = (_input_bytes[:1]).decode('utf-8')
386 |     if first_byte == 'B':
387 |         output = Begin(_input_bytes)
388 |     elif first_byte == "C":
389 |         output = Commit(_input_bytes)   
390 |     elif first_byte == "R":
391 |         output = Relation(_input_bytes)
392 |     elif first_byte == "I":
393 |         output = Insert(_input_bytes)
394 |     elif first_byte == "U":
395 |         output = Update(_input_bytes)
396 |     elif first_byte == 'D':
397 |         output = Delete(_input_bytes)
398 |     elif first_byte == 'T':
399 |         output = Truncate(_input_bytes)
400 |     else:
401 |         print(f"warning unrecognised message {_input_bytes}")
402 |         output = None
403 |     return output
404 | 


--------------------------------------------------------------------------------
/utilities/message_formatter.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Created on July 12 2022
  3 | @author: Vinjit
  4 | """
  5 | 
  6 | import re
  7 | 
  8 | relation_pool={} 
  9 | trans_info = [None] * 3
 10 | 
 11 | def update_timestamp(splitted_records):
 12 |         trans_info[0]=(splitted_records[3].split(' : ')[1]).strip()     # timestamp
 13 |         trans_info[1]=(splitted_records[2].split(':')[1]).strip()       # lsn
 14 |         trans_info[2]=(splitted_records[4].split(':')[1]).strip()       # xid
 15 | 
 16 | def replace_relation(splitted_records,relation_id,relationdata):
 17 |         new_schema_name = ((splitted_records[3].split(':')[1]).strip()).strip("'")
 18 |         # print (new_schema_name)
 19 | 
 20 |         new_table_name = ((splitted_records[4].split(':')[1]).strip()).strip("'")
 21 |         # print (new_table_name)
 22 | 
 23 |         new_col_list = []
 24 |         new_col_record = relationdata.split(':')[-1]
 25 |         # print('col rec : ', new_col_record)
 26 |         new_col_list=re.findall(r'\'.*?\'', new_col_record)
 27 |         for i in range(len(new_col_list)):
 28 |                 new_col_list[i]=new_col_list[i].strip("'")
 29 | 
 30 |         relation_pool[relation_id] = {
 31 | 		                "Namespace/schema" : new_schema_name,
 32 | 		                "Table_name" : new_table_name,
 33 | 		                "Columns" : new_col_list
 34 | 	                }
 35 |         
 36 |         # print("\nThe current relation pool is : ",relation_pool)
 37 |         
 38 | 
 39 | def add_relation(splitted_records,relation_id,relationdata):
 40 |         schema_name = ((splitted_records[3].split(':')[1]).strip()).strip("'")
 41 |         # print (schema_name)
 42 | 
 43 |         table_name = ((splitted_records[4].split(':')[1]).strip()).strip("'")
 44 |         # print (table_name)
 45 | 
 46 |         col_list = []
 47 |         col_record = relationdata.split(':')[-1]
 48 |         # print('col rec : ',col_record)
 49 |         col_list=re.findall(r'\'.*?\'',col_record)
 50 |         for i in range(len(col_list)):
 51 |                 col_list[i]=col_list[i].strip("'")
 52 | 
 53 |         relation_dict = {
 54 | 		                "Namespace/schema" : schema_name,
 55 | 		                "Table_name" : table_name,
 56 | 		                "Columns" : col_list
 57 | 	                }
 58 |         # print(relation_dict)
 59 | 
 60 |         relation_pool.update({relation_id : relation_dict})
 61 |         # print("\nThe current relation pool is : ",relation_pool)
 62 | 
 63 | def update_relation_pool(splitted_records,relationdata):
 64 |         relation_id=(splitted_records[2].split(':')[1]).strip()
 65 |         # print("relation id is: ",relation_id)
 66 |         if relation_id in relation_pool:                
 67 |                 # print("rel found")
 68 |                 replace_relation(splitted_records,relation_id,relationdata)
 69 |                 # print(f"\nSUCCESS : Relation {relation_id} updated in the relation pool")
 70 |                         
 71 |         else:
 72 |                 # print("rel not found")
 73 |                 add_relation(splitted_records,relation_id,relationdata)
 74 |                 # print(f"\nSUCCESS : New Relation {relation_id} added to the relation pool")
 75 | 
 76 | def create_insert_message_json(splitted_records,insertdata):
 77 |         relation_id = (splitted_records[2].split(':')[1]).strip()
 78 |         if relation_id in relation_pool:
 79 |                 rels=relation_pool[relation_id]
 80 |                 col_names=rels['Columns']
 81 |                 schema_name=rels['Namespace/schema']
 82 |                 table_name=rels['Table_name']                       
 83 | 
 84 |         else:
 85 |                 # print(f"\nERROR : Relation id {relation_id} not found in pool")
 86 |                 raise Exception (f"\nERROR : Relation id {relation_id} not found in pool")          
 87 | 
 88 |         raw_cols = (insertdata.split(':')[-1]).split(',')
 89 |         # print(raw_cols)
 90 |         col_data_string = ""
 91 |         for i in range(2,len(raw_cols),3):
 92 |                 col_data_string+=raw_cols[i]
 93 |         col_data=re.findall(r'\'.*?\'', col_data_string)
 94 | 
 95 |         for i in range(len(col_data)):
 96 |                 col_data[i]=col_data[i].strip("'")
 97 |         # print(col_data)
 98 | 
 99 |         if(len(col_data) != len(col_names)):
100 |                 # print(f"\nERROR : Number of columns inserted is not equal to number of columns in relation {relation_id}")
101 |                 raise Exception (f"\nERROR : Number of columns inserted is not equal to number of columns in relation {relation_id}")
102 |         
103 |         insert_message = {
104 |                 "Operation" : "INSERT",
105 |                 "LSN" : trans_info[1], 
106 |                 "Transaction_Xid" : trans_info[2],
107 |                 "Commit_timestamp" : trans_info[0],   
108 |                 "Schema" : schema_name,
109 |                 "Table_name" : table_name,
110 |                 "Relation_id" : relation_id
111 |         }
112 | 
113 |         insert_message.update(zip(col_names, col_data))
114 |         return insert_message
115 | 
116 | def create_update_message_json(splitted_records,updatedata):
117 |         relation_id = (splitted_records[2].split(':')[1]).strip()
118 |         if relation_id in relation_pool:
119 |                 rels=relation_pool[relation_id]
120 |                 col_names=rels['Columns']
121 |                 schema_name=rels['Namespace/schema']
122 |                 table_name=rels['Table_name']                       
123 | 
124 |         else:
125 |                 # print(f"\nERROR : Relation id {relation_id} not found in pool")
126 |                 raise Exception (f"\nERROR : Relation id {relation_id} not found in pool")
127 | 
128 |         raw_cols = (updatedata.split(':')[-1]).split(',')
129 |         # print(raw_cols)
130 |         col_data_string = ""
131 |         for i in range(2,len(raw_cols),3):
132 |                 col_data_string+=raw_cols[i]
133 |         col_data=re.findall(r'\'.*?\'', col_data_string)
134 | 
135 |         for i in range(len(col_data)):
136 |                 col_data[i]=col_data[i].strip("'")
137 |         # print(col_data)
138 | 
139 |         if(len(col_data) != len(col_names)):
140 |                 # print(f"\nERROR : Number of columns inserted is not equal to number of columns in relation {relation_id}")
141 |                 raise Exception (f"\nERROR : Number of columns inserted is not equal to number of columns in relation {relation_id}")
142 |         
143 |         update_message = {
144 |                 "Operation" : "UPDATE",
145 |                 "LSN" : trans_info[1], 
146 |                 "Transaction_Xid" : trans_info[2],
147 |                 "Commit_timestamp" : trans_info[0],
148 |                 "Schema" : schema_name,
149 |                 "Table_name" : table_name,
150 |                 "Relation_id" : relation_id
151 |         }
152 | 
153 |         update_message.update(zip(col_names, col_data))
154 |         return update_message
155 | 
156 | def create_truncate_message_json(truncatedata):
157 |         
158 |         raw_rel_ids = ((truncatedata.split(':')[-1].strip()).lstrip('[')).rstrip(']')
159 |         # print("new rec - ",raw_rel_ids)
160 |         relation_ids=raw_rel_ids.split(',')
161 |         relation_ids=[ x.strip() for x in relation_ids]
162 |         table_list = []
163 |         schema_list = []
164 | 
165 |         for ids in relation_ids:
166 |                 if ids in relation_pool:
167 |                         rels=relation_pool[ids]
168 |                         schema_list.append(rels['Namespace/schema'])
169 |                         table_list.append(rels['Table_name'])
170 |                                 
171 | 
172 |         truncate_message = {
173 |                 "Operation" : "TRUNCATE",
174 |                 "LSN" : trans_info[1], 
175 |                 "Transaction_Xid" : trans_info[2],
176 |                 "Commit_timestamp" : trans_info[0],
177 |                 "Schema_names" : schema_list,
178 |                 "Table_names" : table_list,
179 |                 "Relation_ids" : relation_ids
180 |         }
181 |         return truncate_message
182 | 
183 | def create_delete_message_json(splitted_records,deletedata):
184 |         relation_id = (splitted_records[2].split(':')[1]).strip()
185 |         if relation_id in relation_pool:
186 |                 rels=relation_pool[relation_id]
187 |                 col_names=rels['Columns']
188 |                 schema_name=rels['Namespace/schema']
189 |                 table_name=rels['Table_name']                       
190 | 
191 |         else:
192 |                 # print(f"\nERROR : Relation id {relation_id} not found in pool")
193 |                 raise Exception (f"\nERROR : Relation id {relation_id} not found in pool")
194 |         
195 |         raw_col = (deletedata.split(':')[-1]).split(',')
196 |         result = re.search('col_data=(.*)', raw_col[2])
197 |         # print(result)
198 |         result=result.group(1)
199 |         col_data=(result.strip(')')).strip("'")
200 | 
201 |         delete_message = {
202 |                 "Operation" : "DELETE",
203 |                 "LSN" : trans_info[1], 
204 |                 "Transaction_Xid" : trans_info[2],
205 |                 "Commit_timestamp" : trans_info[0],
206 |                 "Schema" : schema_name,
207 |                 "Table_name" : table_name,
208 |                 "Relation_id" : relation_id,
209 |                 col_names[0] : col_data
210 |         }
211 |         delete_message.update()
212 |         return delete_message
213 | 
214 | def get_message(data):
215 |         try:
216 |                 splitted_records=data.split(',')
217 |                 operation=(splitted_records[0].split(':')[1]).strip()
218 |                 if(operation=='RELATION'):
219 |                         update_relation_pool(splitted_records,data)
220 |                         return ''
221 |                 elif(operation=='BEGIN'):
222 |                         update_timestamp(splitted_records)
223 |                         return ''
224 |                 elif(operation=='INSERT'):
225 |                         message=create_insert_message_json(splitted_records,data)
226 |                         return message
227 |                 elif(operation=='UPDATE'):
228 |                         message=create_update_message_json(splitted_records,data)
229 |                         return message
230 |                 elif(operation=='DELETE'):
231 |                         message=create_delete_message_json(splitted_records,data)
232 |                         return message
233 |                 elif(operation=='TRUNCATE'):
234 |                         message=create_truncate_message_json(data)
235 |                         return message
236 |                 else:
237 |                         return ''
238 |         except Exception as error:
239 |                 # print(error)
240 |                 return error
241 | 


--------------------------------------------------------------------------------