├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── cdc_kafka
    ├── __init__.py
    ├── __main__.py
    ├── build_startup_state.py
    ├── change_index.py
    ├── clock_sync.py
    ├── constants.py
    ├── helpers.py
    ├── kafka.py
    ├── kafka_oauth
    │   ├── __init__.py
    │   └── aws_msk.py
    ├── main.py
    ├── metric_reporting
    │   ├── __init__.py
    │   ├── accumulator.py
    │   ├── http_post_reporter.py
    │   ├── kafka_reporter.py
    │   ├── local_file_reporter.py
    │   ├── metrics.py
    │   ├── reporter_base.py
    │   └── stdout_reporter.py
    ├── options.py
    ├── parsed_row.py
    ├── progress_reset_tool.py
    ├── progress_topic_validator.py
    ├── progress_tracking.py
    ├── replayer.py
    ├── serializers
    │   ├── __init__.py
    │   └── avro.py
    ├── show_snapshot_history.py
    ├── sql_queries.py
    ├── sql_query_subprocess.py
    ├── tracked_tables.py
    └── validation.py
├── docker-compose.yml
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | **/__pycache__/
2 | **/.idea/
3 | venv
4 | **/*.lprof
5 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.12-bookworm
 2 | 
 3 | WORKDIR /srv
 4 | 
 5 | RUN curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add - \
 6 |   && curl https://packages.microsoft.com/config/debian/11/prod.list > /etc/apt/sources.list.d/mssql-release.list \
 7 |   && apt-get update \
 8 |   && ACCEPT_EULA=Y apt-get install -y unixodbc-dev msodbcsql18 dumb-init \
 9 |   && apt-get clean \
10 |   && rm -rf /var/lib/apt/lists/*
11 | 
12 | COPY requirements.txt .
13 | RUN pip install -r requirements.txt
14 | COPY cdc_kafka cdc_kafka
15 | 
16 | ENTRYPOINT ["dumb-init", "--rewrite", "15:2", "--"]
17 | CMD ["python", "-m", "cdc_kafka"]
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Marty Woodlee
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # sqlserver-cdc-to-kafka
 2 | 
 3 | This is a utility for streaming rows from SQL Server Change Data Capture (CDC) to Kafka topics. It also optionally streams initial "snapshot state" of the tables into the same topics, allowing you to effectively create a read replica of your SQL Server tables in some other datastore by consuming and applying messages from the topics this process produces.
 4 | 
 5 | If you've landed here, you should check out [Debezium](https://debezium.io/). It's a more mature project maintained by many talented folks! This project was initially developed because of some shortcomings that once existed in Debezium which may now be resolved. In particular, Debezium's SQL Server connector did not deal well with cases where a CDC instance does not capture all of a source table's columns. You might prefer this project if you wish to fork and customize such a tool but are more comfortable working with Python (Debezium, like many Kafka-ecosystem tools, is written in Java). Otherwise, check them out first.
 6 | 
 7 | This is still a baby project and does not yet have much documentation. If you'd like to try it, check out the options in `cdc_kafka/options.py`.
 8 | 
 9 | Integration with [Sentry](https://sentry.io/welcome/) is included. If you have an account with them and want to use it, set environment variable `SENTRY_DSN` (and optionally `SENTRY_ENVIRONMENT` and `SENTRY_RELEASE`) in accordance with their documentation. If you don't, the Sentry integration will be a no-op.
10 | 
11 | ## Run it
12 | 
13 | These scenarios presume you are using the provided `docker-compose` file, and that you have pre-provisioned and enabled CDC upon a DB called `MyTestDb` in the SQL Server DB.
14 | 
15 | You will need to have [installed the Microsoft ODBC Driver for SQL Server](https://docs.microsoft.com/en-us/sql/connect/odbc/linux-mac/installing-the-microsoft-odbc-driver-for-sql-server?view=sql-server-ver15) for this to work.
16 | 
17 | ### Locally
18 | 
19 | ```
20 | python -m cdc_kafka \
21 |   --db-conn-string 'DRIVER=ODBC Driver 18 for SQL Server; SERVER=localhost; TrustServerCertificate=yes; DATABASE=MyTestDb; UID=sa; PWD=TestLocalPassword123' \
22 |   --kafka-bootstrap-servers localhost:9092 \
23 |   --schema-registry-url http://localhost:8081 \
24 |   --kafka-transactional-id local-c2k-test
25 | ```
26 | 
27 | ### Via Docker
28 | 
29 | ```
30 | docker build -t cdc_kafka .
31 | 
32 | docker run --rm -it \
33 |   --net host  \
34 |   -e DB_CONN_STRING='DRIVER=ODBC Driver 18 for SQL Server; SERVER=localhost; TrustServerCertificate=yes; DATABASE=MyTestDb; UID=sa; PWD=TestLocalPassword123' \
35 |   -e KAFKA_BOOTSTRAP_SERVERS=localhost:9092 \
36 |   -e SCHEMA_REGISTRY_URL=http://localhost:8081 \
37 |   cdc_kafka 
38 | ```
39 | 


--------------------------------------------------------------------------------
/cdc_kafka/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging.config
 2 | import os
 3 | 
 4 | import sentry_sdk
 5 | 
 6 | sentry_sdk.init()
 7 | 
 8 | log_level = os.getenv('LOG_LEVEL', 'INFO').upper()
 9 | 
10 | logging.config.dictConfig({
11 |     'version': 1,
12 |     'disable_existing_loggers': False,
13 |     'loggers': {
14 |         __name__: {
15 |             'handlers': ['console'],
16 |             'level': log_level,
17 |             'propagate': True,
18 |         },
19 |     },
20 |     'handlers': {
21 |         'console': {
22 |             'class': 'logging.StreamHandler',
23 |             'level': log_level,
24 |             'formatter': 'simple',
25 |         },
26 |     },
27 |     'formatters': {
28 |         'simple': {
29 |             'format': '%(asctime)s %(levelname)-8s (%(threadName)s) [%(name)s:%(lineno)s] %(message)s',
30 |         },
31 |     },
32 | })
33 | 


--------------------------------------------------------------------------------
/cdc_kafka/__main__.py:
--------------------------------------------------------------------------------
1 | from cdc_kafka import main
2 | 
3 | if __name__ == '__main__':
4 |     main.run()
5 | 


--------------------------------------------------------------------------------
/cdc_kafka/build_startup_state.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import copy
  3 | import datetime
  4 | import logging
  5 | import re
  6 | from typing import Dict, List, Tuple, Iterable, Optional, Any, Set, NamedTuple, Mapping
  7 | 
  8 | import pyodbc
  9 | from tabulate import tabulate
 10 | 
 11 | from . import sql_query_subprocess, tracked_tables, sql_queries, kafka, progress_tracking, change_index, \
 12 |     constants, helpers, options
 13 | from .serializers.avro import AvroSchemaGenerator
 14 | from .metric_reporting import accumulator
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | def build_tracked_tables_from_cdc_metadata(
 20 |     db_conn: pyodbc.Connection, metrics_accumulator: accumulator.Accumulator, topic_name_template: str,
 21 |     snapshot_table_include_config: str, snapshot_table_exclude_config: str, truncate_fields: Dict[str, int],
 22 |     capture_instance_names: List[str], db_row_batch_size: int,
 23 |     sql_query_processor: sql_query_subprocess.SQLQueryProcessor, progress_tracker: progress_tracking.ProgressTracker
 24 | ) -> List[tracked_tables.TrackedTable]:
 25 |     result: List[tracked_tables.TrackedTable] = []
 26 | 
 27 |     truncate_fields = {k.lower(): v for k, v in truncate_fields.items()}
 28 | 
 29 |     snapshot_table_include_regex = snapshot_table_include_config and re.compile(
 30 |         snapshot_table_include_config, re.IGNORECASE)
 31 |     snapshot_table_exclude_regex = snapshot_table_exclude_config and re.compile(
 32 |         snapshot_table_exclude_config, re.IGNORECASE)
 33 | 
 34 |     name_to_meta_fields: Dict[Tuple[Any, ...], List[Tuple[Any, ...]]] \
 35 |         = collections.defaultdict(list)
 36 | 
 37 |     with db_conn.cursor() as cursor:
 38 |         q, _ = sql_queries.get_cdc_tracked_tables_metadata(capture_instance_names)
 39 |         cursor.execute(q)
 40 |         for row in cursor.fetchall():
 41 |             # 0:4 gets schema name, table name, capture instance name, min captured LSN:
 42 |             name_to_meta_fields[tuple(row[0:4])].append(row[4:])
 43 | 
 44 |     for (schema_name, table_name, capture_instance_name, min_lsn), fields in name_to_meta_fields.items():
 45 |         fq_table_name = f'{schema_name}.{table_name}'
 46 | 
 47 |         can_snapshot = False
 48 | 
 49 |         if snapshot_table_include_regex and snapshot_table_include_regex.match(fq_table_name):
 50 |             logger.debug('Table %s matched snapshotting inclusion regex', fq_table_name)
 51 |             can_snapshot = True
 52 | 
 53 |         if snapshot_table_exclude_regex and snapshot_table_exclude_regex.match(fq_table_name):
 54 |             logger.debug('Table %s matched snapshotting exclusion regex and will NOT be snapshotted', fq_table_name)
 55 |             can_snapshot = False
 56 | 
 57 |         topic_name = topic_name_template.format(
 58 |             schema_name=schema_name, table_name=table_name, capture_instance_name=capture_instance_name)
 59 | 
 60 |         tracked_table = tracked_tables.TrackedTable(
 61 |             db_conn,  metrics_accumulator, sql_query_processor, schema_name, table_name, capture_instance_name,
 62 |             topic_name, min_lsn, can_snapshot, db_row_batch_size, progress_tracker)
 63 | 
 64 |         for (change_table_ordinal, column_name, sql_type_name, _, primary_key_ordinal, decimal_precision,
 65 |              decimal_scale, _) in fields:
 66 |             truncate_after: int = truncate_fields.get(f'{schema_name}.{table_name}.{column_name}'.lower(), 0)
 67 |             tracked_table.append_field(tracked_tables.TrackedField(
 68 |                 column_name, sql_type_name, change_table_ordinal, primary_key_ordinal, decimal_precision,
 69 |                 decimal_scale, truncate_after))
 70 | 
 71 |         result.append(tracked_table)
 72 | 
 73 |     return result
 74 | 
 75 | 
 76 | def determine_start_points_and_finalize_tables(
 77 |         kafka_client: kafka.KafkaClient, db_conn: pyodbc.Connection, tables: Iterable[tracked_tables.TrackedTable],
 78 |         progress_tracker: progress_tracking.ProgressTracker, lsn_gap_handling: str, new_follow_start_point: str,
 79 |         partition_count: int, replication_factor: int, extra_topic_config: Dict[str, str | int],
 80 |         validation_mode: bool = False, redo_snapshot_for_new_instance: bool = False,
 81 |         publish_duplicate_changes_from_new_instance: bool = False, report_progress_only: bool = False
 82 | ) -> None:
 83 |     if validation_mode:
 84 |         for table in tables:
 85 |             table.snapshot_allowed = False
 86 |             table.finalize_table(change_index.LOWEST_CHANGE_INDEX, None, {}, lsn_gap_handling)
 87 |         return
 88 | 
 89 |     prior_progress_log_table_data = []
 90 | 
 91 |     progress_tracker.maybe_create_snapshot_logging_topic()
 92 |     prior_progress = progress_tracker.get_prior_progress_or_create_progress_topic()
 93 | 
 94 |     snapshot_progress: Optional[progress_tracking.ProgressEntry]
 95 |     changes_progress: Optional[progress_tracking.ProgressEntry]
 96 | 
 97 |     with db_conn.cursor() as cursor:
 98 |         q, _ = sql_queries.get_max_lsn()
 99 |         cursor.execute(q)
100 |         db_max_lsn = cursor.fetchval()
101 | 
102 |     for table in tables:
103 |         kafka_client.begin_transaction()
104 |         snapshot_progress, changes_progress = None, None
105 |         prior_change_table_max_index: Optional[change_index.ChangeIndex] = None
106 | 
107 |         if not report_progress_only and not kafka_client.get_topic_partition_count(table.topic_name):
108 |             if partition_count:
109 |                 this_topic_partition_count = partition_count
110 |             else:
111 |                 per_second = table.get_change_rows_per_second()
112 |                 # one partition for each 10 rows/sec on average in the change table:
113 |                 this_topic_partition_count = max(1, int(per_second / 10))
114 |                 if this_topic_partition_count > 100:
115 |                     raise Exception(
116 |                         f'Automatic topic creation would create %{this_topic_partition_count} partitions for topic '
117 |                         f'{table.topic_name} based on a change table rows per second rate of {per_second}. This '
118 |                         f'seems excessive, so the program is exiting to prevent overwhelming your Kafka cluster. '
119 |                         f'Look at setting PARTITION_COUNT to take manual control of this.')
120 |             kafka_client.create_topic(table.topic_name, this_topic_partition_count, replication_factor,
121 |                                       extra_topic_config)
122 |         else:
123 |             snapshot_progress = prior_progress.get((table.topic_name, constants.SNAPSHOT_ROWS_KIND))
124 |             changes_progress = prior_progress.get((table.topic_name, constants.CHANGE_ROWS_KIND))
125 | 
126 |             fq_change_table_name = helpers.get_fq_change_table_name(table.capture_instance_name)
127 |             if snapshot_progress and (snapshot_progress.change_table_name != fq_change_table_name):
128 |                 logger.info('Found prior snapshot progress into topic %s, but from an older capture instance '
129 |                             '(prior progress instance: %s; current instance: %s)', table.topic_name,
130 |                             snapshot_progress.change_table_name, fq_change_table_name)
131 |                 if redo_snapshot_for_new_instance:
132 |                     old_capture_instance_name = helpers.get_capture_instance_name(snapshot_progress.change_table_name)
133 |                     new_capture_instance_name = helpers.get_capture_instance_name(fq_change_table_name)
134 |                     if ddl_change_requires_new_snapshot(db_conn, old_capture_instance_name, new_capture_instance_name,
135 |                                                         table.fq_name):
136 |                         logger.info('Will start new snapshot.')
137 |                         snapshot_progress = None
138 |                     else:
139 |                         progress_tracker.record_snapshot_progress(table.topic_name,
140 |                                                                   constants.SNAPSHOT_COMPLETION_SENTINEL)
141 |                 else:
142 |                     progress_tracker.record_snapshot_progress(table.topic_name,
143 |                                                               constants.SNAPSHOT_COMPLETION_SENTINEL)
144 |                     logger.info('Will NOT start new snapshot.')
145 | 
146 |             if changes_progress and (changes_progress.change_table_name != fq_change_table_name):
147 |                 logger.info('Found prior change data progress into topic %s, but from an older capture instance '
148 |                             '(prior progress instance: %s; current instance: %s)', table.topic_name,
149 |                             changes_progress.change_table_name, fq_change_table_name)
150 |                 with db_conn.cursor() as cursor:
151 |                     cursor.execute("SELECT 1 FROM sys.tables WHERE object_id = OBJECT_ID(?)",
152 |                                    changes_progress.change_table_name)
153 |                     if cursor.fetchval() is not None:
154 |                         q, _ = sql_queries.get_max_lsn_for_change_table(
155 |                             helpers.quote_name(changes_progress.change_table_name))
156 |                         cursor.execute(q)
157 |                         res = cursor.fetchone()
158 |                         if res:
159 |                             (lsn, _, seqval, operation) = res
160 |                             prior_change_table_max_index = change_index.ChangeIndex(lsn, seqval, operation)
161 | 
162 |                 if publish_duplicate_changes_from_new_instance:
163 |                     logger.info('Will republish any change rows duplicated by the new capture instance.')
164 |                     changes_progress = None
165 |                 else:
166 |                     logger.info('Will NOT republish any change rows duplicated by the new capture instance.')
167 | 
168 |         new_table_starting_index = copy.copy(change_index.LOWEST_CHANGE_INDEX)
169 |         if new_follow_start_point == options.NEW_FOLLOW_START_POINT_LATEST:
170 |             new_table_starting_index.lsn = db_max_lsn
171 |         if not (changes_progress and changes_progress.change_index):
172 |             logger.info('Beginning follow of new table %s from LSN %s', table.fq_name, new_table_starting_index)
173 |         starting_change_index: change_index.ChangeIndex = \
174 |             (changes_progress and changes_progress.change_index) or new_table_starting_index
175 |         starting_snapshot_index: Optional[Mapping[str, str | int]] = None
176 |         if snapshot_progress:
177 |             starting_snapshot_index = snapshot_progress.snapshot_index
178 | 
179 |         if report_progress_only:  # elide schema registration
180 |             table.finalize_table(starting_change_index, prior_change_table_max_index, starting_snapshot_index,
181 |                                  options.LSN_GAP_HANDLING_IGNORE)
182 |         else:
183 |             table.finalize_table(starting_change_index, prior_change_table_max_index, starting_snapshot_index,
184 |                                  lsn_gap_handling, allow_progress_writes=True)
185 | 
186 |         if not table.snapshot_allowed:
187 |             snapshot_state = '<not doing>'
188 |         elif table.snapshot_complete:
189 |             snapshot_state = '<already complete>'
190 |         elif table.last_read_key_for_snapshot_display is None:
191 |             snapshot_state = '<from beginning>'
192 |         else:
193 |             snapshot_state = f'From {table.last_read_key_for_snapshot_display}'
194 | 
195 |         prior_progress_log_table_data.append((table.capture_instance_name, table.fq_name, table.topic_name,
196 |                                               starting_change_index or '<from beginning>', snapshot_state))
197 |         kafka_client.commit_transaction()
198 | 
199 |     headers = ('Capture instance name', 'Source table name', 'Topic name', 'From change table index', 'Snapshots')
200 |     display_table = tabulate(sorted(prior_progress_log_table_data), headers, tablefmt='fancy_grid')
201 | 
202 |     logger.info('Processing will proceed from the following positions based on the last message from each topic '
203 |                 'and/or the snapshot progress committed in Kafka (NB: snapshot reads occur BACKWARDS from high to '
204 |                 'low key column values):\n%s\n%s tables total.', display_table, len(prior_progress_log_table_data))
205 | 
206 | 
207 | def ddl_change_requires_new_snapshot(db_conn: pyodbc.Connection, old_capture_instance_name: str,
208 |                                      new_capture_instance_name: str, source_table_fq_name: str,
209 |                                      resnapshot_for_column_drops: bool = True) -> bool:
210 |     with db_conn.cursor() as cursor:
211 |         cursor.execute(f'SELECT TOP 1 1 FROM [{constants.CDC_DB_SCHEMA_NAME}].[change_tables] '
212 |                        f'WHERE capture_instance = ?', old_capture_instance_name)
213 |         if not cursor.fetchval():
214 |             logger.info('Requiring re-snapshot for %s because prior capture instance %s is no longer available as a '
215 |                         'basis for evaluating schema changes.', source_table_fq_name, old_capture_instance_name)
216 |             return True
217 | 
218 |         q, _ = sql_queries.get_cdc_tracked_tables_metadata([old_capture_instance_name, new_capture_instance_name])
219 |         cursor.execute(q)
220 |         old_cols: Dict[str, Dict[str, Any]] = {}
221 |         new_cols: Dict[str, Dict[str, Any]] = {}
222 |         for row in cursor.fetchall():
223 |             (_, _, capture_instance_name, _, _, column_name, sql_type_name, is_computed, _, decimal_precision,
224 |              decimal_scale, is_nullable) = row
225 |             col_info = {'sql_type_name': sql_type_name,
226 |                         'decimal_precision': decimal_precision,
227 |                         'decimal_scale': decimal_scale,
228 |                         'is_computed': is_computed,
229 |                         'is_nullable': is_nullable}
230 |             if capture_instance_name == old_capture_instance_name:
231 |                 old_cols[column_name] = col_info
232 |             elif capture_instance_name == new_capture_instance_name:
233 |                 new_cols[column_name] = col_info
234 | 
235 |         added_col_names = new_cols.keys() - old_cols.keys()
236 |         removed_col_names = old_cols.keys() - new_cols.keys()
237 |         changed_col_names = {k for k in new_cols.keys()
238 |                              if k in old_cols
239 |                              and old_cols[k] != new_cols[k]}
240 |         logger.info('Evaluating need for new snapshot in change from capture instance %s to %s. Added cols: %s Removed '
241 |                     'cols: %s Cols with type changes: %s ...', old_capture_instance_name, new_capture_instance_name,
242 |                     added_col_names, removed_col_names, changed_col_names)
243 | 
244 |         if removed_col_names and resnapshot_for_column_drops:
245 |             logger.info('Requiring re-snapshot for %s because the new capture instance removes column(s) %s.',
246 |                         source_table_fq_name, removed_col_names)
247 |             return True
248 | 
249 |         for changed_col_name in changed_col_names:
250 |             old_col = old_cols[changed_col_name]
251 |             new_col = new_cols[changed_col_name]
252 | 
253 |             if old_col['is_computed'] != new_col['is_computed']:
254 |                 logger.info('Requiring re-snapshot for %s due to an is_computed change for column %s (type: %s, '
255 |                             'is_computed: %s --> type: %s, is_computed: %s).', source_table_fq_name, changed_col_name,
256 |                             old_col['sql_type_name'], old_col['is_computed'], new_col['sql_type_name'],
257 |                             new_col['is_computed'])
258 |                 return True
259 | 
260 |             # Even if the DB col type changed, a resnapshot is really only needed if the corresponding serialization
261 |             # schema changes. An example where we can skip a re-snapshot would be a column "upgrading" from SMALLINT
262 |             # to INT:
263 | 
264 |             # noinspection PyProtectedMember
265 |             if AvroSchemaGenerator._instance:  # Will only exist if process was configured for Avro serialization
266 |                 db_schema, db_table = source_table_fq_name.split('.')
267 |                 # noinspection PyProtectedMember
268 |                 old_avro_type = AvroSchemaGenerator._instance.get_record_field_schema(
269 |                     db_schema, db_table, changed_col_name, old_col['sql_type_name'], old_col['decimal_precision'],
270 |                     old_col['decimal_scale'], True)
271 |                 # noinspection PyProtectedMember
272 |                 new_avro_type = AvroSchemaGenerator._instance.get_record_field_schema(
273 |                     db_schema, db_table, changed_col_name, new_col['sql_type_name'], new_col['decimal_precision'],
274 |                     new_col['decimal_scale'], True)
275 |                 if old_avro_type != new_avro_type:
276 |                     logger.info('Requiring re-snapshot for %s due to an Avro schema change for column %s (type: %s, '
277 |                                 'is_computed: %s --> type: %s, is_computed: %s).', source_table_fq_name,
278 |                                 changed_col_name, old_col['sql_type_name'], old_col['is_computed'],
279 |                                 new_col['sql_type_name'], new_col['is_computed'])
280 |                     return True
281 |             else:
282 |                 # TODO - not yet supporting all the situations we could in this non-Avro case. Add nuance!
283 |                 if not (old_col['sql_type_name'].lower().endswith('int') and
284 |                         new_col['sql_type_name'].lower().endswith('int')):
285 |                     logger.info('Requiring re-snapshot for %s due to a data type change for column %s (type: %s, '
286 |                                 'is_computed: %s --> type: %s, is_computed: %s).', source_table_fq_name,
287 |                                 changed_col_name, old_col['sql_type_name'], old_col['is_computed'],
288 |                                 new_col['sql_type_name'], new_col['is_computed'])
289 |                     return True
290 | 
291 |         for added_col_name in added_col_names:
292 |             col_info = new_cols[added_col_name]
293 |             if not col_info['is_nullable']:
294 |                 logger.info('Requiring re-snapshot for %s because newly-captured column %s is marked NOT NULL',
295 |                             source_table_fq_name, added_col_name)
296 |                 return True
297 | 
298 |         quoted_fq_name = helpers.quote_name(source_table_fq_name)
299 |         q, _ = sql_queries.get_table_rowcount_bounded(quoted_fq_name, constants.SMALL_TABLE_THRESHOLD)
300 |         cursor.execute(q)
301 |         bounded_row_count = cursor.fetchval()
302 |         logger.debug('Bounded row count for %s was: %s', source_table_fq_name, bounded_row_count)
303 |         table_is_small = bounded_row_count < constants.SMALL_TABLE_THRESHOLD
304 | 
305 |         # Gets the names of columns that appear in the first position of one or more unfiltered, non-disabled indexes:
306 |         q, p = sql_queries.get_indexed_cols()
307 |         cursor.setinputsizes(p)  # type: ignore[arg-type]
308 |         cursor.execute(q, source_table_fq_name)
309 |         indexed_cols: Set[str] = {row[0] for row in cursor.fetchall()}
310 |         recently_added_cols: Optional[Set[str]] = None
311 | 
312 |         for added_col_name in added_col_names:
313 |             if table_is_small or added_col_name in indexed_cols:
314 |                 cursor.execute(f"SELECT TOP 1 1 FROM {quoted_fq_name} WITH (NOLOCK) "
315 |                                f"WHERE [{added_col_name}] IS NOT NULL")
316 |                 if cursor.fetchval() is not None:
317 |                     logger.info('Requiring re-snapshot for %s because a direct scan of newly-tracked column %s '
318 |                                 'detected non-null values.', source_table_fq_name, added_col_name)
319 |                     return True
320 |                 else:
321 |                     logger.info('New col %s on table %s contains only NULL values per direct check.',
322 |                                 added_col_name, source_table_fq_name)
323 |             else:
324 |                 # if we get here it means the table is large, the new column does not lead in an index, but
325 |                 # the new column is nullable.
326 |                 if recently_added_cols is None:
327 |                     cols_with_too_old_changes: Set[str] = set()
328 |                     cols_with_new_enough_changes: Set[str] = set()
329 |                     q, p = sql_queries.get_ddl_history_for_capture_table()
330 |                     cursor.setinputsizes(p)  # type: ignore[arg-type]
331 |                     cursor.execute(q, helpers.get_fq_change_table_name(old_capture_instance_name))
332 |                     alter_re = re.compile(
333 |                         r'\W*alter\s+table\s+(?P<table>[\w\.\[\]]+)\s+add\s+(?P<column>[\w\.\[\]]+)\s+(?P<spec>.*)',
334 |                         re.IGNORECASE)
335 |                     for (ddl_command, age_seconds) in cursor.fetchall():
336 |                         match = alter_re.match(ddl_command)
337 |                         if match and match.groupdict().get('column'):
338 |                             col_name_lower = match.groupdict()['column'].lower().strip('[]')
339 |                             if age_seconds > constants.MAX_AGE_TO_PRESUME_ADDED_COL_IS_NULL_SECONDS:
340 |                                 cols_with_too_old_changes.add(col_name_lower)
341 |                             else:
342 |                                 cols_with_new_enough_changes.add(col_name_lower)
343 |                     recently_added_cols = cols_with_new_enough_changes - cols_with_too_old_changes
344 | 
345 |                 if added_col_name.lower() not in recently_added_cols:
346 |                     logger.info('Requiring re-snapshot for %s because newly-tracked column %s appears to have been '
347 |                                 'added more than %s seconds ago.', source_table_fq_name, added_col_name,
348 |                                 constants.MAX_AGE_TO_PRESUME_ADDED_COL_IS_NULL_SECONDS)
349 |                     return True
350 |                 else:
351 |                     logger.info('New col %s on table %s is ASSUMED to contain only NULL values because of the recency '
352 |                                 'of its addition.', added_col_name, source_table_fq_name)
353 | 
354 |     logger.info('Not requiring re-snapshot for table %s.', source_table_fq_name)
355 |     return False
356 | 
357 | 
358 | class CaptureInstanceMetadata(NamedTuple):
359 |     fq_name: str
360 |     capture_instance_name: str
361 |     start_lsn: bytes
362 |     create_date: datetime.datetime
363 |     types_checksum: int
364 |     regex_matched_group: str
365 | 
366 | 
367 | # This pulls the "greatest" capture instance running for each source table, in the event there is more than one.
368 | def get_latest_capture_instances_by_fq_name(
369 |         db_conn: pyodbc.Connection, capture_instance_version_strategy: str, capture_instance_version_config: str,
370 |         table_include_config: str, table_exclude_config: str
371 | ) -> Dict[str, CaptureInstanceMetadata]:
372 |     if capture_instance_version_strategy == options.CAPTURE_INSTANCE_VERSION_STRATEGY_REGEX \
373 |             and not capture_instance_version_config:
374 |         raise Exception('Please provide a capture_instance_version_regex when specifying the `regex` '
375 |                         'capture_instance_version_strategy.')
376 |     result: Dict[str, CaptureInstanceMetadata] = {}
377 |     fq_name_to_capture_instances: Dict[str, List[CaptureInstanceMetadata]] = collections.defaultdict(list)
378 |     capture_instance_version_regex = capture_instance_version_config and re.compile(capture_instance_version_config)
379 |     table_include_regex = table_include_config and re.compile(table_include_config, re.IGNORECASE)
380 |     table_exclude_regex = table_exclude_config and re.compile(table_exclude_config, re.IGNORECASE)
381 | 
382 |     with db_conn.cursor() as cursor:
383 |         q, _ = sql_queries.get_cdc_capture_instances_metadata()
384 |         cursor.execute(q)
385 |         for row in cursor.fetchall():
386 |             fq_table_name = f'{row[0]}.{row[1]}'
387 | 
388 |             if table_include_regex and not table_include_regex.match(fq_table_name):
389 |                 logger.debug('Table %s excluded; did not match inclusion regex', fq_table_name)
390 |                 continue
391 | 
392 |             if table_exclude_regex and table_exclude_regex.match(fq_table_name):
393 |                 logger.debug('Table %s excluded; matched exclusion regex', fq_table_name)
394 |                 continue
395 | 
396 |             if row[3] is None or row[4] is None:
397 |                 logger.debug('Capture instance for %s appears to be brand-new; will evaluate again on '
398 |                              'next pass', fq_table_name)
399 |                 continue
400 | 
401 |             regex_matched_group: str = ''
402 |             if capture_instance_version_regex:
403 |                 match = capture_instance_version_regex.match(row[1])
404 |                 regex_matched_group = match and match.group(1) or ''
405 | 
406 |             ci_meta = CaptureInstanceMetadata(fq_table_name, row[2], row[3], row[4], row[5], regex_matched_group)
407 |             fq_name_to_capture_instances[ci_meta.fq_name].append(ci_meta)
408 | 
409 |     for fq_name, capture_instances in fq_name_to_capture_instances.items():
410 |         if capture_instance_version_strategy == options.CAPTURE_INSTANCE_VERSION_STRATEGY_CREATE_DATE:
411 |             latest_instance = sorted(capture_instances, key=lambda x: x.create_date)[-1]
412 |         elif capture_instance_version_strategy == options.CAPTURE_INSTANCE_VERSION_STRATEGY_REGEX:
413 |             latest_instance = sorted(capture_instances, key=lambda x: x.regex_matched_group)[-1]
414 |         else:
415 |             raise Exception(f'Capture instance version strategy "{capture_instance_version_strategy}" not recognized.')
416 |         result[fq_name] = latest_instance
417 | 
418 |     logger.debug('Latest capture instance names determined by "%s" strategy: %s', capture_instance_version_strategy,
419 |                  sorted([v.capture_instance_name for v in result.values()]))
420 | 
421 |     return result
422 | 


--------------------------------------------------------------------------------
/cdc_kafka/change_index.py:
--------------------------------------------------------------------------------
 1 | from functools import total_ordering
 2 | from typing import Dict, Any
 3 | 
 4 | from . import constants
 5 | 
 6 | 
 7 | @total_ordering
 8 | class ChangeIndex(object):
 9 |     __slots__ = 'lsn', 'seqval', 'operation'
10 | 
11 |     def __init__(self, lsn: bytes, seqval: bytes, operation: int) -> None:
12 |         self.lsn: bytes = lsn
13 |         self.seqval: bytes = seqval
14 |         self.operation: int
15 |         if isinstance(operation, int):
16 |             self.operation = operation
17 |         elif isinstance(operation, str):
18 |             self.operation = constants.CDC_OPERATION_NAME_TO_ID[operation]
19 |         else:
20 |             raise Exception(f'Unrecognized type for parameter `operation` (type: {type(operation)}, '
21 |                             f'value: {operation}).')
22 | 
23 |     def __eq__(self, other: object) -> bool:
24 |         if not isinstance(other, ChangeIndex):
25 |             return NotImplemented
26 |         if isinstance(other, ChangeIndex):
27 |             # I know the below logic seems awkward, but it was the result of performance profiling. Short-circuiting
28 |             # early when we can, since this will most often return False:
29 |             return not (
30 |                 self.lsn != other.lsn
31 |                 or self.seqval != other.seqval
32 |                 or self.operation != other.operation
33 |             )
34 |         return False
35 | 
36 |     def __lt__(self, other: 'ChangeIndex') -> bool:
37 |         if self.lsn != other.lsn:
38 |             return self.lsn < other.lsn
39 |         if self.seqval != other.seqval:
40 |             return self.seqval < other.seqval
41 |         if self.operation != other.operation:
42 |             return self.operation < other.operation
43 |         return False
44 | 
45 |     # For user-friendly display in logging etc.; not the format to be used for persistent data storage
46 |     def __repr__(self) -> str:
47 |         lsn = self.lsn.hex()
48 |         seqval = self.seqval.hex()
49 |         return f'0x{lsn[:8]} {lsn[8:16]} {lsn[16:]}:0x{seqval[:8]} {seqval[8:16]} {seqval[16:]}:{self.operation}'
50 | 
51 |     # Converts from binary LSN/seqval to a string representation that is more friendly to some things that may
52 |     # consume this data. The stringified form is also "SQL query ready" for pasting into SQL Server queries.
53 |     def as_dict(self) -> Dict[str, str]:
54 |         return {
55 |             constants.LSN_NAME: f'0x{self.lsn.hex()}',
56 |             constants.SEQVAL_NAME: f'0x{self.seqval.hex()}',
57 |             constants.OPERATION_NAME: constants.CDC_OPERATION_ID_TO_NAME[self.operation]
58 |         }
59 | 
60 |     @staticmethod
61 |     def from_dict(source_dict: Dict[str, Any]) -> 'ChangeIndex':
62 |         return ChangeIndex(
63 |             int(source_dict[constants.LSN_NAME][2:], 16).to_bytes(10, "big"),
64 |             int(source_dict[constants.SEQVAL_NAME][2:], 16).to_bytes(10, "big"),
65 |             constants.CDC_OPERATION_NAME_TO_ID[source_dict[constants.OPERATION_NAME]]
66 |         )
67 | 
68 |     @property
69 |     def is_probably_heartbeat(self) -> bool:
70 |         return self.seqval == HIGHEST_CHANGE_INDEX.seqval and self.operation == HIGHEST_CHANGE_INDEX.operation
71 | 
72 | 
73 | LOWEST_CHANGE_INDEX = ChangeIndex(b'\x00' * 10, b'\x00' * 10, 0)
74 | HIGHEST_CHANGE_INDEX = ChangeIndex(b'\xff' * 10, b'\xff' * 10, 4)
75 | 


--------------------------------------------------------------------------------
/cdc_kafka/clock_sync.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import logging
 3 | 
 4 | import pyodbc
 5 | 
 6 | from . import sql_queries, constants, helpers
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | class ClockSync(object):
12 |     _instance = None
13 | 
14 |     def __init__(self, db_conn: pyodbc.Connection) -> None:
15 |         if ClockSync._instance is not None:
16 |             raise Exception('ClockSync class should be used as a singleton.')
17 | 
18 |         self._last_sync_time: datetime.datetime = (helpers.naive_utcnow() - 2 * constants.DB_CLOCK_SYNC_INTERVAL)
19 |         self._db_conn: pyodbc.Connection = db_conn
20 |         self._clock_skew: datetime.timedelta = self._get_skew()
21 | 
22 |         ClockSync._instance = self
23 | 
24 |     def db_time_to_utc(self, db_time: datetime.datetime) -> datetime.datetime:
25 |         now = helpers.naive_utcnow()
26 |         if (now - self._last_sync_time) > constants.DB_CLOCK_SYNC_INTERVAL:
27 |             self._clock_skew = self._get_skew()
28 |             self._last_sync_time = now
29 |         return db_time + self._clock_skew
30 | 
31 |     def _get_skew(self) -> datetime.timedelta:
32 |         now = helpers.naive_utcnow()
33 |         with self._db_conn.cursor() as cursor:
34 |             q, _ = sql_queries.get_date()
35 |             cursor.execute(q)
36 |             db_now: datetime.datetime = cursor.fetchval()
37 |             skew = now - db_now
38 |             logger.debug('Current DB time: %s; local process UTC: %s; delta: %s', db_now, now, skew)
39 |             return skew
40 | 


--------------------------------------------------------------------------------
/cdc_kafka/constants.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | from typing import Literal
  3 | 
  4 | # Timing intervals
  5 | 
  6 | MIN_CDC_POLLING_INTERVAL = datetime.timedelta(seconds=3)
  7 | MAX_CDC_POLLING_INTERVAL = datetime.timedelta(seconds=10)
  8 | METRICS_REPORTING_INTERVAL = datetime.timedelta(seconds=20)
  9 | CHANGED_CAPTURE_INSTANCES_CHECK_INTERVAL = datetime.timedelta(seconds=60)
 10 | SLOW_TABLE_PROGRESS_HEARTBEAT_INTERVAL = datetime.timedelta(minutes=3)
 11 | DB_CLOCK_SYNC_INTERVAL = datetime.timedelta(minutes=5)
 12 | DB_FETCH_BATCH_SIZE = 1_000
 13 | DB_QUERIER_CONCURRENT_THREADS = 2
 14 | 
 15 | SMALL_TABLE_THRESHOLD = 5_000_000
 16 | MAX_AGE_TO_PRESUME_ADDED_COL_IS_NULL_SECONDS = 3600
 17 | 
 18 | SQL_QUERY_TIMEOUT_SECONDS = 30
 19 | SQL_QUERY_INTER_RETRY_INTERVAL_SECONDS = 1
 20 | SQL_QUERY_RETRIES = 2
 21 | 
 22 | KAFKA_REQUEST_TIMEOUT_SECS = 15
 23 | KAFKA_OAUTH_CB_POLL_TIMEOUT = 3
 24 | KAFKA_FULL_FLUSH_TIMEOUT_SECS = 30
 25 | KAFKA_CONFIG_RELOAD_DELAY_SECS = 1
 26 | 
 27 | # General
 28 | 
 29 | MESSAGE_KEY_FIELD_NAME_WHEN_PK_ABSENT = '_row_hash'
 30 | DEFAULT_KEY_SCHEMA_COMPATIBILITY_LEVEL: Literal["NONE", "FULL", "FORWARD", "BACKWARD"] = 'FULL'
 31 | DEFAULT_VALUE_SCHEMA_COMPATIBILITY_LEVEL: Literal["NONE", "FULL", "FORWARD", "BACKWARD"] = 'FORWARD'
 32 | CDC_DB_SCHEMA_NAME = 'cdc'
 33 | UNRECOGNIZED_COLUMN_DEFAULT_NAME = 'UNKNOWN_COL'
 34 | VALIDATION_MAXIMUM_SAMPLE_SIZE_PER_TOPIC = 1_000_000
 35 | SNAPSHOT_COMPLETION_SENTINEL = {'<< completed snapshot >>': '<< completed >>'}
 36 | SQL_STRING_TYPES = ('char', 'nchar', 'varchar', 'ntext', 'nvarchar', 'text')
 37 | 
 38 | CHANGE_ROWS_KIND = "change_rows"
 39 | SNAPSHOT_ROWS_KIND = "snapshot_rows"
 40 | ALL_PROGRESS_KINDS = "all_progress"
 41 | 
 42 | SNAPSHOT_LOG_ACTION_STARTED = 'started'
 43 | SNAPSHOT_LOG_ACTION_RESUMED = 'resumed'
 44 | SNAPSHOT_LOG_ACTION_COMPLETED = 'completed'
 45 | SNAPSHOT_LOG_ACTION_RESET_AUTO = 'progress-reset-auto'
 46 | SNAPSHOT_LOG_ACTION_RESET_MANUAL = 'progress-reset-manual'
 47 | 
 48 | # CDC operation types; IDs 1-4 here match what SQL Server provides; ID 0 is of our own creation:
 49 | 
 50 | SNAPSHOT_OPERATION_ID = 0
 51 | SNAPSHOT_OPERATION_NAME = 'Snapshot'
 52 | 
 53 | DELETE_OPERATION_ID = 1
 54 | DELETE_OPERATION_NAME = 'Delete'
 55 | 
 56 | INSERT_OPERATION_ID = 2
 57 | INSERT_OPERATION_NAME = 'Insert'
 58 | 
 59 | PRE_UPDATE_OPERATION_ID = 3
 60 | PRE_UPDATE_OPERATION_NAME = 'PreUpdate'
 61 | 
 62 | POST_UPDATE_OPERATION_ID = 4
 63 | POST_UPDATE_OPERATION_NAME = 'PostUpdate'
 64 | 
 65 | CDC_OPERATION_ID_TO_NAME = {
 66 |     SNAPSHOT_OPERATION_ID: SNAPSHOT_OPERATION_NAME,
 67 |     DELETE_OPERATION_ID: DELETE_OPERATION_NAME,
 68 |     INSERT_OPERATION_ID: INSERT_OPERATION_NAME,
 69 |     PRE_UPDATE_OPERATION_ID: PRE_UPDATE_OPERATION_NAME,
 70 |     POST_UPDATE_OPERATION_ID: POST_UPDATE_OPERATION_NAME,
 71 | }
 72 | 
 73 | CDC_OPERATION_NAME_TO_ID = {
 74 |     SNAPSHOT_OPERATION_NAME: SNAPSHOT_OPERATION_ID,
 75 |     DELETE_OPERATION_NAME: DELETE_OPERATION_ID,
 76 |     INSERT_OPERATION_NAME: INSERT_OPERATION_ID,
 77 |     PRE_UPDATE_OPERATION_NAME: PRE_UPDATE_OPERATION_ID,
 78 |     POST_UPDATE_OPERATION_NAME: POST_UPDATE_OPERATION_ID,
 79 | }
 80 | 
 81 | # Metadata column names and positions
 82 | 
 83 | OPERATION_NAME = '__operation'
 84 | EVENT_TIME_NAME = '__event_time'
 85 | LSN_NAME = '__log_lsn'
 86 | SEQVAL_NAME = '__log_seqval'
 87 | UPDATED_FIELDS_NAME = '__updated_fields'
 88 | 
 89 | DB_LSN_COL_NAME = '__$start_lsn'
 90 | DB_SEQVAL_COL_NAME = '__$seqval'
 91 | DB_OPERATION_COL_NAME = '__$operation'
 92 | 
 93 | # Kafka message types
 94 | 
 95 | SINGLE_TABLE_CHANGE_MESSAGE = 'table-change'
 96 | UNIFIED_TOPIC_CHANGE_MESSAGE = 'unified-change'
 97 | SINGLE_TABLE_SNAPSHOT_MESSAGE = 'table-snapshot'
 98 | DELETION_CHANGE_TOMBSTONE_MESSAGE = 'deletion-tombstone'
 99 | CHANGE_PROGRESS_MESSAGE = 'change-progress'
100 | SNAPSHOT_PROGRESS_MESSAGE = 'snapshot-progress'
101 | SNAPSHOT_LOGGING_MESSAGE = 'snapshot-logging'
102 | PROGRESS_DELETION_TOMBSTONE_MESSAGE = 'progress-deletion-tombstone'
103 | METRIC_REPORTING_MESSAGE = 'metric-reporting'
104 | 


--------------------------------------------------------------------------------
/cdc_kafka/helpers.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | import confluent_kafka
 4 | 
 5 | from . import constants
 6 | 
 7 | 
 8 | # Helper function for loggers working with Kafka messages
 9 | def format_coordinates(msg: confluent_kafka.Message) -> str:
10 |     return f'{msg.topic()}:{msg.partition()}@{msg.offset()}, ' \
11 |            f'time {datetime.datetime.fromtimestamp(msg.timestamp()[1] / 1000, datetime.UTC)}'
12 | 
13 | 
14 | def get_fq_change_table_name(capture_instance_name: str) -> str:
15 |     assert '.' not in capture_instance_name
16 |     capture_instance_name = capture_instance_name.strip(' []')
17 |     return f'{constants.CDC_DB_SCHEMA_NAME}.{capture_instance_name}_CT'
18 | 
19 | 
20 | def get_capture_instance_name(change_table_name: str) -> str:
21 |     change_table_name = change_table_name.replace('[', '')
22 |     change_table_name = change_table_name.replace(']', '')
23 |     if change_table_name.startswith(constants.CDC_DB_SCHEMA_NAME + '.'):
24 |         change_table_name = change_table_name.replace(constants.CDC_DB_SCHEMA_NAME + '.', '')
25 |     assert change_table_name.endswith('_CT')
26 |     return change_table_name[:-3]
27 | 
28 | 
29 | def quote_name(name: str) -> str:
30 |     name = name.replace('[', '')
31 |     name = name.replace(']', '')
32 |     parts = name.split('.')
33 |     return '.'.join([f"[{p}]" for p in parts])
34 | 
35 | 
36 | def naive_utcnow() -> datetime.datetime:
37 |     return datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
38 | 


--------------------------------------------------------------------------------
/cdc_kafka/kafka.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import datetime
  3 | import inspect
  4 | import json
  5 | import logging
  6 | import socket
  7 | import time
  8 | from types import TracebackType
  9 | from typing import List, Dict, Tuple, Any, Generator, Optional, Set, Type
 10 | 
 11 | import confluent_kafka.admin
 12 | 
 13 | from . import constants, kafka_oauth
 14 | 
 15 | from typing import TYPE_CHECKING
 16 | if TYPE_CHECKING:
 17 |     from .metric_reporting import accumulator
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | 
 22 | class KafkaClient(object):
 23 |     _instance = None
 24 | 
 25 |     def __init__(self, metrics_accumulator: 'accumulator.AccumulatorAbstract', bootstrap_servers: str,
 26 |                  extra_kafka_consumer_config: Dict[str, str | int], extra_kafka_producer_config: Dict[str, str | int],
 27 |                  disable_writing: bool = False, transactional_id: Optional[str] = None) -> None:
 28 |         if KafkaClient._instance is not None:
 29 |             raise Exception('KafkaClient class should be used as a singleton.')
 30 | 
 31 |         self._metrics_accumulator: 'accumulator.AccumulatorAbstract' = metrics_accumulator
 32 | 
 33 |         # Kafka consumer/producer librdkafka config defaults are here:
 34 |         self.consumer_config: Dict[str, Any] = {**{
 35 |             'bootstrap.servers': bootstrap_servers,
 36 |             'group.id': f'cdc_to_kafka_{socket.getfqdn()}',
 37 |             'enable.partition.eof': True,
 38 |             'enable.auto.commit': False
 39 |         }, **extra_kafka_consumer_config}
 40 |         producer_config: Dict[str, Any] = {**{
 41 |             'bootstrap.servers': bootstrap_servers,
 42 |             'linger.ms': '200',
 43 |             'enable.idempotence': True,
 44 |             'statistics.interval.ms': 30 * 60 * 1000,
 45 |             'enable.gapless.guarantee': True,
 46 |             'retry.backoff.ms': 250,
 47 |             'compression.codec': 'snappy'
 48 |         }, **extra_kafka_producer_config}
 49 |         admin_config: Dict[str, Any] = {
 50 |             'bootstrap.servers': bootstrap_servers
 51 |         }
 52 | 
 53 |         oauth_provider = kafka_oauth.get_kafka_oauth_provider()
 54 | 
 55 |         self._use_oauth: bool = False
 56 |         if oauth_provider is not None:
 57 |             self._use_oauth = True
 58 |             logger.debug('Using Kafka OAuth provider class %s', type(oauth_provider).__name__)
 59 |             for config_dict in (self.consumer_config, producer_config, admin_config):
 60 |                 if not config_dict.get('security.protocol'):
 61 |                     config_dict['security.protocol'] = 'SASL_SSL'
 62 |                 if not config_dict.get('sasl.mechanisms'):
 63 |                     config_dict['sasl.mechanisms'] = 'OAUTHBEARER'
 64 |                 if not config_dict.get('client.id'):
 65 |                     config_dict['client.id'] = socket.gethostname()
 66 | 
 67 |         logger.debug('Kafka consumer configuration: %s', json.dumps(self.consumer_config))
 68 |         logger.debug('Kafka producer configuration: %s', json.dumps(producer_config))
 69 |         logger.debug('Kafka admin client configuration: %s', json.dumps(admin_config))
 70 | 
 71 |         self.consumer_config['error_cb'] = KafkaClient._raise_kafka_error
 72 |         self.consumer_config['throttle_cb'] = KafkaClient._log_kafka_throttle_event
 73 |         self.consumer_config['logger'] = logger
 74 |         producer_config['error_cb'] = KafkaClient._raise_kafka_error
 75 |         producer_config['stats_cb'] = KafkaClient._emit_producer_stats
 76 |         producer_config['throttle_cb'] = KafkaClient._log_kafka_throttle_event
 77 |         producer_config['logger'] = logger
 78 |         admin_config['error_cb'] = KafkaClient._raise_kafka_error
 79 |         admin_config['throttle_cb'] = KafkaClient._log_kafka_throttle_event
 80 |         admin_config['logger'] = logger
 81 | 
 82 |         if oauth_provider is not None:
 83 |             self.consumer_config['oauth_cb'] = oauth_provider.consumer_oauth_cb
 84 |             producer_config['oauth_cb'] = oauth_provider.producer_oauth_cb
 85 |             admin_config['oauth_cb'] = oauth_provider.admin_oauth_cb
 86 | 
 87 |         self._use_transactions: bool = False
 88 |         if transactional_id is not None:
 89 |             producer_config['transactional.id'] = transactional_id
 90 |             self._use_transactions = True
 91 | 
 92 |         self._producer: confluent_kafka.Producer = confluent_kafka.Producer(producer_config)
 93 |         self._admin: confluent_kafka.admin.AdminClient = confluent_kafka.admin.AdminClient(admin_config)
 94 |         self._disable_writing = disable_writing
 95 |         self._creation_warned_topic_names: Set[str] = set()
 96 | 
 97 |         if self._use_oauth:  # trigger initial oauth_cb calls
 98 |             self._producer.poll(constants.KAFKA_OAUTH_CB_POLL_TIMEOUT)
 99 | 
100 |         if self._use_transactions and not self._disable_writing:
101 |             self._producer.init_transactions(constants.KAFKA_REQUEST_TIMEOUT_SECS)
102 | 
103 |         self._cluster_metadata: confluent_kafka.admin.ClusterMetadata = self._get_cluster_metadata()
104 | 
105 |         KafkaClient._instance = self
106 | 
107 |     @staticmethod
108 |     def get_instance() -> 'KafkaClient':
109 |         if not KafkaClient._instance:
110 |             raise Exception('KafkaClient has not yet been instantiated.')
111 |         return KafkaClient._instance
112 | 
113 |     def __enter__(self) -> 'KafkaClient':
114 |         return self
115 | 
116 |     def __exit__(self, exc_type: Optional[Type[BaseException]], exc: Optional[BaseException],
117 |                  traceback: Optional[TracebackType]) -> None:
118 |         logger.info("Cleaning up Kafka resources...")
119 |         if not self._disable_writing:
120 |             self._producer.flush(constants.KAFKA_FULL_FLUSH_TIMEOUT_SECS)
121 |         del self._admin
122 |         del self._producer
123 |         time.sleep(1)  # gives librdkafka threads more of a chance to exit properly before admin/producer are GC'd
124 |         logger.info("Done.")
125 | 
126 |     # a return of None indicates the topic does not exist
127 |     def get_topic_partition_count(self, topic_name: str) -> int:
128 |         if self._cluster_metadata.topics is None:
129 |             raise Exception('Unexpected state: no topic metadata')
130 |         if topic_name not in self._cluster_metadata.topics:
131 |             return 0
132 |         return len(self._cluster_metadata.topics[topic_name].partitions or [])
133 | 
134 |     def begin_transaction(self) -> None:
135 |         if not self._use_transactions:
136 |             raise Exception('This instance of KafkaClient was not configured to use transactions.')
137 |         if self._disable_writing:
138 |             return
139 |         if logger.isEnabledFor(logging.DEBUG):
140 |             current_frame = inspect.currentframe()
141 |             if current_frame and current_frame.f_back:
142 |                 previous_frame = inspect.getframeinfo(current_frame.f_back)
143 |                 logger.debug('Kafka transaction begin from %s', f'{previous_frame[0]}, line {previous_frame[1]}')
144 |         self._producer.begin_transaction()
145 | 
146 |     def commit_transaction(self) -> None:
147 |         if not self._use_transactions:
148 |             raise Exception('This instance of KafkaClient was not configured to use transactions.')
149 |         if self._disable_writing:
150 |             return
151 |         if logger.isEnabledFor(logging.DEBUG):
152 |             current_frame = inspect.currentframe()
153 |             if current_frame and current_frame.f_back:
154 |                 previous_frame = inspect.getframeinfo(current_frame.f_back)
155 |                 logger.debug('Kafka transaction commit from %s', f'{previous_frame[0]}, line {previous_frame[1]}')
156 |         self._producer.commit_transaction()
157 | 
158 |     def produce(self, topic: str, key: Optional[bytes], value: Optional[bytes], message_type: str,
159 |                 copy_to_unified_topics: Optional[List[str]] = None, event_datetime: Optional[datetime.datetime] = None,
160 |                 change_lsn: Optional[bytes] = None, operation_id: Optional[int] = None,
161 |                 extra_headers: Optional[Dict[str, str | bytes]] = None) -> None:
162 |         if self._disable_writing:
163 |             return
164 | 
165 |         start_time = time.perf_counter()
166 | 
167 |         if event_datetime:
168 |             delivery_cb = lambda _, msg: self._metrics_accumulator.kafka_delivery_callback(msg, event_datetime)
169 |         else:
170 |             delivery_cb = None
171 | 
172 |         while True:
173 |             try:
174 |                 self._producer.produce(
175 |                     topic=topic, value=value, key=key, on_delivery=delivery_cb,
176 |                     headers={'cdc_to_kafka_message_type': message_type, **(extra_headers or {})}
177 |                 )
178 |                 break
179 |             except BufferError:
180 |                 logger.warning('Sleeping due to Kafka producer buffer being full...')
181 |                 self._producer.flush(3)  # clear some space before retrying
182 |             except Exception:
183 |                 logger.error('The following exception occurred producing to topic %s', topic)
184 |                 raise
185 | 
186 |         elapsed = time.perf_counter() - start_time
187 |         self._metrics_accumulator.register_kafka_produce(elapsed, message_type, event_datetime,
188 |                                                          change_lsn, operation_id)
189 | 
190 |         if copy_to_unified_topics:
191 |             for unified_topic in copy_to_unified_topics:
192 |                 start_time = time.perf_counter()
193 | 
194 |                 while True:
195 |                     try:
196 |                         self._producer.produce(
197 |                             topic=unified_topic, value=value, key=key, on_delivery=delivery_cb,
198 |                             headers={'cdc_to_kafka_message_type': constants.UNIFIED_TOPIC_CHANGE_MESSAGE,
199 |                                      'cdc_to_kafka_original_topic': topic, **(extra_headers or {})}
200 |                         )
201 |                         break
202 |                     except BufferError:
203 |                         logger.warning('Sleeping due to Kafka producer buffer being full...')
204 |                         self._producer.flush(3)  # clear some space before retrying
205 |                     except Exception:
206 |                         logger.error('The following exception occurred producing to topic %s', unified_topic)
207 |                         raise
208 | 
209 |                 elapsed = time.perf_counter() - start_time
210 |                 self._metrics_accumulator.register_kafka_produce(elapsed, constants.UNIFIED_TOPIC_CHANGE_MESSAGE,
211 |                                                                  event_datetime, change_lsn, operation_id)
212 | 
213 |     def consume_all(self, topic_name: str) -> Generator[confluent_kafka.Message, None, None]:
214 |         part_count = self.get_topic_partition_count(topic_name)
215 | 
216 |         if part_count is None:
217 |             logger.warning(
218 |                 'consume_all: Requested topic %s does not appear to exist. Returning nothing.', topic_name)
219 |             return
220 | 
221 |         watermarks = self.get_topic_watermarks([topic_name])[topic_name]  # will be list of (low, hi) mark tuples
222 |         last_offset = sum(x[1] for x in watermarks)
223 |         if not last_offset:
224 |             logger.warning(
225 |                 'consume_all: Requested topic %s contains no messages at present. Returning nothing.', topic_name)
226 |             return
227 |         logger.debug('Progress topic %s ends at offset %s', topic_name, last_offset)
228 | 
229 |         consumer: confluent_kafka.Consumer = confluent_kafka.Consumer(self.consumer_config)
230 |         if self._use_oauth:
231 |             consumer.poll(constants.KAFKA_OAUTH_CB_POLL_TIMEOUT)  # Trigger initial oauth_cb call
232 | 
233 |         consumer.assign([confluent_kafka.TopicPartition(topic_name, part_id, confluent_kafka.OFFSET_BEGINNING)
234 |                          for part_id in range(part_count)])
235 | 
236 |         finished_parts = [False] * part_count
237 |         ctr = 0
238 | 
239 |         while True:
240 |             msg = consumer.poll(constants.KAFKA_REQUEST_TIMEOUT_SECS)
241 | 
242 |             if msg is None:
243 |                 time.sleep(0.2)
244 |                 continue
245 |             if msg.error():
246 |                 # noinspection PyProtectedMember
247 |                 if (msg.error().code() == confluent_kafka.KafkaError._PARTITION_EOF  # type: ignore[union-attr]
248 |                         and msg.partition() is not None):
249 |                     finished_parts[msg.partition()] = True  # type: ignore[index]
250 |                     if all(finished_parts):
251 |                         break
252 |                     continue
253 |                 else:
254 |                     raise confluent_kafka.KafkaException(msg.error())
255 | 
256 |             ctr += 1
257 |             if ctr % 100000 == 0:
258 |                 logger.debug('consume_all has yielded %s messages so far from topic %s', ctr, topic_name)
259 | 
260 |             yield msg
261 | 
262 |         consumer.close()
263 | 
264 |     def consume_bounded(self, topic_name: str, approx_max_recs: int,
265 |                         boundary_watermarks: List[Tuple[int, int]]) -> Generator[confluent_kafka.Message, None, None]:
266 |         part_count = self.get_topic_partition_count(topic_name)
267 | 
268 |         if part_count is None:
269 |             logger.warning(
270 |                 'consume_bounded: Requested topic %s does not appear to exist. Returning nothing.', topic_name)
271 |             return
272 | 
273 |         if part_count != len(boundary_watermarks):
274 |             raise ValueError('consume_bounded: The number of captured watermarks does not match the number of '
275 |                              'partitions for topic %s', topic_name)
276 | 
277 |         rewind_per_part = int(approx_max_recs / part_count)
278 |         start_offsets = [max(lo, hi - rewind_per_part) for lo, hi in boundary_watermarks]
279 | 
280 |         consumer: confluent_kafka.Consumer = confluent_kafka.Consumer(self.consumer_config)
281 |         if self._use_oauth:
282 |             consumer.poll(constants.KAFKA_OAUTH_CB_POLL_TIMEOUT)  # Trigger initial oauth_cb call
283 | 
284 |         consumer.assign([confluent_kafka.TopicPartition(topic_name, part_id, offset)
285 |                          for part_id, offset in enumerate(start_offsets)])
286 | 
287 |         finished_parts = [False] * part_count
288 |         ctr = 0
289 | 
290 |         while True:
291 |             msg = consumer.poll(constants.KAFKA_REQUEST_TIMEOUT_SECS)
292 | 
293 |             if msg is None:
294 |                 time.sleep(0.2)
295 |                 continue
296 |             if msg.error():
297 |                 # noinspection PyProtectedMember
298 |                 if (msg.error().code() == confluent_kafka.KafkaError._PARTITION_EOF  # type: ignore[union-attr]
299 |                         and msg.partition() is not None):
300 |                     finished_parts[msg.partition()] = True  # type: ignore[index]
301 |                     if all(finished_parts):
302 |                         break
303 |                     continue
304 |                 else:
305 |                     raise confluent_kafka.KafkaException(msg.error())
306 |             if msg.offset() > boundary_watermarks[msg.partition()][1]:  # type: ignore[index, operator]
307 |                 finished_parts[msg.partition()] = True  # type: ignore[index]
308 |                 if all(finished_parts):
309 |                     break
310 |                 continue
311 | 
312 |             ctr += 1
313 |             if ctr % 100000 == 0:
314 |                 logger.debug('consume_bounded has yielded %s messages so far from topic %s', ctr, topic_name)
315 | 
316 |             yield msg
317 | 
318 |         consumer.close()
319 | 
320 |     def create_topic(self, topic_name: str, partition_count: int, replication_factor: Optional[int] = None,
321 |                      extra_config: Optional[Dict[str, str | int]] = None) -> None:
322 |         if self._disable_writing:
323 |             return
324 | 
325 |         if not replication_factor:
326 |             if self._cluster_metadata.brokers is None:
327 |                 raise Exception('Unexpected state: no brokers metadata')
328 |             replication_factor = min(len(self._cluster_metadata.brokers), 3)
329 | 
330 |         extra_config = extra_config or {}
331 |         topic_config = {**{'cleanup.policy': 'compact'}, **extra_config}
332 |         topic_config_str = {k: str(v) for k, v in topic_config.items()}
333 | 
334 |         logger.info('Creating Kafka topic "%s" with %s partitions, replication factor %s, and config: %s', topic_name,
335 |                     partition_count, replication_factor, json.dumps(topic_config_str))
336 |         topic = confluent_kafka.admin.NewTopic(topic_name, partition_count, replication_factor, config=topic_config_str)
337 |         self._admin.create_topics([topic])[topic_name].result()
338 |         time.sleep(constants.KAFKA_CONFIG_RELOAD_DELAY_SECS)
339 |         self._refresh_cluster_metadata()
340 | 
341 |     # Returns dict where key is topic name and value is ordered list of tuples of (low, high) watermarks per partition:
342 |     def get_topic_watermarks(self, topic_names: List[str]) -> Dict[str, List[Tuple[int, int]]]:
343 |         result = collections.defaultdict(list)
344 | 
345 |         consumer: confluent_kafka.Consumer = confluent_kafka.Consumer(self.consumer_config)
346 |         if self._use_oauth:
347 |             consumer.poll(constants.KAFKA_OAUTH_CB_POLL_TIMEOUT)  # In case oauth token refresh is needed
348 | 
349 |         for topic_name in topic_names:
350 |             part_count = self.get_topic_partition_count(topic_name)
351 | 
352 |             if part_count is None:
353 |                 if topic_name not in self._creation_warned_topic_names:
354 |                     logger.warning('Topic name %s was not found in Kafka. This process will create it if corresponding '
355 |                                    'CDC entries are found.', topic_name)
356 |                     self._creation_warned_topic_names.add(topic_name)
357 |                 continue
358 | 
359 |             for part_id in range(part_count):
360 |                 watermarks = consumer.get_watermark_offsets(
361 |                     confluent_kafka.TopicPartition(topic_name, part_id), timeout=constants.KAFKA_REQUEST_TIMEOUT_SECS)
362 |                 if watermarks is None:
363 |                     raise Exception(f'Timeout requesting watermark offsets from Kafka for topic {topic_name}, '
364 |                                     f'partition {part_id}')
365 |                 result[topic_name].append(watermarks)
366 | 
367 |         consumer.close()
368 |         return result
369 | 
370 |     def get_topic_config(self, topic_name: str) -> Any:
371 |         resource = confluent_kafka.admin.ConfigResource(
372 |             restype=confluent_kafka.admin.ConfigResource.Type.TOPIC, name=topic_name)  # type: ignore[attr-defined]
373 |         result = self._admin.describe_configs([resource])
374 |         return result[resource].result()
375 | 
376 |     def _refresh_cluster_metadata(self) -> None:
377 |         self._cluster_metadata = self._get_cluster_metadata()
378 | 
379 |     def _get_cluster_metadata(self) -> confluent_kafka.admin.ClusterMetadata:
380 |         if self._use_oauth:
381 |             self._admin.poll(constants.KAFKA_OAUTH_CB_POLL_TIMEOUT)  # In case oauth token refresh is needed
382 |         metadata = self._admin.list_topics(timeout=constants.KAFKA_REQUEST_TIMEOUT_SECS)
383 |         if metadata is None:
384 |             raise Exception(f'Cluster metadata request to Kafka timed out')
385 |         return metadata
386 | 
387 |     @staticmethod
388 |     def _raise_kafka_error(err: confluent_kafka.KafkaError) -> None:
389 |         if err.fatal():
390 |             raise confluent_kafka.KafkaException(err)
391 |         else:
392 |             logger.warning("librdkafka raised a non-fatal error: code - %s, name - %s, msg - %s",
393 |                            err.code(), err.name(), err.str())
394 | 
395 |     @staticmethod
396 |     def _log_kafka_throttle_event(evt: confluent_kafka.ThrottleEvent) -> None:
397 |         logger.warning('Kafka throttle event: %s', evt)
398 | 
399 |     @staticmethod
400 |     def _emit_producer_stats(stats_json: str) -> None:
401 |         logger.info('Kafka producer statistics: %s', stats_json)
402 | 


--------------------------------------------------------------------------------
/cdc_kafka/kafka_oauth/__init__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import importlib
 3 | import os
 4 | from abc import ABC, abstractmethod
 5 | from typing import TypeVar, Type, Tuple, Optional
 6 | 
 7 | KafkaOauthProviderAbstractType = TypeVar('KafkaOauthProviderAbstractType', bound='KafkaOauthProviderAbstract')
 8 | 
 9 | 
10 | class KafkaOauthProviderAbstract(ABC):
11 |     @abstractmethod
12 |     def consumer_oauth_cb(self, config_str: str) -> Tuple[str, float]:
13 |         pass
14 | 
15 |     @abstractmethod
16 |     def producer_oauth_cb(self, config_str: str) -> Tuple[str, float]:
17 |         pass
18 | 
19 |     @abstractmethod
20 |     def admin_oauth_cb(self, config_str: str) -> Tuple[str, float]:
21 |         pass
22 | 
23 |     @staticmethod
24 |     def add_arguments(parser: argparse.ArgumentParser) -> None:
25 |         pass
26 | 
27 |     @classmethod
28 |     @abstractmethod
29 |     def construct_with_options(cls: Type[KafkaOauthProviderAbstractType],
30 |                                opts: argparse.Namespace) -> KafkaOauthProviderAbstractType:
31 |         pass
32 | 
33 | 
34 | def add_kafka_oauth_arg(parser: argparse.ArgumentParser) -> None:
35 |     parser.add_argument('--kafka-oauth-provider',
36 |                         default=os.environ.get('KAFKA_OAUTH_PROVIDER'),
37 |                         help="A string of form <module_name>.<class_name> indicating an implementation of "
38 |                              "kafka_oauth.KafkaOauthProviderAbstract that provides OAuth callback functions specified "
39 |                              "when instantiating Kafka consumers, producers, or admin clients.")
40 | 
41 | 
42 | def get_kafka_oauth_provider() -> Optional[KafkaOauthProviderAbstract]:
43 |     parser = argparse.ArgumentParser()
44 |     add_kafka_oauth_arg(parser)
45 |     opts, _ = parser.parse_known_args()
46 | 
47 |     if not opts.kafka_oauth_provider:
48 |         return None
49 | 
50 |     package_module, class_name = opts.kafka_oauth_provider.rsplit('.', 1)
51 |     module = importlib.import_module(package_module)
52 |     oauth_class: KafkaOauthProviderAbstract = getattr(module, class_name)
53 |     oauth_class.add_arguments(parser)
54 |     opts, _ = parser.parse_known_args()
55 |     return oauth_class.construct_with_options(opts)
56 | 


--------------------------------------------------------------------------------
/cdc_kafka/kafka_oauth/aws_msk.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import datetime
 3 | import logging
 4 | import os
 5 | from typing import Tuple, TypeVar, Type, Optional
 6 | 
 7 | from aws_msk_iam_sasl_signer import MSKAuthTokenProvider  # type: ignore[import-untyped]
 8 | 
 9 | from . import KafkaOauthProviderAbstract
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | AwsMskOauthCallbackProviderType = TypeVar('AwsMskOauthCallbackProviderType', bound='AwsMskOauthCallbackProvider')
14 | 
15 | 
16 | class AwsMskOauthCallbackProvider(KafkaOauthProviderAbstract):
17 |     def __init__(self, aws_region: str, role_arn: Optional[str] = None):
18 |         self.aws_region: str = aws_region
19 |         self.role_arn: Optional[str] = role_arn
20 |         self._auth_token: str = ''
21 |         self._expiry_ts: float = datetime.datetime.now(datetime.timezone.utc).timestamp()
22 | 
23 |     def consumer_oauth_cb(self, config_str: str) -> Tuple[str, float]:
24 |         return self._common_cb()
25 | 
26 |     def producer_oauth_cb(self, config_str: str) -> Tuple[str, float]:
27 |         return self._common_cb()
28 | 
29 |     def admin_oauth_cb(self, config_str: str) -> Tuple[str, float]:
30 |         return self._common_cb()
31 | 
32 |     def _common_cb(self) -> Tuple[str, float]:
33 |         if not self._auth_token or datetime.datetime.now(datetime.timezone.utc).timestamp() > self._expiry_ts:
34 |             if self.role_arn:
35 |                 self._auth_token, expiry_ms = MSKAuthTokenProvider.generate_auth_token_from_role_arn(
36 |                     self.aws_region, self.role_arn)
37 |             else:
38 |                 self._auth_token, expiry_ms = MSKAuthTokenProvider.generate_auth_token(self.aws_region)
39 |             self._expiry_ts = expiry_ms / 1000
40 |             logger.debug('AwsMskOauthCallbackProvider generated an auth token that expires at %s',
41 |                          datetime.datetime.fromtimestamp(self._expiry_ts, datetime.timezone.utc))
42 |         return self._auth_token, self._expiry_ts
43 | 
44 |     @staticmethod
45 |     def add_arguments(parser: argparse.ArgumentParser) -> None:
46 |         parser.add_argument('--msk-cluster-aws-region', default=os.environ.get('MSK_CLUSTER_AWS_REGION'),
47 |                             help='AWS region name to use for IAM-based authentication to an AWS MSK cluster.')
48 |         parser.add_argument('--msk-cluster-access-role-arn', default=os.environ.get('MSK_CLUSTER_ACCESS_ROLE_ARN'),
49 |                             help='Optional name of an AWS IAM role to assume for authentication to an AWS MSK cluster.')
50 |         parser.add_argument('--aws-role-session-name', default=os.environ.get('AWS_ROLE_SESSION_NAME'),
51 |                             help='A session name for the process to maintain principal-name stability when'
52 |                                  're-authenticating for AWS IAM/SASL')
53 | 
54 |     @classmethod
55 |     def construct_with_options(cls: Type[AwsMskOauthCallbackProviderType],
56 |                                opts: argparse.Namespace) -> AwsMskOauthCallbackProviderType:
57 |         if not opts.msk_cluster_aws_region:
58 |             raise Exception('AwsMskOauthCallbackProvider cannot be used without specifying a value for '
59 |                             'MSK_CLUSTER_AWS_REGION')
60 |         return cls(opts.msk_cluster_aws_region, opts.msk_cluster_access_role_arn)
61 | 


--------------------------------------------------------------------------------
/cdc_kafka/main.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import collections
  3 | import datetime
  4 | import functools
  5 | import heapq
  6 | import json
  7 | import logging
  8 | import re
  9 | import time
 10 | from typing import Dict, Optional, List, Tuple
 11 | 
 12 | import pyodbc
 13 | 
 14 | from . import clock_sync, kafka, tracked_tables, constants, options, validation, change_index, progress_tracking, \
 15 |     sql_query_subprocess, sql_queries, helpers
 16 | from .build_startup_state import build_tracked_tables_from_cdc_metadata, determine_start_points_and_finalize_tables, \
 17 |     get_latest_capture_instances_by_fq_name, CaptureInstanceMetadata
 18 | from .metric_reporting import accumulator
 19 | 
 20 | from typing import TYPE_CHECKING
 21 | 
 22 | if TYPE_CHECKING:
 23 |     from . import parsed_row
 24 | 
 25 | logger = logging.getLogger(__name__)
 26 | 
 27 | 
 28 | def run() -> None:
 29 |     logger.info('Starting...')
 30 |     opts: argparse.Namespace
 31 |     opts, reporters, serializer = options.get_options_and_metrics_reporters()
 32 |     disable_writes: bool = opts.run_validations or opts.report_progress_only
 33 | 
 34 |     logger.debug('Parsed configuration: %s', json.dumps(vars(opts)))
 35 | 
 36 |     if not (opts.kafka_bootstrap_servers and opts.db_conn_string and opts.kafka_transactional_id):
 37 |         raise Exception('Arguments kafka_bootstrap_servers, db_conn_string, and '
 38 |                         'kafka_transactional_id are all required.')
 39 | 
 40 |     redo_snapshot_for_new_instance: bool = \
 41 |         opts.new_capture_instance_snapshot_handling == options.NEW_CAPTURE_INSTANCE_SNAPSHOT_HANDLING_BEGIN_NEW
 42 |     publish_duplicate_changes_from_new_instance: bool = \
 43 |         opts.new_capture_instance_overlap_handling == options.NEW_CAPTURE_INSTANCE_OVERLAP_HANDLING_REPUBLISH
 44 | 
 45 |     try:
 46 |         with (sql_query_subprocess.get_db_conn(
 47 |             opts.db_conn_string
 48 |         ) as db_conn, sql_query_subprocess.SQLQueryProcessor(
 49 |             opts.db_conn_string
 50 |         ) as sql_query_processor):
 51 |             clock_syncer: clock_sync.ClockSync = clock_sync.ClockSync(db_conn)
 52 | 
 53 |             metrics_accumulator: accumulator.Accumulator = accumulator.Accumulator(
 54 |                 db_conn, clock_syncer, opts.metrics_namespace, opts.process_hostname)
 55 | 
 56 |             capture_instances_by_fq_name: Dict[str, CaptureInstanceMetadata] = get_latest_capture_instances_by_fq_name(
 57 |                 db_conn, opts.capture_instance_version_strategy, opts.capture_instance_version_regex,
 58 |                 opts.table_include_regex, opts.table_exclude_regex)
 59 | 
 60 |             if not capture_instances_by_fq_name:
 61 |                 logger.error('No capture instances could be found.')
 62 |                 exit(1)
 63 | 
 64 |             capture_instance_names: List[str] = [ci.capture_instance_name
 65 |                                                  for ci in capture_instances_by_fq_name.values()]
 66 | 
 67 |             with kafka.KafkaClient(
 68 |                     metrics_accumulator, opts.kafka_bootstrap_servers, opts.extra_kafka_consumer_config,
 69 |                     opts.extra_kafka_producer_config, disable_writing=disable_writes,
 70 |                     transactional_id=opts.kafka_transactional_id
 71 |             ) as kafka_client:
 72 |                 progress_tracker = progress_tracking.ProgressTracker(
 73 |                     kafka_client, serializer, opts.progress_topic_name, opts.process_hostname,
 74 |                     opts.snapshot_logging_topic_name
 75 |                 )
 76 | 
 77 |                 tables: List[tracked_tables.TrackedTable] = build_tracked_tables_from_cdc_metadata(
 78 |                     db_conn, metrics_accumulator, opts.topic_name_template, opts.snapshot_table_include_regex,
 79 |                     opts.snapshot_table_exclude_regex, opts.truncate_fields, capture_instance_names,
 80 |                     opts.db_row_batch_size, sql_query_processor, progress_tracker)
 81 | 
 82 |                 capture_instance_to_topic_map: Dict[str, str] = {t.capture_instance_name: t.topic_name for t in tables}
 83 | 
 84 |                 determine_start_points_and_finalize_tables(
 85 |                     kafka_client, db_conn, tables, progress_tracker, opts.lsn_gap_handling, opts.new_follow_start_point,
 86 |                     opts.partition_count, opts.replication_factor, opts.extra_topic_config, opts.run_validations,
 87 |                     redo_snapshot_for_new_instance, publish_duplicate_changes_from_new_instance,
 88 |                     opts.report_progress_only)
 89 | 
 90 |                 if opts.report_progress_only:
 91 |                     exit(0)
 92 | 
 93 |                 for table in tables:
 94 |                     serializer.register_table(table)
 95 | 
 96 |                 table_to_unified_topics_map: Dict[str, List[str]] = collections.defaultdict(list)
 97 |                 unified_topic_to_tables_map: Dict[str, List[tracked_tables.TrackedTable]] = collections.defaultdict(list)
 98 | 
 99 |                 # "Unified" topics contain change messages from multiple capture instances in a globally-consistent LSN
100 |                 # order. They don't contain snapshot messages.
101 |                 if opts.unified_topics:
102 |                     for unified_topic_name, unified_topic_config in opts.unified_topics.items():
103 |                         included_tables_regex = unified_topic_config['included_tables']
104 |                         compiled_regex = re.compile(included_tables_regex, re.IGNORECASE)
105 |                         matched_tables = [table for table in tables if compiled_regex.match(table.fq_name)]
106 |                         if matched_tables:
107 |                             for matched_table in matched_tables:
108 |                                 table_to_unified_topics_map[matched_table.topic_name].append(unified_topic_name)
109 |                                 unified_topic_to_tables_map[unified_topic_name].append(matched_table)
110 |                             part_count = kafka_client.get_topic_partition_count(unified_topic_name)
111 |                             if part_count:
112 |                                 logger.info('Existing unified topic %s found, with %s partition(s)',
113 |                                             unified_topic_name, part_count)
114 |                             else:
115 |                                 part_count = unified_topic_config.get('partition_count', 1)
116 |                                 extra_config = unified_topic_config.get('extra_topic_config', {})
117 |                                 logger.info('Unified topic %s not found, creating with %s replicas, %s partition(s), and '
118 |                                             'extra config %s', unified_topic_name, opts.replication_factor, part_count,
119 |                                             extra_config)
120 |                                 kafka_client.create_topic(unified_topic_name, part_count, opts.replication_factor,
121 |                                                           extra_config)
122 | 
123 |                 if table_to_unified_topics_map:
124 |                     logger.debug('Unified topics being produced to, by table: %s', table_to_unified_topics_map)
125 | 
126 |                 # Validations will go through all messages in all topics and try to warn of any inconsistencies between
127 |                 # those and the source DB data. It takes a while; probably don't run this on very large datasets!
128 |                 if opts.run_validations:
129 |                     validator: validation.Validator = validation.Validator(
130 |                         kafka_client, tables, progress_tracker, serializer, unified_topic_to_tables_map)
131 |                     validator.run()
132 |                     exit(0)
133 | 
134 |                 last_metrics_emission_time: datetime.datetime = helpers.naive_utcnow()
135 |                 last_capture_instance_check_time: datetime.datetime = helpers.naive_utcnow()
136 |                 last_slow_table_heartbeat_time: datetime.datetime = helpers.naive_utcnow()
137 |                 next_cdc_poll_allowed_time: datetime.datetime = helpers.naive_utcnow()
138 |                 next_cdc_poll_due_time: datetime.datetime = helpers.naive_utcnow()
139 |                 last_produced_row: Optional['parsed_row.ParsedRow'] = None
140 |                 last_topic_produces: Dict[str, datetime.datetime] = {}
141 |                 change_rows_queue: List[Tuple[change_index.ChangeIndex, 'parsed_row.ParsedRow']] = []
142 |                 queued_change_row_counts: Dict[str, int] = {t.topic_name: 0 for t in tables}
143 | 
144 |                 # Returned bool indicates whether the process should halt
145 |                 def poll_periodic_tasks() -> bool:
146 |                     nonlocal last_metrics_emission_time
147 |                     nonlocal last_slow_table_heartbeat_time
148 |                     nonlocal last_capture_instance_check_time
149 | 
150 |                     do_metrics: bool = (helpers.naive_utcnow() - last_metrics_emission_time) > \
151 |                         constants.METRICS_REPORTING_INTERVAL
152 |                     do_heartbeats: bool = (helpers.naive_utcnow() - last_slow_table_heartbeat_time) > \
153 |                         constants.SLOW_TABLE_PROGRESS_HEARTBEAT_INTERVAL
154 |                     do_termination_check: bool = opts.terminate_on_capture_instance_change and \
155 |                         (helpers.naive_utcnow() - last_capture_instance_check_time) > \
156 |                         constants.CHANGED_CAPTURE_INSTANCES_CHECK_INTERVAL
157 | 
158 |                     if do_metrics or do_heartbeats or do_termination_check:
159 |                         kafka_client.begin_transaction()
160 | 
161 |                         if do_metrics:
162 |                             start_time = time.perf_counter()
163 |                             metrics = metrics_accumulator.end_and_get_values()
164 |                             for reporter in reporters:
165 |                                 try:
166 |                                     reporter.emit(metrics)
167 |                                 except Exception as e:
168 |                                     logger.exception('Caught exception while reporting metrics', exc_info=e)
169 |                             elapsed = (time.perf_counter() - start_time)
170 |                             logger.debug('Metrics reporting completed in %s ms', elapsed * 1000)
171 |                             metrics_accumulator.reset_and_start()
172 |                             last_metrics_emission_time = helpers.naive_utcnow()
173 | 
174 |                         if do_heartbeats:
175 |                             for t in tables:
176 |                                 if not queued_change_row_counts[t.topic_name]:
177 |                                     last_topic_produce = last_topic_produces.get(t.topic_name)
178 |                                     if not last_topic_produce or (helpers.naive_utcnow() - last_topic_produce) > \
179 |                                             2 * constants.SLOW_TABLE_PROGRESS_HEARTBEAT_INTERVAL:
180 |                                         logger.debug('Emitting heartbeat progress for slow table %s', t.fq_name)
181 |                                         progress_tracker.record_changes_progress(t.topic_name, t.max_polled_change_index)
182 |                             last_slow_table_heartbeat_time = helpers.naive_utcnow()
183 | 
184 |                         if do_termination_check:
185 |                             topic_to_max_polled_index_map:  Dict[str, change_index.ChangeIndex] = {
186 |                                 t.topic_name: t.max_polled_change_index for t in tables
187 |                             }
188 |                             if should_terminate_due_to_capture_instance_change(
189 |                                     db_conn, progress_tracker, opts.capture_instance_version_strategy,
190 |                                     opts.capture_instance_version_regex, capture_instance_to_topic_map,
191 |                                     capture_instances_by_fq_name, opts.table_include_regex,
192 |                                     opts.table_exclude_regex, topic_to_max_polled_index_map):
193 |                                 kafka_client.commit_transaction()
194 |                                 return True
195 |                             last_capture_instance_check_time = helpers.naive_utcnow()
196 | 
197 |                         kafka_client.commit_transaction()
198 |                     return False
199 | 
200 |                 logger.info('Beginning processing for %s tracked table(s).', len(tables))
201 |                 metrics_accumulator.reset_and_start()
202 | 
203 |                 # The above is all setup, now we come to the "hot loop":
204 | 
205 |                 row: 'parsed_row.ParsedRow'
206 |                 lsn_limit: bytes = change_index.LOWEST_CHANGE_INDEX.lsn
207 | 
208 |                 while True:
209 |                     snapshots_remain: bool = not all([t.snapshot_complete for t in tables])
210 |                     lagging_change_tables: List[tracked_tables.TrackedTable] = [
211 |                         t for t in tables if t.change_reads_are_lagging]
212 |                     # ----- Poll for and produce snapshot data while change row queries run -----
213 | 
214 |                     if snapshots_remain:
215 |                         while helpers.naive_utcnow() < next_cdc_poll_due_time and snapshots_remain:
216 |                             kafka_client.begin_transaction()
217 |                             snapshot_progress_by_topic: Dict[str, Dict[str, str | int]] = {}
218 |                             completions_to_log: List[functools.partial[None]] = []
219 | 
220 |                             for t in tables:
221 |                                 if not t.snapshot_complete:
222 |                                     last_row_retrieved: Optional[parsed_row.ParsedRow] = None
223 |                                     for row in t.retrieve_snapshot_query_results():
224 |                                         key_ser, value_ser = serializer.serialize_table_data_message(row)
225 |                                         kafka_client.produce(row.destination_topic, key_ser, value_ser,
226 |                                                              constants.SINGLE_TABLE_SNAPSHOT_MESSAGE, None,
227 |                                                              row.event_db_time, None,
228 |                                                              constants.SNAPSHOT_OPERATION_ID, row.extra_headers)
229 |                                         last_row_retrieved = row
230 |                                     if last_row_retrieved:
231 |                                         key_as_dict = dict(zip(t.key_field_names,
232 |                                                                last_row_retrieved.ordered_key_field_values))
233 |                                         snapshot_progress_by_topic[last_row_retrieved.destination_topic] = key_as_dict
234 |                                     if t.snapshot_complete:
235 |                                         progress_tracker.record_snapshot_progress(
236 |                                             t.topic_name, constants.SNAPSHOT_COMPLETION_SENTINEL)
237 |                                         snapshot_progress_by_topic.pop(row.destination_topic, None)
238 |                                         completions_to_log.append(functools.partial(
239 |                                             progress_tracker.log_snapshot_completed, t.topic_name, t.fq_name,
240 |                                             helpers.naive_utcnow(), key_as_dict
241 |                                         ))
242 |                                         snapshots_remain = not all([t.snapshot_complete for t in tables])
243 |                                     elif not lagging_change_tables:
244 |                                         t.enqueue_snapshot_query()   # NB: results may not be retrieved until next cycle
245 | 
246 |                                 if helpers.naive_utcnow() > next_cdc_poll_due_time:
247 |                                     break
248 | 
249 |                             for topic_name, snapshot_index in snapshot_progress_by_topic.items():
250 |                                 progress_tracker.record_snapshot_progress(topic_name, snapshot_index)
251 | 
252 |                             kafka_client.commit_transaction()
253 | 
254 |                             # This needs to happen outside the above Kafka transaction so that the topic
255 |                             # watermarks captured as part of the logged event reflect the topic's state after
256 |                             # the final snapshot row(s) have been produced:
257 |                             if completions_to_log:
258 |                                 kafka_client.begin_transaction()
259 |                                 for completion in completions_to_log:
260 |                                     completion()
261 |                                 kafka_client.commit_transaction()
262 | 
263 |                             if lagging_change_tables:
264 |                                 break
265 | 
266 |                         if poll_periodic_tasks():
267 |                             break
268 | 
269 |                     # ----- Wait for next poll window (if needed) and get ceiling LSN for cycle -----
270 | 
271 |                     if not lagging_change_tables:
272 |                         wait_time = (next_cdc_poll_allowed_time - helpers.naive_utcnow()).total_seconds()
273 |                         if wait_time > 0:
274 |                             time.sleep(wait_time)
275 |                             metrics_accumulator.register_sleep(wait_time)
276 | 
277 |                     if poll_periodic_tasks():
278 |                         break
279 | 
280 |                     if lagging_change_tables and lsn_limit > change_index.LOWEST_CHANGE_INDEX.lsn:
281 |                         tables_to_poll = lagging_change_tables
282 |                         # Leave existing LSN limit in place to get through the glut in lagged tables first
283 |                     else:
284 |                         tables_to_poll = tables
285 |                         with db_conn.cursor() as cursor:
286 |                             q, _ = sql_queries.get_max_lsn()
287 |                             cursor.execute(q)
288 |                             lsn_limit = cursor.fetchval()
289 | 
290 |                     next_cdc_poll_allowed_time = (helpers.naive_utcnow() + constants.MIN_CDC_POLLING_INTERVAL)
291 |                     next_cdc_poll_due_time = (helpers.naive_utcnow() + constants.MAX_CDC_POLLING_INTERVAL)
292 | 
293 |                     # ----- Query for change rows ----
294 | 
295 |                     for t in tables_to_poll:
296 |                         if queued_change_row_counts[t.topic_name] < opts.db_row_batch_size + 1:
297 |                             t.enqueue_changes_query(lsn_limit)
298 | 
299 |                     common_lsn_limit: change_index.ChangeIndex = change_index.HIGHEST_CHANGE_INDEX
300 | 
301 |                     if poll_periodic_tasks():
302 |                         break
303 | 
304 |                     for t in tables_to_poll:
305 |                         for row in t.retrieve_changes_query_results():
306 |                             queued_change_row_counts[t.topic_name] += 1
307 |                             heapq.heappush(change_rows_queue, (row.change_idx, row))
308 |                         if t.max_polled_change_index < common_lsn_limit:
309 |                             common_lsn_limit = t.max_polled_change_index
310 | 
311 |                     if poll_periodic_tasks():
312 |                         break
313 | 
314 |                     if not change_rows_queue:
315 |                         continue
316 | 
317 |                     # ----- Produce change data to Kafka and commit progress -----
318 | 
319 |                     kafka_client.begin_transaction()
320 |                     progress_by_topic: Dict[str, change_index.ChangeIndex] = {}
321 | 
322 |                     while change_rows_queue:
323 |                         row = heapq.heappop(change_rows_queue)[1]
324 | 
325 |                         if row.change_idx > common_lsn_limit:
326 |                             heapq.heappush(change_rows_queue, (row.change_idx, row))
327 |                             break
328 | 
329 |                         if last_produced_row and row.change_idx < last_produced_row.change_idx:
330 |                             raise Exception(f'Change rows are being produced to Kafka out of LSN order. There is '
331 |                                             f'a bug. Fix it! Prior: {last_produced_row}, current: {row}')
332 |                         last_produced_row = row
333 |                         queued_change_row_counts[row.destination_topic] -= 1
334 |                         key_ser, value_ser = serializer.serialize_table_data_message(row)
335 |                         kafka_client.produce(row.destination_topic, key_ser, value_ser,
336 |                                              constants.SINGLE_TABLE_CHANGE_MESSAGE,
337 |                                              table_to_unified_topics_map.get(row.destination_topic, []),
338 |                                              row.event_db_time, row.change_idx.lsn,
339 |                                              row.operation_id, row.extra_headers)
340 |                         last_topic_produces[row.destination_topic] = helpers.naive_utcnow()
341 | 
342 |                         if not opts.disable_deletion_tombstones and row.operation_id == \
343 |                                 constants.DELETE_OPERATION_ID:
344 |                             kafka_client.produce(row.destination_topic, key_ser, None,
345 |                                                  constants.DELETION_CHANGE_TOMBSTONE_MESSAGE)
346 | 
347 |                         progress_by_topic[row.destination_topic] = row.change_idx
348 |                     for topic_name, progress_index in progress_by_topic.items():
349 |                         progress_tracker.record_changes_progress(topic_name, progress_index)
350 |                     kafka_client.commit_transaction()
351 | 
352 |                     if poll_periodic_tasks():
353 |                         break
354 |     except (KeyboardInterrupt, pyodbc.OperationalError):
355 |         logger.info('Exiting due to external interrupt.')
356 | 
357 | 
358 | def should_terminate_due_to_capture_instance_change(
359 |         db_conn: pyodbc.Connection, progress_tracker: progress_tracking.ProgressTracker,
360 |         capture_instance_version_strategy: str, capture_instance_version_regex: str,
361 |         capture_instance_to_topic_map: Dict[str, str], current_capture_instances: Dict[str, CaptureInstanceMetadata],
362 |         table_include_regex: str, table_exclude_regex: str,
363 |         topic_to_max_polled_index_map: Dict[str, change_index.ChangeIndex]
364 | ) -> bool:
365 |     new_capture_instances: Dict[str, CaptureInstanceMetadata] = get_latest_capture_instances_by_fq_name(
366 |         db_conn, capture_instance_version_strategy, capture_instance_version_regex, table_include_regex,
367 |         table_exclude_regex)
368 | 
369 |     current = {k: (v.capture_instance_name, v.types_checksum) for k, v in current_capture_instances.items()}
370 |     new = {k: (v.capture_instance_name, v.types_checksum) for k, v in new_capture_instances.items()}
371 | 
372 |     if new == current:
373 |         logger.debug('Capture instances unchanged; continuing...')
374 |         return False
375 | 
376 |     for fq_name, current_ci in current_capture_instances.items():
377 |         if fq_name in new_capture_instances:
378 |             new_ci = new_capture_instances[fq_name]
379 |             if (current_ci.capture_instance_name == new_ci.capture_instance_name and
380 |                     current_ci.types_checksum == new_ci.types_checksum):
381 |                 continue
382 |             topic = capture_instance_to_topic_map[current_ci.capture_instance_name]
383 |             if topic in topic_to_max_polled_index_map:
384 |                 progress_tracker.record_changes_progress(topic, topic_to_max_polled_index_map[topic])
385 |             last_recorded_progress = progress_tracker.get_last_recorded_progress_for_topic(topic)
386 |             current_idx = last_recorded_progress and last_recorded_progress.change_index or \
387 |                 change_index.LOWEST_CHANGE_INDEX
388 |             logger.info('Change detected in capture instance for %s.\nCurrent: capture instance name "%s", column '
389 |                         'types checksum %s\nNew: capture instance name "%s", column types checksum %s',
390 |                         fq_name, current_ci.capture_instance_name, current_ci.types_checksum,
391 |                         new_ci.capture_instance_name, new_ci.types_checksum)
392 |             new_ci_min_index = change_index.ChangeIndex(new_ci.start_lsn, b'\x00' * 10, 0)
393 |             if current_idx < new_ci_min_index:
394 |                 with db_conn.cursor() as cursor:
395 |                     change_table_name = helpers.quote_name(
396 |                         helpers.get_fq_change_table_name(current_ci.capture_instance_name))
397 |                     cursor.execute(f"SELECT TOP 1 1 FROM {change_table_name} WITH (NOLOCK)")
398 |                     has_rows = cursor.fetchval() is not None
399 |                 if has_rows:
400 |                     logger.info('Progress against existing capture instance ("%s") for table "%s" has reached index '
401 |                                 '%s, but the new capture instance ("%s") does not begin until index %s. Deferring '
402 |                                 'termination to maintain data integrity and will try again on next capture instance '
403 |                                 'evaluation iteration.', current_ci.capture_instance_name, fq_name, current_idx,
404 |                                 new_ci.capture_instance_name, new_ci_min_index)
405 |                     return False
406 | 
407 |     logger.warning('Terminating process due to change in capture instances. This behavior can be controlled by '
408 |                    'changing option TERMINATE_ON_CAPTURE_INSTANCE_CHANGE.')
409 |     return True
410 | 


--------------------------------------------------------------------------------
/cdc_kafka/metric_reporting/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/woodlee/sqlserver-cdc-to-kafka/ecde4db9f81f3876fcfe044979d8b1b910db8a63/cdc_kafka/metric_reporting/__init__.py


--------------------------------------------------------------------------------
/cdc_kafka/metric_reporting/accumulator.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import datetime
  3 | from typing import List, Optional
  4 | 
  5 | import confluent_kafka
  6 | import pyodbc
  7 | import sortedcontainers
  8 | 
  9 | from .. import constants, sql_queries, helpers
 10 | from . import metrics
 11 | 
 12 | from typing import TYPE_CHECKING
 13 | if TYPE_CHECKING:
 14 |     from .. import clock_sync
 15 | 
 16 | 
 17 | class AccumulatorAbstract(abc.ABC):
 18 |     @abc.abstractmethod
 19 |     def reset_and_start(self) -> None: pass
 20 | 
 21 |     @abc.abstractmethod
 22 |     def end_and_get_values(self) -> metrics.Metrics: pass
 23 | 
 24 |     @abc.abstractmethod
 25 |     def register_sleep(self, sleep_time_seconds: float) -> None: pass
 26 | 
 27 |     @abc.abstractmethod
 28 |     def register_db_query(self, seconds_elapsed: float, db_query_kind: str, retrieved_row_count: int) -> None: pass
 29 | 
 30 |     @abc.abstractmethod
 31 |     def register_kafka_produce(self, seconds_elapsed: float, message_type: str,
 32 |                                event_datetime: Optional[datetime.datetime] = None, change_lsn: Optional[bytes] = None,
 33 |                                operation_id: Optional[int] = None) -> None: pass
 34 | 
 35 |     @abc.abstractmethod
 36 |     def kafka_delivery_callback(self, message: confluent_kafka.Message,
 37 |                                 event_datetime: datetime.datetime) -> None: pass
 38 | 
 39 | 
 40 | class NoopAccumulator(AccumulatorAbstract):
 41 |     def reset_and_start(self) -> None: pass
 42 |     def end_and_get_values(self) -> metrics.Metrics: return metrics.Metrics()
 43 |     def register_sleep(self, sleep_time_seconds: float) -> None: pass
 44 |     def register_db_query(self, seconds_elapsed: float, db_query_kind: str, retrieved_row_count: int) -> None: pass
 45 | 
 46 |     def register_kafka_produce(self, seconds_elapsed: float, message_type: str,
 47 |                                event_datetime: Optional[datetime.datetime] = None, change_lsn: Optional[bytes] = None,
 48 |                                operation_id: Optional[int] = None) -> None: pass
 49 |     def kafka_delivery_callback(self, message: confluent_kafka.Message,
 50 |                                 event_datetime: datetime.datetime) -> None: pass
 51 | 
 52 | 
 53 | class Accumulator(AccumulatorAbstract):
 54 |     _instance = None
 55 | 
 56 |     def __init__(self, db_conn: pyodbc.Connection, clock_syncer: 'clock_sync.ClockSync',
 57 |                  metrics_namespace: str, process_hostname: str) -> None:
 58 |         if Accumulator._instance is not None:
 59 |             raise Exception('metric_reporting.Accumulator class should be used as a singleton.')
 60 | 
 61 |         self._db_conn: pyodbc.Connection = db_conn
 62 |         self._clock_syncer: 'clock_sync.ClockSync' = clock_syncer
 63 |         self._metrics_namespace: str = metrics_namespace
 64 |         self._process_hostname: str = process_hostname
 65 | 
 66 |         self.reset_and_start()
 67 | 
 68 |         Accumulator._instance = self
 69 | 
 70 |     # noinspection PyAttributeOutsideInit
 71 |     def reset_and_start(self) -> None:
 72 |         self._interval_start_epoch_sec: float = datetime.datetime.now(datetime.UTC).timestamp()
 73 |         self._total_sleep_time_sec: float = 0
 74 |         self._db_change_data_queries_count: int = 0
 75 |         self._db_change_data_queries_total_time_sec: float = 0
 76 |         self._db_change_data_rows_retrieved_count: int = 0
 77 |         self._db_snapshot_queries_count: int = 0
 78 |         self._db_snapshot_queries_total_time_sec: float = 0
 79 |         self._db_snapshot_rows_retrieved_count: int = 0
 80 |         self._change_lsns_produced: sortedcontainers.SortedList[bytes] = sortedcontainers.SortedList()
 81 |         self._change_db_tran_end_times_produced: sortedcontainers.SortedList[datetime.datetime] = sortedcontainers.SortedList()
 82 |         self._e2e_latencies_sec: List[float] = []
 83 |         self._kafka_produces_total_time_sec: float = 0
 84 |         self._kafka_delivery_acks_count: int = 0
 85 |         self._produced_delete_changes_count: int = 0
 86 |         self._produced_insert_changes_count: int = 0
 87 |         self._produced_metadata_records_count: int = 0
 88 |         self._produced_snapshot_records_count: int = 0
 89 |         self._produced_deletion_tombstones_count: int = 0
 90 |         self._messages_copied_to_unified_topics: int = 0
 91 |         self._produced_update_changes_count: int = 0
 92 | 
 93 |     def end_and_get_values(self) -> metrics.Metrics:
 94 |         end_epoch_sec = datetime.datetime.now(datetime.UTC).timestamp()
 95 |         interval_delta_sec = end_epoch_sec - self._interval_start_epoch_sec
 96 |         db_all_data_queries_total_time_sec = self._db_snapshot_queries_total_time_sec + \
 97 |             self._db_change_data_queries_total_time_sec
 98 |         db_all_data_queries_count = self._db_snapshot_queries_count + self._db_change_data_queries_count
 99 |         kafka_produces_count = self._produced_delete_changes_count + self._produced_insert_changes_count + \
100 |             self._produced_metadata_records_count + self._produced_snapshot_records_count + \
101 |             self._produced_deletion_tombstones_count + self._produced_update_changes_count + \
102 |             self._messages_copied_to_unified_topics
103 | 
104 |         with self._db_conn.cursor() as cursor:
105 |             q, _ = sql_queries.get_latest_cdc_entry_time()
106 |             cursor.execute(q)
107 |             cdc_lag = (helpers.naive_utcnow() - self._clock_syncer.db_time_to_utc(cursor.fetchval())) \
108 |                 .total_seconds()
109 | 
110 |         m = metrics.Metrics()
111 | 
112 |         m.metrics_namespace = self._metrics_namespace
113 |         m.process_hostname = self._process_hostname
114 | 
115 |         m.interval_start_epoch_sec = self._interval_start_epoch_sec
116 |         m.interval_end_epoch_sec = end_epoch_sec
117 |         m.interval_delta_sec = interval_delta_sec
118 | 
119 |         m.earliest_change_lsn_produced = \
120 |             (self._change_lsns_produced and f'0x{self._change_lsns_produced[0].hex()}') or None
121 |         m.earliest_change_db_tran_end_time_produced = \
122 |             (self._change_db_tran_end_times_produced and self._change_db_tran_end_times_produced[0]) \
123 |             or None
124 |         m.latest_change_lsn_produced = \
125 |             (self._change_lsns_produced and f'0x{self._change_lsns_produced[-1].hex()}') or None
126 |         m.latest_change_db_tran_end_time_produced = \
127 |             (self._change_db_tran_end_times_produced and self._change_db_tran_end_times_produced[-1]) \
128 |             or None
129 | 
130 |         m.e2e_latency_avg_sec = \
131 |             (self._e2e_latencies_sec and sum(self._e2e_latencies_sec) / len(self._e2e_latencies_sec)) or None
132 |         e2e_max: Optional[float] = None
133 |         e2e_min: Optional[float] = None
134 |         if self._e2e_latencies_sec:
135 |             e2e_max = max(self._e2e_latencies_sec)
136 |             e2e_min = min(self._e2e_latencies_sec)
137 |         m.e2e_latency_max_sec = e2e_max
138 |         m.e2e_latency_min_sec = e2e_min
139 | 
140 |         m.sql_server_cdc_process_lag_sec = cdc_lag
141 | 
142 |         m.db_all_data_queries_avg_time_per_query_ms = \
143 |             (db_all_data_queries_count and
144 |              db_all_data_queries_total_time_sec / db_all_data_queries_count * 1000) or None
145 |         m.db_all_data_queries_count = db_all_data_queries_count
146 |         m.db_all_data_queries_total_time_sec = db_all_data_queries_total_time_sec
147 |         m.db_all_data_rows_retrieved_count = self._db_snapshot_rows_retrieved_count + \
148 |             self._db_change_data_rows_retrieved_count
149 | 
150 |         m.db_change_data_queries_avg_time_per_query_ms = \
151 |             (self._db_change_data_queries_count and
152 |              self._db_change_data_queries_total_time_sec / self._db_change_data_queries_count * 1000) or None
153 |         m.db_change_data_queries_count = self._db_change_data_queries_count
154 |         m.db_change_data_queries_total_time_sec = self._db_change_data_queries_total_time_sec
155 |         m.db_change_data_rows_retrieved_count = self._db_change_data_rows_retrieved_count
156 | 
157 |         m.db_snapshot_queries_avg_time_per_query_ms = \
158 |             (self._db_snapshot_queries_count and
159 |              self._db_snapshot_queries_total_time_sec / self._db_snapshot_queries_count * 1000) or None
160 |         m.db_snapshot_queries_count = self._db_snapshot_queries_count
161 |         m.db_snapshot_queries_total_time_sec = self._db_snapshot_queries_total_time_sec
162 |         m.db_snapshot_rows_retrieved_count = self._db_snapshot_rows_retrieved_count
163 | 
164 |         m.kafka_produces_count = kafka_produces_count
165 |         m.kafka_produces_total_time_sec = self._kafka_produces_total_time_sec
166 |         m.kafka_produces_avg_time_per_record_ms = \
167 |             (kafka_produces_count and
168 |              self._kafka_produces_total_time_sec / kafka_produces_count * 1000) or None
169 |         m.kafka_delivery_acks_count = self._kafka_delivery_acks_count
170 | 
171 |         m.produced_delete_changes_count = self._produced_delete_changes_count
172 |         m.produced_insert_changes_count = self._produced_insert_changes_count
173 |         m.produced_update_changes_count = self._produced_update_changes_count
174 |         m.produced_snapshot_records_count = self._produced_snapshot_records_count
175 |         m.produced_metadata_records_count = self._produced_metadata_records_count
176 |         m.produced_deletion_tombstones_count = self._produced_deletion_tombstones_count
177 |         m.messages_copied_to_unified_topics = self._messages_copied_to_unified_topics
178 | 
179 |         m.total_sleep_time_sec = self._total_sleep_time_sec
180 | 
181 |         return m
182 | 
183 |     def register_sleep(self, sleep_time_seconds: float) -> None:
184 |         self._total_sleep_time_sec += sleep_time_seconds
185 | 
186 |     def register_db_query(self, seconds_elapsed: float, db_query_kind: str, retrieved_row_count: int) -> None:
187 |         if db_query_kind == constants.SNAPSHOT_ROWS_KIND:
188 |             self._db_snapshot_queries_count += 1
189 |             self._db_snapshot_rows_retrieved_count += retrieved_row_count
190 |             self._db_snapshot_queries_total_time_sec += seconds_elapsed
191 |         elif db_query_kind == constants.CHANGE_ROWS_KIND:
192 |             self._db_change_data_queries_count += 1
193 |             self._db_change_data_rows_retrieved_count += retrieved_row_count
194 |             self._db_change_data_queries_total_time_sec += seconds_elapsed
195 |         else:
196 |             raise Exception(f'Accumulator.register_db_query does not recognize db_query_kind "{db_query_kind}".')
197 | 
198 |     def register_kafka_produce(self, seconds_elapsed: float, message_type: str,
199 |                                event_datetime: Optional[datetime.datetime] = None, change_lsn: Optional[bytes] = None,
200 |                                operation_id: Optional[int] = None) -> None:
201 |         self._kafka_produces_total_time_sec += seconds_elapsed
202 | 
203 |         if message_type in (constants.CHANGE_PROGRESS_MESSAGE, constants.SNAPSHOT_PROGRESS_MESSAGE,
204 |                             constants.METRIC_REPORTING_MESSAGE, constants.PROGRESS_DELETION_TOMBSTONE_MESSAGE):
205 |             self._produced_metadata_records_count += 1
206 |         elif message_type == constants.DELETION_CHANGE_TOMBSTONE_MESSAGE:
207 |             self._produced_deletion_tombstones_count += 1
208 |         elif message_type == constants.UNIFIED_TOPIC_CHANGE_MESSAGE:
209 |             self._messages_copied_to_unified_topics += 1
210 |         elif message_type == constants.SINGLE_TABLE_SNAPSHOT_MESSAGE:
211 |             self._produced_snapshot_records_count += 1
212 |         elif message_type == constants.SINGLE_TABLE_CHANGE_MESSAGE:
213 |             if change_lsn:
214 |                 self._change_lsns_produced.add(change_lsn)
215 |             if event_datetime:
216 |                 self._change_db_tran_end_times_produced.add(event_datetime)
217 |             if operation_id == constants.DELETE_OPERATION_ID:
218 |                 self._produced_delete_changes_count += 1
219 |             elif operation_id == constants.INSERT_OPERATION_ID:
220 |                 self._produced_insert_changes_count += 1
221 |             elif operation_id == constants.POST_UPDATE_OPERATION_ID:
222 |                 self._produced_update_changes_count += 1
223 |             else:
224 |                 raise Exception(f'Accumulator.register_kafka_produce does not recognize operation ID: '
225 |                                 f'"{operation_id}".')
226 |         elif message_type == constants.SNAPSHOT_LOGGING_MESSAGE:
227 |             pass
228 |         else:
229 |             raise Exception(f'Accumulator.register_kafka_produce does not recognize message type: "{message_type}".')
230 | 
231 |     def kafka_delivery_callback(self, message: confluent_kafka.Message,
232 |                                 event_datetime: datetime.datetime) -> None:
233 |         self._kafka_delivery_acks_count += 1
234 | 
235 |         timestamp_type, timestamp = message.timestamp()
236 |         if timestamp_type != confluent_kafka.TIMESTAMP_CREATE_TIME:
237 |             produce_datetime = helpers.naive_utcnow()
238 |         else:
239 |             produce_datetime = datetime.datetime.fromtimestamp(timestamp / 1000.0, datetime.UTC).replace(tzinfo=None)
240 | 
241 |         db_commit_time = self._clock_syncer.db_time_to_utc(event_datetime)
242 |         e2e_latency = (produce_datetime - db_commit_time).total_seconds()
243 |         self._e2e_latencies_sec.append(e2e_latency)
244 | 


--------------------------------------------------------------------------------
/cdc_kafka/metric_reporting/http_post_reporter.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import logging
 4 | import os
 5 | import threading
 6 | 
 7 | from jinja2 import Template
 8 | import requests
 9 | 
10 | from . import reporter_base
11 | 
12 | from typing import TYPE_CHECKING, Optional, Dict, Any, TypeVar, Type
13 | 
14 | if TYPE_CHECKING:
15 |     from .metrics import Metrics
16 | 
17 | logger = logging.getLogger(__name__)
18 | 
19 | 
20 | HttpPostReporterType = TypeVar('HttpPostReporterType', bound='HttpPostReporter')
21 | 
22 | 
23 | class HttpPostReporter(reporter_base.ReporterBase):
24 |     def __init__(self, url: str, template: Optional[Template], headers: Dict[str, str]) -> None:
25 |         self._url: str = url
26 |         self._template: Optional[Template] = template
27 |         self._headers: Dict[str, str] = headers
28 | 
29 |     def emit(self, metrics: 'Metrics') -> None:
30 |         t = threading.Thread(target=self._post, args=(metrics.as_dict(),), name='HttpPostReporter')
31 |         t.daemon = True
32 |         t.start()
33 | 
34 |     def _post(self, metrics_dict: Dict[str, Any]) -> None:
35 |         if self._template:
36 |             body = self._template.render(metrics=metrics_dict)
37 |         else:
38 |             body = json.dumps(metrics_dict, default=HttpPostReporter.json_serialize_datetimes)
39 | 
40 |         try:
41 |             resp = requests.post(self._url, data=body, headers=self._headers, timeout=10.0)
42 |             resp.raise_for_status()
43 |             logger.debug('Posted metrics to %s with code %s and response: %s', self._url, resp.status_code, resp.text)
44 |         except requests.exceptions.RequestException as e:
45 |             logger.warning('Failed to post metrics to %s: %s', self._url, e)
46 | 
47 |     @staticmethod
48 |     def add_arguments(parser: argparse.ArgumentParser) -> None:
49 |         parser.add_argument('--http-metrics-url', default=os.environ.get('HTTP_METRICS_URL'),
50 |                             help='URL to target when publishing process metrics metadata via the HttpPostReporter.')
51 |         parser.add_argument('--http-metrics-headers', default=os.environ.get('HTTP_METRICS_HEADERS'), type=json.loads,
52 |                             help='Optional JSON object of HTTP headers k:v pairs to send along with the POST when '
53 |                                  'publishing process metrics via the HttpPostReporter.')
54 |         parser.add_argument('--http-metrics-template', default=os.environ.get('HTTP_METRICS_TEMPLATE'),
55 |                             help='An optional Jinja2 template used to create the HTTP POST body when publishing '
56 |                                  'process metrics via the HttpPostReporter. It may reference the fields defined in '
57 |                                  'the metric_reporting.metrics.Metrics class.')
58 | 
59 |     @classmethod
60 |     def construct_with_options(cls: Type[HttpPostReporterType], opts: argparse.Namespace) -> HttpPostReporterType:
61 |         if not opts.http_metrics_url:
62 |             raise Exception('HttpPostReporter cannot be used without specifying a value for HTTP_METRICS_URL')
63 |         template = None
64 |         if opts.http_metrics_template:
65 |             template = Template(opts.http_metrics_template)
66 |         return cls(opts.http_metrics_url, template, opts.http_metrics_headers)
67 | 
68 | 


--------------------------------------------------------------------------------
/cdc_kafka/metric_reporting/kafka_reporter.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from typing import Type, TypeVar
 4 | 
 5 | from . import reporter_base
 6 | from .. import kafka, constants
 7 | from .metrics import Metrics
 8 | from ..serializers import SerializerAbstract
 9 | from ..serializers.avro import AvroSerializer
10 | 
11 | KafkaReporterType = TypeVar('KafkaReporterType', bound='KafkaReporter')
12 | 
13 | 
14 | class KafkaReporter(reporter_base.ReporterBase):
15 |     DEFAULT_TOPIC = '_cdc_to_kafka_metrics'
16 | 
17 |     def __init__(self, metrics_topic: str, opts: argparse.Namespace) -> None:
18 |         self._metrics_topic: str = metrics_topic
19 |         self._serializer: SerializerAbstract = AvroSerializer(
20 |             opts.schema_registry_url, opts.always_use_avro_longs, opts.progress_topic_name,
21 |             opts.snapshot_logging_topic_name, opts.metrics_topic_name, opts.avro_type_spec_overrides,
22 |             disable_writes=True)
23 | 
24 |     # noinspection PyProtectedMember
25 |     def emit(self, metrics: 'Metrics') -> None:
26 |         metrics_dict = metrics.as_dict()
27 |         key, value = self._serializer.serialize_metrics_message(metrics_dict['metrics_namespace'], metrics_dict)
28 |         kafka.KafkaClient.get_instance().produce(self._metrics_topic, key, value, constants.METRIC_REPORTING_MESSAGE)
29 | 
30 |     @staticmethod
31 |     def add_arguments(parser: argparse.ArgumentParser) -> None:
32 |         parser.add_argument('--kafka-metrics-topic',
33 |                             default=os.environ.get('KAFKA_METRICS_TOPIC', KafkaReporter.DEFAULT_TOPIC),
34 |                             help='Kafka topic to target when publishing process metrics metadata via the '
35 |                                  f'KafkaReporter. Defaults to `{KafkaReporter.DEFAULT_TOPIC}`')
36 | 
37 |     @classmethod
38 |     def construct_with_options(cls: Type[KafkaReporterType], opts: argparse.Namespace) -> KafkaReporterType:
39 |         metrics_topic: str = opts.kafka_metrics_topic or KafkaReporter.DEFAULT_TOPIC
40 |         return cls(metrics_topic, opts)
41 | 


--------------------------------------------------------------------------------
/cdc_kafka/metric_reporting/local_file_reporter.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import logging
 4 | import os
 5 | import pathlib
 6 | from typing import TYPE_CHECKING, TypeVar, Type
 7 | 
 8 | from . import reporter_base
 9 | 
10 | if TYPE_CHECKING:
11 |     from .metrics import Metrics
12 | 
13 | logger = logging.getLogger(__name__)
14 | 
15 | LocalFileReporterType = TypeVar('LocalFileReporterType', bound='LocalFileReporter')
16 | 
17 | 
18 | class LocalFileReporter(reporter_base.ReporterBase):
19 |     def __init__(self, file_path: str) -> None:
20 |         self._file_path: str = file_path
21 |         pathlib.Path(file_path).touch()
22 | 
23 |     def emit(self, metrics: 'Metrics') -> None:
24 |         with open(self._file_path, 'w') as target_file:
25 |             json.dump(metrics.as_dict(), target_file, default=LocalFileReporter.json_serialize_datetimes)
26 | 
27 |     @staticmethod
28 |     def add_arguments(parser: argparse.ArgumentParser) -> None:
29 |         parser.add_argument('--metrics-file-path', default=os.environ.get('METRICS_FILE_PATH'),
30 |                             help='Path to a file to which you want the process to write its most recent metrics.')
31 | 
32 |     @classmethod
33 |     def construct_with_options(cls: Type[LocalFileReporterType], opts: argparse.Namespace) -> LocalFileReporterType:
34 |         if not opts.metrics_file_path:
35 |             raise Exception('LocalFileReporter cannot be used without specifying a value for METRICS_FILE_PATH')
36 |         return cls(opts.metrics_file_path)
37 | 


--------------------------------------------------------------------------------
/cdc_kafka/metric_reporting/metrics.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict
 2 | 
 3 | 
 4 | class Metrics(object):
 5 |     FIELDS_AND_TYPES = [
 6 |         ("metrics_namespace", "string"),
 7 |         ("process_hostname", "string"),
 8 | 
 9 |         ("interval_start_epoch_sec", "double"),
10 |         ("interval_end_epoch_sec", "double"),
11 |         ("interval_delta_sec", "float"),
12 | 
13 |         ("earliest_change_lsn_produced", ["null", "string"]),
14 |         ("earliest_change_db_tran_end_time_produced", ["null", "string"]),
15 |         ("latest_change_lsn_produced", ["null", "string"]),
16 |         ("latest_change_db_tran_end_time_produced", ["null", "string"]),
17 | 
18 |         ("e2e_latency_avg_sec", ["null", "float"]),
19 |         ("e2e_latency_max_sec", ["null", "float"]),
20 |         ("e2e_latency_min_sec", ["null", "float"]),
21 | 
22 |         ("sql_server_cdc_process_lag_sec", ["null", "float"]),
23 | 
24 |         ("db_all_data_queries_avg_time_per_query_ms", ["null", "float"]),
25 |         ("db_all_data_queries_count", "int"),
26 |         ("db_all_data_queries_total_time_sec", "float"),
27 |         ("db_all_data_rows_retrieved_count", "int"),
28 | 
29 |         ("db_change_data_queries_avg_time_per_query_ms", ["null", "float"]),
30 |         ("db_change_data_queries_count", "int"),
31 |         ("db_change_data_queries_total_time_sec", "float"),
32 |         ("db_change_data_rows_retrieved_count", "int"),
33 | 
34 |         ("db_snapshot_queries_avg_time_per_query_ms", ["null", "float"]),
35 |         ("db_snapshot_queries_count", "int"),
36 |         ("db_snapshot_queries_total_time_sec", "float"),
37 |         ("db_snapshot_rows_retrieved_count", "int"),
38 | 
39 |         ("kafka_produces_count", "int"),
40 |         ("kafka_produces_total_time_sec", "float"),
41 |         ("kafka_produces_avg_time_per_record_ms", ["null", "float"]),
42 |         ("kafka_delivery_acks_count", "int"),
43 | 
44 |         ("produced_delete_changes_count", "int"),
45 |         ("produced_insert_changes_count", "int"),
46 |         ("produced_update_changes_count", "int"),
47 |         ("produced_snapshot_records_count", "int"),
48 |         ("produced_metadata_records_count", "int"),
49 |         ("produced_deletion_tombstones_count", "int"),
50 |         ("messages_copied_to_unified_topics", "int"),
51 | 
52 |         ("total_sleep_time_sec", "float"),
53 |     ]
54 | 
55 |     FIELD_NAMES = {ft[0] for ft in FIELDS_AND_TYPES}
56 | 
57 |     def __setattr__(self, attr: str, value: Any) -> None:
58 |         if attr not in Metrics.FIELD_NAMES:
59 |             raise AttributeError(f'Metric name {attr} not recognized.')
60 |         super(Metrics, self).__setattr__(attr, value)
61 | 
62 |     def as_dict(self) -> Dict[str, Any]:
63 |         # Note that this will raise an exception if any of the expected metrics were not set on the object:
64 |         return {fn: getattr(self, fn) for fn in Metrics.FIELD_NAMES}
65 | 


--------------------------------------------------------------------------------
/cdc_kafka/metric_reporting/reporter_base.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import datetime
 3 | from abc import ABC, abstractmethod
 4 | from typing import TYPE_CHECKING, Optional, TypeVar, Type
 5 | 
 6 | if TYPE_CHECKING:
 7 |     from .metrics import Metrics
 8 | 
 9 | ReporterBaseType = TypeVar('ReporterBaseType', bound='ReporterBase')
10 | 
11 | 
12 | class ReporterBase(ABC):
13 |     @abstractmethod
14 |     def emit(self, metrics: 'Metrics') -> None:
15 |         pass
16 | 
17 |     @staticmethod
18 |     def add_arguments(parser: argparse.ArgumentParser) -> None:
19 |         pass
20 | 
21 |     @classmethod
22 |     @abstractmethod
23 |     def construct_with_options(cls: Type[ReporterBaseType], opts: argparse.Namespace) -> ReporterBaseType:
24 |         pass
25 | 
26 |     @staticmethod
27 |     def json_serialize_datetimes(obj: object) -> Optional[str]:
28 |         if isinstance(obj, (datetime.datetime, datetime.date)):
29 |             return obj.isoformat()
30 |         raise TypeError("Type %s not serializable" % type(obj))
31 | 


--------------------------------------------------------------------------------
/cdc_kafka/metric_reporting/stdout_reporter.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import logging
 4 | from typing import TYPE_CHECKING, TypeVar, Type
 5 | 
 6 | from . import reporter_base
 7 | 
 8 | if TYPE_CHECKING:
 9 |     from .metrics import Metrics
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | StdoutReporterType = TypeVar('StdoutReporterType', bound='StdoutReporter')
14 | 
15 | 
16 | class StdoutReporter(reporter_base.ReporterBase):
17 |     def emit(self, metrics: 'Metrics') -> None:
18 |         logger.info('Metrics recorded in last interval: %s', json.dumps(
19 |             metrics.as_dict(), default=StdoutReporter.json_serialize_datetimes))
20 | 
21 |     @staticmethod
22 |     def add_arguments(parser: argparse.ArgumentParser) -> None:
23 |         pass
24 | 
25 |     @classmethod
26 |     def construct_with_options(cls: Type[StdoutReporterType], opts: argparse.Namespace) -> StdoutReporterType:
27 |         return cls()
28 | 


--------------------------------------------------------------------------------
/cdc_kafka/options.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import importlib
  3 | import json
  4 | import os
  5 | import socket
  6 | from typing import Tuple, List, Optional, Callable
  7 | 
  8 | from . import constants, kafka_oauth
  9 | from .metric_reporting import reporter_base
 10 | from .serializers import SerializerAbstract
 11 | 
 12 | # String constants for options with discrete choices:
 13 | CAPTURE_INSTANCE_VERSION_STRATEGY_REGEX = 'regex'
 14 | CAPTURE_INSTANCE_VERSION_STRATEGY_CREATE_DATE = 'create_date'
 15 | LSN_GAP_HANDLING_RAISE_EXCEPTION = 'raise_exception'
 16 | LSN_GAP_HANDLING_BEGIN_NEW_SNAPSHOT = 'begin_new_snapshot'
 17 | LSN_GAP_HANDLING_IGNORE = 'ignore'
 18 | NEW_FOLLOW_START_POINT_EARLIEST = 'earliest'
 19 | NEW_FOLLOW_START_POINT_LATEST = 'latest'
 20 | NEW_CAPTURE_INSTANCE_SNAPSHOT_HANDLING_BEGIN_NEW = 'begin_new_snapshot'
 21 | NEW_CAPTURE_INSTANCE_SNAPSHOT_HANDLING_IGNORE = 'ignore'
 22 | NEW_CAPTURE_INSTANCE_OVERLAP_HANDLING_REPUBLISH = 'republish_from_new_instance'
 23 | NEW_CAPTURE_INSTANCE_OVERLAP_HANDLING_PICKUP = 'start_from_prior_progress'
 24 | 
 25 | 
 26 | def str2bool(v: str) -> bool:
 27 |     if isinstance(v, bool):
 28 |         return v
 29 |     if v.lower() in ('yes', 'true', 't', 'y', '1'):
 30 |         return True
 31 |     elif v.lower() in ('no', 'false', 'f', 'n', '0'):
 32 |         return False
 33 |     else:
 34 |         raise argparse.ArgumentTypeError('Boolean value expected.')
 35 | 
 36 | 
 37 | def get_options_and_metrics_reporters(
 38 |         arg_adder: Optional[Callable[[argparse.ArgumentParser, ], None]] = None) -> Tuple[
 39 |             argparse.Namespace, List[reporter_base.ReporterBase], SerializerAbstract]:
 40 |     p = argparse.ArgumentParser()
 41 | 
 42 |     # Required
 43 |     p.add_argument('--db-conn-string',
 44 |                    default=os.environ.get('DB_CONN_STRING'),
 45 |                    help='ODBC connection string for the DB from which you wish to consume CDC logs')
 46 | 
 47 |     p.add_argument('--kafka-bootstrap-servers',
 48 |                    default=os.environ.get('KAFKA_BOOTSTRAP_SERVERS'),
 49 |                    help='Host and port for your Kafka cluster, e.g. "localhost:9092"')
 50 | 
 51 |     p.add_argument('--kafka-transactional-id',
 52 |                    default=os.environ.get('KAFKA_TRANSACTIONAL_ID'),
 53 |                    help='An identifier of your choosing that should stay stable across restarts of a particularly-'
 54 |                         'configured deployment of this tool. If for example you have several deployments, each '
 55 |                         'pointing at a different source DB or set of tables, each of those should have its own unique '
 56 |                         'identifier. This value is passed to the Kafka producer as its `transactional.id` config, '
 57 |                         'which is used to guarantee atomic writes across multiple topics, including the topic this '
 58 |                         'tool uses to track its own progress against CDC data.')
 59 | 
 60 |     # Optional
 61 |     p.add_argument('--extra-kafka-consumer-config',
 62 |                    default=os.environ.get('EXTRA_KAFKA_CONSUMER_CONFIG', {}), type=json.loads,
 63 |                    help='Optional JSON object of additional librdkafka config parameters to be used when instantiating '
 64 |                         'the Kafka consumer (used only for checking saved progress upon startup, and when in '
 65 |                         'validation mode). For example: '
 66 |                         '`{"queued.max.messages.kbytes": "500000", "fetch.wait.max.ms": "250"}`')
 67 | 
 68 |     p.add_argument('--extra-kafka-producer-config',
 69 |                    default=os.environ.get('EXTRA_KAFKA_PRODUCER_CONFIG', {}), type=json.loads,
 70 |                    help='Optional JSON object of additional librdkafka config parameters to be used when instantiating '
 71 |                         'the Kafka producer. For example: `{"linger.ms": "200", "retry.backoff.ms": "250"}`')
 72 | 
 73 |     p.add_argument('--extra-topic-config',
 74 |                    default=os.environ.get('EXTRA_TOPIC_CONFIG', {}), type=json.loads,
 75 |                    help='Optional JSON object of additional librdkafka config parameters to be used when creating new '
 76 |                         'topics. For example: `{"min.insync.replicas": "2"}`.')
 77 | 
 78 |     p.add_argument('--table-exclude-regex',
 79 |                    default=os.environ.get('TABLE_EXCLUDE_REGEX'),
 80 |                    help="A regex used to exclude any tables that are tracked by CDC in your DB, but for which you "
 81 |                         "don't wish to publish data using this tool. Tables names are specified in dot-separated "
 82 |                         "'schema_name.table_name' form. Applied after the include regex, if specified.")
 83 | 
 84 |     p.add_argument('--table-include-regex',
 85 |                    default=os.environ.get('TABLE_INCLUDE_REGEX'),
 86 |                    help="A regex used to include the specific CDC-tracked tables in your DB that you wish to publish "
 87 |                         "data for with this tool. Tables names are specified in dot-separated 'schema_name.table_name' "
 88 |                         "form.")
 89 | 
 90 |     p.add_argument('--topic-name-template',
 91 |                    default=os.environ.get('TOPIC_NAME_TEMPLATE', '{schema_name}_{table_name}_cdc'),
 92 |                    help="Template by which the Kafka topics will be named. Uses curly braces to specify substituted "
 93 |                         "values. Values available for substitution are `schema_name`, `table_name`, and `capture_"
 94 |                         "instance_name`.")
 95 | 
 96 |     p.add_argument('--snapshot-table-exclude-regex',
 97 |                    default=os.environ.get('SNAPSHOT_TABLE_EXCLUDE_REGEX'),
 98 |                    help="A regex used to exclude any tables for which you don't want to do a full initial-snapshot "
 99 |                         "read, in the case that this tool is being applied against them for the first time. Table "
100 |                         "names are specified in dot-separated 'schema_name.table_name' form. Applied after the "
101 |                         "inclusion regex, if specified.")
102 | 
103 |     p.add_argument('--snapshot-table-include-regex',
104 |                    default=os.environ.get('SNAPSHOT_TABLE_INCLUDE_REGEX'),
105 |                    help="A regex used to include the specific tables for which you want to do a full initial-"
106 |                         "snapshot read, in the case that this tool is being applied against them for the first time. "
107 |                         "Tables names are specified in dot-separated 'schema_name.table_name' form.")
108 | 
109 |     p.add_argument('--capture-instance-version-strategy',
110 |                    choices=(CAPTURE_INSTANCE_VERSION_STRATEGY_REGEX, CAPTURE_INSTANCE_VERSION_STRATEGY_CREATE_DATE),
111 |                    default=os.environ.get('CAPTURE_INSTANCE_VERSION_STRATEGY',
112 |                                           CAPTURE_INSTANCE_VERSION_STRATEGY_CREATE_DATE),
113 |                    help=f"If there is more than one capture instance following a given source table, how do you want "
114 |                         f"to select which one this tool reads? `{CAPTURE_INSTANCE_VERSION_STRATEGY_CREATE_DATE}` (the "
115 |                         f"default) will follow the one most recently created. "
116 |                         f"`{CAPTURE_INSTANCE_VERSION_STRATEGY_REGEX}` allows you to specify a regex against the "
117 |                         f"capture instance name (as argument `capture-instance-version-regex`, the first captured "
118 |                         f"group of which will be used in a lexicographic ordering of capture instance names to select "
119 |                         f"the highest one. This can be useful if your capture instance names have a version number in "
120 |                         f"them.")
121 | 
122 |     p.add_argument('--capture-instance-version-regex',
123 |                    default=os.environ.get('CAPTURE_INSTANCE_VERSION_REGEX'),
124 |                    help="Regex to use if specifying option `regex` for argument `capture-instance-version-strategy`")
125 | 
126 |     p.add_argument('--progress-topic-name',
127 |                    default=os.environ.get('PROGRESS_TOPIC_NAME', '_cdc_to_kafka_progress'),
128 |                    help="Name of the topic used to store progress details reading change tables (and also source "
129 |                         "tables, in the case of snapshots). This process will create the topic if it does not yet "
130 |                         "exist. IMPORTANT: It should have only one partition.")
131 | 
132 |     p.add_argument('--snapshot-logging-topic-name',
133 |                    default=os.environ.get('SNAPSHOT_LOGGING_TOPIC_NAME'),
134 |                    help="Optional name of a topic which will receive messages logging events related to table "
135 |                         f"snapshots. Logged actions include '{constants.SNAPSHOT_LOG_ACTION_STARTED}', "
136 |                         f"'{constants.SNAPSHOT_LOG_ACTION_RESUMED}', '{constants.SNAPSHOT_LOG_ACTION_COMPLETED}', "
137 |                         f"'{constants.SNAPSHOT_LOG_ACTION_RESET_AUTO}', and "
138 |                         f"'{constants.SNAPSHOT_LOG_ACTION_RESET_MANUAL}'.")
139 | 
140 |     p.add_argument('--disable-deletion-tombstones',
141 |                    type=str2bool, nargs='?', const=True,
142 |                    default=str2bool(os.environ.get('DISABLE_DELETION_TOMBSTONES', '0')),
143 |                    help="When false (the default), CDC deletion events will lead to emitting two records: one with "
144 |                         "the CDC data and a second with the same key but a null value, to signal Kafka log compaction "
145 |                         "to remove the entry for that key. If set to true, the null-value 'tombstones' are not "
146 |                         "emitted.")
147 | 
148 |     p.add_argument('--lsn-gap-handling',
149 |                    choices=(LSN_GAP_HANDLING_RAISE_EXCEPTION, LSN_GAP_HANDLING_BEGIN_NEW_SNAPSHOT,
150 |                             LSN_GAP_HANDLING_IGNORE),
151 |                    default=os.environ.get('LSN_GAP_HANDLING', LSN_GAP_HANDLING_RAISE_EXCEPTION),
152 |                    help=f"Controls what happens if the earliest available change LSN in a capture instance is after "
153 |                         f"the LSN of the latest change published to Kafka. Defaults to "
154 |                         f"`{LSN_GAP_HANDLING_RAISE_EXCEPTION}`")
155 | 
156 |     p.add_argument('--new-follow-start-point',
157 |                    choices=(NEW_FOLLOW_START_POINT_EARLIEST, NEW_FOLLOW_START_POINT_LATEST),
158 |                    default=os.environ.get('NEW_FOLLOW_START_POINT', NEW_FOLLOW_START_POINT_LATEST),
159 |                    help=f"Controls how much change data history to read from SQL Server capture tables, for any tables "
160 |                         f"that are being followed by this process for the first time. Value "
161 |                         f"`{NEW_FOLLOW_START_POINT_EARLIEST}` will pull all existing data from the capture tables; "
162 |                         f"value `{NEW_FOLLOW_START_POINT_EARLIEST}` will only process change data added after this "
163 |                         f"process starts following the table. Note that use of `{NEW_FOLLOW_START_POINT_EARLIEST}` "
164 |                         f"with unified topics may lead to LSN regressions in the sequence of unified topic messages "
165 |                         f"in the case where new tables are added to a previously-tracked set. This setting does not "
166 |                         f"affect the behavior of table snapshots. Defaults to `{NEW_FOLLOW_START_POINT_LATEST}`")
167 | 
168 |     p.add_argument('--unified-topics',
169 |                    default=os.environ.get('UNIFIED_TOPICS', {}), type=json.loads,
170 |                    help=f'A string that is a JSON object mapping topic names to various configuration parameters as '
171 |                         f'follows: {{"<string: topic name>": {{"included_tables": "<string: a regex, required>", '
172 |                         f'"partition_count": <int: optional, defaults to 1>, "extra_topic_config": {{<JSON object, '
173 |                         f'optional>}}}}, ... }}. For each specified topic name, all change data entries for source '
174 |                         f'tables that match the `included_tables` regex will be produced to the specified topic in '
175 |                         f'globally-consistent LSN order. This means the topic may contain messages with varying '
176 |                         f'schemas, and ideally should be a single-partition topic to simplify in-order consumption of '
177 |                         f'the messages, though if desired this can be overridden via `partition_count` when this '
178 |                         f'process is creating the topic. Similarly `extra_topic_config` can be used to specify '
179 |                         f'additional parameters passed directly to librdkafka at topic creation time (e.g. to specify '
180 |                         f'`retention.ms`). For these topics, snapshot entries will not be included, and since the '
181 |                         f'messages may have varying key schemas, use of topic compaction is not recommended.')
182 | 
183 |     p.add_argument('--new-capture-instance-snapshot-handling',
184 |                    choices=(NEW_CAPTURE_INSTANCE_SNAPSHOT_HANDLING_BEGIN_NEW,
185 |                             NEW_CAPTURE_INSTANCE_SNAPSHOT_HANDLING_IGNORE),
186 |                    default=os.environ.get('NEW_CAPTURE_INSTANCE_SNAPSHOT_HANDLING',
187 |                                           NEW_CAPTURE_INSTANCE_SNAPSHOT_HANDLING_BEGIN_NEW),
188 |                    help=f"When the process begins consuming from a newer capture instance for a given source table, "
189 |                         f"how is snapshot data handled? `{NEW_CAPTURE_INSTANCE_SNAPSHOT_HANDLING_BEGIN_NEW}`, the "
190 |                         f"default, will begin a new full snapshot of the corresponding source table to pick up data "
191 |                         f"from any new columns added in the newer instance. The behavior of "
192 |                         f"`{NEW_CAPTURE_INSTANCE_SNAPSHOT_HANDLING_IGNORE}` depends on whether there was a snapshot "
193 |                         f"already in progress: if so, the snapshot will continue from where it left off but will begin "
194 |                         f"including any new columns added in the newer capture instance. If the snapshot was already "
195 |                         f"complete, nothing further will happen.")
196 | 
197 |     p.add_argument('--new-capture-instance-overlap-handling',
198 |                    choices=(NEW_CAPTURE_INSTANCE_OVERLAP_HANDLING_REPUBLISH,
199 |                             NEW_CAPTURE_INSTANCE_OVERLAP_HANDLING_PICKUP),
200 |                    default=os.environ.get('NEW_CAPTURE_INSTANCE_OVERLAP_HANDLING',
201 |                                           NEW_CAPTURE_INSTANCE_OVERLAP_HANDLING_PICKUP),
202 |                    help=f"When the process begins consuming from a newer capture instance for a given source table, "
203 |                         f"how should we handle change data that appears in both instances' change tables? "
204 |                         f"`{NEW_CAPTURE_INSTANCE_OVERLAP_HANDLING_PICKUP}`, the default, will skip over any entries "
205 |                         f"in the newer change table that were previously published based on the older instance, "
206 |                         f"preventing duplication of events. `{NEW_CAPTURE_INSTANCE_OVERLAP_HANDLING_REPUBLISH}` will "
207 |                         f"publish all change entries from the beginning of the new instance's change table, maximizing "
208 |                         f"the amount of change data published for any columns you may have added at the cost of "
209 |                         f"duplicate messages.")
210 | 
211 |     p.add_argument('--run-validations',
212 |                    type=str2bool, nargs='?', const=True,
213 |                    default=str2bool(os.environ.get('RUN_VALIDATIONS', '0')),
214 |                    help="Runs count validations between messages in the Kafka topic and rows in the change and "
215 |                         "source tables, then quits. Respects the table inclusion/exclusion regexes.")
216 | 
217 |     p.add_argument('--message-serializer',
218 |                    default=os.environ.get('MESSAGE_SERIALIZER',
219 |                                           'cdc_kafka.serializers.avro.AvroSerializer'),
220 |                    help="The serializer class (from this project's `serializers` module) used to serialize messages"
221 |                         "sent to Kafka.")
222 | 
223 |     p.add_argument('--metrics-reporters',
224 |                    default=os.environ.get('METRICS_REPORTERS',
225 |                                           'cdc_kafka.metric_reporting.stdout_reporter.StdoutReporter'),
226 |                    help="Comma-separated list of <module_name>.<class_name>s of metric reporters you want this app "
227 |                         "to emit to.")
228 | 
229 |     p.add_argument('--metrics-namespace',
230 |                    default=os.environ.get('METRICS_NAMESPACE', socket.getfqdn()),
231 |                    help="Namespace used to key metrics for certain metric reporters, and which is embedded in the "
232 |                         "metrics payload as well. Useful if multiple CDC-to-Kafka instances are emitting metrics to "
233 |                         "the same destination. Defaults to the value returned by `socket.getfqdn()`.")
234 | 
235 |     p.add_argument('--process-hostname',
236 |                    default=os.environ.get('PROCESS_HOSTNAME', socket.getfqdn()),
237 |                    help="Hostname inserted into metrics metadata messages. Defaults to the value returned by "
238 |                         "`socket.getfqdn()`.")
239 | 
240 |     p.add_argument('--partition-count',
241 |                    type=int,
242 |                    default=os.environ.get('PARTITION_COUNT'),
243 |                    help="Number of partitions to specify when creating new topics. If left empty, defaults to 1 or "
244 |                         "the average number of rows per second in the corresponding change table divided by 10, "
245 |                         "whichever is larger.")
246 | 
247 |     p.add_argument('--replication-factor',
248 |                    type=int,
249 |                    default=os.environ.get('REPLICATION_FACTOR'),
250 |                    help="Replication factor to specify when creating new topics. If left empty, defaults to 3 or the "
251 |                         "number of brokers in the cluster, whichever is smaller.")
252 | 
253 |     p.add_argument('--truncate-fields',
254 |                    default=os.environ.get('TRUNCATE_FIELDS', {}), type=json.loads,
255 |                    help='Optional JSON object that maps schema.table.column names to an integer max number of '
256 |                         'UTF-8 encoded bytes that should be serialized into the Kafka message for that field\'s '
257 |                         'values. Only applicable to string types; will raise an exception if used for non-strings. '
258 |                         'Truncation respects UTF-8 character boundaries and will not break in the middle of 2- or '
259 |                         '4-byte characters. The schema, table, and column names are case-insensitive. Example: '
260 |                         '`{"dbo.order.gift_note": 65536}`. When a field is truncated via this mechanism, a Kafka '
261 |                         'message header of the form key: `cdc_to_kafka_truncated_field__<column_name>`, value '
262 |                         '`<original_byte_length>,<truncated_byte_length>` will be added to the message.')
263 | 
264 |     p.add_argument('--terminate-on-capture-instance-change',
265 |                    type=str2bool, nargs='?', const=True,
266 |                    default=str2bool(os.environ.get('TERMINATE_ON_CAPTURE_INSTANCE_CHANGE', '0')),
267 |                    help="When true, will cause the process to terminate if it detects a change in the set of capture "
268 |                         "instances tracked based on the CAPTURE_INSTANCE_VERSION_* settings, BUT NOT UNTIL the "
269 |                         "existing process has caught up to the minimum LSN available in the new capture instance(s) "
270 |                         "for all such tables. Checked on a period defined in constants.CAPTURE_INSTANCE_EVAL_INTERVAL. "
271 |                         "This is intended to be used with a process supervisor (e.g., the Kubernetes restart loop) "
272 |                         "that will restart the process, to allow transparent migration to updated capture instances. "
273 |                         "Defaults to False")
274 | 
275 |     p.add_argument('--report-progress-only',
276 |                    type=str2bool, nargs='?', const=True,
277 |                    default=str2bool(os.environ.get('REPORT_PROGRESS_ONLY', '0')),
278 |                    help="Prints the table of instances being captured and their change data / snapshot data progress, "
279 |                         "then exits without changing any state. Can be handy for validating other configuration such "
280 |                         "as the regexes used to control which tables are followed and/or snapshotted.")
281 | 
282 |     p.add_argument('--db-row-batch-size',
283 |                    type=int,
284 |                    default=os.environ.get('DB_ROW_BATCH_SIZE', 2000),
285 |                    help="Maximum number of rows to retrieve in a single change data or snapshot query. Default 2000.")
286 | 
287 |     kafka_oauth.add_kafka_oauth_arg(p)
288 |     if arg_adder:
289 |         arg_adder(p)
290 |     opts, _ = p.parse_known_args()
291 | 
292 |     reporter_classes: List[reporter_base.ReporterBase] = []
293 |     reporters: List[reporter_base.ReporterBase] = []
294 | 
295 |     if opts.metrics_reporters:
296 |         for class_path in opts.metrics_reporters.split(','):
297 |             package_module, class_name = class_path.rsplit('.', 1)
298 |             module = importlib.import_module(package_module)
299 |             reporter_class = getattr(module, class_name)
300 |             reporter_classes.append(reporter_class)
301 |             reporter_class.add_arguments(p)
302 | 
303 |         opts, _ = p.parse_known_args()
304 | 
305 |         for reporter_class in reporter_classes:
306 |             reporters.append(reporter_class.construct_with_options(opts))
307 | 
308 |     package_module, class_name = opts.message_serializer.rsplit('.', 1)
309 |     module = importlib.import_module(package_module)
310 |     serializer_class: SerializerAbstract = getattr(module, class_name)
311 |     serializer_class.add_arguments(p)
312 |     opts, _ = p.parse_known_args()
313 |     disable_writes: bool = opts.run_validations or opts.report_progress_only
314 |     serializer = serializer_class.construct_with_options(opts, disable_writes)
315 | 
316 |     return opts, reporters, serializer
317 | 


--------------------------------------------------------------------------------
/cdc_kafka/parsed_row.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | from typing import Any, Sequence, List, Optional, Dict
 3 | 
 4 | from . import change_index
 5 | 
 6 | 
 7 | class ParsedRow(object):
 8 |     __slots__ = ('destination_topic', 'operation_id', 'cdc_update_mask', 'event_db_time',
 9 |                  'change_idx', 'ordered_key_field_values', 'table_data_cols', 'extra_headers')
10 | 
11 |     def __init__(self, destination_topic: str, operation_id: int, cdc_update_mask: bytes,
12 |                  event_db_time: datetime.datetime, change_idx: change_index.ChangeIndex,
13 |                  ordered_key_field_values: Sequence[Any], table_data_cols: List[Any],
14 |                  extra_headers: Optional[Dict[str, str | bytes]] = None) -> None:
15 |         self.destination_topic: str = destination_topic
16 |         self.operation_id: int = operation_id
17 |         self.cdc_update_mask: bytes = cdc_update_mask
18 |         self.event_db_time: datetime.datetime = event_db_time
19 |         self.change_idx: change_index.ChangeIndex = change_idx
20 |         self.ordered_key_field_values: Sequence[Any] = ordered_key_field_values
21 |         self.table_data_cols: List[Any] = table_data_cols
22 |         self.extra_headers: Optional[Dict[str, str | bytes]] = extra_headers
23 | 
24 |     def __repr__(self) -> str:
25 |         return f'ParsedRow for topic {self.destination_topic}, change index {self.change_idx}'
26 | 


--------------------------------------------------------------------------------
/cdc_kafka/progress_reset_tool.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | import os
 4 | import socket
 5 | 
 6 | from . import kafka, constants, progress_tracking, options
 7 | from .metric_reporting import accumulator
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | def main() -> None:
13 |     def add_args(p: argparse.ArgumentParser) -> None:
14 |         p.add_argument('--topic-names', required=True,
15 |                        default=os.environ.get('TOPIC_NAMES'))
16 |         p.add_argument('--progress-kind', required=True,
17 |                        choices=(constants.CHANGE_ROWS_KIND, constants.ALL_PROGRESS_KINDS, constants.SNAPSHOT_ROWS_KIND),
18 |                        default=os.environ.get('PROGRESS_KIND'))
19 |         p.add_argument('--execute',
20 |                        type=options.str2bool, nargs='?', const=True,
21 |                        default=options.str2bool(os.environ.get('EXECUTE', '0')))
22 | 
23 |     opts, _, serializer = options.get_options_and_metrics_reporters(add_args)
24 | 
25 |     logger.info(f"""
26 |     
27 | Progress reset tool: 
28 | 
29 | WILL {'NOT (because --execute is not set)' if not opts.execute else ''} reset {opts.progress_kind} progress
30 | for topic(s) {opts.topic_names}, if prior progress is found
31 | in progress topic {opts.progress_topic_name} 
32 | in Kafka cluster with bootstrap server(s) {opts.kafka_bootstrap_servers}
33 | 
34 | Reading progress topic, please wait...
35 | 
36 |     """)
37 | 
38 |     with kafka.KafkaClient(accumulator.NoopAccumulator(), opts.kafka_bootstrap_servers,
39 |                            opts.extra_kafka_consumer_config, opts.extra_kafka_producer_config,
40 |                            disable_writing=True) as kafka_client:
41 |         progress_tracker = progress_tracking.ProgressTracker(kafka_client, serializer, opts.progress_topic_name,
42 |                                                              socket.getfqdn(), opts.snapshot_logging_topic_name)
43 |         progress_entries = progress_tracker.get_prior_progress()
44 | 
45 |         def act(topic: str, progress_kind: str) -> None:
46 |             if (topic, progress_kind) not in progress_entries:
47 |                 logger.warning(f'No {progress_kind} progress found for topic {topic}')
48 |                 return
49 |             progress = progress_entries[(topic, progress_kind)]
50 |             logger.info(f'Existing {progress_kind} progress found for topic {topic} at '
51 |                         f'{progress.progress_msg_coordinates}: {progress}')
52 |             if opts.execute:
53 |                 kafka_client._disable_writing = False
54 |                 progress_tracker.reset_progress(topic, progress_kind, progress.source_table_name, False,
55 |                                                 progress.snapshot_index)
56 | 
57 |         for topic_name in opts.topic_names.split(','):
58 |             topic_name = topic_name.strip()
59 |             if opts.progress_kind == constants.ALL_PROGRESS_KINDS:
60 |                 act(topic_name, constants.CHANGE_ROWS_KIND)
61 |                 act(topic_name, constants.SNAPSHOT_ROWS_KIND)
62 |             else:
63 |                 act(topic_name, opts.progress_kind)
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     # importing this file to pick up the logging config in __init__; is there a better way??
68 |     # noinspection PyUnresolvedReferences
69 |     from cdc_kafka import progress_reset_tool
70 |     progress_reset_tool.main()
71 | 


--------------------------------------------------------------------------------
/cdc_kafka/progress_topic_validator.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import collections
  3 | import copy
  4 | import datetime
  5 | import logging
  6 | import os
  7 | import re
  8 | from typing import Dict, Optional, Set
  9 | 
 10 | from tabulate import tabulate
 11 | 
 12 | from . import kafka, constants, progress_tracking, options, helpers
 13 | from .serializers import DeserializedMessage
 14 | from .metric_reporting import accumulator
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | class TopicProgressInfo(object):
 20 |     def __init__(self) -> None:
 21 |         self.change_progress_count: int = 0
 22 |         self.snapshot_progress_count: int = 0
 23 |         self.last_change_progress: Optional[DeserializedMessage] = None
 24 |         self.last_snapshot_progress: Optional[DeserializedMessage] = None
 25 |         self.distinct_change_tables: Set[str] = set()
 26 |         self.reset_count: int = 0
 27 |         self.evolution_count: int = 0
 28 |         self.heartbeat_count: int = 0
 29 |         self.problem_count: int = 0
 30 | 
 31 | 
 32 | def main() -> None:
 33 |     def add_args(p: argparse.ArgumentParser) -> None:
 34 |         p.add_argument('--topics-to-include-regex',
 35 |                        default=os.environ.get('TOPICS_TO_INCLUDE_REGEX', '.*'))
 36 |         p.add_argument('--topics-to-exclude-regex',
 37 |                        default=os.environ.get('TOPICS_TO_EXCLUDE_REGEX'))
 38 |         p.add_argument('--show-all',
 39 |                        type=options.str2bool, nargs='?', const=True,
 40 |                        default=options.str2bool(os.environ.get('SHOW_ALL', '0')))
 41 | 
 42 |     opts, _, serializer = options.get_options_and_metrics_reporters(add_args)
 43 | 
 44 |     with kafka.KafkaClient(accumulator.NoopAccumulator(), opts.kafka_bootstrap_servers,
 45 |                            opts.extra_kafka_consumer_config, {}, disable_writing=True) as kafka_client:
 46 |         if kafka_client.get_topic_partition_count(opts.progress_topic_name) is None:
 47 |             logger.error('Progress topic %s not found.', opts.progress_topic_name)
 48 |             exit(1)
 49 | 
 50 |         topic_include_re = re.compile(opts.topics_to_include_regex, re.IGNORECASE)
 51 |         topic_exclude_re = None
 52 |         if opts.topics_to_exclude_regex:
 53 |             topic_exclude_re = re.compile(opts.topics_to_exclude_regex, re.IGNORECASE)
 54 | 
 55 |         msg_ctr = 0
 56 |         topic_info: Dict[str, TopicProgressInfo] = collections.defaultdict(TopicProgressInfo)
 57 | 
 58 |         for msg in kafka_client.consume_all(opts.progress_topic_name):
 59 |             if not msg_ctr:
 60 |                 logger.info('Read first message: %s', helpers.format_coordinates(msg))
 61 | 
 62 |             msg_ctr += 1
 63 | 
 64 |             if msg_ctr % 100000 == 0:
 65 |                 logger.info('Read %s messages so far; last was %s', msg_ctr, helpers.format_coordinates(msg))
 66 | 
 67 |             # noinspection PyTypeChecker
 68 |             deser_msg = serializer.deserialize(msg)
 69 |             if deser_msg.key_dict is None:
 70 |                 continue
 71 | 
 72 |             topic, kind = deser_msg.key_dict['topic_name'], deser_msg.key_dict['progress_kind']
 73 | 
 74 |             if not topic_include_re.match(topic):
 75 |                 continue
 76 |             if topic_exclude_re and topic_exclude_re.match(topic):
 77 |                 continue
 78 | 
 79 |             prior = copy.copy(topic_info.get(topic))
 80 | 
 81 |             # noinspection PyArgumentList
 82 |             if not deser_msg.value_dict:
 83 |                 logger.warning('%s progress for topic %s reset at %s', kind, topic, helpers.format_coordinates(msg))
 84 |                 topic_info[topic].reset_count += 1
 85 |                 continue
 86 | 
 87 |             # noinspection PyTypeChecker,PyArgumentList
 88 |             current_change_table = deser_msg.value_dict['change_table_name']
 89 |             topic_info[topic].distinct_change_tables.add(current_change_table)
 90 |             current_pe = progress_tracking.ProgressEntry.from_message(deser_msg)
 91 | 
 92 |             if kind == constants.CHANGE_ROWS_KIND:
 93 |                 if not current_pe.change_index:
 94 |                     raise Exception('Unexpected state.')
 95 |                 current_change_index = current_pe.change_index
 96 |                 topic_info[topic].change_progress_count += 1
 97 |                 topic_info[topic].last_change_progress = deser_msg
 98 |                 if current_change_index.is_probably_heartbeat:
 99 |                     topic_info[topic].heartbeat_count += 1
100 | 
101 |                 if prior and prior.last_change_progress:
102 |                     prior_pe = progress_tracking.ProgressEntry.from_message(prior.last_change_progress)
103 |                     if not prior_pe.change_index:
104 |                         raise Exception('Unexpected state.')
105 |                     prior_change_index = prior_pe.change_index
106 |                     if prior_change_index == current_change_index and not \
107 |                             current_change_index.is_probably_heartbeat:
108 |                         topic_info[topic].problem_count += 1
109 |                         logger.warning('Duplicate change entry for topic %s between %s and %s', topic,
110 |                                        helpers.format_coordinates(prior.last_change_progress.raw_msg),
111 |                                        helpers.format_coordinates(msg))
112 |                     if prior_change_index > current_change_index:
113 |                         topic_info[topic].problem_count += 1
114 |                         log_msg = '''
115 | Unordered change entry for topic %s
116 |     Prior  : progress message %s, index %s
117 |     Current: progress message %s, index %s
118 | '''
119 |                         logger.error(log_msg, topic, helpers.format_coordinates(prior.last_change_progress.raw_msg),
120 |                                      prior_change_index, helpers.format_coordinates(msg), current_change_index)
121 | 
122 |             if kind == constants.SNAPSHOT_ROWS_KIND:
123 |                 if not current_pe.snapshot_index:
124 |                     raise Exception('Unexpected state.')
125 |                 current_snapshot_index = current_pe.snapshot_index
126 |                 topic_info[topic].snapshot_progress_count += 1
127 |                 topic_info[topic].last_snapshot_progress = deser_msg
128 | 
129 |                 if prior and prior.last_snapshot_progress:
130 |                     prior_pe = progress_tracking.ProgressEntry.from_message(prior.last_snapshot_progress)
131 |                     if not prior_pe.snapshot_index:
132 |                         raise Exception('Unexpected state.')
133 |                     prior_snapshot_index = prior_pe.snapshot_index
134 |                     if current_snapshot_index == constants.SNAPSHOT_COMPLETION_SENTINEL:
135 |                         pass
136 |                     elif prior_snapshot_index == constants.SNAPSHOT_COMPLETION_SENTINEL or \
137 |                             tuple(prior_snapshot_index.values()) < tuple(current_snapshot_index.values()):
138 |                         if prior_pe.change_table_name != current_change_table:
139 |                             topic_info[topic].evolution_count += 1
140 |                             logger.info('Snapshot restart due to schema evolution to capture instance %s for topic %s '
141 |                                         'at progress message %s', current_change_table, topic,
142 |                                         helpers.format_coordinates(msg))
143 |                         else:
144 |                             topic_info[topic].problem_count += 1
145 |                             log_msg = '''
146 | Unordered snapshot entry for topic %s
147 |     Prior  : progress message %s, index %s
148 |     Current: progress message %s, index %s
149 | '''
150 |                             logger.error(log_msg, topic,
151 |                                          helpers.format_coordinates(prior.last_snapshot_progress.raw_msg),
152 |                                          prior_pe.snapshot_index, helpers.format_coordinates(msg),
153 |                                          current_pe.snapshot_index)
154 | 
155 |         logger.info('Read last message: %s', helpers.format_coordinates(msg))
156 | 
157 |         headers = ('Topic',
158 |                    'Change entries',
159 |                    'Snapshot entries',
160 |                    'Snapshot complete',
161 |                    'Change tables',
162 |                    'Last snapshot progress',
163 |                    'Last change progress',
164 |                    'Progress resets',
165 |                    'Problems',
166 |                    'Evolution re-snaps',
167 |                    'Heartbeat entries')
168 | 
169 |         table = [[k,
170 |                   v.change_progress_count,
171 |                   v.snapshot_progress_count,
172 |                   'yes' if (v.last_snapshot_progress and
173 |                             progress_tracking.ProgressEntry.from_message(v.last_snapshot_progress).snapshot_index ==
174 |                             constants.SNAPSHOT_COMPLETION_SENTINEL) else 'no',
175 |                   len(v.distinct_change_tables),
176 |                   datetime.datetime.fromtimestamp(v.last_snapshot_progress.raw_msg.timestamp()[1] / 1000,
177 |                                                   datetime.UTC) if v.last_snapshot_progress else None,
178 |                   datetime.datetime.fromtimestamp(v.last_change_progress.raw_msg.timestamp()[1] / 1000,
179 |                                                   datetime.UTC) if v.last_change_progress else None,
180 |                   v.reset_count,
181 |                   v.problem_count,
182 |                   v.evolution_count,
183 |                   v.heartbeat_count]
184 |                  for k, v in topic_info.items() if (
185 |                      opts.show_all or
186 |                      len(v.distinct_change_tables) > 1 or
187 |                      v.reset_count > 0 or
188 |                      v.problem_count > 0 or
189 |                      v.evolution_count > 0
190 |                  )]
191 | 
192 |         if not opts.show_all:
193 |             logger.warning('Only showing topics with anomalies. Use --show-all to see all topics.')
194 | 
195 |         if not table:
196 |             logger.warning('No topics to show.')
197 |         else:
198 |             table = sorted(table)
199 |             print(tabulate(table, headers, tablefmt='fancy_grid'))
200 | 
201 |         logger.info('Progress data parsed for %s topic(s). Check above for possible warnings.', len(topic_info))
202 |         logger.info('Checked %s progress messages from topic %s', msg_ctr, opts.progress_topic_name)
203 | 
204 | 
205 | if __name__ == "__main__":
206 |     # importing this file to pick up the logging config in __init__; is there a better way??
207 |     # noinspection PyUnresolvedReferences
208 |     from cdc_kafka import progress_topic_validator
209 |     progress_topic_validator.main()
210 | 


--------------------------------------------------------------------------------
/cdc_kafka/progress_tracking.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | from typing import Dict, Tuple, Any, Optional, Mapping, TypeVar, Type, TYPE_CHECKING
  4 | 
  5 | from . import constants, helpers, tracked_tables
  6 | from .change_index import ChangeIndex
  7 | from .serializers import SerializerAbstract, DeserializedMessage
  8 | 
  9 | if TYPE_CHECKING:
 10 |     from .kafka import KafkaClient
 11 |     import confluent_kafka
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | ProgressEntryType = TypeVar('ProgressEntryType', bound='ProgressEntry')
 17 | 
 18 | 
 19 | class ProgressEntry(object):
 20 |     @classmethod
 21 |     def from_message(cls: Type[ProgressEntryType], message: DeserializedMessage) -> ProgressEntryType:
 22 |         # noinspection PyTypeChecker,PyArgumentList
 23 |         k, v = message.key_dict, message.value_dict
 24 | 
 25 |         if k is None or v is None:
 26 |             raise Exception("Malformed message received by ProgressEntry.from_message")
 27 | 
 28 |         kind: str = k['progress_kind']
 29 | 
 30 |         if kind not in (constants.CHANGE_ROWS_KIND, constants.SNAPSHOT_ROWS_KIND):
 31 |             raise Exception(f"Unrecognized progress kind from message: {kind}")
 32 | 
 33 |         msg_coordinates = helpers.format_coordinates(message.raw_msg)
 34 | 
 35 |         if kind == constants.SNAPSHOT_ROWS_KIND:
 36 |             return cls(kind, k['topic_name'], v['source_table_name'], v['change_table_name'],
 37 |                        v['last_ack_position']['key_fields'], None, msg_coordinates)
 38 | 
 39 |         else:
 40 |             return cls(kind, k['topic_name'], v['source_table_name'], v['change_table_name'],
 41 |                        None, ChangeIndex.from_dict(v['last_ack_position']), msg_coordinates)
 42 | 
 43 |     def __init__(self, progress_kind: str, topic_name: str, source_table_name: str, change_table_name: str,
 44 |                  snapshot_index: Optional[Mapping[str, str | int]] = None,
 45 |                  change_index: Optional[ChangeIndex] = None, progress_msg_coordinates: Optional[str] = None) -> None:
 46 |         if progress_kind not in (constants.CHANGE_ROWS_KIND, constants.SNAPSHOT_ROWS_KIND):
 47 |             raise Exception(f'Unrecognized progress kind: {progress_kind}')
 48 | 
 49 |         self.progress_kind: str = progress_kind
 50 |         self.topic_name: str = topic_name
 51 |         self.source_table_name: str = source_table_name
 52 |         self.change_table_name: str = change_table_name
 53 |         self.snapshot_index: Optional[Mapping[str, str | int]] = snapshot_index
 54 |         self.change_index: Optional[ChangeIndex] = change_index
 55 |         self.progress_msg_coordinates: Optional[str] = progress_msg_coordinates
 56 | 
 57 |     @property
 58 |     def key(self) -> Dict[str, str]:
 59 |         return {
 60 |             'topic_name': self.topic_name,
 61 |             'progress_kind': self.progress_kind
 62 |         }
 63 | 
 64 |     @property
 65 |     def value(self) -> Optional[Dict[str, Any]]:
 66 |         if not (self.change_index or self.snapshot_index):
 67 |             return None
 68 |         pos: Dict[str, Any]
 69 |         if self.change_index:
 70 |             pos = self.change_index.as_dict()
 71 |         else:
 72 |             pos = {'key_fields': self.snapshot_index}
 73 |         return {
 74 |             'source_table_name': self.source_table_name,
 75 |             'change_table_name': self.change_table_name,
 76 |             'last_ack_position': pos
 77 |         }
 78 | 
 79 |     def __repr__(self) -> str:
 80 |         progress = self.snapshot_index if self.progress_kind == constants.SNAPSHOT_ROWS_KIND else self.change_index
 81 |         return f'ProgressEntry for {self.topic_name}; {self.progress_kind} progress: {progress}'
 82 | 
 83 | 
 84 | class ProgressTracker(object):
 85 |     _instance = None
 86 | 
 87 |     def __init__(self, kafka_client: 'KafkaClient', serializer: SerializerAbstract, progress_topic_name: str,
 88 |                  process_hostname: str, snapshot_logging_topic_name: Optional[str] = None) -> None:
 89 |         if ProgressTracker._instance is not None:
 90 |             raise Exception('ProgressTracker class should be used as a singleton.')
 91 | 
 92 |         self._kafka_client: 'KafkaClient' = kafka_client
 93 |         self._serializer: SerializerAbstract = serializer
 94 |         self._progress_topic_name: str = progress_topic_name
 95 |         self._process_hostname: str = process_hostname
 96 |         self._snapshot_logging_topic_name: Optional[str] = snapshot_logging_topic_name
 97 |         self._last_recorded_progress_by_topic: Dict[str, ProgressEntry] = {}
 98 |         self._topic_to_source_table_map: Dict[str, str] = {}
 99 |         self._topic_to_change_table_map: Dict[str, str] = {}
100 | 
101 |         ProgressTracker._instance = self
102 | 
103 |     def register_table(self, table: 'tracked_tables.TrackedTable') -> None:
104 |         self._topic_to_source_table_map[table.topic_name] = table.fq_name
105 |         self._topic_to_change_table_map[table.topic_name] = helpers.get_fq_change_table_name(
106 |             table.capture_instance_name)
107 | 
108 |     def get_last_recorded_progress_for_topic(self, topic_name: str) -> Optional[ProgressEntry]:
109 |         return self._last_recorded_progress_by_topic.get(topic_name)
110 | 
111 |     def record_changes_progress(self, topic_name: str, change_index: ChangeIndex) -> None:
112 |         progress_entry = ProgressEntry(
113 |             progress_kind=constants.CHANGE_ROWS_KIND,
114 |             topic_name=topic_name,
115 |             source_table_name=self._topic_to_source_table_map[topic_name],
116 |             change_table_name=self._topic_to_change_table_map[topic_name],
117 |             change_index=change_index
118 |         )
119 | 
120 |         key, value = self._serializer.serialize_progress_tracking_message(progress_entry)
121 | 
122 |         self._kafka_client.produce(
123 |             topic=self._progress_topic_name,
124 |             key=key,
125 |             value=value,
126 |             message_type=constants.CHANGE_PROGRESS_MESSAGE
127 |         )
128 | 
129 |         self._last_recorded_progress_by_topic[topic_name] = progress_entry
130 | 
131 |     def record_snapshot_progress(self, topic_name: str, snapshot_index: Mapping[str, str | int]) -> None:
132 |         progress_entry = ProgressEntry(
133 |             progress_kind=constants.SNAPSHOT_ROWS_KIND,
134 |             topic_name=topic_name,
135 |             source_table_name=self._topic_to_source_table_map[topic_name],
136 |             change_table_name=self._topic_to_change_table_map[topic_name],
137 |             snapshot_index=snapshot_index
138 |         )
139 | 
140 |         key, value = self._serializer.serialize_progress_tracking_message(progress_entry)
141 | 
142 |         self._kafka_client.produce(
143 |             topic=self._progress_topic_name,
144 |             key=key,
145 |             value=value,
146 |             message_type=constants.SNAPSHOT_PROGRESS_MESSAGE
147 |         )
148 | 
149 |     def _log_snapshot_event(self, topic_name: str, table_name: str, action: str,
150 |                             event_time: Optional[datetime.datetime] = None,
151 |                             starting_snapshot_index: Optional[Mapping[str, str | int]] = None,
152 |                             ending_snapshot_index: Optional[Mapping[str, str | int]] = None) -> None:
153 |         if self._snapshot_logging_topic_name is None:
154 |             return
155 | 
156 |         low_wms: Dict[str, int] = {}
157 |         high_wms: Dict[str, int] = {}
158 |         for partition, (lo_wm, hi_wm) in enumerate(self._kafka_client.get_topic_watermarks([topic_name])[topic_name]):
159 |             low_wms[str(partition)] = lo_wm
160 |             high_wms[str(partition)] = hi_wm
161 | 
162 |         event_time_iso = event_time.isoformat() if event_time is not None \
163 |             else helpers.naive_utcnow().isoformat()
164 | 
165 |         msg = {
166 |             "action": action,
167 |             "ending_snapshot_index": ending_snapshot_index,
168 |             "event_time_utc": event_time_iso,
169 |             "partition_watermarks_high": high_wms,
170 |             "partition_watermarks_low": low_wms,
171 |             "process_hostname": self._process_hostname,
172 |             "starting_snapshot_index": starting_snapshot_index,
173 |             "table_name": table_name,
174 |             "topic_name": topic_name
175 |         }
176 | 
177 |         logger.debug('Logging snapshot event: %s', msg)
178 |         _, value = self._serializer.serialize_snapshot_logging_message(msg)
179 |         self._kafka_client.produce(
180 |             topic=self._snapshot_logging_topic_name,
181 |             key=None,
182 |             value=value,
183 |             message_type=constants.SNAPSHOT_LOGGING_MESSAGE
184 |         )
185 | 
186 |     def log_snapshot_started(self, topic_name: str, table_name: str,
187 |                              starting_snapshot_index: Mapping[str, str | int]) -> None:
188 |         return self._log_snapshot_event(topic_name, table_name, constants.SNAPSHOT_LOG_ACTION_STARTED,
189 |                                         starting_snapshot_index=starting_snapshot_index)
190 | 
191 |     def log_snapshot_resumed(self, topic_name: str, table_name: str,
192 |                              starting_snapshot_index: Mapping[str, str | int]) -> None:
193 |         return self._log_snapshot_event(topic_name, table_name, constants.SNAPSHOT_LOG_ACTION_RESUMED,
194 |                                         starting_snapshot_index=starting_snapshot_index)
195 | 
196 |     def log_snapshot_completed(self, topic_name: str, table_name: str, event_time: datetime.datetime,
197 |                                ending_snapshot_index: Mapping[str, str | int]) -> None:
198 |         return self._log_snapshot_event(topic_name, table_name, constants.SNAPSHOT_LOG_ACTION_COMPLETED,
199 |                                         event_time=event_time, ending_snapshot_index=ending_snapshot_index)
200 | 
201 |     def log_snapshot_progress_reset(self, topic_name: str, table_name: str, is_auto_reset: bool,
202 |                                     prior_progress_snapshot_index: Optional[Mapping[str, str | int]]) -> None:
203 |         action = constants.SNAPSHOT_LOG_ACTION_RESET_AUTO if is_auto_reset \
204 |             else constants.SNAPSHOT_LOG_ACTION_RESET_MANUAL
205 |         return self._log_snapshot_event(topic_name, table_name, action,
206 |                                         ending_snapshot_index=prior_progress_snapshot_index)
207 | 
208 |     def get_prior_progress_or_create_progress_topic(self) -> Dict[Tuple[str, str], ProgressEntry]:
209 |         if not self._kafka_client.get_topic_partition_count(self._progress_topic_name):
210 |             logger.warning('No existing progress storage topic found; creating topic %s', self._progress_topic_name)
211 | 
212 |             # log.segment.bytes set to 16 MB. Compaction will not run until the next log segment rolls, so we set this
213 |             # a bit low (the default is 1 GB!) to prevent having to read too much from the topic on process startup:
214 |             self._kafka_client.create_topic(self._progress_topic_name, 1,
215 |                                             extra_config={"segment.bytes": 16 * 1024 * 1024})
216 |             return {}
217 |         return self.get_prior_progress()
218 | 
219 |     def maybe_create_snapshot_logging_topic(self) -> None:
220 |         if (self._snapshot_logging_topic_name and not
221 |                 self._kafka_client.get_topic_partition_count(self._snapshot_logging_topic_name)):
222 |             logger.warning('No existing snapshot logging topic found; creating topic %s',
223 |                            self._snapshot_logging_topic_name)
224 |             self._kafka_client.create_topic(self._snapshot_logging_topic_name, 1,
225 |                                             extra_config={'cleanup.policy': 'delete',
226 |                                                           'retention.ms': 365 * 24 * 60 * 60 * 1000})  # 1 year
227 | 
228 |     # the keys in the returned dictionary are tuples of (topic_name, progress_kind)
229 |     def get_prior_progress(self) -> Dict[Tuple[str, str], ProgressEntry]:
230 |         raw_msgs: Dict[bytes | str | None, confluent_kafka.Message] = {}
231 | 
232 |         progress_msg_ctr = 0
233 |         for progress_msg in self._kafka_client.consume_all(self._progress_topic_name):
234 |             progress_msg_ctr += 1
235 |             # noinspection PyArgumentList
236 |             if progress_msg.value() is None:
237 |                 if progress_msg.key() is not None and progress_msg.key() in raw_msgs:
238 |                     del raw_msgs[progress_msg.key()]
239 |                 continue
240 |             raw_msgs[progress_msg.key()] = progress_msg
241 | 
242 |         logger.info('Read %s prior progress messages from Kafka topic %s', progress_msg_ctr, self._progress_topic_name)
243 | 
244 |         result: Dict[Tuple[str, str], ProgressEntry] = {}
245 |         for msg in raw_msgs.values():
246 |             deser_msg = self._serializer.deserialize(msg)
247 |             if deser_msg.key_dict is None:
248 |                 raise Exception('Unexpected state: None value from deserializing progress message key')
249 |             result[(deser_msg.key_dict['topic_name'], deser_msg.key_dict['progress_kind'])] = \
250 |                 ProgressEntry.from_message(message=deser_msg)
251 | 
252 |         return result
253 | 
254 |     def reset_progress(self, topic_name: str, kind_to_reset: str, source_table_name: str, is_auto_reset: bool,
255 |                        prior_progress_snapshot_index: Optional[Mapping[str, str | int]] = None) -> None:
256 |         # Produce messages with empty values to "delete" them from Kafka
257 |         matched = False
258 | 
259 |         if kind_to_reset in (constants.CHANGE_ROWS_KIND, constants.ALL_PROGRESS_KINDS):
260 |             progress_entry = ProgressEntry(constants.CHANGE_ROWS_KIND, topic_name, '', '')
261 |             key, _ = self._serializer.serialize_progress_tracking_message(progress_entry)
262 |             self._kafka_client.produce(
263 |                 topic=self._progress_topic_name,
264 |                 key=key,
265 |                 value=None,
266 |                 message_type=constants.PROGRESS_DELETION_TOMBSTONE_MESSAGE
267 |             )
268 |             logger.info('Deleted existing change rows progress records for topic %s.', topic_name)
269 |             matched = True
270 | 
271 |         if kind_to_reset in (constants.SNAPSHOT_ROWS_KIND, constants.ALL_PROGRESS_KINDS):
272 |             progress_entry = ProgressEntry(constants.SNAPSHOT_ROWS_KIND, topic_name, '', '')
273 |             key, _ = self._serializer.serialize_progress_tracking_message(progress_entry)
274 |             self._kafka_client.produce(
275 |                 topic=self._progress_topic_name,
276 |                 key=key,
277 |                 value=None,
278 |                 message_type=constants.PROGRESS_DELETION_TOMBSTONE_MESSAGE
279 |             )
280 |             logger.info('Deleted existing snapshot progress records for topic %s.', topic_name)
281 |             self.maybe_create_snapshot_logging_topic()
282 |             self.log_snapshot_progress_reset(topic_name, source_table_name, is_auto_reset,
283 |                                              prior_progress_snapshot_index)
284 |             matched = True
285 | 
286 |         if not matched:
287 |             raise Exception(f'Function reset_progress received unrecognized argument "{kind_to_reset}" for '
288 |                             f'kind_to_reset.')
289 | 


--------------------------------------------------------------------------------
/cdc_kafka/serializers/__init__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from abc import ABC, abstractmethod
 3 | from typing import TypeVar, Type, Tuple, Dict, Any, Optional, TYPE_CHECKING
 4 | 
 5 | import confluent_kafka
 6 | 
 7 | if TYPE_CHECKING:
 8 |     from ..parsed_row import ParsedRow
 9 |     from ..progress_tracking import ProgressEntry
10 |     from ..tracked_tables import TrackedTable
11 | 
12 | SerializerAbstractType = TypeVar('SerializerAbstractType', bound='SerializerAbstract')
13 | 
14 | 
15 | class DeserializedMessage(object):
16 |     def __init__(self, raw_msg: confluent_kafka.Message, key_dict: Optional[Dict[str, Any]],
17 |                  value_dict: Optional[Dict[str, Any]]):
18 |         self.raw_msg = raw_msg
19 |         self.key_dict = key_dict
20 |         self.value_dict = value_dict
21 | 
22 | 
23 | class SerializerAbstract(ABC):
24 | 
25 |     @abstractmethod
26 |     def register_table(self, table: 'TrackedTable') -> None:
27 |         pass
28 | 
29 |     @abstractmethod
30 |     def serialize_table_data_message(self, row: 'ParsedRow') -> Tuple[bytes, bytes]:
31 |         pass
32 | 
33 |     @abstractmethod
34 |     def serialize_progress_tracking_message(self, progress_entry: 'ProgressEntry') -> Tuple[bytes, Optional[bytes]]:
35 |         pass
36 | 
37 |     @abstractmethod
38 |     def serialize_metrics_message(self, metrics_namespace: str, metrics: Dict[str, Any]) -> Tuple[bytes, bytes]:
39 |         pass
40 | 
41 |     @abstractmethod
42 |     def serialize_snapshot_logging_message(self, snapshot_log: Dict[str, Any]) -> Tuple[None, bytes]:
43 |         pass
44 | 
45 |     @abstractmethod
46 |     def deserialize(self, msg: confluent_kafka.Message) -> DeserializedMessage:
47 |         pass
48 | 
49 |     @staticmethod
50 |     def add_arguments(parser: argparse.ArgumentParser) -> None:
51 |         pass
52 | 
53 |     @classmethod
54 |     @abstractmethod
55 |     def construct_with_options(cls: Type[SerializerAbstractType], opts: argparse.Namespace,
56 |                                disable_writes: bool) -> SerializerAbstractType:
57 |         pass
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/cdc_kafka/show_snapshot_history.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import logging
  4 | from typing import List, Any, Dict
  5 | 
  6 | import confluent_kafka
  7 | 
  8 | from tabulate import tabulate
  9 | 
 10 | from . import kafka, constants, options
 11 | from .metric_reporting import accumulator
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | RETENTION_CONFIG_NAMES = (
 17 |     'retention.ms',
 18 |     'retention.bytes',
 19 |     'cleanup.policy'
 20 | )
 21 | 
 22 | 
 23 | def main() -> None:
 24 |     def add_args(p: argparse.ArgumentParser) -> None:
 25 |         p.add_argument('--topic-names', required=True)
 26 |         p.add_argument('--script-output-file', type=argparse.FileType('w'))
 27 |         p.add_argument('--extra-kafka-cli-command-arg', nargs='*')
 28 | 
 29 |     opts, _, serializer = options.get_options_and_metrics_reporters(add_args)
 30 |     print(opts.topic_names)
 31 |     topic_names: List[str] = [x.strip() for x in opts.topic_names.strip().split(',') if x.strip()]
 32 |     display_table: List[List[str]] = []
 33 |     completions_seen_since_start: Dict[str, bool] = {tn: False for tn in topic_names}
 34 |     last_starts: Dict[str, Dict[str, Any]] = {tn: {} for tn in topic_names}
 35 |     consumed_count: int = 0
 36 |     relevant_count: int = 0
 37 |     table_headers = [
 38 |         "Topic",
 39 |         "Time",
 40 |         "Action",
 41 |         "Source table",
 42 |         "Starting Key",
 43 |         "Ending key",
 44 |         "Low watermarks",
 45 |         "High watermarks",
 46 |         "Acting hostname",
 47 |         "Key schema ID",
 48 |         "Value schema ID",
 49 |     ]
 50 | 
 51 |     with kafka.KafkaClient(accumulator.NoopAccumulator(), opts.kafka_bootstrap_servers,
 52 |                            opts.extra_kafka_consumer_config, {}, disable_writing=True) as kafka_client:
 53 |         for msg in kafka_client.consume_all(opts.snapshot_logging_topic_name):
 54 |             deser_msg = serializer.deserialize(msg)
 55 |             log = deser_msg.value_dict or {}
 56 |             consumed_count += 1
 57 |             if log['topic_name'] not in topic_names:
 58 |                 continue
 59 |             relevant_count += 1
 60 |             display_table.append([
 61 |                 log['topic_name'],
 62 |                 log["event_time_utc"],
 63 |                 log["action"],
 64 |                 log["table_name"],
 65 |                 log["starting_snapshot_index"],
 66 |                 log["ending_snapshot_index"],
 67 |                 log["partition_watermarks_low"],
 68 |                 log["partition_watermarks_high"],
 69 |                 log["process_hostname"],
 70 |                 log["key_schema_id"],
 71 |                 log["value_schema_id"],
 72 |             ])
 73 |             if log["action"] == constants.SNAPSHOT_LOG_ACTION_STARTED:
 74 |                 last_starts[log['topic_name']] = log
 75 |                 completions_seen_since_start[log['topic_name']] = False
 76 |             if log["action"] == constants.SNAPSHOT_LOG_ACTION_COMPLETED:
 77 |                 completions_seen_since_start[log['topic_name']] = True
 78 | 
 79 |         print(f'''
 80 | Consumed {consumed_count} messages from snapshot logging topic {opts.snapshot_logging_topic_name}.
 81 | {relevant_count} were related to requested topics {topic_names}.
 82 |         ''')
 83 | 
 84 |         if not relevant_count:
 85 |             exit(0)
 86 | 
 87 |         display_table = sorted(display_table, key=lambda x: (x[0], x[1]))
 88 |         print(tabulate(display_table, table_headers))
 89 | 
 90 |         watermarks = kafka_client.get_topic_watermarks(topic_names)
 91 | 
 92 |         for topic_name in topic_names:
 93 |             try:
 94 |                 all_topic_configs = kafka_client.get_topic_config(topic_name)
 95 |             except confluent_kafka.KafkaException as e:
 96 |                 if e.args[0].code() == confluent_kafka.KafkaError.UNKNOWN_TOPIC_OR_PART:
 97 |                     print(f'Topic {topic_name} does not seem to exist; skipping.')
 98 |                     continue
 99 |                 else:
100 |                     raise
101 |             topic_has_delete_cleanup_policy = 'delete' in all_topic_configs['cleanup.policy'].value
102 |             topic_level_retention_configs = {
103 |                 k: v.value for k, v in all_topic_configs.items()
104 |                 if k in RETENTION_CONFIG_NAMES
105 |                 and v.source == confluent_kafka.admin.ConfigSource.DYNAMIC_TOPIC_CONFIG.value
106 |             }
107 | 
108 |             print(f'''
109 |             
110 | ----- Topic {topic_name} -----
111 | Current retention-related configs for the topic are: {topic_level_retention_configs}.      
112 | Current topic watermarks are: {watermarks[topic_name]}.''')
113 | 
114 |             if not last_starts[topic_name]:
115 |                 print('No logged snapshot event history was found for this topic.')
116 |             else:
117 |                 topic_starts_with_last_snapshot = True
118 |                 for part, (low_wm, _) in enumerate(watermarks[topic_name]):
119 |                     if low_wm != last_starts[topic_name]['partition_watermarks_high'][str(part)]:
120 |                         topic_starts_with_last_snapshot = False
121 |                 if topic_starts_with_last_snapshot:
122 |                     print('The first message in all topic-partitions appears to coincide with the beginning of the most recent snapshot.')
123 |                 else:
124 |                     if not completions_seen_since_start[topic_name]:
125 |                         print('''
126 | --------------------------------------------------------------------------------------------------------------------------------------------------            
127 | *** NOTE: A snapshot completion has not been logged since the most recent start! Take note in considering whether to run the commands below... ***
128 | --------------------------------------------------------------------------------------------------------------------------------------------------
129 |                         ''')
130 |                     delete_parts = [{"topic": topic_name, "partition": int(part), "offset": wm}
131 |                                     for part, wm in last_starts[topic_name]['partition_watermarks_high'].items()]
132 |                     delete_spec = json.dumps({"partitions": delete_parts}, indent=4)
133 |                     if not topic_has_delete_cleanup_policy:
134 |                         extra_args_str = f' --{" --".join(opts.extra_kafka_cli_command_arg)}' if opts.extra_kafka_cli_command_arg else ''
135 |                         config_alter = (f'kafka-configs{extra_args_str} --bootstrap-server {opts.kafka_bootstrap_servers} '
136 |                                         f'--alter --entity-type topics --entity-name {topic_name} --add-config '
137 |                                         f'cleanup.policy=delete,retention.ms=-1,retention.bytes=-1')
138 |                         to_add = []
139 |                         to_delete = []
140 |                         for ret_con in RETENTION_CONFIG_NAMES:
141 |                             if ret_con in topic_level_retention_configs:
142 |                                 to_add.append(f'{ret_con}=[{topic_level_retention_configs[ret_con]}]')
143 |                             else:
144 |                                 to_delete.append(ret_con)
145 |                         restore_by_add = ''
146 |                         if to_add:
147 |                             restore_by_add = (f"kafka-configs{extra_args_str} --bootstrap-server {opts.kafka_bootstrap_servers} --alter "
148 |                                               f"--entity-type topics --entity-name {topic_name} "
149 |                                               f"--add-config {','.join(to_add)}")
150 |                         restore_by_delete = ''
151 |                         if to_delete:
152 |                             restore_by_delete = (f"kafka-configs{extra_args_str} --bootstrap-server {opts.kafka_bootstrap_servers} --alter "
153 |                                                  f"--entity-type topics --entity-name {topic_name} "
154 |                                                  f"--delete-config {','.join(to_delete)}")
155 | 
156 |                         commands = f'''
157 | {config_alter}
158 | cat << "EOF" > delete-records-{topic_name}.json
159 | {delete_spec}
160 | EOF
161 | kafka-delete-records{extra_args_str} --bootstrap-server {opts.kafka_bootstrap_servers} --offset-json-file delete-records-{topic_name}.json
162 | {restore_by_add}
163 | {restore_by_delete}
164 |                         '''
165 |                         print(f'''The following sequence of Kafka CLI tool commands would allow you to delete all messages in the topic prior to the beginning of the most recent snapshot:
166 |                         
167 | {commands}''')
168 |                         if opts.script_output_file:
169 |                             opts.script_output_file.write(commands)
170 | 
171 | 
172 | if __name__ == "__main__":
173 |     # importing this file to pick up the logging config in __init__; is there a better way??
174 |     # noinspection PyUnresolvedReferences
175 |     from cdc_kafka import show_snapshot_history
176 |     show_snapshot_history.main()
177 | 


--------------------------------------------------------------------------------
/cdc_kafka/sql_queries.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Tuple, Iterable, Collection, Optional, Sequence
  2 | 
  3 | import pyodbc
  4 | 
  5 | from . import constants
  6 | 
  7 | # Methods in this module should return (<the SQL query text>, <a list of the query parameters' type specifications>),
  8 | # where the param specs are tuples of (odbc_type, column_size, decimal_digits)
  9 | 
 10 | 
 11 | def get_cdc_capture_instances_metadata() -> Tuple[str, List[Tuple[int, int, Optional[int]]]]:
 12 |     return f'''
 13 | -- cdc-to-kafka: get_cdc_capture_instances_metadata
 14 | SELECT
 15 |     OBJECT_SCHEMA_NAME(source_object_id) AS schema_name
 16 |     , OBJECT_NAME(source_object_id) AS table_name
 17 |     , capture_instance
 18 |     , start_lsn
 19 |     , create_date
 20 |     , COALESCE(CHECKSUM_AGG(CHECKSUM(
 21 |         cc.column_name, cc.column_id, cc.column_type, cc.column_ordinal, cc.is_computed
 22 |     )), -1) AS col_types_checksum
 23 | FROM [{constants.CDC_DB_SCHEMA_NAME}].[change_tables] AS ct
 24 | JOIN [{constants.CDC_DB_SCHEMA_NAME}].[captured_columns] AS cc on ct.object_id = cc.object_id
 25 | GROUP BY source_object_id, capture_instance, start_lsn, create_date
 26 | ORDER BY source_object_id
 27 |     ''', []
 28 | 
 29 | 
 30 | def get_cdc_tracked_tables_metadata(capture_instance_names: List[str]) -> \
 31 |         Tuple[str, List[Tuple[int, int, Optional[int]]]]:
 32 |     ci_list = ", ".join([f"'{x}'" for x in capture_instance_names])
 33 |     return f'''
 34 | -- cdc-to-kafka: get_cdc_tracked_tables_metadata    
 35 | SELECT
 36 |     OBJECT_SCHEMA_NAME(ct.source_object_id) AS schema_name
 37 |     , OBJECT_NAME(ct.source_object_id) AS table_name
 38 |     , ct.capture_instance AS capture_instance_name
 39 |     , ct.start_lsn AS capture_min_lsn
 40 |     , cc.column_ordinal AS change_table_ordinal
 41 |     , cc.column_name AS column_name
 42 |     , cc.column_type AS sql_type_name
 43 |     , cc.is_computed AS is_computed
 44 |     , ic.index_ordinal AS primary_key_ordinal
 45 |     , sc.precision AS decimal_precision
 46 |     , sc.scale AS decimal_scale
 47 |     , sc.is_nullable AS is_nullable
 48 | FROM
 49 |     [{constants.CDC_DB_SCHEMA_NAME}].[change_tables] AS ct
 50 |     INNER JOIN [{constants.CDC_DB_SCHEMA_NAME}].[captured_columns] AS cc ON (ct.object_id = cc.object_id)
 51 |     LEFT JOIN [{constants.CDC_DB_SCHEMA_NAME}].[index_columns] AS ic 
 52 |         ON (cc.object_id = ic.object_id AND cc.column_id = ic.column_id)
 53 |     LEFT JOIN sys.columns AS sc ON (sc.object_id = ct.source_object_id AND sc.column_id = cc.column_id)
 54 | WHERE ct.capture_instance IN ({ci_list})
 55 | ORDER BY ct.object_id, cc.column_ordinal
 56 |     ''', []
 57 | 
 58 | 
 59 | def get_latest_cdc_entry_time() -> Tuple[str, List[Tuple[int, int, Optional[int]]]]:
 60 |     return f'''
 61 | -- cdc-to-kafka: get_latest_cdc_entry_time    
 62 | SELECT TOP 1 tran_end_time 
 63 | FROM [{constants.CDC_DB_SCHEMA_NAME}].[lsn_time_mapping] 
 64 | ORDER BY tran_end_time DESC
 65 |     ''', []
 66 | 
 67 | 
 68 | def get_change_rows_per_second(fq_change_table_name: str) -> Tuple[str, List[Tuple[int, int, Optional[int]]]]:
 69 |     return f'''
 70 | -- cdc-to-kafka: get_change_rows_per_second    
 71 | SELECT ISNULL(COUNT(*) / NULLIF(DATEDIFF(second, MIN(ltm.tran_end_time), MAX(ltm.tran_end_time)), 0), 0)
 72 | FROM {fq_change_table_name} AS ct WITH (NOLOCK)
 73 | INNER JOIN [{constants.CDC_DB_SCHEMA_NAME}].[lsn_time_mapping] AS ltm WITH (NOLOCK) ON ct.__$start_lsn = ltm.start_lsn
 74 |     ''', []
 75 | 
 76 | 
 77 | def get_change_table_index_cols() -> Tuple[str, List[Tuple[int, int, Optional[int]]]]:
 78 |     return f'''
 79 | -- cdc-to-kafka: get_change_table_index_cols
 80 | SELECT COL_NAME(ic.object_id, ic.column_id)
 81 | FROM sys.indexes AS i
 82 | INNER JOIN sys.index_columns AS ic ON i.object_id = ic.object_id AND i.index_id = ic.index_id
 83 | WHERE i.object_id = OBJECT_ID(?) AND type_desc = 'CLUSTERED'
 84 | ORDER BY key_ordinal
 85 |     ''', [(pyodbc.SQL_VARCHAR, 255, None)]
 86 | 
 87 | 
 88 | def get_date() -> Tuple[str, List[Tuple[int, int, Optional[int]]]]:
 89 |     return 'SELECT GETDATE()', []
 90 | 
 91 | 
 92 | def get_indexed_cols() -> Tuple[str, List[Tuple[int, int, Optional[int]]]]:
 93 |     return f'''
 94 | -- cdc-to-kafka: get_indexed_cols
 95 | SELECT DISTINCT c.[name]
 96 | FROM sys.index_columns AS ic
 97 | INNER JOIN sys.indexes AS i
 98 |     ON ic.[object_id] = i.[object_id]
 99 |     AND ic.[index_id] = i.[index_id]
100 | INNER JOIN sys.columns AS c
101 |     ON ic.[object_id] = c.[object_id]
102 |     AND ic.[column_id] = c.[column_id]
103 | WHERE ic.[object_id] = OBJECT_ID(?)
104 |     AND ic.[key_ordinal] = 1
105 |     AND i.[is_disabled] = 0
106 |     AND i.[type] != 0
107 |     AND i.has_filter = 0
108 |    ''', [(pyodbc.SQL_VARCHAR, 255, None)]
109 | 
110 | 
111 | def get_ddl_history_for_capture_table() -> Tuple[str, List[Tuple[int, int, Optional[int]]]]:
112 |     return f'''
113 | -- cdc-to-kafka: get_ddl_history_for_capture_table
114 | SELECT ddl_command, DATEDIFF(second, ddl_time, GETDATE()) AS age_seconds
115 | FROM [{constants.CDC_DB_SCHEMA_NAME}].[ddl_history]
116 | WHERE object_id = OBJECT_ID(?) AND required_column_update = 0
117 |     ''', [(pyodbc.SQL_VARCHAR, 255, None)]
118 | 
119 | 
120 | def get_table_count(schema_name: str, table_name: str, pk_cols: Sequence[str],
121 |                     odbc_columns: Tuple[pyodbc.Row, ...]) -> Tuple[str, List[Tuple[int, int, Optional[int]]]]:
122 |     declarations, where_spec, params = _get_snapshot_query_bits(pk_cols, odbc_columns, ('>=', '<='))
123 | 
124 |     return f'''
125 | -- cdc-to-kafka: get_table_count
126 | DECLARE 
127 |     {declarations}
128 | ;
129 | 
130 | SELECT COUNT(*)
131 | FROM [{schema_name}].[{table_name}] WITH (NOLOCK)
132 | WHERE {where_spec}
133 |     ''', params
134 | 
135 | 
136 | def get_table_rowcount_bounded(table_fq_name: str, max_count: int) -> \
137 |         Tuple[str, List[Tuple[int, int, Optional[int]]]]:
138 |     assert max_count > 0
139 |     return f'''
140 | -- cdc-to-kafka: get_table_rowcount_bounded
141 | SELECT COUNT(*) FROM (
142 |     SELECT TOP {max_count} 1 AS nbr
143 |     FROM {table_fq_name} WITH (NOLOCK)
144 | ) AS ctr
145 |     ''', []
146 | 
147 | 
148 | def get_max_key_value(schema_name: str, table_name: str, pk_cols: Sequence[str]) -> \
149 |         Tuple[str, List[Tuple[int, int, Optional[int]]]]:
150 |     select_spec = ", ".join([f'[{x}]' for x in pk_cols])
151 |     order_by_spec = ", ".join([f'[{x}] DESC' for x in pk_cols])
152 |     return f'''
153 | -- cdc-to-kafka: get_max_key_value
154 | SELECT TOP 1 {select_spec}
155 | FROM [{schema_name}].[{table_name}] ORDER BY {order_by_spec}
156 |     ''', []
157 | 
158 | 
159 | def get_min_key_value(schema_name: str, table_name: str, pk_cols: Sequence[str]) -> \
160 |         Tuple[str, List[Tuple[int, int, Optional[int]]]]:
161 |     select_spec = ", ".join([f'[{x}]' for x in pk_cols])
162 |     order_by_spec = ", ".join([f'[{x}] ASC' for x in pk_cols])
163 |     return f'''
164 | -- cdc-to-kafka: get_min_key_value
165 | SELECT TOP 1 {select_spec}
166 | FROM [{schema_name}].[{table_name}] ORDER BY {order_by_spec}
167 |     ''', []
168 | 
169 | 
170 | def get_change_table_count_by_operation(fq_change_table_name: str) -> Tuple[str, List[Tuple[int, int, Optional[int]]]]:
171 |     return f'''
172 | -- cdc-to-kafka: get_change_table_count_by_operation
173 | DECLARE 
174 |     @LSN BINARY(10) = ?
175 |     , @SEQVAL BINARY(10) = ?
176 |     , @OPERATION INT = ?
177 | ;
178 | 
179 | SELECT 
180 |     COUNT(*)
181 |     , __$operation AS op
182 | FROM {fq_change_table_name} WITH (NOLOCK)
183 | WHERE __$operation != 3 
184 |     AND (
185 |         __$start_lsn < @LSN
186 |         OR __$start_lsn = @LSN AND __$seqval < @SEQVAL
187 |         OR __$start_lsn = @LSN AND __$seqval = @SEQVAL AND __$operation <= @OPERATION
188 |     )
189 | GROUP BY __$operation
190 |     ''', [(pyodbc.SQL_BINARY, 10, None), (pyodbc.SQL_BINARY, 10, None), (pyodbc.SQL_INTEGER, 4, None)]
191 | 
192 | 
193 | def get_max_lsn() -> Tuple[str, List[Tuple[int, int, Optional[int]]]]:
194 |     return 'SELECT sys.fn_cdc_get_max_lsn()', []
195 | 
196 | 
197 | def get_max_lsn_for_change_table(fq_change_table_name: str) -> Tuple[str, List[Tuple[int, int, Optional[int]]]]:
198 |     return f'''
199 | -- cdc-to-kafka: get_max_lsn_for_change_table
200 | WITH lsns AS (
201 |     SELECT __$start_lsn, __$command_id, __$seqval, __$operation
202 |     FROM {fq_change_table_name}
203 | 
204 |     UNION ALL
205 | 
206 |     SELECT sys.fn_cdc_increment_lsn(start_lsn), 0, 0x00000000000000000000, 0
207 |     FROM [{constants.CDC_DB_SCHEMA_NAME}].[change_tables]
208 |     WHERE object_id = OBJECT_ID('{fq_change_table_name}')
209 | )
210 | SELECT TOP 1 __$start_lsn, __$command_id, __$seqval, __$operation
211 | FROM lsns
212 | ORDER BY __$start_lsn DESC, __$command_id DESC, __$seqval DESC, __$operation DESC
213 |     ''', []
214 | 
215 | 
216 | def get_change_rows(batch_size: int, fq_change_table_name: str, field_names: Iterable[str],
217 |                     ct_index_cols: Iterable[str]) -> Tuple[str, List[Tuple[int, int, Optional[int]]]]:
218 |     # You may feel tempted to change or simplify this query. TREAD CAREFULLY. There was a lot of iterating here to
219 |     # craft something that would not induce SQL Server to resort to a full index scan. If you change it, run some
220 |     # EXPLAINs and ensure that the steps are still only index SEEKs, not scans.
221 | 
222 |     # See comments in _get_snapshot_query_bits to understand other details of why these queries look as they do,
223 |     # esp. in regard to the presence of DECLARE statements within them.
224 | 
225 |     select_column_specs = ', '.join([f'ct.[{f}]' for f in field_names])
226 |     order_spec = ', '.join([f'[{f}]' for f in ct_index_cols])
227 |     return f'''
228 | -- cdc-to-kafka: get_change_rows
229 | DECLARE 
230 |     @LSN BINARY(10) = ?
231 |     , @SEQ BINARY(10) = ?
232 |     , @MAX_LSN BINARY(10) = ?
233 | ;
234 | 
235 | WITH ct AS (
236 |     SELECT *
237 |     FROM {fq_change_table_name} AS ct WITH (NOLOCK)
238 |     WHERE ct.__$start_lsn = @LSN AND ct.__$seqval > @SEQ AND ct.__$start_lsn <= @MAX_LSN
239 | 
240 |     UNION ALL
241 | 
242 |     SELECT *
243 |     FROM {fq_change_table_name} AS ct WITH (NOLOCK)
244 |     WHERE ct.__$start_lsn > @LSN AND ct.__$start_lsn <= @MAX_LSN
245 | )
246 | SELECT TOP ({batch_size})
247 |     ct.__$operation AS {constants.OPERATION_NAME}
248 |     , ltm.tran_end_time AS {constants.EVENT_TIME_NAME}
249 |     , ct.__$start_lsn AS {constants.LSN_NAME}
250 |     , ct.__$seqval AS {constants.SEQVAL_NAME}
251 |     , ct.__$update_mask AS {constants.UPDATED_FIELDS_NAME}
252 |     , {select_column_specs}
253 | FROM ct 
254 | INNER JOIN [{constants.CDC_DB_SCHEMA_NAME}].[lsn_time_mapping] AS ltm WITH (NOLOCK) ON (ct.__$start_lsn = ltm.start_lsn)
255 | WHERE ct.__$operation = 1 OR ct.__$operation = 2 OR ct.__$operation = 4
256 | ORDER BY {order_spec}
257 |     ''', [(pyodbc.SQL_BINARY, 10, None)] * 3
258 | 
259 | 
260 | def get_snapshot_rows(
261 |         batch_size: int, schema_name: str, table_name: str, field_names: Collection[str],
262 |         removed_field_names: Collection[str], pk_cols: Sequence[str], first_read: bool,
263 |         odbc_columns: Tuple[pyodbc.Row, ...]) -> Tuple[str, List[Tuple[int, int, Optional[int]]]]:
264 |     select_cols = []
265 |     for fn in field_names:
266 |         if fn in removed_field_names:
267 |             select_cols.append(f'NULL AS [{fn}]')
268 |         else:
269 |             select_cols.append(f'[{fn}]')
270 |     select_column_specs = ', '.join(select_cols)
271 |     order_spec = ', '.join([f'[{x}] DESC' for x in pk_cols])
272 | 
273 |     params: List[Tuple[int, int, Optional[int]]]
274 |     if first_read:
275 |         declarations = '@K0 int = 0'
276 |         where_spec = '1=1'
277 |         params = []
278 |     else:
279 |         declarations, where_spec, params = _get_snapshot_query_bits(pk_cols, odbc_columns, ('<', ))
280 | 
281 |     return f'''
282 | -- cdc-to-kafka: get_snapshot_rows
283 | DECLARE 
284 |     {declarations}
285 | ;
286 | 
287 | SELECT TOP ({batch_size})
288 |     {constants.SNAPSHOT_OPERATION_ID} AS {constants.OPERATION_NAME}
289 |     , GETDATE() AS {constants.EVENT_TIME_NAME}
290 |     , NULL AS {constants.LSN_NAME}
291 |     , NULL AS {constants.SEQVAL_NAME}
292 |     , NULL AS {constants.UPDATED_FIELDS_NAME}
293 |     , {select_column_specs}
294 | FROM
295 |     [{schema_name}].[{table_name}]
296 | WHERE {where_spec}
297 | ORDER BY {order_spec}
298 |     ''', params
299 | 
300 | 
301 | def _get_snapshot_query_bits(pk_cols: Sequence[str], odbc_columns: Tuple[pyodbc.Row, ...],
302 |                              comparators: Iterable[str]) -> Tuple[str, str, List[Tuple[int, int, Optional[int]]]]:
303 |     # For multi-column primary keys, this builds a WHERE clause of the following form, assuming
304 |     # for example a PK on (field_a, field_b, field_c):
305 |     #   WHERE (field_a < @K0)
306 |     #    OR (field_a = @K0 AND field_b < @K1)
307 |     #    OR (field_a = @K0 AND field_b = @K1 AND field_c < @K2)
308 | 
309 |     # You may find it odd that this query (as well as the change data query) has `DECLARE` statements in it.
310 |     # Why not just pass the parameters with the query like usual? We found that in composite-key cases,
311 |     # the need to pass the parameter for the bounding value of the non-last column(s) more than once caused
312 |     # SQL Server to treat those as different values (even though they were actually the same), and this
313 |     # messed up query plans and caused poor performance esp. since we're asking for results ordered
314 |     # backwards against the PK's index
315 |     #
316 |     # Having the second layer of "declare indirection" seemed to be the only way to arrange reuse of the
317 |     # same passed parameter in more than one place via pyodbc, which only supports '?' positional
318 |     # placeholders for parameters.
319 | 
320 |     odbc_types = {x[3]: (x[3], x[4], x[5], x[6], x[8]) for x in odbc_columns}
321 |     pk_odbc_cols = [odbc_types[col_name] for col_name in pk_cols]
322 | 
323 |     comparator_where_clauses = []
324 |     param_declarations = []
325 |     params = []
326 | 
327 |     for comparator_ix, comparator in enumerate(comparators):
328 |         also_equal = '=' in comparator
329 |         comparator = comparator.replace('=', '')
330 |         key_where_clauses = []
331 | 
332 |         for pk_ix, (col_name, data_type, type_name, column_size, decimal_digits) in enumerate(pk_odbc_cols):
333 |             type_name = type_name.replace('identity', '')
334 |             if 'char' in type_name:
335 |                 type_name += f'({column_size})'
336 |             params.append((data_type, column_size, decimal_digits))
337 |             param_ix = len(params) - 1
338 |             param_declarations.append(f'@K{param_ix} {type_name} = ?')
339 | 
340 |             inner_clauses = []
341 | 
342 |             for jx, prior_field in enumerate(pk_cols[0:pk_ix]):
343 |                 prior_ix = jx + comparator_ix * len(pk_cols)
344 |                 inner_clauses.append(f'[{prior_field}] = @K{prior_ix}')
345 |             inner_clauses.append(f'[{col_name}] {comparator} @K{param_ix}')
346 |             if also_equal and pk_ix == len(pk_odbc_cols) - 1:
347 |                 inner_clauses[-1] = inner_clauses[-1] .replace(comparator, comparator + '=')
348 | 
349 |             key_where_clauses.append(f"\n    ({' AND '.join(inner_clauses)})")
350 | 
351 |         comparator_where_clauses.append(f"({' OR '.join(key_where_clauses)})")
352 | 
353 |     declarations = ', '.join(param_declarations)
354 |     where_spec = '\n  AND '.join(comparator_where_clauses)
355 | 
356 |     return declarations, where_spec, params
357 | 


--------------------------------------------------------------------------------
/cdc_kafka/sql_query_subprocess.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import datetime
  3 | import logging
  4 | import threading
  5 | import queue
  6 | import re
  7 | import struct
  8 | import time
  9 | from types import TracebackType
 10 | from typing import Any, Tuple, Dict, Optional, NamedTuple, List, Sequence, Type, Callable
 11 | 
 12 | import pyodbc
 13 | 
 14 | from . import constants, helpers, parsed_row
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | class SQLQueryRequest(NamedTuple):
 20 |     queue_name: str
 21 |     query_metadata_to_reflect: Any
 22 |     query_text: str
 23 |     query_param_types: Sequence[Tuple[int, int, Optional[int]]]
 24 |     query_params: Sequence[Any]
 25 |     parser: Callable[[pyodbc.Row], parsed_row.ParsedRow]
 26 | 
 27 | 
 28 | class SQLQueryResult(NamedTuple):
 29 |     queue_name: str
 30 |     reflected_query_request_metadata: Any
 31 |     query_executed_utc: datetime.datetime
 32 |     query_took_sec: float
 33 |     result_rows: List[parsed_row.ParsedRow]
 34 |     query_params: Sequence[Any]
 35 | 
 36 | 
 37 | class SQLQueryProcessor(object):
 38 |     _instance = None
 39 | 
 40 |     def __init__(self, odbc_conn_string: str) -> None:
 41 |         if SQLQueryProcessor._instance is not None:
 42 |             raise Exception('SQLQueryProcessor class should be used as a singleton.')
 43 | 
 44 |         self.odbc_conn_string: str = odbc_conn_string
 45 |         self._stop_event: threading.Event = threading.Event()
 46 |         self._request_queue: 'queue.Queue[SQLQueryRequest]' = queue.Queue(1000)
 47 |         self._output_queues: Dict[str, 'collections.deque[SQLQueryResult]'] = {}
 48 |         self._threads: List[threading.Thread] = [
 49 |             threading.Thread(target=self.querier_thread, name=f'sql-querier-{i + 1}')
 50 |             for i in range(constants.DB_QUERIER_CONCURRENT_THREADS)
 51 |         ]
 52 |         self._results_wait_time: datetime.timedelta = datetime.timedelta(
 53 |             seconds=(constants.SQL_QUERY_RETRIES + 1) *
 54 |             (constants.SQL_QUERY_TIMEOUT_SECONDS + constants.SQL_QUERY_INTER_RETRY_INTERVAL_SECONDS)
 55 |         )
 56 |         self._ended: bool = False
 57 | 
 58 |         SQLQueryProcessor._instance = self
 59 | 
 60 |     def __enter__(self) -> 'SQLQueryProcessor':
 61 |         for t in self._threads:
 62 |             t.start()
 63 |         return self
 64 | 
 65 |     def __exit__(self, exc_type: Optional[Type[BaseException]], exc: Optional[BaseException],
 66 |                  traceback: Optional[TracebackType]) -> None:
 67 |         if not self._ended:
 68 |             self._stop_event.set()
 69 |             self._check_if_ended()
 70 | 
 71 |     def _check_if_ended(self) -> bool:
 72 |         if self._stop_event.is_set() and not self._ended:
 73 |             logger.info('Ending SQL querier thread...')
 74 |             self._ended = True
 75 |             self._stop_event.set()
 76 |             for t in self._threads:
 77 |                 if t.is_alive():
 78 |                     t.join(timeout=3)
 79 |             while not self._request_queue.empty():
 80 |                 try:
 81 |                     self._request_queue.get_nowait()
 82 |                 except (queue.Empty, EOFError):
 83 |                     break
 84 |             logger.info("Done.")
 85 |         return self._ended
 86 | 
 87 |     def enqueue_query(self, request: 'SQLQueryRequest') -> None:
 88 |         if self._check_if_ended():
 89 |             return
 90 |         if request.queue_name not in self._output_queues:
 91 |             self._output_queues[request.queue_name] = collections.deque()
 92 |         self._request_queue.put_nowait(request)
 93 | 
 94 |     def get_result(self, queue_name: str) -> Optional['SQLQueryResult']:
 95 |         deadline = helpers.naive_utcnow() + self._results_wait_time
 96 |         while helpers.naive_utcnow() < deadline:
 97 |             try:
 98 |                 return self._output_queues[queue_name].popleft()
 99 |             except IndexError:
100 |                 time.sleep(0.001)
101 |             if self._check_if_ended():
102 |                 return None
103 |         return None
104 | 
105 |     # This runs in the separate thread, and uses its own DB connection:
106 |     def querier_thread(self) -> None:
107 |         try:
108 |             with get_db_conn(self.odbc_conn_string) as db_conn:
109 |                 logger.debug("SQL querier thread started.")
110 |                 while not self._stop_event.is_set():
111 |                     try:
112 |                         request = self._request_queue.get(block=True, timeout=0.1)
113 |                     except queue.Empty:
114 |                         continue
115 | 
116 |                     start_time = time.perf_counter()
117 |                     with db_conn.cursor() as cursor:
118 |                         if request.query_param_types is not None:
119 |                             cursor.setinputsizes(request.query_param_types)  # type: ignore[arg-type]
120 |                         retry_count = 0
121 |                         while True:
122 |                             try:
123 |                                 if request.query_params is None:
124 |                                     cursor.execute(request.query_text)
125 |                                 else:
126 |                                     cursor.execute(request.query_text, request.query_params)
127 |                                 break
128 |                             except pyodbc.OperationalError as exc:
129 |                                 # HYT00 is the error code for "Timeout expired"
130 |                                 if exc.args[0].startswith('HYT00') and retry_count < constants.SQL_QUERY_RETRIES:
131 |                                     retry_count += 1
132 |                                     logger.warning('SQL query timed out, retrying...')
133 |                                     time.sleep(constants.SQL_QUERY_INTER_RETRY_INTERVAL_SECONDS)
134 |                                     continue
135 |                                 raise exc
136 |                         query_executed_utc = helpers.naive_utcnow()
137 |                         result_rows = []
138 |                         fetch_batch_count = constants.DB_FETCH_BATCH_SIZE
139 |                         while fetch_batch_count >= constants.DB_FETCH_BATCH_SIZE:
140 |                             fetch_batch_count = 0
141 |                             for row in cursor.fetchmany(constants.DB_FETCH_BATCH_SIZE):
142 |                                 result_rows.append(request.parser(row))
143 |                                 fetch_batch_count += 1
144 |                     query_took_sec = (time.perf_counter() - start_time)
145 |                     self._output_queues[request.queue_name].append(
146 |                         SQLQueryResult(request.queue_name, request.query_metadata_to_reflect, query_executed_utc,
147 |                                        query_took_sec, result_rows, request.query_params)
148 |                     )
149 |         except pyodbc.OperationalError as exc:
150 |             # 08S01 is the error code for "Communication link failure" which may be raised in response to KeyboardInterrupt
151 |             if not exc.args[0].startswith('08S01'):
152 |                 raise exc
153 |         except KeyboardInterrupt:
154 |             pass
155 |         except Exception as exc:
156 |             logger.exception('SQL querier thread raised an exception.', exc_info=exc)
157 |         finally:
158 |             self._stop_event.set()
159 |             logger.info("SQL querier thread exiting.")
160 | 
161 | 
162 | def get_db_conn(odbc_conn_string: str) -> pyodbc.Connection:
163 |     # The Linux ODBC driver doesn't do failover, so we're hacking it in here. This will only work for initial
164 |     # connections. If a failover happens while this process is running, the app will crash. Have a process supervisor
165 |     # that can restart it if that happens, and it'll connect to the failover on restart:
166 |     # THIS ASSUMES that you are using the exact keywords 'SERVER' and 'Failover_Partner' in your connection string!
167 |     try:
168 |         conn = pyodbc.connect(odbc_conn_string)
169 |     except pyodbc.DatabaseError as e:
170 |         server_match = re.match(r".*SERVER=(?P<hostname>.*?);", odbc_conn_string)
171 |         failover_partner_match = re.match(r".*Failover_Partner=(?P<hostname>.*?);", odbc_conn_string)
172 | 
173 |         if failover_partner_match is None or server_match is None or e.args[0] not in ('42000', 'HYT00'):
174 |             raise
175 | 
176 |         failover_partner = failover_partner_match.groups('hostname')[0]
177 |         server = server_match.groups('hostname')[0]
178 |         odbc_conn_string = odbc_conn_string.replace(server, failover_partner)
179 |         logger.warning('Connection to PRIMARY failed, trying failover... (primary: "%s", failover: "%s")',
180 |                        server, failover_partner)
181 |         conn = pyodbc.connect(odbc_conn_string)
182 | 
183 |     def decode_truncated_utf16(raw_bytes: bytes) -> str:
184 |         # SQL Server generally uses UTF-16-LE encoding for text. The length of NCHAR and NVARCHAR columns is the number
185 |         # of byte pairs that can be stored in the column. But some higher UTF-16 codepoints are 4 bytes long. So it's
186 |         # possible for a 4-byte character to get truncated halfway through, causing decode errors. This is to work
187 |         # around that.
188 |         try:
189 |             return raw_bytes.decode("utf-16le")
190 |         except UnicodeDecodeError as ex:
191 |             return raw_bytes[:ex.start].decode("utf-16le")
192 | 
193 |     def decode_datetimeoffset(raw_bytes: bytes) -> datetime.datetime:
194 |         tup = struct.unpack("<6hI2h", raw_bytes)
195 |         return datetime.datetime(
196 |             tup[0], tup[1], tup[2], tup[3], tup[4], tup[5], tup[6] // 1000,
197 |             datetime.timezone(datetime.timedelta(hours=tup[7], minutes=tup[8]))
198 |         )
199 | 
200 |     conn.add_output_converter(pyodbc.SQL_WVARCHAR, decode_truncated_utf16)
201 |     conn.add_output_converter(pyodbc.SQL_WCHAR, decode_truncated_utf16)
202 |     conn.add_output_converter(-155, decode_datetimeoffset)
203 |     conn.timeout = constants.SQL_QUERY_TIMEOUT_SECONDS
204 | 
205 |     return conn
206 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   sqlserver:
 4 |     image: mcr.microsoft.com/mssql/server:2017-CU14-ubuntu
 5 |     ports:
 6 |       - '1433:1433'
 7 |     environment:
 8 |       ACCEPT_EULA: 'Y'
 9 |       MSSQL_AGENT_ENABLED: 'true'
10 |       SA_PASSWORD: 'TestLocalPassword123'
11 |   confluent:
12 |     image: landoop/fast-data-dev:2.3
13 |     ports:
14 |       - '2181:2181'
15 |       - '3030:3030'
16 |       - '8081-8083:8081-8083'
17 |       - '9581-9585:9581-9585'
18 |       - '9092:9092'
19 |     environment:
20 |       RUNTESTS: '0'
21 |       SAMPLEDATA: '0'
22 |       RUNNING_SAMPLEDATA: '0'
23 |       ADV_HOST: '127.0.0.1'
24 |     volumes:
25 |       - /Users/martywoodlee/Downloads/sqljdbc_7.4/enu/mssql-jdbc-7.4.1.jre8.jar:/opt/landoop/connectors/third-party/kafka-connect-jdbc/mssql-jdbc-7.4.1.jre8.jar
26 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | avro==1.11.3
 2 | aws-msk-iam-sasl-signer-python==1.0.1
 3 | bitarray==2.9.2
 4 | confluent-kafka==2.3.0
 5 | Jinja2==3.1.3
 6 | pyodbc==5.1.0
 7 | requests==2.31.0
 8 | sentry-sdk==1.45.0
 9 | sortedcontainers==2.4.0
10 | tabulate==0.9.0
11 | 
12 | # Only used by replayer.py; if using the Docker image, this requires that you apt install freetds-dev and python3-dev too:
13 | # ctds==1.14.0
14 | # faster-fifo==1.4.5
15 | 
16 | # Helpers if you're doing development on this project:
17 | # ipython==8.23.0
18 | # line-profiler==4.1.2
19 | # mypy==1.10.0
20 | # sortedcontainers-stubs==2.4.2
21 | # types-confluent-kafka==1.2.0
22 | # types-requests==2.31.0.20240406
23 | # types-tabulate==0.9.0.20240106
24 | # vulture==2.11
25 | 


--------------------------------------------------------------------------------