├── .gitignore ├── .images ├── PostgreSQL_logo.3colors.120x120.png ├── Snowflake_Logo.svg.png ├── color_white_debezium_type_600px.svg ├── database-tables.png ├── debeziumio-ar21.svg ├── horizontal-logo-monochromatic-white.png ├── howto-flow.png ├── logo-mysql-170x115.png ├── snowflake-security.png ├── solution.drawio.png └── solution.png ├── LICENSE ├── README.md ├── articles ├── .images │ ├── docker-compose.png │ ├── snowflake_console.png │ ├── solution-capture-data-changes.png │ ├── solution-debezium.png │ ├── solution-kafka-to-snowflake.png │ ├── solution-replication.png │ ├── solution-sink-snowflake.png │ ├── solution-solution-points.png │ └── solution-solution.png └── dzone_howto_building-an-enterprise-cdc-solution.md ├── database ├── README.md ├── init_db.sh ├── mysql_crud.sh ├── postgres_crud.sh └── sql │ ├── 00_mysql_init.sql │ ├── 00_postgres_init.sql │ ├── 01_mysql_changes.sql │ └── 01_postgres_changes.sql ├── debezium ├── README.md ├── connect │ ├── debezium-mysql-inventory-connector.json │ └── debezium-postgres-inventory-connector.json ├── delete_cdc.sh ├── init_cdc.sh └── status_cdc.sh ├── services ├── .env ├── README.md ├── docker-compose.png ├── docker-compose.yml └── render_compose_image.sh └── snowflake ├── README.md ├── connect └── snowflake-sink-connector.json ├── delete_sink.sh ├── init_sink.sh ├── keys ├── README.md ├── snowflake_rsa_key.p8 ├── snowflake_rsa_key.pem └── snowflake_rsa_key.pub ├── sql ├── 00-security.sql ├── 01-cdc-to-replica-mysql.sql └── 01-cdc-to-replica-postgres.sql └── status_sink.sh /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *.class 3 | 4 | database/data_mysql.csv 5 | 6 | database/data_postgres.csv 7 | 8 | services/.cache/ 9 | -------------------------------------------------------------------------------- /.images/PostgreSQL_logo.3colors.120x120.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/.images/PostgreSQL_logo.3colors.120x120.png -------------------------------------------------------------------------------- /.images/Snowflake_Logo.svg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/.images/Snowflake_Logo.svg.png -------------------------------------------------------------------------------- /.images/color_white_debezium_type_600px.svg: -------------------------------------------------------------------------------- 1 | color_white -------------------------------------------------------------------------------- /.images/database-tables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/.images/database-tables.png -------------------------------------------------------------------------------- /.images/debeziumio-ar21.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 13 | 14 | 18 | 22 | 26 | 30 | 31 | 33 | 35 | 39 | 40 | 41 | 42 | 43 | 44 | 47 | 48 | 49 | 50 | 51 | 52 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /.images/horizontal-logo-monochromatic-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/.images/horizontal-logo-monochromatic-white.png -------------------------------------------------------------------------------- /.images/howto-flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/.images/howto-flow.png -------------------------------------------------------------------------------- /.images/logo-mysql-170x115.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/.images/logo-mysql-170x115.png -------------------------------------------------------------------------------- /.images/snowflake-security.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/.images/snowflake-security.png -------------------------------------------------------------------------------- /.images/solution.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/.images/solution.drawio.png -------------------------------------------------------------------------------- /.images/solution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/.images/solution.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Debezium to Snowflake 2 | 3 | - [Debezium to Snowflake](#debezium-to-snowflake) 4 | - [Requirements](#requirements) 5 | - [Organization](#organization) 6 | - [How-to steps](#how-to-steps) 7 | - [I need more!!](#i-need-more) 8 | 9 | This repo is a demo of how to use Debezium to capture changes over tables in MySQL and PostgreSQL 10 | to generate a replica in near-real-time in Snowflake. This is extensible to other databases and 11 | describes several common points about CDC, Kafka, Kafka connect, or Snowflake tools. 12 | 13 | [Miguel García] and I work together on a DZone article [Data Platform: Building an Enterprise CDC Solution], 14 | and as next step I publish this repo as [HOWTO: Building an Enterprise CDC Solution] 15 | 16 | ![solution.png](./.images/solution.png) 17 | 18 | ## Requirements 19 | 20 | To facilitate the execution of the howto, the services will be deployed using **[docker-compose]**. 21 | It has a dependency of **[docker engine]**. For better compatibility, we are using the docker-compose specification 2, 22 | so a **docker engine 1.10.0** or later should work. 23 | 24 | As part of the howto, you will create a Snowflake account, and the howto guide you to create a key pair for authentication. 25 | To perform these actions, you should have an **[OpenSSL toolkit]**. Is commonly available in Linux distributions and 26 | can be installed in Windows or Mac. If you need it, you can run it inside a docker image (will be commented in the howto). 27 | 28 | About hardware requirements, review **[docker engine]** requirements. 29 | 30 | ## Organization 31 | 32 | Well, this demo has several parts. To simplify this, it has been split into several folders in this repo. 33 | For each folder you can found a README file with explanations: 34 | 35 | - **[services]**: relative to docker images and services 36 | - **[database]**: sentences and scripts to run inside the local databases 37 | - **[debezium]**: configuration and scripts to start and check the status of Debezium connectors 38 | - **[snowflake]**: Snowflake scripts, and configuration of the Snowflake sink connector 39 | 40 | ## How-to steps 41 | 42 | You can see a detailed howto in DZone article [HOWTO: Building an Enterprise CDC Solution] that follows these steps 43 | 44 | ![howto-flow](.images/howto-flow.png) 45 | 46 | In this flow: 47 | - Gray: local services 48 | - Yellow: external resources 49 | 50 | ## I need more!! 51 | 52 | Well, check the README available in each folder. It includes some detail about his components 53 | and some additional scripts or functions that you can use to explore this solution. 54 | 55 | I hope this tutorial has been helpful for you and you have enjoyed it. 56 | 57 | 58 | [Miguel García]: https://dzone.com/users/4531976/miguelglor.html 59 | [Data Platform: Building an Enterprise CDC Solution]: https://dzone.com/articles/data-platform-building-an-enterprise-cdc-solution 60 | [HOWTO: Building an Enterprise CDC Solution]: https://dzone.com/articles/howto_building-an-enterprise-cdc-solution 61 | [docker-compose]: https://docs.docker.com/compose/install/ 62 | [docker engine]: https://docs.docker.com/engine/ 63 | [OpenSSL toolkit]: https://github.com/openssl/openssl#build-and-install 64 | [services]: services/README.md 65 | [database]: database/README.md 66 | [debezium]: debezium/README.md 67 | [snowflake]: snowflake/README.md 68 | [snowflake/keys README]: snowflake/keys 69 | [snowflake/sql/00-security.sql]: snowflake/sql/00-security.sql 70 | [snowflake/connect/snowflake-sink-connector.json]: snowflake/connect/snowflake-sink-connector.json 71 | [snowflake/sql/01-cdc-to-replica-mysql.sql]: snowflake/sql/01-cdc-to-replica-mysql.sql 72 | [snowflake/sql/01-cdc-to-replica-postgres.sql]: snowflake/sql/01-cdc-to-replica-postgres.sql 73 | -------------------------------------------------------------------------------- /articles/.images/docker-compose.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/articles/.images/docker-compose.png -------------------------------------------------------------------------------- /articles/.images/snowflake_console.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/articles/.images/snowflake_console.png -------------------------------------------------------------------------------- /articles/.images/solution-capture-data-changes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/articles/.images/solution-capture-data-changes.png -------------------------------------------------------------------------------- /articles/.images/solution-debezium.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/articles/.images/solution-debezium.png -------------------------------------------------------------------------------- /articles/.images/solution-kafka-to-snowflake.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/articles/.images/solution-kafka-to-snowflake.png -------------------------------------------------------------------------------- /articles/.images/solution-replication.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/articles/.images/solution-replication.png -------------------------------------------------------------------------------- /articles/.images/solution-sink-snowflake.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/articles/.images/solution-sink-snowflake.png -------------------------------------------------------------------------------- /articles/.images/solution-solution-points.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/articles/.images/solution-solution-points.png -------------------------------------------------------------------------------- /articles/.images/solution-solution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/articles/.images/solution-solution.png -------------------------------------------------------------------------------- /articles/dzone_howto_building-an-enterprise-cdc-solution.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | This article is a follow-up to the [Data Platform: Building an Enterprise CDC Solution](https://dzone.com/articles/data-platform-building-an-enterprise-cdc-solution), where [Miguel García](https://dzone.com/users/4531976/miguelglor.html) and I described: 4 | 5 | * Several Change Data Capture (CDC) use cases and common scenarios in an enterprise platform 6 | * A proposal using Debezium (as log-based CDC) to capture data from the relational databases, and Kafka as a channel that enables several consumers to propagate data changes for different use cases. 7 | 8 | One of the common scenarios for this solution consists of data replication from OLTP Database to OLAP Database (from the operational database to the data warehouse). 9 | 10 | In this article, I'm going to provide a "how-to" to deploy a sample of a CDC process to replicate data from two different relational databases to Snowflake: 11 | * Manage the data changes in a common format. 12 | * Set up a Debezium in Kafka Connect to get data changes and push into Kafka topics. 13 | * Set up Snowflake Sink in Kafka Connect to get data changes from Kafka topics and push the data to Snowflake. 14 | * Apply a specific replication logic to consolidate the data change events in Snowflake, avoiding the use of the JDBC connector for better cost-effectiveness. 15 | 16 | ![solution](.images/solution-solution.png) 17 | 18 | # Step-by-step 19 | 20 | [The GitHub repository](https://github.com/dariocazas/howto-debezium-to-snowflake) includes a detailed description as well as several scripts that you will need in this "how-to": 21 | 22 | ```sh 23 | git clone https://github.com/dariocazas/howto-debezium-to-snowflake.git 24 | ``` 25 | 26 | > Note: every folder in this repository has a README file with more info about the process. 27 | 28 | The steps are: 29 | 30 | 1. Pre-requirements 31 | 1. Local environment 32 | 2. Snowflake database 33 | 3. Snowflake authentication 34 | 2. How to capture data changes from databases to a Kafka topic 35 | 1. Start local services 36 | 2. Prepare the databases 37 | 3. Start Debezium 38 | 4. Check data capture 39 | 3. How to push data changes from a Kafka topic into Snowflake 40 | 1. Start local sink process 41 | 2. Check data capture into CDC tables 42 | 3. Apply replication logic 43 | 4. Check data replication 44 | 45 | ![steps](.images/solution-solution-points.png) 46 | 47 | ## 1. Pre-requirements 48 | 49 | ### 1.1. Local environment 50 | - [docker-compose](https://docs.docker.com/compose/install/) and [docker engine](https://docs.docker.com/engine/) 1.10.0 or later. 51 | - [jq](https://stedolan.github.io/jq/download/) as a JSON parser used in scripts. 52 | 53 | ### 1.2. Snowflake database 54 | 55 | You need a Snowflake Account. To create a trial follow the [Snowflake Trial Accounts doc](https://docs.snowflake.com/en/user-guide/admin-trial-account.html) 56 | 57 | Access to your Snowflake Account, create a database and run the next steps in it: 58 | 59 | ```sh 60 | USE ROLE ACCOUNTADMIN; 61 | CREATE DATABASE HOWTO_DB; 62 | ``` 63 | 64 | > Note: in a production environment, it is not recommended to use the role ACCOUNTADMIN for all the tasks like I describe in this howto. 65 | 66 | ### 1.3. Snowflake authentication 67 | 68 | In this howto, we use a key-pair authentication. The detailed process is documented [here](https://docs.snowflake.com/en/user-guide/kafka-connector-install.html#using-key-pair-authentication-key-rotation). You can use the key-pair provided by the repository: 69 | * Encrypted private key: `snowflake/keys/snowflake_rsa_key.p8` 70 | * Private passphrase to decrypt: `mypassphrase` 71 | * Public key: `snowflake/keys/snowflake_rsa_key.pub` 72 | 73 | As the next step, in the Snowflake Worksheet, we need to register the public key (replace in this script the key with your snowflake/keys/snowflake_rsa_key.pub without header and footer) 74 | 75 | ```sql 76 | USE ROLE ACCOUNTADMIN; 77 | ALTER USER dariocazas SET rsa_public_key='MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAwBwYbPtbEUXueQ6u3KDw 78 | zlKu4IhAkGdcUBVbdTdUVBLNVsZX+eiKOedN3EnMtDeVzRlaT8JAwHX0LVXkgXtn 79 | KzMBp6TpS4j+2kKvbZc5p0KfZHjn42G+C/DXI4ZNQZEBQ/Q4UY6OkTZepFaOX3ev 80 | 2icxB6LnnVYI3WHkSnq3vTthhYhTuUOQ4YRudadOtoT4By09hxbsaanVl42FXIZP 81 | AXX1jwawzKe52V1+FB5/UMv+JMUFfczlO+acn/EaZvKbR55Vk/+OVrUP4KIKvdWn 82 | s/n4ASYqxiw9xjrizGCoUyl+b+Ch6A02fTU02HrT9jOOj+dVAeFD2QGOqaze0eCD 83 | dwIDAQAB'; 84 | ``` 85 | 86 | ## 2. How to capture data changes from databases to a Kafka topic 87 | 88 | In this step, you start two different database engines and enable a CDC process. As a result, you have two Kafka topics with Debezium events that you can consume. 89 | 90 | ![capture-data-changes](.images/solution-capture-data-changes.png) 91 | 92 | ### 2.1. Start local services 93 | 94 | The repository contains a docker-compose to run several services in your local environment: 95 | * Two database engines: MySQL and PostgreSQL 96 | * One Kafka broker (and its zookeeper) 97 | * Two Kafka connect services: one to run CDC Debezium tasks and another to send the events to Snowflake 98 | 99 | ![docker-compose](.images/docker-compose.png) 100 | 101 | In a terminal run: 102 | 103 | ```sh 104 | cd services 105 | docker-compose up 106 | ``` 107 | 108 | It can take several minutes to download and start the services. Keep this terminal open to be able to see the log of services. When the turorial is finished, you can stop all using `Ctrl+C`. 109 | 110 | ### 2.2. Prepare the databases 111 | 112 | There are two SQL initialization scripts: 113 | * `database/sql/00_mysql_init.sql`: create table `users` 114 | * `database/sql/00_postgres_init.sql`: create table `product` 115 | 116 | To apply these SQL scripts it in dockerized services, and populate data, run the following lines in a terminal: 117 | 118 | ```sh 119 | cd database 120 | # create tables 121 | ./init_db.sh 122 | # Populate data 123 | ./mysql_crud.sh 124 | ./postgres_crud.sh 125 | ``` 126 | 127 | In the output, you can see several CRUD operations in the tables, and the last state after operations. You can close this terminal. 128 | 129 | ### 2.3. Start Debezium 130 | 131 | The docker service `cdc_connect` has the necessary dependencies to run Debezium over MySQL and Postgres. The configuration is available in: 132 | * `debezium/connect/debezium-mysql-inventory-connector.json` 133 | * `debezium/connect/debezium-postgres-inventory-connector.json` 134 | 135 | Open a terminal and init the capture of the tables: 136 | ```sh 137 | cd debezium 138 | ./init_cdc.sh 139 | ``` 140 | 141 | In the docker-compose terminal, you can see how the connectors start. When the log stabilizes, you can check the status of the Debezium connectors in the previous terminal using: 142 | ```sh 143 | # I assume you are in the debezium folder 144 | ./status_cdc.sh 145 | ``` 146 | 147 | You can close this terminal. 148 | 149 | ### 2.4. Check data capture 150 | 151 | You can test if the capture is working with this strategy: 152 | * Open a terminal with live consumer events 153 | * Do CRUD operations in the database 154 | 155 | First for MySQL, open a terminal and run: 156 | ```sh 157 | cd services 158 | docker-compose exec kafka /kafka/bin/kafka-console-consumer.sh \ 159 | --bootstrap-server kafka:9092 --from-beginning \ 160 | --topic mysqldb.inventory.users \ 161 | --property print.key=true --property print.timestamp=true 162 | ``` 163 | 164 | The terminal will populate every new event pushed from Debezium to Kafka, sending every insert/update/delete done in `inventory.users` in MySQL 165 | 166 | Open now a terminal in PostgreSQL and do the same: 167 | ```sh 168 | cd services 169 | docker-compose exec kafka /kafka/bin/kafka-console-consumer.sh \ 170 | --bootstrap-server kafka:9092 --from-beginning \ 171 | --topic postgresdb.inventory.product \ 172 | --property print.key=true --property print.timestamp=true 173 | ``` 174 | 175 | To generate new events, open a terminal and run: 176 | ```sh 177 | cd database 178 | ./mysql_crud.sh 179 | ./postgres_crud.sh 180 | ``` 181 | 182 | You should see new data change events in the consumer terminals. 183 | 184 | ## 3. How to push data changes from Kafka topic into Snowflake 185 | 186 | In this step, you send the Kafka events to Snowflake and generate a replica of the source tables. 187 | 188 | ![sink-snowflake](.images/solution-sink-snowflake.png) 189 | 190 | ### 3.1. Start local sink process 191 | 192 | The docker service `sink_connect` has the necessary dependencies to run the Snowflake Sink connector to push new Kafka events into the Snowflake table. The configuration is available in `snowflake/connect/snowflake-sink-connector.json` and you need to update: 193 | * The Snowflake URL with yours in field `snowflake.url.name` 194 | * The authentication fields if you generate your key-pair in the previous step: `snowflake.private.key` and `snowflake.private.key.passphrase` 195 | 196 | Open a terminal and init the upload of the Kafka topics: 197 | ```sh 198 | cd snowflake 199 | ./init_sink.sh 200 | ``` 201 | 202 | In the docker-compose terminal, you can see how the connector starts. When the log stabilizes, you can check the status of the Snowflake connector in the previous terminal using: 203 | ```sh 204 | # From the snowflake folder 205 | ./status_sink.sh 206 | ``` 207 | 208 | ### 3.2. Check data capture into CDC tables 209 | 210 | When the sink connector uploads the events from the Kafka topics, it creates these tables: 211 | * `CDC_MYSQL_INVENTORY_USERS` 212 | * `CDC_POSTGRESDB_INVENTORY_PRODUCT` 213 | 214 | The upload to Snowflake will be done in batches, so it may take some time until the data is available in Snowflake (in the order of 30-60 seconds). 215 | 216 | From your Snowflake Worksheet, validate that your events are populated in the new tables: 217 | ```sql 218 | USE ROLE ACCOUNTADMIN; 219 | USE SCHEMA HOWTO_DB.PUBLIC; 220 | SELECT * FROM CDC_MYSQL_INVENTORY_USERS; 221 | SELECT * FROM CDC_POSTGRESDB_INVENTORY_PRODUCT; 222 | ``` 223 | 224 | Adding new changes in your dockerized databases produces new rows in your tables. 225 | 226 | 1. In the Snowflake Worksheet: 227 | ```sql 228 | SELECT 'Events MySQL', COUNT(1) FROM CDC_MYSQL_INVENTORY_USERS 229 | UNION ALL 230 | SELECT 'Events PostgreSQL', COUNT(1) FROM CDC_POSTGRESDB_INVENTORY_PRODUCT; 231 | ``` 232 | 2. From a terminal, apply changes in your databases: 233 | ```sh 234 | cd database 235 | ./mysql_crud.sh 236 | ./postgres_crud.sh 237 | ``` 238 | 3. Wait until the events are sent to Snowflake (you can see the log in docker-compose terminal) 239 | 4. Repeat the query in the Snowflake Worksheet 240 | 241 | ### 3.3. Apply replication logic 242 | 243 | In the repository there are two scripts with the SQL logic to generate the replica of the source tables: 244 | * `snowflake/sql/01-cdc-to-replica-mysql.sql` 245 | * `snowflake/sql/01-cdc-to-replica-postgres.sql` 246 | 247 | From your Snowflake Worksheet, execute these two scripts. As a result, you have two views with the same structure of the source databases: 248 | * `MYSQL_INVENTORY_USERS` 249 | * `POSTGRESDB_INVENTORY_PRODUCT` 250 | 251 | These scripts follow the same logic, creating a scheduled task that processes the new events that arrive and updates the replica table. 252 | 253 | ![replication](.images/solution-replication.png) 254 | 255 | > Note: one part of these SQL scripts (the MERGE sentence) depends on the source database engine. The Debezium events have the metadata about the source engine and are used to know which is the last event for an entity. Take into account if you replicate this logic in your production systems. 256 | 257 | ### 3.4. Check data replication 258 | 259 | The end-to-end is running now. You can check the data available in your local databases and validate it against the Snowflake view: 260 | 1. In a terminal, get the actual state of MySQL users table: 261 | ```sh 262 | cd services 263 | echo "SELECT * FROM users ORDER BY id" | docker-compose \ 264 | exec -T mysql \ 265 | bash -c 'mysql -u $MYSQL_USER -p$MYSQL_PASSWORD inventory' 266 | ``` 267 | 2. Go to the Snowflake Worksheet and validate the result with: 268 | ```sql 269 | USE ROLE ACCOUNTADMIN; 270 | USE SCHEMA HOWTO_DB.PUBLIC; 271 | SELECT * FROM MYSQL_INVENTORY_USERS; 272 | ``` 273 | 3. In a terminal, get the actual state of the PostgreSQL product table: 274 | ```sh 275 | # I assume you are in the services folder 276 | echo "SELECT * FROM product ORDER BY id" | docker-compose \ 277 | exec -T postgres \ 278 | env PGOPTIONS="--search_path=inventory" \ 279 | bash -c 'psql -U $POSTGRES_USER postgres' 280 | ``` 281 | 4. And validate in the Snowflake Worksheet. 282 | ```sql 283 | USE ROLE ACCOUNTADMIN; 284 | USE SCHEMA HOWTO_DB.PUBLIC; 285 | SELECT * FROM POSTGRESDB_INVENTORY_PRODUCT; 286 | ``` 287 | 5. Generate new insert-delete-update operations from a terminal: 288 | ```sh 289 | cd database 290 | ./mysql_crud.sh 291 | ./postgres_crud.sh 292 | ``` 293 | 6. Wait until the events are sent to Snowflake (review docker-compose terminal log). 294 | 7. Wait until the scheduled task is triggered in Snowflake: 295 | ```sql 296 | USE ROLE ACCOUNTADMIN; 297 | select name, state, error_code, error_message,scheduled_time, next_scheduled_time 298 | from table(HOWTO_DB.information_schema.task_history()) 299 | order by scheduled_time desc; 300 | ``` 301 | 8. Validate again the content of the tables in Snowflake 302 | 303 | # Conclusions 304 | 305 | **Debezium provides an easy way to capture changes from databases** and populate change events in a Kafka service, that you can consume in several ways. 306 | 307 | To populate these changes to another database you can use the simplest way, but this is not always the best option **in the context of the new generation of data warehouses**, and probably you need to **take into account another kind of strategy close to this service** for better performance and reduce the cost of use. 308 | 309 | First of all, study the possibilities, and after **testing it with a POC similar to this howto**, including some aspects like performance and cost review, **proceed to do the next steps** (security, naming, automatization, data quality, failover, ...). 310 | -------------------------------------------------------------------------------- /database/README.md: -------------------------------------------------------------------------------- 1 | # Howto - Database description 2 | 3 | 4 | ![PostgreSQL-logo](../.images/PostgreSQL_logo.3colors.120x120.png) 5 | ![MySQL-logo](../.images/logo-mysql-170x115.png) 6 | 7 | * [Access to database shell](#access-to-database-shell) 8 | * [Tables](#tables) 9 | * [CRUD operations](#crud-operations) 10 | 11 | 12 | As part of this howto, I provide: 13 | 14 | - SQL scripts to create new tables and data 15 | - Bash scripts to apply the SQL over the dockerized databases 16 | 17 | ## Access to database shell 18 | 19 | You can open the shell of your database and run it your commands: 20 | 21 | ```sh 22 | # Go to services folder (important) 23 | cd howto-debezium-to-snowflake/services 24 | 25 | # Access to MySQL shell 26 | docker-compose exec mysql \ 27 | bash -c 'mysql -u $MYSQL_USER -p$MYSQL_PASSWORD inventory' 28 | 29 | # Access to Postgres shell 30 | docker-compose exec postgres \ 31 | env PGOPTIONS="--search_path=inventory" \ 32 | bash -c 'psql -U $POSTGRES_USER postgres' 33 | ``` 34 | 35 | ## Tables 36 | 37 | Well, to simplify the howto, we use database images provided by Debezium. 38 | When the service databases are UP, you should perform this script: 39 | 40 | ```sh 41 | ./init_db.sh 42 | ``` 43 | 44 | This script initializes tables in both database instances (MySQL and PostgreSQL) 45 | loaded from `./sql` folder. 46 | 47 | The SQL script [`sql/00_mysql_init.sql`](./sql/00_mysql_init.sql) create the 48 | **users table** with five basic fields, common for a lot of databases. 49 | 50 | The SQL script [`sql/00_postgres_init.sql`](./sql/00_postgres_init.sql) create the 51 | **product table** with five basic fields, common for a lot of databases. 52 | 53 | The `init_db.sh` script uses these SQL files to init database tables (one for each database) 54 | in preconfigured database `inventory`. 55 | 56 | Both tables have a `created_on` field with the timestamp of creation. This field 57 | is not necessary for CDC, but can be util to perform some checks in sink destination. 58 | 59 | ## CRUD operations 60 | 61 | Well, as part of the demo, you should do actions over the databases. For each reason, 62 | I provide two scripts: 63 | 64 | - `mysql_crud.sh`: trigger several inserts, update, delete and show the final status of the **users** table 65 | - `postgres_crud.sh`: same again, but over PostgreSQL **product** table 66 | 67 | You can launch these scripts over and over again to generate new data in the database, 68 | which via CDC will be replicated as events in Kafka. 69 | 70 | -------------------------------------------------------------------------------- /database/init_db.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DOCKER_COMPOSE_FILE=docker-compose.yml 3 | DOCKER_COMPOSE_RELATIVE_PATH=../services 4 | 5 | MYSQL=`cat sql/00_mysql_init.sql` 6 | POSTGRES=`cat sql/00_postgres_init.sql` 7 | 8 | cd $DOCKER_COMPOSE_RELATIVE_PATH 9 | 10 | echo "MySQL new table" 11 | echo "$MYSQL" 12 | echo "$MYSQL" | docker-compose \ 13 | -f $DOCKER_COMPOSE_FILE \ 14 | exec -T mysql \ 15 | bash -c 'mysql -u $MYSQL_USER -p$MYSQL_PASSWORD inventory' 16 | 17 | echo "PostgreSQL new table" 18 | echo "$POSTGRES" 19 | echo "$POSTGRES" | docker-compose \ 20 | -f docker-compose.yml \ 21 | exec -T postgres \ 22 | env PGOPTIONS="--search_path=inventory" \ 23 | bash -c 'psql -U $POSTGRES_USER postgres' 24 | -------------------------------------------------------------------------------- /database/mysql_crud.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DOCKER_COMPOSE_FILE=docker-compose.yml 3 | DOCKER_COMPOSE_RELATIVE_PATH=../services 4 | 5 | run_sql() { 6 | echo "$1" 7 | echo "$1" | docker-compose \ 8 | -f $DOCKER_COMPOSE_FILE \ 9 | exec -T mysql \ 10 | bash -c 'mysql -u $MYSQL_USER -p$MYSQL_PASSWORD inventory 2> /dev/null' 11 | } 12 | 13 | DML=$(cat sql/01_mysql_changes.sql) 14 | 15 | cd $DOCKER_COMPOSE_RELATIVE_PATH 16 | run_sql "$DML" 17 | -------------------------------------------------------------------------------- /database/postgres_crud.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DOCKER_COMPOSE_FILE=docker-compose.yml 3 | DOCKER_COMPOSE_RELATIVE_PATH=../services 4 | 5 | run_sql() { 6 | echo "$1" 7 | echo "$1" | docker-compose \ 8 | -f docker-compose.yml \ 9 | exec -T postgres \ 10 | env PGOPTIONS="--search_path=inventory" \ 11 | bash -c 'psql -U $POSTGRES_USER postgres 2> /dev/null' 12 | } 13 | 14 | DML=$(cat sql/01_postgres_changes.sql) 15 | 16 | cd $DOCKER_COMPOSE_RELATIVE_PATH 17 | run_sql "$DML" 18 | -------------------------------------------------------------------------------- /database/sql/00_mysql_init.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE users ( 2 | id MEDIUMINT PRIMARY KEY AUTO_INCREMENT, 3 | name VARCHAR(20), 4 | email VARCHAR(255), 5 | password VARCHAR(100), 6 | created_on TIMESTAMP DEFAULT CURRENT_TIMESTAMP 7 | ); 8 | -------------------------------------------------------------------------------- /database/sql/00_postgres_init.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE product ( 2 | id serial PRIMARY KEY, 3 | name VARCHAR(100), 4 | description VARCHAR(255), 5 | created_on TIMESTAMP NOT NULL DEFAULT NOW() 6 | ); 7 | -------------------------------------------------------------------------------- /database/sql/01_mysql_changes.sql: -------------------------------------------------------------------------------- 1 | -- Insert six users in three sentences 2 | INSERT INTO users(name, email, password) 3 | SELECT 'Lara', concat('lara', LEFT(UUID(), 4), '@email.com'), LEFT(UUID(), 25) 4 | ; 5 | INSERT INTO users(name, email, password) 6 | SELECT 'Jackson', concat('jackson', LEFT(UUID(), 4), '@email.com'), LEFT(UUID(), 25) 7 | ; 8 | INSERT INTO users(name, email, password) 9 | SELECT name, concat(lower(name), LEFT(UUID(), 4), '@email.com'), LEFT(UUID(), 25) 10 | FROM ( 11 | SELECT 'Hana' AS name 12 | UNION SELECT 'Morgan' 13 | UNION SELECT 'Willie' 14 | UNION SELECT 'Bruce' 15 | ) t; 16 | -- Update last two user passwords 17 | UPDATE users SET password=LEFT(UUID(), 10) ORDER BY id DESC LIMIT 2 18 | ; 19 | -- Update first user password 20 | UPDATE users SET password=LEFT(UUID(), 5) ORDER BY id LIMIT 1 21 | ; 22 | -- Delete last user 23 | DELETE FROM users ORDER BY id DESC LIMIT 1 24 | ; 25 | -- Show actual state 26 | SELECT * FROM users ORDER BY id 27 | ; -------------------------------------------------------------------------------- /database/sql/01_postgres_changes.sql: -------------------------------------------------------------------------------- 1 | -- Insert six products in two sentences 2 | INSERT INTO product(name, description) 3 | SELECT name, concat('Description for ', name) 4 | FROM ( 5 | VALUES ('Harley Davidson Ultimate Chopper'), 6 | ('1996 Moto Guzzi 1100i') 7 | ) t (name) 8 | ; 9 | INSERT INTO product(name, description) 10 | SELECT name, concat('Description for ', name) 11 | FROM ( 12 | VALUES ('1985 Toyota Supra'), 13 | ('1957 Ford Thunderbird'), 14 | ('1938 Cadillac V-16 Presidential Limousine'), 15 | ('1982 Lamborghini Diablo') 16 | ) t (name) 17 | ; 18 | -- Update last two descriptions 19 | UPDATE product 20 | SET description=concat('(Update ', NOW(), ') - Desc. for ', name) 21 | WHERE id in ( 22 | SELECT id FROM product ORDER BY id DESC LIMIT 2 23 | ) 24 | ; 25 | -- Update first description 26 | UPDATE product 27 | SET description=concat('(Up. ', NOW(), ') - Desc. for ', name) 28 | WHERE id in ( 29 | SELECT min(id) FROM product 30 | ) 31 | ; 32 | -- Delete last product 33 | DELETE FROM product 34 | WHERE id in ( 35 | SELECT id FROM product ORDER BY id DESC LIMIT 1 36 | ) 37 | ; 38 | -- Show actual state 39 | SELECT * FROM product ORDER BY id 40 | ; -------------------------------------------------------------------------------- /debezium/README.md: -------------------------------------------------------------------------------- 1 | # Howto - CDC with Debezium 2 | 3 | ![Debezum-logo](../.images/color_white_debezium_type_600px.svg) 4 | 5 | * [Usage](#usage) 6 | * [Context](#context) 7 | + [Change Events](#change-events) 8 | + [Connector actions](#connector-actions) 9 | + [Connectors config](#connectors-config) 10 | - [MySQL connector](#mysql-connector) 11 | - [PostgreSQL connector](#postgresql-connector) 12 | - [Secret management](#secret-management) 13 | 14 | As part of this howto, I provide: 15 | 16 | - Kafka connect configurations to capture changes from MySQL and PostgreSQL databases 17 | - Scripts to create, destroy and check the status of these connectors 18 | 19 | ## Usage 20 | 21 | This folder includes three scripts, that perform actions against the docker service `cdc_connector`: 22 | 23 | - `init_cdc.sh`: take the configurations available in `./connect` folder, and call 24 | the Kafka connect REST API to create the connector that captures the changes 25 | in the databases and push it in Kafka 26 | - `status_cdc.sh`: call the Kafka connect REST API, get the list of configured 27 | connectors, and for each connector call to show you the status 28 | - `delete_cdc.sh`: similar to status, but delete all the connectors in this 29 | Kafka connect service 30 | 31 | With these scripts, you can perform your test as you wish: 32 | 33 | - Create connectors after or before the tables exists or have data 34 | - Destroy connectors, insert new data, and create again to check data loss 35 | - Wherever test that you can do 36 | 37 | ## Context 38 | 39 | Kafka connect enables the ability to push/poll events to Kafka from/to 40 | other system using only a configuration file, without developing a source/sink application. 41 | 42 | The Kafka connector plugin need to be deployed into the Kafka connect nodes (called 43 | worker nodes), and after doing this you can call a REST API with a configuration to 44 | enable the connector to push data from an external source to Kafka (like CDC connector do for you) 45 | or pull data from Kafka to other sink destinations. 46 | 47 | ### Change Events 48 | 49 | In Kafka, a topic can have one or more partitions. This enables parallel read from consumers 50 | in the same consumer group. A consumer group is a group of consumers that see the topic as 51 | a queue and each consumer can pull events from several partitions but one partition cannot 52 | have more than one consumer for each consumer group. This is the main point to understand 53 | one part of the event: the key. 54 | 55 | An event has three parts: 56 | - Key: 57 | - By default, all events with the same key are pushed to the same partition. 58 | - This can be null, in this case by default, a round-robin between partitions on push is performed. 59 | - Value: the event data 60 | - Headers: a collection of pair key-value that can be setted 61 | 62 | Compared to the native CDC of each database, Debezium provides decoupling between the 63 | database engine and the events it emits, standardizing them and making them common as far as possible. 64 | 65 | As a key, Debezium (and other change data capture tools) include the key fields of the table 66 | 67 | As a value, Debezium sends these common fields: 68 | - source: a metadata document about the connector and the source database 69 | - op: the operation code, can be `r` (read, snapshot), `c`(create, insert), `u` (update), `d` (delete) 70 | - after: a document with the data state after database operation 71 | - before: a document with the data state before database operation 72 | 73 |
74 | Example of key seralized as JSON 75 | 76 | ```JSON 77 | { 78 | "payload": { 79 | "id": 1 80 | }, 81 | "schema": { 82 | "fields": [ 83 | { 84 | "field": "id", 85 | "optional": false, 86 | "type": "int32" 87 | } 88 | ], 89 | "name": "mysqldb.inventory.users.Key", 90 | "optional": false, 91 | "type": "struct" 92 | } 93 | } 94 | ``` 95 | 96 |
97 | 98 |
99 | Example of value seralized as JSON 100 | 101 | ```JSON 102 | { 103 | "payload": { 104 | "after": { 105 | "created_on": "2021-07-28T16:32:45Z", 106 | "email": "lara7012@email.com", 107 | "id": 1, 108 | "name": "Lara", 109 | "password": "701280aa-efc1-11eb-a7c9-0" 110 | }, 111 | "before": null, 112 | "op": "c", 113 | "source": { 114 | "connector": "mysql", 115 | "db": "inventory", 116 | "file": "mysql-bin.000003", 117 | "gtid": null, 118 | "name": "mysqldb", 119 | "pos": 703, 120 | "query": null, 121 | "row": 0, 122 | "sequence": null, 123 | "server_id": 223344, 124 | "snapshot": "false", 125 | "table": "users", 126 | "thread": null, 127 | "ts_ms": 1627489965000, 128 | "version": "1.6.1.Final" 129 | }, 130 | "transaction": null, 131 | "ts_ms": 1627489965300 132 | }, 133 | "schema": { 134 | "fields": [ 135 | { 136 | "field": "before", 137 | "fields": [ 138 | { 139 | "field": "id", 140 | "optional": false, 141 | "type": "int32" 142 | }, 143 | { 144 | "field": "name", 145 | "optional": true, 146 | "type": "string" 147 | }, 148 | { 149 | "field": "email", 150 | "optional": true, 151 | "type": "string" 152 | }, 153 | { 154 | "field": "password", 155 | "optional": true, 156 | "type": "string" 157 | }, 158 | { 159 | "field": "created_on", 160 | "name": "io.debezium.time.ZonedTimestamp", 161 | "optional": true, 162 | "type": "string", 163 | "version": 1 164 | } 165 | ], 166 | "name": "mysqldb.inventory.users.Value", 167 | "optional": true, 168 | "type": "struct" 169 | }, 170 | { 171 | "field": "after", 172 | "fields": [ 173 | { 174 | "field": "id", 175 | "optional": false, 176 | "type": "int32" 177 | }, 178 | { 179 | "field": "name", 180 | "optional": true, 181 | "type": "string" 182 | }, 183 | { 184 | "field": "email", 185 | "optional": true, 186 | "type": "string" 187 | }, 188 | { 189 | "field": "password", 190 | "optional": true, 191 | "type": "string" 192 | }, 193 | { 194 | "field": "created_on", 195 | "name": "io.debezium.time.ZonedTimestamp", 196 | "optional": true, 197 | "type": "string", 198 | "version": 1 199 | } 200 | ], 201 | "name": "mysqldb.inventory.users.Value", 202 | "optional": true, 203 | "type": "struct" 204 | }, 205 | { 206 | "field": "source", 207 | "fields": [ 208 | { 209 | "field": "version", 210 | "optional": false, 211 | "type": "string" 212 | }, 213 | { 214 | "field": "connector", 215 | "optional": false, 216 | "type": "string" 217 | }, 218 | { 219 | "field": "name", 220 | "optional": false, 221 | "type": "string" 222 | }, 223 | { 224 | "field": "ts_ms", 225 | "optional": false, 226 | "type": "int64" 227 | }, 228 | { 229 | "default": "false", 230 | "field": "snapshot", 231 | "name": "io.debezium.data.Enum", 232 | "optional": true, 233 | "parameters": { 234 | "allowed": "true,last,false" 235 | }, 236 | "type": "string", 237 | "version": 1 238 | }, 239 | { 240 | "field": "db", 241 | "optional": false, 242 | "type": "string" 243 | }, 244 | { 245 | "field": "sequence", 246 | "optional": true, 247 | "type": "string" 248 | }, 249 | { 250 | "field": "table", 251 | "optional": true, 252 | "type": "string" 253 | }, 254 | { 255 | "field": "server_id", 256 | "optional": false, 257 | "type": "int64" 258 | }, 259 | { 260 | "field": "gtid", 261 | "optional": true, 262 | "type": "string" 263 | }, 264 | { 265 | "field": "file", 266 | "optional": false, 267 | "type": "string" 268 | }, 269 | { 270 | "field": "pos", 271 | "optional": false, 272 | "type": "int64" 273 | }, 274 | { 275 | "field": "row", 276 | "optional": false, 277 | "type": "int32" 278 | }, 279 | { 280 | "field": "thread", 281 | "optional": true, 282 | "type": "int64" 283 | }, 284 | { 285 | "field": "query", 286 | "optional": true, 287 | "type": "string" 288 | } 289 | ], 290 | "name": "io.debezium.connector.mysql.Source", 291 | "optional": false, 292 | "type": "struct" 293 | }, 294 | { 295 | "field": "op", 296 | "optional": false, 297 | "type": "string" 298 | }, 299 | { 300 | "field": "ts_ms", 301 | "optional": true, 302 | "type": "int64" 303 | }, 304 | { 305 | "field": "transaction", 306 | "fields": [ 307 | { 308 | "field": "id", 309 | "optional": false, 310 | "type": "string" 311 | }, 312 | { 313 | "field": "total_order", 314 | "optional": false, 315 | "type": "int64" 316 | }, 317 | { 318 | "field": "data_collection_order", 319 | "optional": false, 320 | "type": "int64" 321 | } 322 | ], 323 | "optional": true, 324 | "type": "struct" 325 | } 326 | ], 327 | "name": "mysqldb.inventory.users.Envelope", 328 | "optional": false, 329 | "type": "struct" 330 | } 331 | } 332 | ``` 333 | 334 |
335 | 336 | To maintain simplicity, this demo works with JSON events with the schema included in the event. 337 | In a non-test environment, the recommended approach is to use a Schema Registry to store the schemas 338 | and other serialization format like Avro to store it. 339 | 340 | ### Connector actions 341 | 342 | When connectors perform the first run, you can see an initial snapshot of the database (which is a configurable option). 343 | After doing this, every change applied to the tables that these connectors listen will be the track to Kafka. This include: 344 | - When you add new rows, one event per row will be inserted 345 | - When you update rows, 346 | - One event per row will be updated 347 | - If an update affects the key of the table, Debezium throw like a delete action and a new insert of data 348 | - When you delete rows, two events per row will be deleted (configurable option): 349 | - One event with info about the operation DELETE 350 | - Another event with a null value (events in Kafka have key, value, and headers, and any can be null) 351 | 352 | Each event has as key the key of the table, that enables guarantees of order. The topics of Kafka 353 | have properties to identify data retention and clean policies: 354 | - Retention by time 355 | - Retention by size 356 | - Retention by compaction 357 | 358 | When using compaction hold, when Kafka triggers the cleanup process, it keeps the last event for each key on the topic. 359 | If the last event for a key has a null value, Kafka removes all events for this key. With this approach, 360 | when a new consumer begins to read the topic, he does not have to download the changes from the origin of the replica: 361 | he first obtains the state of the table from the moment of the last compaction, and then continues reading 362 | the changes captured since then. 363 | 364 | 365 | ### Connectors config 366 | 367 | The Kafka connectors have common configuration properties and others that depend of 368 | the Kafka connector plugin that you use. A FileStreamSource connector needs 369 | the configuration of the file to read, and a CDC connector need info about the 370 | database that should be read: the configuration is not the same, but 371 | some parts are common: 372 | - name: all connectors should have a name to reference it 373 | - connector.class: the class that implements the connector, that may be a 374 | source (push external data to Kafka) or sink (pull data from Kafka to another system) 375 | - tasks.max: the maximum number of tasks that perform the source/sink action 376 | 377 | To review other common configurations, you can review [the official doc about kafka connect configuring]. 378 | 379 | Another main point of the Kafka connector is the ability to do some basic transformations (called SMT) 380 | of the event, like add some field or change the event key. We don't perform this 381 | in this howto, but can be interested in some use cases. 382 | 383 | #### MySQL connector 384 | 385 | You can see all the documentation about this Kafka connector plugin in 386 | the [Debezium connector for MySQL] page. 387 | 388 | This connector supports several MySQL topologies, but this demo will track 389 | changes for a standalone MySQL server. 390 | 391 | When you start the connector, you can see three new topics: 392 | 393 | - `mysqldb`: schema change topic, with schema change events that include all DDL 394 | statements applied to databases in the MySQL server. The name of this topic is 395 | the same described in property `database.server.name` 396 | - `mysqldb.schema-changes.inventory`: track DDL changes in the database, and it 397 | is necessary by internal management of the CDC connector. You can configure the 398 | topic name in `database.history.kafka.topic` 399 | - `mysqldb.inventory.users`: 400 | - If you were run the steps in [database readme], you should have a topic for this table 401 | - This topic manage the change events for table users 402 | 403 | Well, you can see the connector config in [`connect/debezium-mysql-inventory-connector.json`](./connector/debezium-mysql-inventory-connector.json) 404 | 405 | - Connection properties: 406 | - `database.hostname`: IP address or hostname of the MySQL database server. 407 | - `database.port`: integer port number of the MySQL database server. 408 | - `database.user`: name of the MySQL user to use when connecting to the MySQL database server. 409 | - `database.password`: password to use when connecting to the MySQL database server. 410 | - `database.server.id`: a numeric ID of this database client, which must be unique across all 411 | currently-running database processes in the MySQL cluster. If not set, a random number will be use. 412 | - `database.server.name`: logical name that identifies and provides a namespace for the particular 413 | MySQL database server/cluster in which Debezium is capturing changes. 414 | - CDC properties: 415 | - `database.history.kafka.bootstrap.servers`: a list of host/port pairs that the connector uses for 416 | establishing an initial connection to the Kafka cluster. Each pair should point to the same Kafka 417 | cluster used by the Kafka Connect process. 418 | - `database.history.kafka.topic`: the full name of the Kafka topic where the connector stores the 419 | database schema history. 420 | - `database.include`: name of the database for which to capture changes. The connector does not capture 421 | changes in any database whose name is not in this property or `database.include.list` 422 | - `table.include.list`: an optional, comma-separated list of regular expressions that match 423 | fully-qualified table identifiers of tables whose changes you want to capture. 424 | The connector does not capture changes in any table not included in table.include.list. 425 | - Exists properties to configure the exclude instead of include databases/tables, and a lot of 426 | parametrized options. Review the [official doc](https://debezium.io/documentation/reference/connectors/mysql.html#mysql-connector-properties). 427 | 428 | #### PostgreSQL connector 429 | 430 | You can see all the documentation about this Kafka connector plugin in 431 | the [Debezium connector for PostgreSQL] page. 432 | 433 | In this case, when you start the connector you only see one topic: 434 | - `postgres.inventory.product`: 435 | - If you were run the steps in [database readme], you should have a topic for this table 436 | - This topic manage the change events for table product 437 | 438 | If you review the properties used, is very similar to the MySQL connector, and no new description is needed. 439 | 440 | #### Secret management 441 | 442 | Is a good practice externalize your secrets outside of connector configs. You can review the [KIP-297] to use 443 | an external provider to reference it. 444 | 445 | 446 | [database readme]: ../database/README.md 447 | [docker readme]: ../docker/README.md 448 | [Debezium connector for MySQL]: https://debezium.io/documentation/reference/connectors/mysql.html 449 | [Debezium connector for PostgreSQL]: https://debezium.io/documentation/reference/connectors/postgresql.html 450 | [the official doc about kafka connect configuring]: https://kafka.apache.org/documentation.html#connect_configuring 451 | [KIP-297]: https://cwiki.apache.org/confluence/display/KAFKA/KIP-297%3A+Externalizing+Secrets+for+Connect+Configurations -------------------------------------------------------------------------------- /debezium/connect/debezium-mysql-inventory-connector.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "debezium-mysql-inventory-connector", 3 | "config": { 4 | "connector.class": "io.debezium.connector.mysql.MySqlConnector", 5 | "tasks.max": "1", 6 | "database.hostname": "mysql", 7 | "database.port": "3306", 8 | "database.user": "debezium", 9 | "database.password": "dbz", 10 | "database.server.id": "184054", 11 | "database.server.name": "mysqldb", 12 | "database.include": "inventory", 13 | "database.history.kafka.bootstrap.servers": "kafka:9092", 14 | "database.history.kafka.topic": "mysqldb.schema-changes.inventory", 15 | "table.include.list": "inventory.users" 16 | } 17 | } -------------------------------------------------------------------------------- /debezium/connect/debezium-postgres-inventory-connector.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "debezium-postgres-inventory-connector", 3 | "config": { 4 | "connector.class": "io.debezium.connector.postgresql.PostgresConnector", 5 | "tasks.max": "1", 6 | "database.hostname": "postgres", 7 | "database.port": "5432", 8 | "database.user": "postgres", 9 | "database.password": "postgres", 10 | "database.dbname": "postgres", 11 | "database.server.name": "postgresdb", 12 | "schema.include": "inventory", 13 | "table.include.list": "inventory.product" 14 | } 15 | } -------------------------------------------------------------------------------- /debezium/delete_cdc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CONNECT_URL=http://localhost:8083 4 | 5 | CONNECTORS=$(curl -s -k ${CONNECT_URL}/connectors) 6 | echo Connector list: 7 | echo $CONNECTORS 8 | echo 9 | 10 | for row in $(echo "${CONNECTORS}" | jq -c -r '.[]'); do 11 | status=$(curl -s -k -X DELETE "${CONNECT_URL}/connectors/${row}") 12 | echo Deleted ${row} 13 | done 14 | -------------------------------------------------------------------------------- /debezium/init_cdc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Based on https://github.com/debezium/debezium-examples/tree/master/tutorial 4 | 5 | CONNECT_URL=http://localhost:8083 6 | MYSQL_CONNECT_CONFIG=connect/debezium-mysql-inventory-connector.json 7 | POSTGRES_CONNECT_CONFIG=connect/debezium-postgres-inventory-connector.json 8 | 9 | echo "### Creating MySQL CDC connect ###" 10 | curl -i -X POST $CONNECT_URL/connectors \ 11 | -H "Content-Type:application/json" \ 12 | -d @$MYSQL_CONNECT_CONFIG 13 | echo . 14 | 15 | echo "### Creating Postgres CDC connect ###" 16 | curl -i -X POST $CONNECT_URL/connectors \ 17 | -H "Accept:application/json" \ 18 | -H "Content-Type:application/json" \ 19 | -d @$POSTGRES_CONNECT_CONFIG 20 | echo . 21 | 22 | 23 | -------------------------------------------------------------------------------- /debezium/status_cdc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CONNECT_URL=http://localhost:8083 4 | 5 | CONNECTORS=$(curl -s -k ${CONNECT_URL}/connectors) 6 | echo Connector list: 7 | echo $CONNECTORS 8 | echo 9 | 10 | echo Connector status: 11 | echo 12 | 13 | for row in $(echo "${CONNECTORS}" | jq -c -r '.[]'); do 14 | status=$(curl -s -k -X GET "${CONNECT_URL}/connectors/${row}/status") 15 | echo $status 16 | echo 17 | done 18 | -------------------------------------------------------------------------------- /services/.env: -------------------------------------------------------------------------------- 1 | DEBEZIUM_VERSION=1.6 2 | COMPOSE_PROJECT_NAME=howto-debeizum-to-snowflake-${DEBEZIUM_VERSION} 3 | CONFLUENT_VERSION=5.5.5 4 | -------------------------------------------------------------------------------- /services/README.md: -------------------------------------------------------------------------------- 1 | # Services 2 | 3 | ![docker-logo](../.images/horizontal-logo-monochromatic-white.png) 4 | 5 | * [Usage](#usage) 6 | * [Context](#context) 7 | + [Docker-compose detail](#docker-compose-detail) 8 | + [Access to containers](#access-to-containers) 9 | - [Kafka commands](#kafka-commands) 10 | - [Database commands](#database-commands) 11 | * [References](#references) 12 | 13 | As part of this howto, I provide: 14 | 15 | - A docker-compose to run it 16 | - `credentials` folder with Snowflake keys 17 | - `.env` file with product versions 18 | 19 | ## Usage 20 | 21 | You can run it with a single command, and see all logs in your terminal. Clone this repository and go to the 22 | docker folder to run it: 23 | ```sh 24 | git clone https://github.com/dariocazas/howto-debezium-to-snowflake.git 25 | cd howto-debezium-to-snowflake/services 26 | docker-compose up 27 | ``` 28 | 29 | You can stop this using `Ctrl+C` 30 | 31 | **It is important** go to the docker folder due to use a `.env` file available in this folder 32 | 33 | ## Context 34 | 35 | ### Docker-compose detail 36 | 37 | The compose YML run several images and expose several ports. For simplicity, I use Debezium images for many parts: 38 | 39 | - **mysql**: database instance provided by Debezium team 40 | - **postgres**: database instance provide by Debezium team 41 | - **zookeeper**: as part of the Kafka ecosystem 42 | - **kafka**: single Kafka broker, exposing his 9092 port 43 | - **cdc_connect**: Kafka connect worker node, provided by Debezium team, with the connector plugins for his supported databases 44 | - **sink_connect**: Kafka connect worker node, provided by confluent. I include the installation of snowflake connector plugin 45 | 46 | ![Docker compose info](docker-compose.png) 47 | 48 | ### Access to containers 49 | 50 | Inside of docker-compose file, you can see several commands to enable access to the containers. 51 | You can run these commands inside the `docker` folder (to enable docker to read the `.env` file) 52 | 53 | #### Kafka commands 54 | 55 | ```sh 56 | # List topics 57 | docker-compose -f docker-compose.yml exec kafka /kafka/bin/kafka-topics.sh --bootstrap-server kafka:9092 --list 58 | 59 | # Show all CDC MySQL data (including keys for the events) 60 | docker-compose -f docker-compose.yml exec kafka /kafka/bin/kafka-console-consumer.sh \ 61 | --bootstrap-server kafka:9092 --from-beginning \ 62 | --topic mysqldb.inventory.users 63 | 64 | # Show all CDC MySQL data (including keys for the events and timestamp which the event was received in Kafka) 65 | docker-compose -f docker-compose.yml exec kafka /kafka/bin/kafka-console-consumer.sh \ 66 | --bootstrap-server kafka:9092 --from-beginning \ 67 | --topic mysqldb.inventory.users \ 68 | --property print.key=true --property print.timestamp=true 69 | 70 | # Show all CDC Posgres data 71 | docker-compose -f docker-compose.yml exec kafka /kafka/bin/kafka-console-consumer.sh \ 72 | --bootstrap-server kafka:9092 --from-beginning \ 73 | --topic postgresdb.inventory.product 74 | ``` 75 | 76 | #### Database commands 77 | 78 | ```sh 79 | # Access to MySQL shell 80 | docker-compose -f docker-compose.yml exec mysql \ 81 | bash -c 'mysql -u $MYSQL_USER -p$MYSQL_PASSWORD inventory' 82 | 83 | # Access to Postgres shell 84 | docker-compose -f docker-compose.yml exec postgres \ 85 | env PGOPTIONS="--search_path=inventory" \ 86 | bash -c 'psql -U $POSTGRES_USER postgres' 87 | ``` 88 | 89 | ## References 90 | 91 | - [Debezium tutorial](https://debezium.io/documentation/reference/1.6/tutorial.html) 92 | - [Debezium images github](https://github.com/debezium/docker-images) 93 | - [Confluent: kafka connect zero to hero](https://github.com/confluentinc/demo-scene/tree/master/kafka-connect-zero-to-hero) 94 | - [Docker compose graph visualization](https://github.com/pmsipilot/docker-compose-viz) -------------------------------------------------------------------------------- /services/docker-compose.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/services/docker-compose.png -------------------------------------------------------------------------------- /services/docker-compose.yml: -------------------------------------------------------------------------------- 1 | # Based on https://debezium.io/documentation/reference/1.5/tutorial.html 2 | # Run as: 3 | # docker-compose up 4 | --- 5 | version: "2" 6 | services: 7 | 8 | zookeeper: 9 | image: debezium/zookeeper:${DEBEZIUM_VERSION} 10 | ports: 11 | - 2181:2181 12 | - 2888:2888 13 | - 3888:3888 14 | 15 | # You can list the existing topics with 16 | # docker-compose exec kafka /kafka/bin/kafka-topics.sh --bootstrap-server kafka:9092 --list 17 | # After start CDC, you can consume the events using this command 18 | # MySQL: 19 | # docker-compose exec kafka /kafka/bin/kafka-console-consumer.sh --bootstrap-server kafka:9092 --from-beginning --property print.key=true --topic mysqldb.inventory.users 20 | # Postgres: 21 | # docker-compose exec kafka /kafka/bin/kafka-console-consumer.sh --bootstrap-server kafka:9092 --from-beginning --property print.key=true --topic postgresdb.inventory.product 22 | kafka: 23 | image: debezium/kafka:${DEBEZIUM_VERSION} 24 | ports: 25 | - 9092:9092 26 | environment: 27 | #ADVERTISED_HOST_NAME: localhost 28 | ZOOKEEPER_CONNECT: zookeeper:2181 29 | depends_on: 30 | - zookeeper 31 | links: 32 | - zookeeper:zookeeper 33 | 34 | # docker-compose exec mysql bash -c 'mysql -u $MYSQL_USER -p$MYSQL_PASSWORD inventory' 35 | mysql: 36 | image: debezium/example-mysql:${DEBEZIUM_VERSION} 37 | ports: 38 | - 3306:3306 39 | environment: 40 | MYSQL_ROOT_PASSWORD: debezium 41 | MYSQL_USER: mysqluser 42 | MYSQL_PASSWORD: mysqlpw 43 | 44 | # docker-compose exec postgres env PGOPTIONS="--search_path=inventory" bash -c 'psql -U $POSTGRES_USER postgres' 45 | postgres: 46 | image: debezium/example-postgres:${DEBEZIUM_VERSION} 47 | ports: 48 | - 5432:5432 49 | environment: 50 | POSTGRES_USER: postgres 51 | POSTGRES_PASSWORD: postgres 52 | 53 | cdc_connect: 54 | image: debezium/connect:${DEBEZIUM_VERSION} 55 | ports: 56 | - 8083:8083 57 | environment: 58 | BOOTSTRAP_SERVERS: kafka:9092 59 | GROUP_ID: cdc_connect_group 60 | REST_PORT: 8083 61 | REST_ADVERTISED_HOST_NAME: localhost 62 | CONFIG_STORAGE_TOPIC: my_cdc_connect_configs 63 | OFFSET_STORAGE_TOPIC: my_cdc_connect_offsets 64 | STATUS_STORAGE_TOPIC: my_cdc_connect_statuses 65 | CONFIG_STORAGE_REPLICATION_FACTOR: "1" 66 | OFFSET_STORAGE_REPLICATION_FACTOR: "1" 67 | STATUS_STORAGE_REPLICATION_FACTOR: "1" 68 | depends_on: 69 | - zookeeper 70 | - kafka 71 | - mysql 72 | - postgres 73 | links: 74 | - zookeeper:zookeeper 75 | - kafka:kafka 76 | - mysql:mysql 77 | - postgres:postgres 78 | 79 | sink_connect: 80 | image: confluentinc/cp-kafka-connect-base:${CONFLUENT_VERSION} 81 | ports: 82 | - 8085:8085 83 | environment: 84 | CONNECT_BOOTSTRAP_SERVERS: kafka:9092 85 | CONNECT_REST_PORT: 8085 86 | CONNECT_REST_ADVERTISED_HOST_NAME: "localhost" 87 | CONNECT_GROUP_ID: sink_connect_group 88 | CONNECT_CONFIG_STORAGE_TOPIC: my_sink_connect_configs 89 | CONNECT_OFFSET_STORAGE_TOPIC: my_sink_connect_offsets 90 | CONNECT_STATUS_STORAGE_TOPIC: my_sink_connect_statuses 91 | CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: "1" 92 | CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: "1" 93 | CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: "1" 94 | CONNECT_KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter 95 | CONNECT_VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter 96 | DEBEZIUM_VERSION: ${DEBEZIUM_VERSION} 97 | depends_on: 98 | - zookeeper 99 | - kafka 100 | links: 101 | - zookeeper:zookeeper 102 | - kafka:kafka 103 | # https://github.com/confluentinc/demo-scene/blob/master/kafka-connect-zero-to-hero/docker-compose.yml#L89-L101 104 | command: 105 | - bash 106 | - -c 107 | - | 108 | echo "Installing Connector" 109 | confluent-hub install --no-prompt snowflakeinc/snowflake-kafka-connector:1.5.5 110 | # 111 | echo "Launching Kafka Connect worker" 112 | /etc/confluent/docker/run & 113 | # 114 | sleep infinity 115 | -------------------------------------------------------------------------------- /services/render_compose_image.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # This script update the PNG with the services of the docker-compose.yml 4 | # Is based on https://github.com/pmsipilot/docker-compose-viz 5 | 6 | docker run --rm -it --name dcv -v $(pwd):/input pmsipilot/docker-compose-viz render -m image docker-compose.yml --force 7 | -------------------------------------------------------------------------------- /snowflake/README.md: -------------------------------------------------------------------------------- 1 | # Snowflake 2 | 3 | ![Snowflake-logo](../.images/Snowflake_Logo.svg.png) 4 | 5 | * [Sink to Snowflake scripts](#sink-to-snowflake-scripts) 6 | + [Snowflake scripts](#snowflake-scripts) 7 | * [Context](#context) 8 | + [Sink connector](#sink-connector) 9 | + [Snowflake security](#snowflake-security) 10 | + [Snowflake resource naming used](#snowflake-resource-naming-used) 11 | + [Snowflake CDC Debeizum table](#snowflake-cdc-debeizum-table) 12 | + [Snowflake replica table](#snowflake-replica-table) 13 | + [The final view](#the-final-view) 14 | 15 | As part of this howto, I provide: 16 | 17 | - Kafka connect configurations to push event changes from CDC topics to Snowflake 18 | - Scripts to create, destroy and check the status of these connectors 19 | - Snowflake SQL scripts with replica transformation of the change event tables 20 | 21 | ## Sink to Snowflake scripts 22 | 23 | This folder includes three bash scripts, that perform actions against the docker service `cdc_sink`: 24 | 25 | - `init_cdc.sh`: take the configuration available in `./connect/snowflake-sink-connector.json` file, and call 26 | the Kafka connect REST API to create the connector sink the CDC topics to Snowflake event tables 27 | - `status_cdc.sh`: call the Kafka connect REST API, get the list of configured 28 | connectors, and for each connector call to show you the status 29 | - `delete_cdc.sh`: similar to status, but delete all the connectors in this 30 | Kafka connect service 31 | 32 | **IMPORTANT**: you MUST change several parameters in `./connect/snowflake-sink-connector.json` file: 33 | - `snowflake.url.name`: the entry point for your Snowflake environment 34 | - `snowflake.user.name`: your user name 35 | - `snowflake.private.key`: your pub certificate 36 | - `snowflake.private.key.passphrase`: well, in this demo not include it because the generated certificate isn't encrypted 37 | 38 | Is a good practice to externalize your secrets outside of connector configs. You can review the [KIP-297] to use 39 | an external provider to reference it. 40 | 41 | With these scripts, you can perform your test as you wish: 42 | 43 | - Create connector after or before the topics exist or have data 44 | - Destroy connector, insert new data, and create again to check data loss 45 | - Wherever test that you can do 46 | 47 | ### Snowflake scripts 48 | 49 | Configure your Snowflake account replication with: 50 | 51 | - `sql/00-security.sql`: you partially include it when you do the [snowflake/keys] README. The script is documented. 52 | - `sql/01-cdc-to-replica-mysql.sql`: create a view similar to the original MySQL table, and the necessary to replicate 53 | the events uploaded to Snowflake 54 | - `sql/01-cdc-to-replica-postgres.sql`: like the MySQL, but for the PostgreSQL table 55 | 56 | ## Context 57 | 58 | ### Sink connector 59 | 60 | If you review the detail about the [debezium], you should have context about the Kafka connect 61 | and how to configure it. As you can see, [this connector] is very similar: 62 | 63 | - Common connector parts (name, connector class, ...) 64 | - Snowflake connection properties and destination definition 65 | - You should configure your Snowflake account (url, user, keys...) 66 | - Is recommended to apply a topic2table mapping 67 | - Other configs: 68 | - `key.converter`: 69 | - Tell to connector how to understand the key of the events received from the topics. 70 | - You can use a generic JsonConverter, but Snowflake offers to you his own implementation, that support some additional options 71 | - `value.converter`: like the `key.converter`, but with a focus on the value of the event 72 | - `behavior.on.null.values` 73 | - Specific property of the Snowflake converters (but exist generic alternatives) 74 | - In [debezium] explain about how to Debezium transform the DELETE actions 75 | into two events (one with the delete operation, and another with `null` value) 76 | - An `null` value makes sense in Kafka context, but not for a database (like Snowflake), for this reason, configure as `IGNORE`: 77 | these events will not upload to Snowflake 78 | 79 | ### Snowflake security 80 | 81 | For simplicity, this demo should be run as SYSADMIN role in Snowflake, after grant privileges to run TASK to this role. 82 | 83 | ### Snowflake resource naming used 84 | 85 | In this demo: 86 | - All resources include the topic name in upper case, replacing the `.` with `_` 87 | - The Debezium events are ingested to tables with the prefix `CDC_` 88 | - The tables with the replica of state using the prefix `REPLICA_` 89 | - The stream (listeners over the change in Snowflake tables) used for batch new events to replication, follow `_STREAM_REPLICATION` 90 | - The task in charge of trigger the replica, follow `_TASK_REPLICATION` 91 | 92 | ### Snowflake CDC Debeizum table 93 | 94 | As the configuration of the sink Kafka connector, you specify in which database, schema, and table populate the events. 95 | The tables have the same format with two columns: 96 | - `RECORD_METADATA`: variant column with a JSON, that includes info about the original topic and the key of the event 97 | - `RECORD_CONTENT`: variant column with a JSON, with the value of the event. 98 | 99 | About the key and the value, this demo works with JSON serialization without schema registry. The events generated by 100 | the CDC includes the JSON Schema relative to the events. If you review, the `RECORD_CONTENT` has the same event value that 101 | you see as event value in Kafka topic. The record `RECORD_METADATA` includes: 102 | 103 | - CreateTime: when Kafka receive the event 104 | - topic: the name of the source topic 105 | - partition: the number of the partition of topic that contains the event 106 | - offset: the position in the partition for the event 107 | - key: the event key 108 | 109 | ```json 110 | { 111 | "CreateTime": 1627490826351, 112 | "topic": "mysqldb.inventory.users", 113 | "partition": 0, 114 | "offset": 12, 115 | "key": { 116 | "payload": { 117 | "id": 1 118 | }, 119 | "schema": { 120 | "fields": [ 121 | { 122 | "field": "id", 123 | "optional": false, 124 | "type": "int32" 125 | } 126 | ], 127 | "name": "mysqldb.inventory.users.Key", 128 | "optional": false, 129 | "type": "struct" 130 | } 131 | } 132 | } 133 | 134 | ``` 135 | 136 | You can use this table as historic evolution for the source table, which can be util for analytical purposes. 137 | 138 | ### Snowflake replica table 139 | 140 | One of the objectives of this demo is to replicate the state of the source databases in Snowflake. It can be done 141 | not only for Snowflake (you can populate the topic data in another database via JDBC sink connector) but in the case 142 | of Snowflake exist several points to consider that enable one plus of complexity. 143 | 144 | When you perform a replica using JDBC connector, the order of the operations is directly the order that you read 145 | from the topic. But in Snowflake, you need to process a batch of information (or the partial/entire event table while you 146 | haven't a task to do it). In this case, you need to sort the events and take the last one for each key. 147 | 148 | The script replication does these actions: 149 | - Create replication table 150 | - Create a view over the replication table (to see the same structure as original database table) 151 | - Create a stream over the event table (in our case, capture new ingested rows) 152 | - Merge the actual table to replication table 153 | - Create a task with the `MERGE INTO` sentence, reading from the stream (not from the event table) 154 | - Enable the task (that runs every minute) 155 | - And other check sentence util 156 | 157 | Well, is important (to avoid lost data) to create the stream before running the `MERGE INTO` sentence from 158 | the event table (I assume that you are ingesting data before creating the replication table). 159 | 160 | The `MERGE INTO` sentence includes: 161 | - Projection of important fields for the process (not from a functional data perspective). This include: 162 | - Fields used for sorting the events (binlog, lsn,...) 163 | - The functional data (payload of the event) 164 | - The CDC operation (read, insert, update, delete) 165 | - Metadata about the CDC process (source field of Debezium change event), util for traceability 166 | - Some fields util to calc latencies 167 | - Sort the input. This operation depends on your source database engine and his configuration: 168 | - From MySQL, exist diferent topologies. In our demo, use a standalone and build a binlog sequence 169 | with filename and position to sort it 170 | - From PostgreSQL, the path is used the lsn id 171 | - Take the last operation for each key 172 | - You should guarantee that the query only result in one result for each key 173 | - If the merge operation matches several keys to one, the operation is not deterministic and can apply any. 174 | - Check if the key of the source table match with the target (replica) table 175 | - If no match and operation is `delete`, the event should be discarded 176 | - If no match and operation is another, the event should be inserted 177 | - If match and operation is `delete`, the row in the replica table should be deleted 178 | - If match and operation is another, the event should be applied to the replica table 179 | 180 | When your query runs fine over the source table, you should schedule a task that runs it for you. If you run 181 | again and again this query over the events table, you process again and again all the events. To avoid it, 182 | run the task over the stream created, not for the event table. The stream is cleaned automatically every 183 | successful iteration, and you only process the new events. You can add a condition over the task that only 184 | runs if exist data in the stream. 185 | 186 | After create the task, you should enable it using a `ALTER TASK` sentence. You can see the task history execution with 187 | ```sql 188 | select * 189 | from table(demo_db.information_schema.task_history()) 190 | order by scheduled_time desc; 191 | ``` 192 | 193 | ### The final view 194 | 195 | The replication table contains columns with info about the CDC and replication process, util for checking. But for your 196 | final consumer, this information is not expected. They want the same table that they have in the source database system. 197 | 198 | One column has the valuable data: the `PAYLOAD` column. This content the functional data, in JSON format. 199 | You can create a view over this field, projecting the data like the source databases. 200 | 201 | This has one additional benefit: **evolution**. If your source database evolves (adding columns, removing it, wherever) 202 | all the process is not affected, all runs fine. The unique change is the view: 203 | - No changes in your data pipeline 204 | - No changes in your data 205 | - Coexistence of old and new data 206 | - The schema of each data is included with the data 207 | 208 | [debezium]: ../debezium/README.md 209 | [this connector]: ./connect/snowflake-sink-connector.json 210 | [snowflake/keys]: keys/ 211 | [KIP-297]: https://cwiki.apache.org/confluence/display/KAFKA/KIP-297%3A+Externalizing+Secrets+for+Connect+Configurations -------------------------------------------------------------------------------- /snowflake/connect/snowflake-sink-connector.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "snowflake-sink-connector", 3 | "config": { 4 | "connector.class": "com.snowflake.kafka.connector.SnowflakeSinkConnector", 5 | "tasks.max": "1", 6 | "topics": "mysqldb.inventory.users,postgresdb.inventory.product", 7 | "snowflake.topic2table.map": "mysqldb.inventory.users:cdc_mysql_inventory_users,postgresdb.inventory.product:cdc_postgresdb_inventory_product", 8 | "snowflake.url.name": "mh16247.eu-west-2.aws.snowflakecomputing.com:443", 9 | "snowflake.user.name": "dariocazas", 10 | "snowflake.private.key": "MIIFLTBXBgkqhkiG9w0BBQ0wSjApBgkqhkiG9w0BBQwwHAQIHl29yM4BvgICAggAMAwGCCqGSIb3DQIJBQAwHQYJYIZIAWUDBAEqBBCkFIfNB88Urq5VaPCCzze1BIIE0In6kYmdUnVvH5Q+nPXkPj3VCXd0/aPceHSbC4BsWRtli39bIrWCch1EQXZxoj3xt8QNtOL9XGKH3XqG9rYpu0VmR2MZjC+FteNZ98RXrDqdwkoN/ZWTeaQ+MaeZtiCn93N3dhh70Woee/JgVEcO38vV/i0eJ4ryM07a0eV4d5Y8JQHRBoVVxTPm0Ha/af+p7loS5AKvwfiHndFgQPNbILfweGuhRUe8AQf9Bo0vzeXHBj5nO4RqnaTcfmRDIul4ZtMD7zxxTMJnhezTGFDPqlxEvOOZAudakm01C3y5mfPUs+veMWjNcz0AfPPeyvJPO5Xmu7kiIGtM1UHRojnQOtZ9QWBmhLfMsRZ3GbVbycCLgZOKhW1OIy+PbbykiiRQD6AHszJiFKroZz3yqMRTh9QJFJ4mpa6XjkLGCE8CiPns5Tl7qX2BN78Qs5vxsWJC0Z7wqNuoldsNSFKVtfW6Qm81j12XQw1fsk5zqCnabpsiK/uWo1NowhEa5xAAeRW95wqTyWYi0tu8/u3EQo/xwBCCbDiYFxvbbOmWZjsxf95sO5yHrBxGTs8wCduj0I1UqTXWzTZ4JoAPdSFHwLS61slvujqlSvNvla12nqTYGBtWO/qgLh5egaTmGupLhu4b6FiO5CCXg4sfyOoKeZtykbM0wT0Ud8oK8fx9HwlUNxAaW8NrIo3EuRg7dsKdhtD2hJrqM1dyorVIT7bHSJ5YRLfXHdFGnmaOmJOGvMqXC2yfivEFbMI0nxnrJGDJ4KLS9a8DLmgsQZS8PySmWS+cGuvq4nUcxHnhX/j0ZWCZhSUxQ/z/lRx+RmZM+ey/PnzBuOQGaQrIHe44taN2skz97oopQu9lS6OANE9TPG1Vp1NqanU2Mxkz07++5swdeYp0WEJLWhkLpn3Ce7ImcceLlFI0B9TlAih4rEiE3REfbGCTvLKpaRPHmwYNmZIAhlhKm0Q+v/4Isk4hpce5MuOTiR7yz4neV3VCl66sw7o3tJSRnXtoVKFA2QlN0emdOj6ji0iPvRtKsU/9r8+8EkO3WTg/YO59aLM/pX8V9Rd87jnDidLuO2gVzIsghRiElg2g/4cC9zmvBSZLfF/TJZGs6pX9WxDh3VjLEjdqvU8weepk/LrxyJADp7Up7GuALSytFaMbDPRTLXICsu5q0C/ne//sHeVjiKcz0WgIzeUGqC4wt7ht/G1DDd4/gxAp6ZPmlnh5WjNPTtmfU2TVV14EYUs9UzrUYm+2G0uG/+da+WpB6hRKZkHNSoFKVq3g5IHlB2Lc7SFKYnQhpHxmpmCeoQ2/DlzSWS/EHrV54ej68TdPa2MnrrdeeDCGB3Oo4oSmHSh0bTO4vVOLS9ezLDiFfT0KnhI2HmN3JOGm/2njXwp/qnk3oscyYIxBocsmYeQ91EfS9M4iNjryFNLHNuyWq/9WsDF/LrWPJIoQ+7qZm9AmLZ9yx3ED8YbqIjiK1Q48gl0NwpyvCFEfWDCjmxUA+W1SnAhf4VK3pRLBbkr5UwNcW+FSQWNtoZ8eHASDab5l4HH1NoswYqzEc4jmssQG+3nDimNvenbXvuOjwMF9+wC5LVryysZ2nMeKql4lSr8hlHe4xkvquTyPbJCSsViAueAHmHxSNW/i6QVNukc24UtP", 11 | "snowflake.private.key.passphrase": "mypassphrase", 12 | "snowflake.database.name": "HOWTO_DB", 13 | "snowflake.schema.name": "public", 14 | "key.converter": "com.snowflake.kafka.connector.records.SnowflakeJsonConverter", 15 | "value.converter": "com.snowflake.kafka.connector.records.SnowflakeJsonConverter", 16 | "behavior.on.null.values": "IGNORE" 17 | } 18 | } -------------------------------------------------------------------------------- /snowflake/delete_sink.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CONNECT_URL=http://localhost:8085 4 | 5 | CONNECTORS=$(curl -s -k ${CONNECT_URL}/connectors) 6 | echo Connector list: 7 | echo $CONNECTORS 8 | echo 9 | 10 | for row in $(echo "${CONNECTORS}" | jq -c -r '.[]'); do 11 | status=$(curl -s -k -X DELETE "${CONNECT_URL}/connectors/${row}") 12 | echo Deleted ${row} 13 | done 14 | -------------------------------------------------------------------------------- /snowflake/init_sink.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CONNECT_URL=http://localhost:8085 4 | SINK_SNOWPIPE_CONNECT_CONFIG=connect/snowflake-sink-connector.json 5 | 6 | echo "### Creating Snowpipe sink connector ###" 7 | curl -i -X POST $CONNECT_URL/connectors \ 8 | -H "Accept:application/json" \ 9 | -H "Content-Type:application/json" \ 10 | -d @$SINK_SNOWPIPE_CONNECT_CONFIG 11 | echo . 12 | -------------------------------------------------------------------------------- /snowflake/keys/README.md: -------------------------------------------------------------------------------- 1 | # Credentials management 2 | 3 | ## Snowflake 4 | 5 | ### Create your account 6 | 7 | To use snowflake need to create a free trial: https://signup.snowflake.com 8 | 9 | You can select a Standard Snowflake edition over several clouds. 10 | After validate email and access to the web console, you can see that exists: 11 | 12 | - The host accessed in the URL is your configuration for the snowflake connector 13 | - In left panel, you can see the DEMO_DB database with a PUBLIC schema 14 | - In top-right panel, you can see 15 | - Your role (SYSADMIN) 16 | - Your warehouse (COMPUTE_WH) 17 | 18 | ### Create your key pair 19 | 20 | In [Kafka connector install - Using Key Pair Authentication & Key Rotation], you can 21 | see more detail about it. 22 | 23 | To simplify the management, we generate a unencrypted private key (and a public key) 24 | to use with snowflake: 25 | 26 | ```sh 27 | cd snowflake/keys 28 | openssl genrsa -out snowflake_rsa_key.pem 2048 29 | openssl pkcs8 -topk8 -inform PEM -in snowflake_rsa_key.pem -out snowflake_rsa_key.p8 30 | openssl rsa -in snowflake_rsa_key.p8 -pubout -out snowflake_rsa_key.pub 31 | ``` 32 | 33 | If you don't have a [OpenSSL toolkit] installed in your environment, you can run 34 | this commands with docker: 35 | 36 | ```sh 37 | cd snowflake 38 | docker run -v $PWD:/work -it nginx openssl genrsa -out /work/keys/snowflake_rsa_key.pem 2048 39 | docker run -v $PWD:/work -it nginx openssl pkcs8 -topk8 -inform PEM -in /work/keys/snowflake_rsa_key.pem -out /work/keys/snowflake_rsa_key.p8 40 | docker run -v $PWD:/work -it nginx openssl rsa -in /work/keys/snowflake_rsa_key.pem -pubout -out /work/keys/snowflake_rsa_key.pub 41 | sudo chown -R $USER:$USER keys/* 42 | ``` 43 | 44 | The content of the keys is similar to the content in this repo 45 | (we upload a valid cert, but it doesn't authenticate with our trial snowflake service) 46 | 47 | ```sh 48 | cat docker/credentials/snowflake_rsa_key.pem 49 | -----BEGIN ENCRYPTED PRIVATE KEY----- 50 | MIIFLTBXBgkqhkiG9w0BBQ0wSjApBgkqhkiG9w0BBQwwHAQIHl29yM4BvgICAggA 51 | MAwGCCqGSIb3DQIJBQAwHQYJYIZIAWUDBAEqBBCkFIfNB88Urq5VaPCCzze1BIIE 52 | ... 53 | -----END ENCRYPTED PRIVATE KEY----- 54 | ``` 55 | ```sh 56 | cat docker/credentials/snowflake_rsa_key.pub 57 | -----BEGIN PUBLIC KEY----- 58 | MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAwBwYbPtbEUXueQ6u3KDw 59 | zlKu4IhAkGdcUBVbdTdUVBLNVsZX+eiKOedN3EnMtDeVzRlaT8JAwHX0LVXkgXtn 60 | ... 61 | -----END PUBLIC KEY----- 62 | ``` 63 | 64 | ### Registry key pair in snowflake 65 | 66 | Access it to the snowflake web console, and locate on top-right your username. 67 | In the snowflake documentation refers swicth your role to SECURITYADMIN, but 68 | in our case need to change to ACCOUNTADMIN. 69 | 70 | Take your public key (without header and footer) and use it to registry in snowflake 71 | using the web console over your user: 72 | 73 | ```sql 74 | alter user dariocazas set rsa_public_key='MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAwBwYbPtbEUXueQ6u3KDw 75 | zlKu4IhAkGdcUBVbdTdUVBLNVsZX+eiKOedN3EnMtDeVzRlaT8JAwHX0LVXkgXtn 76 | KzMBp6TpS4j+2kKvbZc5p0KfZHjn42G+C/DXI4ZNQZEBQ/Q4UY6OkTZepFaOX3ev 77 | 2icxB6LnnVYI3WHkSnq3vTthhYhTuUOQ4YRudadOtoT4By09hxbsaanVl42FXIZP 78 | AXX1jwawzKe52V1+FB5/UMv+JMUFfczlO+acn/EaZvKbR55Vk/+OVrUP4KIKvdWn 79 | s/n4ASYqxiw9xjrizGCoUyl+b+Ch6A02fTU02HrT9jOOj+dVAeFD2QGOqaze0eCD 80 | dwIDAQAB'; 81 | ``` 82 | 83 | After do this, you can use the __snowflake_rsa_key.pem__ private key from kafka 84 | connect. 85 | 86 | [Kafka connector install - Using Key Pair Authentication & Key Rotation]: https://docs.snowflake.com/en/user-guide/kafka-connector-install.html#using-key-pair-authentication-key-rotation 87 | -------------------------------------------------------------------------------- /snowflake/keys/snowflake_rsa_key.p8: -------------------------------------------------------------------------------- 1 | -----BEGIN ENCRYPTED PRIVATE KEY----- 2 | MIIFLTBXBgkqhkiG9w0BBQ0wSjApBgkqhkiG9w0BBQwwHAQIHl29yM4BvgICAggA 3 | MAwGCCqGSIb3DQIJBQAwHQYJYIZIAWUDBAEqBBCkFIfNB88Urq5VaPCCzze1BIIE 4 | 0In6kYmdUnVvH5Q+nPXkPj3VCXd0/aPceHSbC4BsWRtli39bIrWCch1EQXZxoj3x 5 | t8QNtOL9XGKH3XqG9rYpu0VmR2MZjC+FteNZ98RXrDqdwkoN/ZWTeaQ+MaeZtiCn 6 | 93N3dhh70Woee/JgVEcO38vV/i0eJ4ryM07a0eV4d5Y8JQHRBoVVxTPm0Ha/af+p 7 | 7loS5AKvwfiHndFgQPNbILfweGuhRUe8AQf9Bo0vzeXHBj5nO4RqnaTcfmRDIul4 8 | ZtMD7zxxTMJnhezTGFDPqlxEvOOZAudakm01C3y5mfPUs+veMWjNcz0AfPPeyvJP 9 | O5Xmu7kiIGtM1UHRojnQOtZ9QWBmhLfMsRZ3GbVbycCLgZOKhW1OIy+PbbykiiRQ 10 | D6AHszJiFKroZz3yqMRTh9QJFJ4mpa6XjkLGCE8CiPns5Tl7qX2BN78Qs5vxsWJC 11 | 0Z7wqNuoldsNSFKVtfW6Qm81j12XQw1fsk5zqCnabpsiK/uWo1NowhEa5xAAeRW9 12 | 5wqTyWYi0tu8/u3EQo/xwBCCbDiYFxvbbOmWZjsxf95sO5yHrBxGTs8wCduj0I1U 13 | qTXWzTZ4JoAPdSFHwLS61slvujqlSvNvla12nqTYGBtWO/qgLh5egaTmGupLhu4b 14 | 6FiO5CCXg4sfyOoKeZtykbM0wT0Ud8oK8fx9HwlUNxAaW8NrIo3EuRg7dsKdhtD2 15 | hJrqM1dyorVIT7bHSJ5YRLfXHdFGnmaOmJOGvMqXC2yfivEFbMI0nxnrJGDJ4KLS 16 | 9a8DLmgsQZS8PySmWS+cGuvq4nUcxHnhX/j0ZWCZhSUxQ/z/lRx+RmZM+ey/PnzB 17 | uOQGaQrIHe44taN2skz97oopQu9lS6OANE9TPG1Vp1NqanU2Mxkz07++5swdeYp0 18 | WEJLWhkLpn3Ce7ImcceLlFI0B9TlAih4rEiE3REfbGCTvLKpaRPHmwYNmZIAhlhK 19 | m0Q+v/4Isk4hpce5MuOTiR7yz4neV3VCl66sw7o3tJSRnXtoVKFA2QlN0emdOj6j 20 | i0iPvRtKsU/9r8+8EkO3WTg/YO59aLM/pX8V9Rd87jnDidLuO2gVzIsghRiElg2g 21 | /4cC9zmvBSZLfF/TJZGs6pX9WxDh3VjLEjdqvU8weepk/LrxyJADp7Up7GuALSyt 22 | FaMbDPRTLXICsu5q0C/ne//sHeVjiKcz0WgIzeUGqC4wt7ht/G1DDd4/gxAp6ZPm 23 | lnh5WjNPTtmfU2TVV14EYUs9UzrUYm+2G0uG/+da+WpB6hRKZkHNSoFKVq3g5IHl 24 | B2Lc7SFKYnQhpHxmpmCeoQ2/DlzSWS/EHrV54ej68TdPa2MnrrdeeDCGB3Oo4oSm 25 | HSh0bTO4vVOLS9ezLDiFfT0KnhI2HmN3JOGm/2njXwp/qnk3oscyYIxBocsmYeQ9 26 | 1EfS9M4iNjryFNLHNuyWq/9WsDF/LrWPJIoQ+7qZm9AmLZ9yx3ED8YbqIjiK1Q48 27 | gl0NwpyvCFEfWDCjmxUA+W1SnAhf4VK3pRLBbkr5UwNcW+FSQWNtoZ8eHASDab5l 28 | 4HH1NoswYqzEc4jmssQG+3nDimNvenbXvuOjwMF9+wC5LVryysZ2nMeKql4lSr8h 29 | lHe4xkvquTyPbJCSsViAueAHmHxSNW/i6QVNukc24UtP 30 | -----END ENCRYPTED PRIVATE KEY----- 31 | -------------------------------------------------------------------------------- /snowflake/keys/snowflake_rsa_key.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN RSA PRIVATE KEY----- 2 | MIIEpAIBAAKCAQEAwBwYbPtbEUXueQ6u3KDwzlKu4IhAkGdcUBVbdTdUVBLNVsZX 3 | +eiKOedN3EnMtDeVzRlaT8JAwHX0LVXkgXtnKzMBp6TpS4j+2kKvbZc5p0KfZHjn 4 | 42G+C/DXI4ZNQZEBQ/Q4UY6OkTZepFaOX3ev2icxB6LnnVYI3WHkSnq3vTthhYhT 5 | uUOQ4YRudadOtoT4By09hxbsaanVl42FXIZPAXX1jwawzKe52V1+FB5/UMv+JMUF 6 | fczlO+acn/EaZvKbR55Vk/+OVrUP4KIKvdWns/n4ASYqxiw9xjrizGCoUyl+b+Ch 7 | 6A02fTU02HrT9jOOj+dVAeFD2QGOqaze0eCDdwIDAQABAoIBABdl1JvBaXALImZg 8 | IXABshKPA3mZXrO3wwiF8WOvX3f16kh9U82+QseWomcIHgR9GKOoSNWaBSTruNek 9 | tHYIv6IFTlhKv5dRkiinIpKobn8uoBcump+ZEfqGVM2g19v6ezr5jbpxMHADPTeq 10 | yyBZhXN+fnw9nRQOnHnKKHHhzGshkEWEwHSxteG9HYqmID68bEta3azpq+T+Rrig 11 | N9UKLqgUXBD9G8i4T0iuoBZcq0sN7YtJMg6sHOu0JbZgPHlTJliNuQ3OLt3TGYew 12 | /kiQJOsZofVbHVQXvxpAU/Dy+87yUVwl9tCBbsFKmtxgyzZR7w2WxMdCrIjc940t 13 | Zi4UUSECgYEA6CGO6NiYjogmVFhs+BEMi7oRdBtqouQ3qQW5byk8F1bPruWdRnmB 14 | Ekmu5sYhg9oS8PWxXKJHdZ7in/kmbXke/UQ8I+R+RqjKJVNKX0nBAyQY7XNsM4pv 15 | CuStp0XWnsnwP5MO3SYIkmaaushsL6AxR9RGgJZISTKcktA+v5S2cwkCgYEA090L 16 | w3qLfu9egox2/YWykaPsOQnwIEFRueowcJp/ZyAARA2A3gvyoiQt1CVcT9KJ0nPx 17 | ryXb6mQ2rf5qHG0JceQ1DI+mVXhbs+AzPI/n0pPnCW50J5+kNVGQ4fBpbXmh61Tr 18 | VM+b2lTHoSjDisVToaQHYn/BpzaK8aVQggm0Yn8CgYEAwP0VaTSaMPW0mC8j+WGD 19 | Qq+hTxx0HZULSXS+5FIt6WF9LPUtOqhNzLyBss9Kkeo+ESLTICayrnE4DLQBZMZs 20 | IzgVn+mZqnkuBrYmgO46j7f1GYT6kicnhrD0RrtjYYSWPuSuWOIEAmNXhK6Yc0gF 21 | cKhlLQbEdkajsdN8N58VyLECgYAf4mltztiFjvKzRP53YxKftoLLhsJbqFjrWOJX 22 | X/kChR9lHn8ha7zlR/qZrdG5tZ7GTGq4CEOTf+d2wg4oHwTH3idZr5jBzi5G4Nv1 23 | JlcmKtofYj8a43ysBY1/Y1YKgr6qkwojpmb3McElcOQU02OltPDjkwSK7Lt2aIG4 24 | QEukcwKBgQC55ORnqG6548zBg1+eryNKYrzTEJpaFSZ4gRFwcPJMogkQoGRLvW7K 25 | 7P++3fqYvOHTa0dCIHqMXjcalzcyM/N6VSiZExi6N5BhZtwkcGO0YVi+6FFRfGxW 26 | K6ITKTgeTj409QzpwH2qPszq1zsfiHz6HWcKbsJ18thU9ISnod3u9g== 27 | -----END RSA PRIVATE KEY----- 28 | -------------------------------------------------------------------------------- /snowflake/keys/snowflake_rsa_key.pub: -------------------------------------------------------------------------------- 1 | -----BEGIN PUBLIC KEY----- 2 | MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAwBwYbPtbEUXueQ6u3KDw 3 | zlKu4IhAkGdcUBVbdTdUVBLNVsZX+eiKOedN3EnMtDeVzRlaT8JAwHX0LVXkgXtn 4 | KzMBp6TpS4j+2kKvbZc5p0KfZHjn42G+C/DXI4ZNQZEBQ/Q4UY6OkTZepFaOX3ev 5 | 2icxB6LnnVYI3WHkSnq3vTthhYhTuUOQ4YRudadOtoT4By09hxbsaanVl42FXIZP 6 | AXX1jwawzKe52V1+FB5/UMv+JMUFfczlO+acn/EaZvKbR55Vk/+OVrUP4KIKvdWn 7 | s/n4ASYqxiw9xjrizGCoUyl+b+Ch6A02fTU02HrT9jOOj+dVAeFD2QGOqaze0eCD 8 | dwIDAQAB 9 | -----END PUBLIC KEY----- 10 | -------------------------------------------------------------------------------- /snowflake/sql/00-security.sql: -------------------------------------------------------------------------------- 1 | 2 | 3 | -- Set public key in snowflake to your user account to enable access via kafka connect 4 | -- Snowflake doc refers to SECURITYADMIN, but for me didn't work (I need use ACCOUNTADMIN) 5 | -- https://docs.snowflake.com/en/user-guide/kafka-connector-install.html#using-key-pair-authentication-key-rotation 6 | use role accountadmin; 7 | 8 | alter user dariocazas set rsa_public_key='MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEArJFv7/40nuy8D4FC76wQ 9 | Qkz1FHnEhS8jvXVTrSGzlJoTRrKm3Nx039+PPgz0EkzW/WiUdyPF6G4ZJh5L9+WU 10 | 6xEQo9HGFJhA4U4rOOXv9q3SlZEMndpg9qbGd6mp/ym5GZ9lznBVc33oQO2lIWum 11 | j8EmuYn7SLpceY7iCUtCrGgu2gE+OxHcajvQPccdMtNlz+LfXXCe+4By7PGQuBkR 12 | 9wO0wkhoYfRdInvATRSpGJK8jtAmxe9UelobyeEFsbFVqsXruOw1LbNF2bq3IAaQ 13 | TvD5OVYcfyQ+nDrE55AngRAfewpur09laqYfqzYvVZjutZc2InD4VuSVouGc8bYg 14 | qwIDAQAB'; 15 | 16 | -------------------------------------------------------------------------------- /snowflake/sql/01-cdc-to-replica-mysql.sql: -------------------------------------------------------------------------------- 1 | -- Based on: 2 | -- https://docs.snowflake.com/en/user-guide/data-pipelines-examples.html#transforming-loaded-json-data-on-a-schedule 3 | -- https://docs.snowflake.com/en/sql-reference/sql/merge.html 4 | 5 | -- Use this role is not recomendable in production environments 6 | use role accountadmin; 7 | 8 | -- Create the replica table, including extra columns to support replica logic and process trazability 9 | create or replace 10 | table "HOWTO_DB"."PUBLIC"."REPLICA_MYSQL_INVENTORY_USERS" 11 | ( id number PRIMARY KEY comment 'primary key of the source table' 12 | , sourcedb_binlog_gtid string comment 'database log position, gtid used in HA MySQL (null in other cases), used for ordering events (RECORD_CONTENT:payload.source.gtid)' 13 | , sourcedb_binlog_file string comment 'database log position, file log name, used for ordering events (RECORD_CONTENT:payload.source.file)' 14 | , sourcedb_binlog_pos string comment 'database log position, position in log file, used for ordering events (RECORD_CONTENT:payload.source.pos)' 15 | , payload variant comment 'data after operation (RECORD_CONTENT:payload.after)' 16 | , cdc_operation char comment 'CDC registered operation in source DB (RECORD_CONTENT:payload.op)' 17 | , cdc_source_info variant comment 'Debezium source field, for trazability (RECORD_CONTENT:payload.source)' 18 | , ts_ms_sourcedb number comment 'the timestamp when database register the event, not available on database snapshot (RECORD_CONTENT:payload.source.ts_ms)' 19 | , ts_ms_cdc number comment 'the timestamp when the CDC connector capture the event (RECORD_CONTENT:payload.ts_ms)' 20 | , ts_ms_replica_sf number comment 'the timestamp when snowflake task fills the record') 21 | comment = 'Replica from CDC over MySQL Inventory Users'; 22 | 23 | -- Create final view with same columns as MySQL database to use like the same table 24 | create or replace view "HOWTO_DB"."PUBLIC"."MYSQL_INVENTORY_USERS" 25 | as 26 | select payload:id id, payload:name name, payload:email email, payload:password password, payload:created_on created_on 27 | from "HOWTO_DB"."PUBLIC"."REPLICA_MYSQL_INVENTORY_USERS"; 28 | 29 | -- Create a stream from CDC events table, to process new events into replica table 30 | create or replace 31 | stream "HOWTO_DB"."PUBLIC"."CDC_MYSQL_INVENTORY_USERS_STREAM_REPLICATION" 32 | on table "HOWTO_DB"."PUBLIC"."CDC_MYSQL_INVENTORY_USERS"; 33 | 34 | 35 | -- After create stream (avoid loss events), process all events available in CDC events table 36 | merge into "HOWTO_DB"."PUBLIC"."REPLICA_MYSQL_INVENTORY_USERS" replica_table 37 | using 38 | (with 39 | prequery as (select RECORD_METADATA:key.payload.id id 40 | , COALESCE(RECORD_CONTENT:payload.source.gtid, '') sourcedb_binlog_gtid 41 | , COALESCE(RECORD_CONTENT:payload.source.file, '') sourcedb_binlog_file 42 | , to_number(RECORD_CONTENT:payload.source.pos) sourcedb_binlog_pos 43 | , RECORD_CONTENT:payload.after payload 44 | , RECORD_CONTENT:payload.op cdc_operation 45 | , RECORD_CONTENT:payload.source cdc_source_info 46 | , RECORD_CONTENT:payload.source.ts_ms ts_ms_sourcedb 47 | , RECORD_CONTENT:payload.ts_ms ts_ms_cdc 48 | from "HOWTO_DB"."PUBLIC"."CDC_MYSQL_INVENTORY_USERS"), 49 | rank_query as (select * 50 | , ROW_NUMBER() over (PARTITION BY id 51 | order by ts_ms_cdc desc, sourcedb_binlog_file desc, sourcedb_binlog_pos desc) as row_num 52 | from prequery) 53 | select * from rank_query where row_num = 1) event_data 54 | on replica_table.id = to_number(event_data.id) 55 | when not matched and event_data.cdc_operation <> 'd' 56 | then insert 57 | (id, sourcedb_binlog_gtid, sourcedb_binlog_file, sourcedb_binlog_pos, payload 58 | , cdc_operation, cdc_source_info, ts_ms_sourcedb, ts_ms_cdc, ts_ms_replica_sf) 59 | values 60 | (event_data.id, event_data.sourcedb_binlog_gtid, event_data.sourcedb_binlog_file 61 | , event_data.sourcedb_binlog_pos, event_data.payload, event_data.cdc_operation 62 | , event_data.cdc_source_info, event_data.ts_ms_sourcedb, event_data.ts_ms_cdc 63 | , date_part(epoch_millisecond, CURRENT_TIMESTAMP)) 64 | when matched and event_data.cdc_operation = 'd' 65 | then delete 66 | when matched and event_data.cdc_operation <> 'd' 67 | then update set id=event_data.id 68 | , sourcedb_binlog_gtid=event_data.sourcedb_binlog_gtid 69 | , sourcedb_binlog_file=event_data.sourcedb_binlog_file 70 | , sourcedb_binlog_pos=event_data.sourcedb_binlog_pos 71 | , payload=event_data.payload 72 | , cdc_operation=event_data.cdc_operation 73 | , cdc_source_info=event_data.cdc_source_info 74 | , ts_ms_sourcedb=event_data.ts_ms_sourcedb 75 | , ts_ms_cdc=event_data.ts_ms_cdc 76 | , ts_ms_replica_sf=date_part(epoch_millisecond, CURRENT_TIMESTAMP); 77 | 78 | 79 | -- Create task with previous tested query, but read data from the created stream (not CDC events table). 80 | create or replace task "HOWTO_DB"."PUBLIC"."CDC_MYSQL_INVENTORY_USERS_TASK_REPLICATION" 81 | warehouse = compute_wh 82 | schedule = '1 minute' 83 | allow_overlapping_execution = false 84 | when 85 | system$stream_has_data('HOWTO_DB.PUBLIC.CDC_MYSQL_INVENTORY_USERS_STREAM_REPLICATION') 86 | as 87 | merge into "HOWTO_DB"."PUBLIC"."REPLICA_MYSQL_INVENTORY_USERS" replica_table 88 | using 89 | (with 90 | prequery as (select RECORD_METADATA:key.payload.id id 91 | , COALESCE(RECORD_CONTENT:payload.source.gtid, '') sourcedb_binlog_gtid 92 | , COALESCE(RECORD_CONTENT:payload.source.file, '') sourcedb_binlog_file 93 | , to_number(RECORD_CONTENT:payload.source.pos) sourcedb_binlog_pos 94 | , RECORD_CONTENT:payload.after payload 95 | , RECORD_CONTENT:payload.op cdc_operation 96 | , RECORD_CONTENT:payload.source cdc_source_info 97 | , RECORD_CONTENT:payload.source.ts_ms ts_ms_sourcedb 98 | , RECORD_CONTENT:payload.ts_ms ts_ms_cdc 99 | from "HOWTO_DB"."PUBLIC"."CDC_MYSQL_INVENTORY_USERS_STREAM_REPLICATION"), 100 | rank_query as (select * 101 | , ROW_NUMBER() over (PARTITION BY id 102 | order by ts_ms_cdc desc, sourcedb_binlog_file desc, sourcedb_binlog_pos desc) as row_num 103 | from prequery) 104 | select * from rank_query where row_num = 1) event_data 105 | on replica_table.id = to_number(event_data.id) 106 | when not matched and event_data.cdc_operation <> 'd' 107 | then insert 108 | (id, sourcedb_binlog_gtid, sourcedb_binlog_file, sourcedb_binlog_pos, payload 109 | , cdc_operation, cdc_source_info, ts_ms_sourcedb, ts_ms_cdc, ts_ms_replica_sf) 110 | values 111 | (event_data.id, event_data.sourcedb_binlog_gtid, event_data.sourcedb_binlog_file 112 | , event_data.sourcedb_binlog_pos, event_data.payload, event_data.cdc_operation 113 | , event_data.cdc_source_info, event_data.ts_ms_sourcedb, event_data.ts_ms_cdc 114 | , date_part(epoch_millisecond, CURRENT_TIMESTAMP)) 115 | when matched and event_data.cdc_operation = 'd' 116 | then delete 117 | when matched and event_data.cdc_operation <> 'd' 118 | then update set id=event_data.id 119 | , sourcedb_binlog_gtid=event_data.sourcedb_binlog_gtid 120 | , sourcedb_binlog_file=event_data.sourcedb_binlog_file 121 | , sourcedb_binlog_pos=event_data.sourcedb_binlog_pos 122 | , payload=event_data.payload 123 | , cdc_operation=event_data.cdc_operation 124 | , cdc_source_info=event_data.cdc_source_info 125 | , ts_ms_sourcedb=event_data.ts_ms_sourcedb 126 | , ts_ms_cdc=event_data.ts_ms_cdc 127 | , ts_ms_replica_sf=date_part(epoch_millisecond, CURRENT_TIMESTAMP); 128 | 129 | 130 | -- Enable task 131 | ALTER TASK "HOWTO_DB"."PUBLIC"."CDC_MYSQL_INVENTORY_USERS_TASK_REPLICATION" RESUME; 132 | 133 | -- Check info about the task executions (STATE and NEXT_SCHEDULED_TIME columns) 134 | -- If you see error "Cannot execute task , EXECUTE TASK privilege must be granted to owner role" 135 | -- review 00-security.sql script 136 | select * 137 | from table(HOWTO_DB.information_schema.task_history()) 138 | order by scheduled_time desc; 139 | 140 | 141 | -- Check counts (you don't see the same results in event table against the replica table) 142 | select to_char(RECORD_CONTENT:payload.op) cdc_operation, count(*), 'CDC_MYSQL_INVENTORY_USERS' table_name 143 | from "HOWTO_DB"."PUBLIC"."CDC_MYSQL_INVENTORY_USERS" group by RECORD_CONTENT:payload.op 144 | union all 145 | select cdc_operation, count(*), 'REPLICA_MYSQL_INVENTORY_USERS' table_name 146 | from "HOWTO_DB"."PUBLIC"."REPLICA_MYSQL_INVENTORY_USERS" group by cdc_operation 147 | order by table_name, cdc_operation; 148 | -------------------------------------------------------------------------------- /snowflake/sql/01-cdc-to-replica-postgres.sql: -------------------------------------------------------------------------------- 1 | -- Based on: 2 | -- https://docs.snowflake.com/en/user-guide/data-pipelines-examples.html#transforming-loaded-json-data-on-a-schedule 3 | -- https://docs.snowflake.com/en/sql-reference/sql/merge.html 4 | 5 | -- Use this role is not recomendable in production environments 6 | use role accountadmin; 7 | 8 | -- Create the replica table, including extra columns to support replica logic and process trazability 9 | create or replace 10 | table "HOWTO_DB"."PUBLIC"."REPLICA_POSTGRESDB_INVENTORY_PRODUCT" 11 | ( id number PRIMARY KEY comment 'primary key of the source table' 12 | , sourcedb_lsn string comment 'postgres log sequence number, used for ordering events (RECORD_CONTENT:payload.source.lsn)' 13 | , payload variant comment 'data after operation (RECORD_CONTENT:payload.after)' 14 | , cdc_operation char comment 'CDC registered operation in source DB (RECORD_CONTENT:payload.op)' 15 | , cdc_source_info variant comment 'Debezium source field, for trazability (RECORD_CONTENT:payload.source)' 16 | , ts_ms_sourcedb number comment 'the timestamp when database register the event, not available on database snapshot (RECORD_CONTENT:payload.source.ts_ms)' 17 | , ts_ms_cdc number comment 'the timestamp when the CDC connector capture the event (RECORD_CONTENT:payload.ts_ms)' 18 | , ts_ms_replica_sf number comment 'the timestamp when snowflake task fills the record') 19 | comment = 'Replica from CDC over PostgreSQL Inventory Products'; 20 | 21 | -- Create final view with same columns as PostgreSQL database to use like the same table 22 | create or replace view "HOWTO_DB"."PUBLIC"."POSTGRESDB_INVENTORY_PRODUCT" 23 | as 24 | select payload:id id, payload:name name, payload:description description, payload:created_on created_on 25 | from "HOWTO_DB"."PUBLIC"."REPLICA_POSTGRESDB_INVENTORY_PRODUCT"; 26 | 27 | -- Create a stream from CDC events table, to process new events into replica table 28 | create or replace 29 | stream "HOWTO_DB"."PUBLIC"."CDC_POSTGRESDB_INVENTORY_PRODUCT_STREAM_REPLICATION" 30 | on table "HOWTO_DB"."PUBLIC"."CDC_POSTGRESDB_INVENTORY_PRODUCT"; 31 | 32 | 33 | -- After create stream (avoid loss events), process all events available in CDC events table 34 | merge into "HOWTO_DB"."PUBLIC"."REPLICA_POSTGRESDB_INVENTORY_PRODUCT" replica_table 35 | using 36 | (with 37 | prequery as (select RECORD_METADATA:key.payload.id id 38 | , to_number(RECORD_CONTENT:payload.source.lsn) sourcedb_lsn 39 | , RECORD_CONTENT:payload.after payload 40 | , RECORD_CONTENT:payload.op cdc_operation 41 | , RECORD_CONTENT:payload.source cdc_source_info 42 | , RECORD_CONTENT:payload.source.ts_ms ts_ms_sourcedb 43 | , RECORD_CONTENT:payload.ts_ms ts_ms_cdc 44 | from "HOWTO_DB"."PUBLIC"."CDC_POSTGRESDB_INVENTORY_PRODUCT"), 45 | rank_query as (select * 46 | , ROW_NUMBER() over (PARTITION BY id 47 | order by ts_ms_cdc desc, sourcedb_lsn desc) as row_num 48 | from prequery) 49 | select * from rank_query where row_num = 1) event_data 50 | on replica_table.id = to_number(event_data.id) 51 | when not matched and event_data.cdc_operation <> 'd' 52 | then insert 53 | (id, sourcedb_lsn, payload, cdc_operation, cdc_source_info, ts_ms_sourcedb 54 | , ts_ms_cdc, ts_ms_replica_sf) 55 | values 56 | (event_data.id, event_data.sourcedb_lsn, event_data.payload, event_data.cdc_operation 57 | , event_data.cdc_source_info, event_data.ts_ms_sourcedb, event_data.ts_ms_cdc 58 | , date_part(epoch_millisecond, CURRENT_TIMESTAMP)) 59 | when matched and event_data.cdc_operation = 'd' 60 | then delete 61 | when matched and event_data.cdc_operation <> 'd' 62 | then update set id=event_data.id 63 | , sourcedb_lsn=event_data.sourcedb_lsn 64 | , payload=event_data.payload 65 | , cdc_operation=event_data.cdc_operation 66 | , cdc_source_info=event_data.cdc_source_info 67 | , ts_ms_sourcedb=event_data.ts_ms_sourcedb 68 | , ts_ms_cdc=event_data.ts_ms_cdc 69 | , ts_ms_replica_sf=date_part(epoch_millisecond, CURRENT_TIMESTAMP); 70 | 71 | 72 | -- Create task with previous tested query, but read data from the created stream (not CDC events table). 73 | create or replace task "HOWTO_DB"."PUBLIC"."CDC_POSTGRESDB_INVENTORY_PRODUCT_TASK_REPLICATION" 74 | warehouse = compute_wh 75 | schedule = '1 minute' 76 | allow_overlapping_execution = false 77 | when 78 | system$stream_has_data('HOWTO_DB.PUBLIC.CDC_POSTGRESDB_INVENTORY_PRODUCT_STREAM_REPLICATION') 79 | as 80 | merge into "HOWTO_DB"."PUBLIC"."REPLICA_POSTGRESDB_INVENTORY_PRODUCT" replica_table 81 | using 82 | (with 83 | prequery as (select RECORD_METADATA:key.payload.id id 84 | , to_number(RECORD_CONTENT:payload.source.lsn) sourcedb_lsn 85 | , RECORD_CONTENT:payload.after payload 86 | , RECORD_CONTENT:payload.op cdc_operation 87 | , RECORD_CONTENT:payload.source cdc_source_info 88 | , RECORD_CONTENT:payload.source.ts_ms ts_ms_sourcedb 89 | , RECORD_CONTENT:payload.ts_ms ts_ms_cdc 90 | from "HOWTO_DB"."PUBLIC"."CDC_POSTGRESDB_INVENTORY_PRODUCT_STREAM_REPLICATION"), 91 | rank_query as (select * 92 | , ROW_NUMBER() over (PARTITION BY id 93 | order by ts_ms_cdc desc, sourcedb_lsn desc) as row_num 94 | from prequery) 95 | select * from rank_query where row_num = 1) event_data 96 | on replica_table.id = to_number(event_data.id) 97 | when not matched and event_data.cdc_operation <> 'd' 98 | then insert 99 | (id, sourcedb_lsn, payload, cdc_operation, cdc_source_info, ts_ms_sourcedb 100 | , ts_ms_cdc, ts_ms_replica_sf) 101 | values 102 | (event_data.id, event_data.sourcedb_lsn, event_data.payload, event_data.cdc_operation 103 | , event_data.cdc_source_info, event_data.ts_ms_sourcedb, event_data.ts_ms_cdc 104 | , date_part(epoch_millisecond, CURRENT_TIMESTAMP)) 105 | when matched and event_data.cdc_operation = 'd' 106 | then delete 107 | when matched and event_data.cdc_operation <> 'd' 108 | then update set id=event_data.id 109 | , sourcedb_lsn=event_data.sourcedb_lsn 110 | , payload=event_data.payload 111 | , cdc_operation=event_data.cdc_operation 112 | , cdc_source_info=event_data.cdc_source_info 113 | , ts_ms_sourcedb=event_data.ts_ms_sourcedb 114 | , ts_ms_cdc=event_data.ts_ms_cdc 115 | , ts_ms_replica_sf=date_part(epoch_millisecond, CURRENT_TIMESTAMP); 116 | 117 | 118 | -- Enable task 119 | ALTER TASK "HOWTO_DB"."PUBLIC"."CDC_POSTGRESDB_INVENTORY_PRODUCT_TASK_REPLICATION" RESUME; 120 | 121 | -- Check info about the task executions (STATE and NEXT_SCHEDULED_TIME columns) 122 | -- If you see error "Cannot execute task , EXECUTE TASK privilege must be granted to owner role" 123 | -- review 00-security.sql script 124 | select * 125 | from table(HOWTO_DB.information_schema.task_history()) 126 | order by scheduled_time desc; 127 | 128 | 129 | -- Check counts (you don't see the same results in event table against the replica table) 130 | select to_char(RECORD_CONTENT:payload.op) cdc_operation, count(*), 'CDC_POSTGRESDB_INVENTORY_PRODUCT' table_name 131 | from "HOWTO_DB"."PUBLIC"."CDC_POSTGRESDB_INVENTORY_PRODUCT" group by RECORD_CONTENT:payload.op 132 | union all 133 | select cdc_operation, count(*), 'REPLICA_POSTGRESDB_INVENTORY_PRODUCT' table_name 134 | from "HOWTO_DB"."PUBLIC"."REPLICA_POSTGRESDB_INVENTORY_PRODUCT" group by cdc_operation 135 | order by table_name, cdc_operation; 136 | -------------------------------------------------------------------------------- /snowflake/status_sink.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CONNECT_URL=http://localhost:8085 4 | 5 | CONNECTORS=$(curl -s -k ${CONNECT_URL}/connectors) 6 | echo Connector list: 7 | echo $CONNECTORS 8 | echo 9 | 10 | echo Connector status: 11 | echo 12 | 13 | for row in $(echo "${CONNECTORS}" | jq -c -r '.[]'); do 14 | status=$(curl -s -k -X GET "${CONNECT_URL}/connectors/${row}/status") 15 | echo $status 16 | echo 17 | done 18 | 19 | --------------------------------------------------------------------------------