├── .gitignore
├── .images
├── PostgreSQL_logo.3colors.120x120.png
├── Snowflake_Logo.svg.png
├── color_white_debezium_type_600px.svg
├── database-tables.png
├── debeziumio-ar21.svg
├── horizontal-logo-monochromatic-white.png
├── howto-flow.png
├── logo-mysql-170x115.png
├── snowflake-security.png
├── solution.drawio.png
└── solution.png
├── LICENSE
├── README.md
├── articles
├── .images
│ ├── docker-compose.png
│ ├── snowflake_console.png
│ ├── solution-capture-data-changes.png
│ ├── solution-debezium.png
│ ├── solution-kafka-to-snowflake.png
│ ├── solution-replication.png
│ ├── solution-sink-snowflake.png
│ ├── solution-solution-points.png
│ └── solution-solution.png
└── dzone_howto_building-an-enterprise-cdc-solution.md
├── database
├── README.md
├── init_db.sh
├── mysql_crud.sh
├── postgres_crud.sh
└── sql
│ ├── 00_mysql_init.sql
│ ├── 00_postgres_init.sql
│ ├── 01_mysql_changes.sql
│ └── 01_postgres_changes.sql
├── debezium
├── README.md
├── connect
│ ├── debezium-mysql-inventory-connector.json
│ └── debezium-postgres-inventory-connector.json
├── delete_cdc.sh
├── init_cdc.sh
└── status_cdc.sh
├── services
├── .env
├── README.md
├── docker-compose.png
├── docker-compose.yml
└── render_compose_image.sh
└── snowflake
├── README.md
├── connect
└── snowflake-sink-connector.json
├── delete_sink.sh
├── init_sink.sh
├── keys
├── README.md
├── snowflake_rsa_key.p8
├── snowflake_rsa_key.pem
└── snowflake_rsa_key.pub
├── sql
├── 00-security.sql
├── 01-cdc-to-replica-mysql.sql
└── 01-cdc-to-replica-postgres.sql
└── status_sink.sh
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | *.class
3 |
4 | database/data_mysql.csv
5 |
6 | database/data_postgres.csv
7 |
8 | services/.cache/
9 |
--------------------------------------------------------------------------------
/.images/PostgreSQL_logo.3colors.120x120.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/.images/PostgreSQL_logo.3colors.120x120.png
--------------------------------------------------------------------------------
/.images/Snowflake_Logo.svg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/.images/Snowflake_Logo.svg.png
--------------------------------------------------------------------------------
/.images/color_white_debezium_type_600px.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.images/database-tables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/.images/database-tables.png
--------------------------------------------------------------------------------
/.images/debeziumio-ar21.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
75 |
--------------------------------------------------------------------------------
/.images/horizontal-logo-monochromatic-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/.images/horizontal-logo-monochromatic-white.png
--------------------------------------------------------------------------------
/.images/howto-flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/.images/howto-flow.png
--------------------------------------------------------------------------------
/.images/logo-mysql-170x115.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/.images/logo-mysql-170x115.png
--------------------------------------------------------------------------------
/.images/snowflake-security.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/.images/snowflake-security.png
--------------------------------------------------------------------------------
/.images/solution.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/.images/solution.drawio.png
--------------------------------------------------------------------------------
/.images/solution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/.images/solution.png
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Debezium to Snowflake
2 |
3 | - [Debezium to Snowflake](#debezium-to-snowflake)
4 | - [Requirements](#requirements)
5 | - [Organization](#organization)
6 | - [How-to steps](#how-to-steps)
7 | - [I need more!!](#i-need-more)
8 |
9 | This repo is a demo of how to use Debezium to capture changes over tables in MySQL and PostgreSQL
10 | to generate a replica in near-real-time in Snowflake. This is extensible to other databases and
11 | describes several common points about CDC, Kafka, Kafka connect, or Snowflake tools.
12 |
13 | [Miguel García] and I work together on a DZone article [Data Platform: Building an Enterprise CDC Solution],
14 | and as next step I publish this repo as [HOWTO: Building an Enterprise CDC Solution]
15 |
16 | 
17 |
18 | ## Requirements
19 |
20 | To facilitate the execution of the howto, the services will be deployed using **[docker-compose]**.
21 | It has a dependency of **[docker engine]**. For better compatibility, we are using the docker-compose specification 2,
22 | so a **docker engine 1.10.0** or later should work.
23 |
24 | As part of the howto, you will create a Snowflake account, and the howto guide you to create a key pair for authentication.
25 | To perform these actions, you should have an **[OpenSSL toolkit]**. Is commonly available in Linux distributions and
26 | can be installed in Windows or Mac. If you need it, you can run it inside a docker image (will be commented in the howto).
27 |
28 | About hardware requirements, review **[docker engine]** requirements.
29 |
30 | ## Organization
31 |
32 | Well, this demo has several parts. To simplify this, it has been split into several folders in this repo.
33 | For each folder you can found a README file with explanations:
34 |
35 | - **[services]**: relative to docker images and services
36 | - **[database]**: sentences and scripts to run inside the local databases
37 | - **[debezium]**: configuration and scripts to start and check the status of Debezium connectors
38 | - **[snowflake]**: Snowflake scripts, and configuration of the Snowflake sink connector
39 |
40 | ## How-to steps
41 |
42 | You can see a detailed howto in DZone article [HOWTO: Building an Enterprise CDC Solution] that follows these steps
43 |
44 | 
45 |
46 | In this flow:
47 | - Gray: local services
48 | - Yellow: external resources
49 |
50 | ## I need more!!
51 |
52 | Well, check the README available in each folder. It includes some detail about his components
53 | and some additional scripts or functions that you can use to explore this solution.
54 |
55 | I hope this tutorial has been helpful for you and you have enjoyed it.
56 |
57 |
58 | [Miguel García]: https://dzone.com/users/4531976/miguelglor.html
59 | [Data Platform: Building an Enterprise CDC Solution]: https://dzone.com/articles/data-platform-building-an-enterprise-cdc-solution
60 | [HOWTO: Building an Enterprise CDC Solution]: https://dzone.com/articles/howto_building-an-enterprise-cdc-solution
61 | [docker-compose]: https://docs.docker.com/compose/install/
62 | [docker engine]: https://docs.docker.com/engine/
63 | [OpenSSL toolkit]: https://github.com/openssl/openssl#build-and-install
64 | [services]: services/README.md
65 | [database]: database/README.md
66 | [debezium]: debezium/README.md
67 | [snowflake]: snowflake/README.md
68 | [snowflake/keys README]: snowflake/keys
69 | [snowflake/sql/00-security.sql]: snowflake/sql/00-security.sql
70 | [snowflake/connect/snowflake-sink-connector.json]: snowflake/connect/snowflake-sink-connector.json
71 | [snowflake/sql/01-cdc-to-replica-mysql.sql]: snowflake/sql/01-cdc-to-replica-mysql.sql
72 | [snowflake/sql/01-cdc-to-replica-postgres.sql]: snowflake/sql/01-cdc-to-replica-postgres.sql
73 |
--------------------------------------------------------------------------------
/articles/.images/docker-compose.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/articles/.images/docker-compose.png
--------------------------------------------------------------------------------
/articles/.images/snowflake_console.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/articles/.images/snowflake_console.png
--------------------------------------------------------------------------------
/articles/.images/solution-capture-data-changes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/articles/.images/solution-capture-data-changes.png
--------------------------------------------------------------------------------
/articles/.images/solution-debezium.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/articles/.images/solution-debezium.png
--------------------------------------------------------------------------------
/articles/.images/solution-kafka-to-snowflake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/articles/.images/solution-kafka-to-snowflake.png
--------------------------------------------------------------------------------
/articles/.images/solution-replication.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/articles/.images/solution-replication.png
--------------------------------------------------------------------------------
/articles/.images/solution-sink-snowflake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/articles/.images/solution-sink-snowflake.png
--------------------------------------------------------------------------------
/articles/.images/solution-solution-points.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/articles/.images/solution-solution-points.png
--------------------------------------------------------------------------------
/articles/.images/solution-solution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/articles/.images/solution-solution.png
--------------------------------------------------------------------------------
/articles/dzone_howto_building-an-enterprise-cdc-solution.md:
--------------------------------------------------------------------------------
1 | # Introduction
2 |
3 | This article is a follow-up to the [Data Platform: Building an Enterprise CDC Solution](https://dzone.com/articles/data-platform-building-an-enterprise-cdc-solution), where [Miguel García](https://dzone.com/users/4531976/miguelglor.html) and I described:
4 |
5 | * Several Change Data Capture (CDC) use cases and common scenarios in an enterprise platform
6 | * A proposal using Debezium (as log-based CDC) to capture data from the relational databases, and Kafka as a channel that enables several consumers to propagate data changes for different use cases.
7 |
8 | One of the common scenarios for this solution consists of data replication from OLTP Database to OLAP Database (from the operational database to the data warehouse).
9 |
10 | In this article, I'm going to provide a "how-to" to deploy a sample of a CDC process to replicate data from two different relational databases to Snowflake:
11 | * Manage the data changes in a common format.
12 | * Set up a Debezium in Kafka Connect to get data changes and push into Kafka topics.
13 | * Set up Snowflake Sink in Kafka Connect to get data changes from Kafka topics and push the data to Snowflake.
14 | * Apply a specific replication logic to consolidate the data change events in Snowflake, avoiding the use of the JDBC connector for better cost-effectiveness.
15 |
16 | 
17 |
18 | # Step-by-step
19 |
20 | [The GitHub repository](https://github.com/dariocazas/howto-debezium-to-snowflake) includes a detailed description as well as several scripts that you will need in this "how-to":
21 |
22 | ```sh
23 | git clone https://github.com/dariocazas/howto-debezium-to-snowflake.git
24 | ```
25 |
26 | > Note: every folder in this repository has a README file with more info about the process.
27 |
28 | The steps are:
29 |
30 | 1. Pre-requirements
31 | 1. Local environment
32 | 2. Snowflake database
33 | 3. Snowflake authentication
34 | 2. How to capture data changes from databases to a Kafka topic
35 | 1. Start local services
36 | 2. Prepare the databases
37 | 3. Start Debezium
38 | 4. Check data capture
39 | 3. How to push data changes from a Kafka topic into Snowflake
40 | 1. Start local sink process
41 | 2. Check data capture into CDC tables
42 | 3. Apply replication logic
43 | 4. Check data replication
44 |
45 | 
46 |
47 | ## 1. Pre-requirements
48 |
49 | ### 1.1. Local environment
50 | - [docker-compose](https://docs.docker.com/compose/install/) and [docker engine](https://docs.docker.com/engine/) 1.10.0 or later.
51 | - [jq](https://stedolan.github.io/jq/download/) as a JSON parser used in scripts.
52 |
53 | ### 1.2. Snowflake database
54 |
55 | You need a Snowflake Account. To create a trial follow the [Snowflake Trial Accounts doc](https://docs.snowflake.com/en/user-guide/admin-trial-account.html)
56 |
57 | Access to your Snowflake Account, create a database and run the next steps in it:
58 |
59 | ```sh
60 | USE ROLE ACCOUNTADMIN;
61 | CREATE DATABASE HOWTO_DB;
62 | ```
63 |
64 | > Note: in a production environment, it is not recommended to use the role ACCOUNTADMIN for all the tasks like I describe in this howto.
65 |
66 | ### 1.3. Snowflake authentication
67 |
68 | In this howto, we use a key-pair authentication. The detailed process is documented [here](https://docs.snowflake.com/en/user-guide/kafka-connector-install.html#using-key-pair-authentication-key-rotation). You can use the key-pair provided by the repository:
69 | * Encrypted private key: `snowflake/keys/snowflake_rsa_key.p8`
70 | * Private passphrase to decrypt: `mypassphrase`
71 | * Public key: `snowflake/keys/snowflake_rsa_key.pub`
72 |
73 | As the next step, in the Snowflake Worksheet, we need to register the public key (replace in this script the key with your snowflake/keys/snowflake_rsa_key.pub without header and footer)
74 |
75 | ```sql
76 | USE ROLE ACCOUNTADMIN;
77 | ALTER USER dariocazas SET rsa_public_key='MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAwBwYbPtbEUXueQ6u3KDw
78 | zlKu4IhAkGdcUBVbdTdUVBLNVsZX+eiKOedN3EnMtDeVzRlaT8JAwHX0LVXkgXtn
79 | KzMBp6TpS4j+2kKvbZc5p0KfZHjn42G+C/DXI4ZNQZEBQ/Q4UY6OkTZepFaOX3ev
80 | 2icxB6LnnVYI3WHkSnq3vTthhYhTuUOQ4YRudadOtoT4By09hxbsaanVl42FXIZP
81 | AXX1jwawzKe52V1+FB5/UMv+JMUFfczlO+acn/EaZvKbR55Vk/+OVrUP4KIKvdWn
82 | s/n4ASYqxiw9xjrizGCoUyl+b+Ch6A02fTU02HrT9jOOj+dVAeFD2QGOqaze0eCD
83 | dwIDAQAB';
84 | ```
85 |
86 | ## 2. How to capture data changes from databases to a Kafka topic
87 |
88 | In this step, you start two different database engines and enable a CDC process. As a result, you have two Kafka topics with Debezium events that you can consume.
89 |
90 | 
91 |
92 | ### 2.1. Start local services
93 |
94 | The repository contains a docker-compose to run several services in your local environment:
95 | * Two database engines: MySQL and PostgreSQL
96 | * One Kafka broker (and its zookeeper)
97 | * Two Kafka connect services: one to run CDC Debezium tasks and another to send the events to Snowflake
98 |
99 | 
100 |
101 | In a terminal run:
102 |
103 | ```sh
104 | cd services
105 | docker-compose up
106 | ```
107 |
108 | It can take several minutes to download and start the services. Keep this terminal open to be able to see the log of services. When the turorial is finished, you can stop all using `Ctrl+C`.
109 |
110 | ### 2.2. Prepare the databases
111 |
112 | There are two SQL initialization scripts:
113 | * `database/sql/00_mysql_init.sql`: create table `users`
114 | * `database/sql/00_postgres_init.sql`: create table `product`
115 |
116 | To apply these SQL scripts it in dockerized services, and populate data, run the following lines in a terminal:
117 |
118 | ```sh
119 | cd database
120 | # create tables
121 | ./init_db.sh
122 | # Populate data
123 | ./mysql_crud.sh
124 | ./postgres_crud.sh
125 | ```
126 |
127 | In the output, you can see several CRUD operations in the tables, and the last state after operations. You can close this terminal.
128 |
129 | ### 2.3. Start Debezium
130 |
131 | The docker service `cdc_connect` has the necessary dependencies to run Debezium over MySQL and Postgres. The configuration is available in:
132 | * `debezium/connect/debezium-mysql-inventory-connector.json`
133 | * `debezium/connect/debezium-postgres-inventory-connector.json`
134 |
135 | Open a terminal and init the capture of the tables:
136 | ```sh
137 | cd debezium
138 | ./init_cdc.sh
139 | ```
140 |
141 | In the docker-compose terminal, you can see how the connectors start. When the log stabilizes, you can check the status of the Debezium connectors in the previous terminal using:
142 | ```sh
143 | # I assume you are in the debezium folder
144 | ./status_cdc.sh
145 | ```
146 |
147 | You can close this terminal.
148 |
149 | ### 2.4. Check data capture
150 |
151 | You can test if the capture is working with this strategy:
152 | * Open a terminal with live consumer events
153 | * Do CRUD operations in the database
154 |
155 | First for MySQL, open a terminal and run:
156 | ```sh
157 | cd services
158 | docker-compose exec kafka /kafka/bin/kafka-console-consumer.sh \
159 | --bootstrap-server kafka:9092 --from-beginning \
160 | --topic mysqldb.inventory.users \
161 | --property print.key=true --property print.timestamp=true
162 | ```
163 |
164 | The terminal will populate every new event pushed from Debezium to Kafka, sending every insert/update/delete done in `inventory.users` in MySQL
165 |
166 | Open now a terminal in PostgreSQL and do the same:
167 | ```sh
168 | cd services
169 | docker-compose exec kafka /kafka/bin/kafka-console-consumer.sh \
170 | --bootstrap-server kafka:9092 --from-beginning \
171 | --topic postgresdb.inventory.product \
172 | --property print.key=true --property print.timestamp=true
173 | ```
174 |
175 | To generate new events, open a terminal and run:
176 | ```sh
177 | cd database
178 | ./mysql_crud.sh
179 | ./postgres_crud.sh
180 | ```
181 |
182 | You should see new data change events in the consumer terminals.
183 |
184 | ## 3. How to push data changes from Kafka topic into Snowflake
185 |
186 | In this step, you send the Kafka events to Snowflake and generate a replica of the source tables.
187 |
188 | 
189 |
190 | ### 3.1. Start local sink process
191 |
192 | The docker service `sink_connect` has the necessary dependencies to run the Snowflake Sink connector to push new Kafka events into the Snowflake table. The configuration is available in `snowflake/connect/snowflake-sink-connector.json` and you need to update:
193 | * The Snowflake URL with yours in field `snowflake.url.name`
194 | * The authentication fields if you generate your key-pair in the previous step: `snowflake.private.key` and `snowflake.private.key.passphrase`
195 |
196 | Open a terminal and init the upload of the Kafka topics:
197 | ```sh
198 | cd snowflake
199 | ./init_sink.sh
200 | ```
201 |
202 | In the docker-compose terminal, you can see how the connector starts. When the log stabilizes, you can check the status of the Snowflake connector in the previous terminal using:
203 | ```sh
204 | # From the snowflake folder
205 | ./status_sink.sh
206 | ```
207 |
208 | ### 3.2. Check data capture into CDC tables
209 |
210 | When the sink connector uploads the events from the Kafka topics, it creates these tables:
211 | * `CDC_MYSQL_INVENTORY_USERS`
212 | * `CDC_POSTGRESDB_INVENTORY_PRODUCT`
213 |
214 | The upload to Snowflake will be done in batches, so it may take some time until the data is available in Snowflake (in the order of 30-60 seconds).
215 |
216 | From your Snowflake Worksheet, validate that your events are populated in the new tables:
217 | ```sql
218 | USE ROLE ACCOUNTADMIN;
219 | USE SCHEMA HOWTO_DB.PUBLIC;
220 | SELECT * FROM CDC_MYSQL_INVENTORY_USERS;
221 | SELECT * FROM CDC_POSTGRESDB_INVENTORY_PRODUCT;
222 | ```
223 |
224 | Adding new changes in your dockerized databases produces new rows in your tables.
225 |
226 | 1. In the Snowflake Worksheet:
227 | ```sql
228 | SELECT 'Events MySQL', COUNT(1) FROM CDC_MYSQL_INVENTORY_USERS
229 | UNION ALL
230 | SELECT 'Events PostgreSQL', COUNT(1) FROM CDC_POSTGRESDB_INVENTORY_PRODUCT;
231 | ```
232 | 2. From a terminal, apply changes in your databases:
233 | ```sh
234 | cd database
235 | ./mysql_crud.sh
236 | ./postgres_crud.sh
237 | ```
238 | 3. Wait until the events are sent to Snowflake (you can see the log in docker-compose terminal)
239 | 4. Repeat the query in the Snowflake Worksheet
240 |
241 | ### 3.3. Apply replication logic
242 |
243 | In the repository there are two scripts with the SQL logic to generate the replica of the source tables:
244 | * `snowflake/sql/01-cdc-to-replica-mysql.sql`
245 | * `snowflake/sql/01-cdc-to-replica-postgres.sql`
246 |
247 | From your Snowflake Worksheet, execute these two scripts. As a result, you have two views with the same structure of the source databases:
248 | * `MYSQL_INVENTORY_USERS`
249 | * `POSTGRESDB_INVENTORY_PRODUCT`
250 |
251 | These scripts follow the same logic, creating a scheduled task that processes the new events that arrive and updates the replica table.
252 |
253 | 
254 |
255 | > Note: one part of these SQL scripts (the MERGE sentence) depends on the source database engine. The Debezium events have the metadata about the source engine and are used to know which is the last event for an entity. Take into account if you replicate this logic in your production systems.
256 |
257 | ### 3.4. Check data replication
258 |
259 | The end-to-end is running now. You can check the data available in your local databases and validate it against the Snowflake view:
260 | 1. In a terminal, get the actual state of MySQL users table:
261 | ```sh
262 | cd services
263 | echo "SELECT * FROM users ORDER BY id" | docker-compose \
264 | exec -T mysql \
265 | bash -c 'mysql -u $MYSQL_USER -p$MYSQL_PASSWORD inventory'
266 | ```
267 | 2. Go to the Snowflake Worksheet and validate the result with:
268 | ```sql
269 | USE ROLE ACCOUNTADMIN;
270 | USE SCHEMA HOWTO_DB.PUBLIC;
271 | SELECT * FROM MYSQL_INVENTORY_USERS;
272 | ```
273 | 3. In a terminal, get the actual state of the PostgreSQL product table:
274 | ```sh
275 | # I assume you are in the services folder
276 | echo "SELECT * FROM product ORDER BY id" | docker-compose \
277 | exec -T postgres \
278 | env PGOPTIONS="--search_path=inventory" \
279 | bash -c 'psql -U $POSTGRES_USER postgres'
280 | ```
281 | 4. And validate in the Snowflake Worksheet.
282 | ```sql
283 | USE ROLE ACCOUNTADMIN;
284 | USE SCHEMA HOWTO_DB.PUBLIC;
285 | SELECT * FROM POSTGRESDB_INVENTORY_PRODUCT;
286 | ```
287 | 5. Generate new insert-delete-update operations from a terminal:
288 | ```sh
289 | cd database
290 | ./mysql_crud.sh
291 | ./postgres_crud.sh
292 | ```
293 | 6. Wait until the events are sent to Snowflake (review docker-compose terminal log).
294 | 7. Wait until the scheduled task is triggered in Snowflake:
295 | ```sql
296 | USE ROLE ACCOUNTADMIN;
297 | select name, state, error_code, error_message,scheduled_time, next_scheduled_time
298 | from table(HOWTO_DB.information_schema.task_history())
299 | order by scheduled_time desc;
300 | ```
301 | 8. Validate again the content of the tables in Snowflake
302 |
303 | # Conclusions
304 |
305 | **Debezium provides an easy way to capture changes from databases** and populate change events in a Kafka service, that you can consume in several ways.
306 |
307 | To populate these changes to another database you can use the simplest way, but this is not always the best option **in the context of the new generation of data warehouses**, and probably you need to **take into account another kind of strategy close to this service** for better performance and reduce the cost of use.
308 |
309 | First of all, study the possibilities, and after **testing it with a POC similar to this howto**, including some aspects like performance and cost review, **proceed to do the next steps** (security, naming, automatization, data quality, failover, ...).
310 |
--------------------------------------------------------------------------------
/database/README.md:
--------------------------------------------------------------------------------
1 | # Howto - Database description
2 |
3 |
4 | 
5 | 
6 |
7 | * [Access to database shell](#access-to-database-shell)
8 | * [Tables](#tables)
9 | * [CRUD operations](#crud-operations)
10 |
11 |
12 | As part of this howto, I provide:
13 |
14 | - SQL scripts to create new tables and data
15 | - Bash scripts to apply the SQL over the dockerized databases
16 |
17 | ## Access to database shell
18 |
19 | You can open the shell of your database and run it your commands:
20 |
21 | ```sh
22 | # Go to services folder (important)
23 | cd howto-debezium-to-snowflake/services
24 |
25 | # Access to MySQL shell
26 | docker-compose exec mysql \
27 | bash -c 'mysql -u $MYSQL_USER -p$MYSQL_PASSWORD inventory'
28 |
29 | # Access to Postgres shell
30 | docker-compose exec postgres \
31 | env PGOPTIONS="--search_path=inventory" \
32 | bash -c 'psql -U $POSTGRES_USER postgres'
33 | ```
34 |
35 | ## Tables
36 |
37 | Well, to simplify the howto, we use database images provided by Debezium.
38 | When the service databases are UP, you should perform this script:
39 |
40 | ```sh
41 | ./init_db.sh
42 | ```
43 |
44 | This script initializes tables in both database instances (MySQL and PostgreSQL)
45 | loaded from `./sql` folder.
46 |
47 | The SQL script [`sql/00_mysql_init.sql`](./sql/00_mysql_init.sql) create the
48 | **users table** with five basic fields, common for a lot of databases.
49 |
50 | The SQL script [`sql/00_postgres_init.sql`](./sql/00_postgres_init.sql) create the
51 | **product table** with five basic fields, common for a lot of databases.
52 |
53 | The `init_db.sh` script uses these SQL files to init database tables (one for each database)
54 | in preconfigured database `inventory`.
55 |
56 | Both tables have a `created_on` field with the timestamp of creation. This field
57 | is not necessary for CDC, but can be util to perform some checks in sink destination.
58 |
59 | ## CRUD operations
60 |
61 | Well, as part of the demo, you should do actions over the databases. For each reason,
62 | I provide two scripts:
63 |
64 | - `mysql_crud.sh`: trigger several inserts, update, delete and show the final status of the **users** table
65 | - `postgres_crud.sh`: same again, but over PostgreSQL **product** table
66 |
67 | You can launch these scripts over and over again to generate new data in the database,
68 | which via CDC will be replicated as events in Kafka.
69 |
70 |
--------------------------------------------------------------------------------
/database/init_db.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | DOCKER_COMPOSE_FILE=docker-compose.yml
3 | DOCKER_COMPOSE_RELATIVE_PATH=../services
4 |
5 | MYSQL=`cat sql/00_mysql_init.sql`
6 | POSTGRES=`cat sql/00_postgres_init.sql`
7 |
8 | cd $DOCKER_COMPOSE_RELATIVE_PATH
9 |
10 | echo "MySQL new table"
11 | echo "$MYSQL"
12 | echo "$MYSQL" | docker-compose \
13 | -f $DOCKER_COMPOSE_FILE \
14 | exec -T mysql \
15 | bash -c 'mysql -u $MYSQL_USER -p$MYSQL_PASSWORD inventory'
16 |
17 | echo "PostgreSQL new table"
18 | echo "$POSTGRES"
19 | echo "$POSTGRES" | docker-compose \
20 | -f docker-compose.yml \
21 | exec -T postgres \
22 | env PGOPTIONS="--search_path=inventory" \
23 | bash -c 'psql -U $POSTGRES_USER postgres'
24 |
--------------------------------------------------------------------------------
/database/mysql_crud.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | DOCKER_COMPOSE_FILE=docker-compose.yml
3 | DOCKER_COMPOSE_RELATIVE_PATH=../services
4 |
5 | run_sql() {
6 | echo "$1"
7 | echo "$1" | docker-compose \
8 | -f $DOCKER_COMPOSE_FILE \
9 | exec -T mysql \
10 | bash -c 'mysql -u $MYSQL_USER -p$MYSQL_PASSWORD inventory 2> /dev/null'
11 | }
12 |
13 | DML=$(cat sql/01_mysql_changes.sql)
14 |
15 | cd $DOCKER_COMPOSE_RELATIVE_PATH
16 | run_sql "$DML"
17 |
--------------------------------------------------------------------------------
/database/postgres_crud.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | DOCKER_COMPOSE_FILE=docker-compose.yml
3 | DOCKER_COMPOSE_RELATIVE_PATH=../services
4 |
5 | run_sql() {
6 | echo "$1"
7 | echo "$1" | docker-compose \
8 | -f docker-compose.yml \
9 | exec -T postgres \
10 | env PGOPTIONS="--search_path=inventory" \
11 | bash -c 'psql -U $POSTGRES_USER postgres 2> /dev/null'
12 | }
13 |
14 | DML=$(cat sql/01_postgres_changes.sql)
15 |
16 | cd $DOCKER_COMPOSE_RELATIVE_PATH
17 | run_sql "$DML"
18 |
--------------------------------------------------------------------------------
/database/sql/00_mysql_init.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE users (
2 | id MEDIUMINT PRIMARY KEY AUTO_INCREMENT,
3 | name VARCHAR(20),
4 | email VARCHAR(255),
5 | password VARCHAR(100),
6 | created_on TIMESTAMP DEFAULT CURRENT_TIMESTAMP
7 | );
8 |
--------------------------------------------------------------------------------
/database/sql/00_postgres_init.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE product (
2 | id serial PRIMARY KEY,
3 | name VARCHAR(100),
4 | description VARCHAR(255),
5 | created_on TIMESTAMP NOT NULL DEFAULT NOW()
6 | );
7 |
--------------------------------------------------------------------------------
/database/sql/01_mysql_changes.sql:
--------------------------------------------------------------------------------
1 | -- Insert six users in three sentences
2 | INSERT INTO users(name, email, password)
3 | SELECT 'Lara', concat('lara', LEFT(UUID(), 4), '@email.com'), LEFT(UUID(), 25)
4 | ;
5 | INSERT INTO users(name, email, password)
6 | SELECT 'Jackson', concat('jackson', LEFT(UUID(), 4), '@email.com'), LEFT(UUID(), 25)
7 | ;
8 | INSERT INTO users(name, email, password)
9 | SELECT name, concat(lower(name), LEFT(UUID(), 4), '@email.com'), LEFT(UUID(), 25)
10 | FROM (
11 | SELECT 'Hana' AS name
12 | UNION SELECT 'Morgan'
13 | UNION SELECT 'Willie'
14 | UNION SELECT 'Bruce'
15 | ) t;
16 | -- Update last two user passwords
17 | UPDATE users SET password=LEFT(UUID(), 10) ORDER BY id DESC LIMIT 2
18 | ;
19 | -- Update first user password
20 | UPDATE users SET password=LEFT(UUID(), 5) ORDER BY id LIMIT 1
21 | ;
22 | -- Delete last user
23 | DELETE FROM users ORDER BY id DESC LIMIT 1
24 | ;
25 | -- Show actual state
26 | SELECT * FROM users ORDER BY id
27 | ;
--------------------------------------------------------------------------------
/database/sql/01_postgres_changes.sql:
--------------------------------------------------------------------------------
1 | -- Insert six products in two sentences
2 | INSERT INTO product(name, description)
3 | SELECT name, concat('Description for ', name)
4 | FROM (
5 | VALUES ('Harley Davidson Ultimate Chopper'),
6 | ('1996 Moto Guzzi 1100i')
7 | ) t (name)
8 | ;
9 | INSERT INTO product(name, description)
10 | SELECT name, concat('Description for ', name)
11 | FROM (
12 | VALUES ('1985 Toyota Supra'),
13 | ('1957 Ford Thunderbird'),
14 | ('1938 Cadillac V-16 Presidential Limousine'),
15 | ('1982 Lamborghini Diablo')
16 | ) t (name)
17 | ;
18 | -- Update last two descriptions
19 | UPDATE product
20 | SET description=concat('(Update ', NOW(), ') - Desc. for ', name)
21 | WHERE id in (
22 | SELECT id FROM product ORDER BY id DESC LIMIT 2
23 | )
24 | ;
25 | -- Update first description
26 | UPDATE product
27 | SET description=concat('(Up. ', NOW(), ') - Desc. for ', name)
28 | WHERE id in (
29 | SELECT min(id) FROM product
30 | )
31 | ;
32 | -- Delete last product
33 | DELETE FROM product
34 | WHERE id in (
35 | SELECT id FROM product ORDER BY id DESC LIMIT 1
36 | )
37 | ;
38 | -- Show actual state
39 | SELECT * FROM product ORDER BY id
40 | ;
--------------------------------------------------------------------------------
/debezium/README.md:
--------------------------------------------------------------------------------
1 | # Howto - CDC with Debezium
2 |
3 | 
4 |
5 | * [Usage](#usage)
6 | * [Context](#context)
7 | + [Change Events](#change-events)
8 | + [Connector actions](#connector-actions)
9 | + [Connectors config](#connectors-config)
10 | - [MySQL connector](#mysql-connector)
11 | - [PostgreSQL connector](#postgresql-connector)
12 | - [Secret management](#secret-management)
13 |
14 | As part of this howto, I provide:
15 |
16 | - Kafka connect configurations to capture changes from MySQL and PostgreSQL databases
17 | - Scripts to create, destroy and check the status of these connectors
18 |
19 | ## Usage
20 |
21 | This folder includes three scripts, that perform actions against the docker service `cdc_connector`:
22 |
23 | - `init_cdc.sh`: take the configurations available in `./connect` folder, and call
24 | the Kafka connect REST API to create the connector that captures the changes
25 | in the databases and push it in Kafka
26 | - `status_cdc.sh`: call the Kafka connect REST API, get the list of configured
27 | connectors, and for each connector call to show you the status
28 | - `delete_cdc.sh`: similar to status, but delete all the connectors in this
29 | Kafka connect service
30 |
31 | With these scripts, you can perform your test as you wish:
32 |
33 | - Create connectors after or before the tables exists or have data
34 | - Destroy connectors, insert new data, and create again to check data loss
35 | - Wherever test that you can do
36 |
37 | ## Context
38 |
39 | Kafka connect enables the ability to push/poll events to Kafka from/to
40 | other system using only a configuration file, without developing a source/sink application.
41 |
42 | The Kafka connector plugin need to be deployed into the Kafka connect nodes (called
43 | worker nodes), and after doing this you can call a REST API with a configuration to
44 | enable the connector to push data from an external source to Kafka (like CDC connector do for you)
45 | or pull data from Kafka to other sink destinations.
46 |
47 | ### Change Events
48 |
49 | In Kafka, a topic can have one or more partitions. This enables parallel read from consumers
50 | in the same consumer group. A consumer group is a group of consumers that see the topic as
51 | a queue and each consumer can pull events from several partitions but one partition cannot
52 | have more than one consumer for each consumer group. This is the main point to understand
53 | one part of the event: the key.
54 |
55 | An event has three parts:
56 | - Key:
57 | - By default, all events with the same key are pushed to the same partition.
58 | - This can be null, in this case by default, a round-robin between partitions on push is performed.
59 | - Value: the event data
60 | - Headers: a collection of pair key-value that can be setted
61 |
62 | Compared to the native CDC of each database, Debezium provides decoupling between the
63 | database engine and the events it emits, standardizing them and making them common as far as possible.
64 |
65 | As a key, Debezium (and other change data capture tools) include the key fields of the table
66 |
67 | As a value, Debezium sends these common fields:
68 | - source: a metadata document about the connector and the source database
69 | - op: the operation code, can be `r` (read, snapshot), `c`(create, insert), `u` (update), `d` (delete)
70 | - after: a document with the data state after database operation
71 | - before: a document with the data state before database operation
72 |
73 |
74 | Example of key seralized as JSON
75 |
76 | ```JSON
77 | {
78 | "payload": {
79 | "id": 1
80 | },
81 | "schema": {
82 | "fields": [
83 | {
84 | "field": "id",
85 | "optional": false,
86 | "type": "int32"
87 | }
88 | ],
89 | "name": "mysqldb.inventory.users.Key",
90 | "optional": false,
91 | "type": "struct"
92 | }
93 | }
94 | ```
95 |
96 |
97 |
98 |
99 | Example of value seralized as JSON
100 |
101 | ```JSON
102 | {
103 | "payload": {
104 | "after": {
105 | "created_on": "2021-07-28T16:32:45Z",
106 | "email": "lara7012@email.com",
107 | "id": 1,
108 | "name": "Lara",
109 | "password": "701280aa-efc1-11eb-a7c9-0"
110 | },
111 | "before": null,
112 | "op": "c",
113 | "source": {
114 | "connector": "mysql",
115 | "db": "inventory",
116 | "file": "mysql-bin.000003",
117 | "gtid": null,
118 | "name": "mysqldb",
119 | "pos": 703,
120 | "query": null,
121 | "row": 0,
122 | "sequence": null,
123 | "server_id": 223344,
124 | "snapshot": "false",
125 | "table": "users",
126 | "thread": null,
127 | "ts_ms": 1627489965000,
128 | "version": "1.6.1.Final"
129 | },
130 | "transaction": null,
131 | "ts_ms": 1627489965300
132 | },
133 | "schema": {
134 | "fields": [
135 | {
136 | "field": "before",
137 | "fields": [
138 | {
139 | "field": "id",
140 | "optional": false,
141 | "type": "int32"
142 | },
143 | {
144 | "field": "name",
145 | "optional": true,
146 | "type": "string"
147 | },
148 | {
149 | "field": "email",
150 | "optional": true,
151 | "type": "string"
152 | },
153 | {
154 | "field": "password",
155 | "optional": true,
156 | "type": "string"
157 | },
158 | {
159 | "field": "created_on",
160 | "name": "io.debezium.time.ZonedTimestamp",
161 | "optional": true,
162 | "type": "string",
163 | "version": 1
164 | }
165 | ],
166 | "name": "mysqldb.inventory.users.Value",
167 | "optional": true,
168 | "type": "struct"
169 | },
170 | {
171 | "field": "after",
172 | "fields": [
173 | {
174 | "field": "id",
175 | "optional": false,
176 | "type": "int32"
177 | },
178 | {
179 | "field": "name",
180 | "optional": true,
181 | "type": "string"
182 | },
183 | {
184 | "field": "email",
185 | "optional": true,
186 | "type": "string"
187 | },
188 | {
189 | "field": "password",
190 | "optional": true,
191 | "type": "string"
192 | },
193 | {
194 | "field": "created_on",
195 | "name": "io.debezium.time.ZonedTimestamp",
196 | "optional": true,
197 | "type": "string",
198 | "version": 1
199 | }
200 | ],
201 | "name": "mysqldb.inventory.users.Value",
202 | "optional": true,
203 | "type": "struct"
204 | },
205 | {
206 | "field": "source",
207 | "fields": [
208 | {
209 | "field": "version",
210 | "optional": false,
211 | "type": "string"
212 | },
213 | {
214 | "field": "connector",
215 | "optional": false,
216 | "type": "string"
217 | },
218 | {
219 | "field": "name",
220 | "optional": false,
221 | "type": "string"
222 | },
223 | {
224 | "field": "ts_ms",
225 | "optional": false,
226 | "type": "int64"
227 | },
228 | {
229 | "default": "false",
230 | "field": "snapshot",
231 | "name": "io.debezium.data.Enum",
232 | "optional": true,
233 | "parameters": {
234 | "allowed": "true,last,false"
235 | },
236 | "type": "string",
237 | "version": 1
238 | },
239 | {
240 | "field": "db",
241 | "optional": false,
242 | "type": "string"
243 | },
244 | {
245 | "field": "sequence",
246 | "optional": true,
247 | "type": "string"
248 | },
249 | {
250 | "field": "table",
251 | "optional": true,
252 | "type": "string"
253 | },
254 | {
255 | "field": "server_id",
256 | "optional": false,
257 | "type": "int64"
258 | },
259 | {
260 | "field": "gtid",
261 | "optional": true,
262 | "type": "string"
263 | },
264 | {
265 | "field": "file",
266 | "optional": false,
267 | "type": "string"
268 | },
269 | {
270 | "field": "pos",
271 | "optional": false,
272 | "type": "int64"
273 | },
274 | {
275 | "field": "row",
276 | "optional": false,
277 | "type": "int32"
278 | },
279 | {
280 | "field": "thread",
281 | "optional": true,
282 | "type": "int64"
283 | },
284 | {
285 | "field": "query",
286 | "optional": true,
287 | "type": "string"
288 | }
289 | ],
290 | "name": "io.debezium.connector.mysql.Source",
291 | "optional": false,
292 | "type": "struct"
293 | },
294 | {
295 | "field": "op",
296 | "optional": false,
297 | "type": "string"
298 | },
299 | {
300 | "field": "ts_ms",
301 | "optional": true,
302 | "type": "int64"
303 | },
304 | {
305 | "field": "transaction",
306 | "fields": [
307 | {
308 | "field": "id",
309 | "optional": false,
310 | "type": "string"
311 | },
312 | {
313 | "field": "total_order",
314 | "optional": false,
315 | "type": "int64"
316 | },
317 | {
318 | "field": "data_collection_order",
319 | "optional": false,
320 | "type": "int64"
321 | }
322 | ],
323 | "optional": true,
324 | "type": "struct"
325 | }
326 | ],
327 | "name": "mysqldb.inventory.users.Envelope",
328 | "optional": false,
329 | "type": "struct"
330 | }
331 | }
332 | ```
333 |
334 |
335 |
336 | To maintain simplicity, this demo works with JSON events with the schema included in the event.
337 | In a non-test environment, the recommended approach is to use a Schema Registry to store the schemas
338 | and other serialization format like Avro to store it.
339 |
340 | ### Connector actions
341 |
342 | When connectors perform the first run, you can see an initial snapshot of the database (which is a configurable option).
343 | After doing this, every change applied to the tables that these connectors listen will be the track to Kafka. This include:
344 | - When you add new rows, one event per row will be inserted
345 | - When you update rows,
346 | - One event per row will be updated
347 | - If an update affects the key of the table, Debezium throw like a delete action and a new insert of data
348 | - When you delete rows, two events per row will be deleted (configurable option):
349 | - One event with info about the operation DELETE
350 | - Another event with a null value (events in Kafka have key, value, and headers, and any can be null)
351 |
352 | Each event has as key the key of the table, that enables guarantees of order. The topics of Kafka
353 | have properties to identify data retention and clean policies:
354 | - Retention by time
355 | - Retention by size
356 | - Retention by compaction
357 |
358 | When using compaction hold, when Kafka triggers the cleanup process, it keeps the last event for each key on the topic.
359 | If the last event for a key has a null value, Kafka removes all events for this key. With this approach,
360 | when a new consumer begins to read the topic, he does not have to download the changes from the origin of the replica:
361 | he first obtains the state of the table from the moment of the last compaction, and then continues reading
362 | the changes captured since then.
363 |
364 |
365 | ### Connectors config
366 |
367 | The Kafka connectors have common configuration properties and others that depend of
368 | the Kafka connector plugin that you use. A FileStreamSource connector needs
369 | the configuration of the file to read, and a CDC connector need info about the
370 | database that should be read: the configuration is not the same, but
371 | some parts are common:
372 | - name: all connectors should have a name to reference it
373 | - connector.class: the class that implements the connector, that may be a
374 | source (push external data to Kafka) or sink (pull data from Kafka to another system)
375 | - tasks.max: the maximum number of tasks that perform the source/sink action
376 |
377 | To review other common configurations, you can review [the official doc about kafka connect configuring].
378 |
379 | Another main point of the Kafka connector is the ability to do some basic transformations (called SMT)
380 | of the event, like add some field or change the event key. We don't perform this
381 | in this howto, but can be interested in some use cases.
382 |
383 | #### MySQL connector
384 |
385 | You can see all the documentation about this Kafka connector plugin in
386 | the [Debezium connector for MySQL] page.
387 |
388 | This connector supports several MySQL topologies, but this demo will track
389 | changes for a standalone MySQL server.
390 |
391 | When you start the connector, you can see three new topics:
392 |
393 | - `mysqldb`: schema change topic, with schema change events that include all DDL
394 | statements applied to databases in the MySQL server. The name of this topic is
395 | the same described in property `database.server.name`
396 | - `mysqldb.schema-changes.inventory`: track DDL changes in the database, and it
397 | is necessary by internal management of the CDC connector. You can configure the
398 | topic name in `database.history.kafka.topic`
399 | - `mysqldb.inventory.users`:
400 | - If you were run the steps in [database readme], you should have a topic for this table
401 | - This topic manage the change events for table users
402 |
403 | Well, you can see the connector config in [`connect/debezium-mysql-inventory-connector.json`](./connector/debezium-mysql-inventory-connector.json)
404 |
405 | - Connection properties:
406 | - `database.hostname`: IP address or hostname of the MySQL database server.
407 | - `database.port`: integer port number of the MySQL database server.
408 | - `database.user`: name of the MySQL user to use when connecting to the MySQL database server.
409 | - `database.password`: password to use when connecting to the MySQL database server.
410 | - `database.server.id`: a numeric ID of this database client, which must be unique across all
411 | currently-running database processes in the MySQL cluster. If not set, a random number will be use.
412 | - `database.server.name`: logical name that identifies and provides a namespace for the particular
413 | MySQL database server/cluster in which Debezium is capturing changes.
414 | - CDC properties:
415 | - `database.history.kafka.bootstrap.servers`: a list of host/port pairs that the connector uses for
416 | establishing an initial connection to the Kafka cluster. Each pair should point to the same Kafka
417 | cluster used by the Kafka Connect process.
418 | - `database.history.kafka.topic`: the full name of the Kafka topic where the connector stores the
419 | database schema history.
420 | - `database.include`: name of the database for which to capture changes. The connector does not capture
421 | changes in any database whose name is not in this property or `database.include.list`
422 | - `table.include.list`: an optional, comma-separated list of regular expressions that match
423 | fully-qualified table identifiers of tables whose changes you want to capture.
424 | The connector does not capture changes in any table not included in table.include.list.
425 | - Exists properties to configure the exclude instead of include databases/tables, and a lot of
426 | parametrized options. Review the [official doc](https://debezium.io/documentation/reference/connectors/mysql.html#mysql-connector-properties).
427 |
428 | #### PostgreSQL connector
429 |
430 | You can see all the documentation about this Kafka connector plugin in
431 | the [Debezium connector for PostgreSQL] page.
432 |
433 | In this case, when you start the connector you only see one topic:
434 | - `postgres.inventory.product`:
435 | - If you were run the steps in [database readme], you should have a topic for this table
436 | - This topic manage the change events for table product
437 |
438 | If you review the properties used, is very similar to the MySQL connector, and no new description is needed.
439 |
440 | #### Secret management
441 |
442 | Is a good practice externalize your secrets outside of connector configs. You can review the [KIP-297] to use
443 | an external provider to reference it.
444 |
445 |
446 | [database readme]: ../database/README.md
447 | [docker readme]: ../docker/README.md
448 | [Debezium connector for MySQL]: https://debezium.io/documentation/reference/connectors/mysql.html
449 | [Debezium connector for PostgreSQL]: https://debezium.io/documentation/reference/connectors/postgresql.html
450 | [the official doc about kafka connect configuring]: https://kafka.apache.org/documentation.html#connect_configuring
451 | [KIP-297]: https://cwiki.apache.org/confluence/display/KAFKA/KIP-297%3A+Externalizing+Secrets+for+Connect+Configurations
--------------------------------------------------------------------------------
/debezium/connect/debezium-mysql-inventory-connector.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "debezium-mysql-inventory-connector",
3 | "config": {
4 | "connector.class": "io.debezium.connector.mysql.MySqlConnector",
5 | "tasks.max": "1",
6 | "database.hostname": "mysql",
7 | "database.port": "3306",
8 | "database.user": "debezium",
9 | "database.password": "dbz",
10 | "database.server.id": "184054",
11 | "database.server.name": "mysqldb",
12 | "database.include": "inventory",
13 | "database.history.kafka.bootstrap.servers": "kafka:9092",
14 | "database.history.kafka.topic": "mysqldb.schema-changes.inventory",
15 | "table.include.list": "inventory.users"
16 | }
17 | }
--------------------------------------------------------------------------------
/debezium/connect/debezium-postgres-inventory-connector.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "debezium-postgres-inventory-connector",
3 | "config": {
4 | "connector.class": "io.debezium.connector.postgresql.PostgresConnector",
5 | "tasks.max": "1",
6 | "database.hostname": "postgres",
7 | "database.port": "5432",
8 | "database.user": "postgres",
9 | "database.password": "postgres",
10 | "database.dbname": "postgres",
11 | "database.server.name": "postgresdb",
12 | "schema.include": "inventory",
13 | "table.include.list": "inventory.product"
14 | }
15 | }
--------------------------------------------------------------------------------
/debezium/delete_cdc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CONNECT_URL=http://localhost:8083
4 |
5 | CONNECTORS=$(curl -s -k ${CONNECT_URL}/connectors)
6 | echo Connector list:
7 | echo $CONNECTORS
8 | echo
9 |
10 | for row in $(echo "${CONNECTORS}" | jq -c -r '.[]'); do
11 | status=$(curl -s -k -X DELETE "${CONNECT_URL}/connectors/${row}")
12 | echo Deleted ${row}
13 | done
14 |
--------------------------------------------------------------------------------
/debezium/init_cdc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Based on https://github.com/debezium/debezium-examples/tree/master/tutorial
4 |
5 | CONNECT_URL=http://localhost:8083
6 | MYSQL_CONNECT_CONFIG=connect/debezium-mysql-inventory-connector.json
7 | POSTGRES_CONNECT_CONFIG=connect/debezium-postgres-inventory-connector.json
8 |
9 | echo "### Creating MySQL CDC connect ###"
10 | curl -i -X POST $CONNECT_URL/connectors \
11 | -H "Content-Type:application/json" \
12 | -d @$MYSQL_CONNECT_CONFIG
13 | echo .
14 |
15 | echo "### Creating Postgres CDC connect ###"
16 | curl -i -X POST $CONNECT_URL/connectors \
17 | -H "Accept:application/json" \
18 | -H "Content-Type:application/json" \
19 | -d @$POSTGRES_CONNECT_CONFIG
20 | echo .
21 |
22 |
23 |
--------------------------------------------------------------------------------
/debezium/status_cdc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CONNECT_URL=http://localhost:8083
4 |
5 | CONNECTORS=$(curl -s -k ${CONNECT_URL}/connectors)
6 | echo Connector list:
7 | echo $CONNECTORS
8 | echo
9 |
10 | echo Connector status:
11 | echo
12 |
13 | for row in $(echo "${CONNECTORS}" | jq -c -r '.[]'); do
14 | status=$(curl -s -k -X GET "${CONNECT_URL}/connectors/${row}/status")
15 | echo $status
16 | echo
17 | done
18 |
--------------------------------------------------------------------------------
/services/.env:
--------------------------------------------------------------------------------
1 | DEBEZIUM_VERSION=1.6
2 | COMPOSE_PROJECT_NAME=howto-debeizum-to-snowflake-${DEBEZIUM_VERSION}
3 | CONFLUENT_VERSION=5.5.5
4 |
--------------------------------------------------------------------------------
/services/README.md:
--------------------------------------------------------------------------------
1 | # Services
2 |
3 | 
4 |
5 | * [Usage](#usage)
6 | * [Context](#context)
7 | + [Docker-compose detail](#docker-compose-detail)
8 | + [Access to containers](#access-to-containers)
9 | - [Kafka commands](#kafka-commands)
10 | - [Database commands](#database-commands)
11 | * [References](#references)
12 |
13 | As part of this howto, I provide:
14 |
15 | - A docker-compose to run it
16 | - `credentials` folder with Snowflake keys
17 | - `.env` file with product versions
18 |
19 | ## Usage
20 |
21 | You can run it with a single command, and see all logs in your terminal. Clone this repository and go to the
22 | docker folder to run it:
23 | ```sh
24 | git clone https://github.com/dariocazas/howto-debezium-to-snowflake.git
25 | cd howto-debezium-to-snowflake/services
26 | docker-compose up
27 | ```
28 |
29 | You can stop this using `Ctrl+C`
30 |
31 | **It is important** go to the docker folder due to use a `.env` file available in this folder
32 |
33 | ## Context
34 |
35 | ### Docker-compose detail
36 |
37 | The compose YML run several images and expose several ports. For simplicity, I use Debezium images for many parts:
38 |
39 | - **mysql**: database instance provided by Debezium team
40 | - **postgres**: database instance provide by Debezium team
41 | - **zookeeper**: as part of the Kafka ecosystem
42 | - **kafka**: single Kafka broker, exposing his 9092 port
43 | - **cdc_connect**: Kafka connect worker node, provided by Debezium team, with the connector plugins for his supported databases
44 | - **sink_connect**: Kafka connect worker node, provided by confluent. I include the installation of snowflake connector plugin
45 |
46 | 
47 |
48 | ### Access to containers
49 |
50 | Inside of docker-compose file, you can see several commands to enable access to the containers.
51 | You can run these commands inside the `docker` folder (to enable docker to read the `.env` file)
52 |
53 | #### Kafka commands
54 |
55 | ```sh
56 | # List topics
57 | docker-compose -f docker-compose.yml exec kafka /kafka/bin/kafka-topics.sh --bootstrap-server kafka:9092 --list
58 |
59 | # Show all CDC MySQL data (including keys for the events)
60 | docker-compose -f docker-compose.yml exec kafka /kafka/bin/kafka-console-consumer.sh \
61 | --bootstrap-server kafka:9092 --from-beginning \
62 | --topic mysqldb.inventory.users
63 |
64 | # Show all CDC MySQL data (including keys for the events and timestamp which the event was received in Kafka)
65 | docker-compose -f docker-compose.yml exec kafka /kafka/bin/kafka-console-consumer.sh \
66 | --bootstrap-server kafka:9092 --from-beginning \
67 | --topic mysqldb.inventory.users \
68 | --property print.key=true --property print.timestamp=true
69 |
70 | # Show all CDC Posgres data
71 | docker-compose -f docker-compose.yml exec kafka /kafka/bin/kafka-console-consumer.sh \
72 | --bootstrap-server kafka:9092 --from-beginning \
73 | --topic postgresdb.inventory.product
74 | ```
75 |
76 | #### Database commands
77 |
78 | ```sh
79 | # Access to MySQL shell
80 | docker-compose -f docker-compose.yml exec mysql \
81 | bash -c 'mysql -u $MYSQL_USER -p$MYSQL_PASSWORD inventory'
82 |
83 | # Access to Postgres shell
84 | docker-compose -f docker-compose.yml exec postgres \
85 | env PGOPTIONS="--search_path=inventory" \
86 | bash -c 'psql -U $POSTGRES_USER postgres'
87 | ```
88 |
89 | ## References
90 |
91 | - [Debezium tutorial](https://debezium.io/documentation/reference/1.6/tutorial.html)
92 | - [Debezium images github](https://github.com/debezium/docker-images)
93 | - [Confluent: kafka connect zero to hero](https://github.com/confluentinc/demo-scene/tree/master/kafka-connect-zero-to-hero)
94 | - [Docker compose graph visualization](https://github.com/pmsipilot/docker-compose-viz)
--------------------------------------------------------------------------------
/services/docker-compose.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dariocazas/howto-debezium-to-snowflake/a2d8a9689929f2025cb6113af355e6ad77b77046/services/docker-compose.png
--------------------------------------------------------------------------------
/services/docker-compose.yml:
--------------------------------------------------------------------------------
1 | # Based on https://debezium.io/documentation/reference/1.5/tutorial.html
2 | # Run as:
3 | # docker-compose up
4 | ---
5 | version: "2"
6 | services:
7 |
8 | zookeeper:
9 | image: debezium/zookeeper:${DEBEZIUM_VERSION}
10 | ports:
11 | - 2181:2181
12 | - 2888:2888
13 | - 3888:3888
14 |
15 | # You can list the existing topics with
16 | # docker-compose exec kafka /kafka/bin/kafka-topics.sh --bootstrap-server kafka:9092 --list
17 | # After start CDC, you can consume the events using this command
18 | # MySQL:
19 | # docker-compose exec kafka /kafka/bin/kafka-console-consumer.sh --bootstrap-server kafka:9092 --from-beginning --property print.key=true --topic mysqldb.inventory.users
20 | # Postgres:
21 | # docker-compose exec kafka /kafka/bin/kafka-console-consumer.sh --bootstrap-server kafka:9092 --from-beginning --property print.key=true --topic postgresdb.inventory.product
22 | kafka:
23 | image: debezium/kafka:${DEBEZIUM_VERSION}
24 | ports:
25 | - 9092:9092
26 | environment:
27 | #ADVERTISED_HOST_NAME: localhost
28 | ZOOKEEPER_CONNECT: zookeeper:2181
29 | depends_on:
30 | - zookeeper
31 | links:
32 | - zookeeper:zookeeper
33 |
34 | # docker-compose exec mysql bash -c 'mysql -u $MYSQL_USER -p$MYSQL_PASSWORD inventory'
35 | mysql:
36 | image: debezium/example-mysql:${DEBEZIUM_VERSION}
37 | ports:
38 | - 3306:3306
39 | environment:
40 | MYSQL_ROOT_PASSWORD: debezium
41 | MYSQL_USER: mysqluser
42 | MYSQL_PASSWORD: mysqlpw
43 |
44 | # docker-compose exec postgres env PGOPTIONS="--search_path=inventory" bash -c 'psql -U $POSTGRES_USER postgres'
45 | postgres:
46 | image: debezium/example-postgres:${DEBEZIUM_VERSION}
47 | ports:
48 | - 5432:5432
49 | environment:
50 | POSTGRES_USER: postgres
51 | POSTGRES_PASSWORD: postgres
52 |
53 | cdc_connect:
54 | image: debezium/connect:${DEBEZIUM_VERSION}
55 | ports:
56 | - 8083:8083
57 | environment:
58 | BOOTSTRAP_SERVERS: kafka:9092
59 | GROUP_ID: cdc_connect_group
60 | REST_PORT: 8083
61 | REST_ADVERTISED_HOST_NAME: localhost
62 | CONFIG_STORAGE_TOPIC: my_cdc_connect_configs
63 | OFFSET_STORAGE_TOPIC: my_cdc_connect_offsets
64 | STATUS_STORAGE_TOPIC: my_cdc_connect_statuses
65 | CONFIG_STORAGE_REPLICATION_FACTOR: "1"
66 | OFFSET_STORAGE_REPLICATION_FACTOR: "1"
67 | STATUS_STORAGE_REPLICATION_FACTOR: "1"
68 | depends_on:
69 | - zookeeper
70 | - kafka
71 | - mysql
72 | - postgres
73 | links:
74 | - zookeeper:zookeeper
75 | - kafka:kafka
76 | - mysql:mysql
77 | - postgres:postgres
78 |
79 | sink_connect:
80 | image: confluentinc/cp-kafka-connect-base:${CONFLUENT_VERSION}
81 | ports:
82 | - 8085:8085
83 | environment:
84 | CONNECT_BOOTSTRAP_SERVERS: kafka:9092
85 | CONNECT_REST_PORT: 8085
86 | CONNECT_REST_ADVERTISED_HOST_NAME: "localhost"
87 | CONNECT_GROUP_ID: sink_connect_group
88 | CONNECT_CONFIG_STORAGE_TOPIC: my_sink_connect_configs
89 | CONNECT_OFFSET_STORAGE_TOPIC: my_sink_connect_offsets
90 | CONNECT_STATUS_STORAGE_TOPIC: my_sink_connect_statuses
91 | CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: "1"
92 | CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: "1"
93 | CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: "1"
94 | CONNECT_KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter
95 | CONNECT_VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter
96 | DEBEZIUM_VERSION: ${DEBEZIUM_VERSION}
97 | depends_on:
98 | - zookeeper
99 | - kafka
100 | links:
101 | - zookeeper:zookeeper
102 | - kafka:kafka
103 | # https://github.com/confluentinc/demo-scene/blob/master/kafka-connect-zero-to-hero/docker-compose.yml#L89-L101
104 | command:
105 | - bash
106 | - -c
107 | - |
108 | echo "Installing Connector"
109 | confluent-hub install --no-prompt snowflakeinc/snowflake-kafka-connector:1.5.5
110 | #
111 | echo "Launching Kafka Connect worker"
112 | /etc/confluent/docker/run &
113 | #
114 | sleep infinity
115 |
--------------------------------------------------------------------------------
/services/render_compose_image.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # This script update the PNG with the services of the docker-compose.yml
4 | # Is based on https://github.com/pmsipilot/docker-compose-viz
5 |
6 | docker run --rm -it --name dcv -v $(pwd):/input pmsipilot/docker-compose-viz render -m image docker-compose.yml --force
7 |
--------------------------------------------------------------------------------
/snowflake/README.md:
--------------------------------------------------------------------------------
1 | # Snowflake
2 |
3 | 
4 |
5 | * [Sink to Snowflake scripts](#sink-to-snowflake-scripts)
6 | + [Snowflake scripts](#snowflake-scripts)
7 | * [Context](#context)
8 | + [Sink connector](#sink-connector)
9 | + [Snowflake security](#snowflake-security)
10 | + [Snowflake resource naming used](#snowflake-resource-naming-used)
11 | + [Snowflake CDC Debeizum table](#snowflake-cdc-debeizum-table)
12 | + [Snowflake replica table](#snowflake-replica-table)
13 | + [The final view](#the-final-view)
14 |
15 | As part of this howto, I provide:
16 |
17 | - Kafka connect configurations to push event changes from CDC topics to Snowflake
18 | - Scripts to create, destroy and check the status of these connectors
19 | - Snowflake SQL scripts with replica transformation of the change event tables
20 |
21 | ## Sink to Snowflake scripts
22 |
23 | This folder includes three bash scripts, that perform actions against the docker service `cdc_sink`:
24 |
25 | - `init_cdc.sh`: take the configuration available in `./connect/snowflake-sink-connector.json` file, and call
26 | the Kafka connect REST API to create the connector sink the CDC topics to Snowflake event tables
27 | - `status_cdc.sh`: call the Kafka connect REST API, get the list of configured
28 | connectors, and for each connector call to show you the status
29 | - `delete_cdc.sh`: similar to status, but delete all the connectors in this
30 | Kafka connect service
31 |
32 | **IMPORTANT**: you MUST change several parameters in `./connect/snowflake-sink-connector.json` file:
33 | - `snowflake.url.name`: the entry point for your Snowflake environment
34 | - `snowflake.user.name`: your user name
35 | - `snowflake.private.key`: your pub certificate
36 | - `snowflake.private.key.passphrase`: well, in this demo not include it because the generated certificate isn't encrypted
37 |
38 | Is a good practice to externalize your secrets outside of connector configs. You can review the [KIP-297] to use
39 | an external provider to reference it.
40 |
41 | With these scripts, you can perform your test as you wish:
42 |
43 | - Create connector after or before the topics exist or have data
44 | - Destroy connector, insert new data, and create again to check data loss
45 | - Wherever test that you can do
46 |
47 | ### Snowflake scripts
48 |
49 | Configure your Snowflake account replication with:
50 |
51 | - `sql/00-security.sql`: you partially include it when you do the [snowflake/keys] README. The script is documented.
52 | - `sql/01-cdc-to-replica-mysql.sql`: create a view similar to the original MySQL table, and the necessary to replicate
53 | the events uploaded to Snowflake
54 | - `sql/01-cdc-to-replica-postgres.sql`: like the MySQL, but for the PostgreSQL table
55 |
56 | ## Context
57 |
58 | ### Sink connector
59 |
60 | If you review the detail about the [debezium], you should have context about the Kafka connect
61 | and how to configure it. As you can see, [this connector] is very similar:
62 |
63 | - Common connector parts (name, connector class, ...)
64 | - Snowflake connection properties and destination definition
65 | - You should configure your Snowflake account (url, user, keys...)
66 | - Is recommended to apply a topic2table mapping
67 | - Other configs:
68 | - `key.converter`:
69 | - Tell to connector how to understand the key of the events received from the topics.
70 | - You can use a generic JsonConverter, but Snowflake offers to you his own implementation, that support some additional options
71 | - `value.converter`: like the `key.converter`, but with a focus on the value of the event
72 | - `behavior.on.null.values`
73 | - Specific property of the Snowflake converters (but exist generic alternatives)
74 | - In [debezium] explain about how to Debezium transform the DELETE actions
75 | into two events (one with the delete operation, and another with `null` value)
76 | - An `null` value makes sense in Kafka context, but not for a database (like Snowflake), for this reason, configure as `IGNORE`:
77 | these events will not upload to Snowflake
78 |
79 | ### Snowflake security
80 |
81 | For simplicity, this demo should be run as SYSADMIN role in Snowflake, after grant privileges to run TASK to this role.
82 |
83 | ### Snowflake resource naming used
84 |
85 | In this demo:
86 | - All resources include the topic name in upper case, replacing the `.` with `_`
87 | - The Debezium events are ingested to tables with the prefix `CDC_`
88 | - The tables with the replica of state using the prefix `REPLICA_`
89 | - The stream (listeners over the change in Snowflake tables) used for batch new events to replication, follow `_STREAM_REPLICATION`
90 | - The task in charge of trigger the replica, follow `_TASK_REPLICATION`
91 |
92 | ### Snowflake CDC Debeizum table
93 |
94 | As the configuration of the sink Kafka connector, you specify in which database, schema, and table populate the events.
95 | The tables have the same format with two columns:
96 | - `RECORD_METADATA`: variant column with a JSON, that includes info about the original topic and the key of the event
97 | - `RECORD_CONTENT`: variant column with a JSON, with the value of the event.
98 |
99 | About the key and the value, this demo works with JSON serialization without schema registry. The events generated by
100 | the CDC includes the JSON Schema relative to the events. If you review, the `RECORD_CONTENT` has the same event value that
101 | you see as event value in Kafka topic. The record `RECORD_METADATA` includes:
102 |
103 | - CreateTime: when Kafka receive the event
104 | - topic: the name of the source topic
105 | - partition: the number of the partition of topic that contains the event
106 | - offset: the position in the partition for the event
107 | - key: the event key
108 |
109 | ```json
110 | {
111 | "CreateTime": 1627490826351,
112 | "topic": "mysqldb.inventory.users",
113 | "partition": 0,
114 | "offset": 12,
115 | "key": {
116 | "payload": {
117 | "id": 1
118 | },
119 | "schema": {
120 | "fields": [
121 | {
122 | "field": "id",
123 | "optional": false,
124 | "type": "int32"
125 | }
126 | ],
127 | "name": "mysqldb.inventory.users.Key",
128 | "optional": false,
129 | "type": "struct"
130 | }
131 | }
132 | }
133 |
134 | ```
135 |
136 | You can use this table as historic evolution for the source table, which can be util for analytical purposes.
137 |
138 | ### Snowflake replica table
139 |
140 | One of the objectives of this demo is to replicate the state of the source databases in Snowflake. It can be done
141 | not only for Snowflake (you can populate the topic data in another database via JDBC sink connector) but in the case
142 | of Snowflake exist several points to consider that enable one plus of complexity.
143 |
144 | When you perform a replica using JDBC connector, the order of the operations is directly the order that you read
145 | from the topic. But in Snowflake, you need to process a batch of information (or the partial/entire event table while you
146 | haven't a task to do it). In this case, you need to sort the events and take the last one for each key.
147 |
148 | The script replication does these actions:
149 | - Create replication table
150 | - Create a view over the replication table (to see the same structure as original database table)
151 | - Create a stream over the event table (in our case, capture new ingested rows)
152 | - Merge the actual table to replication table
153 | - Create a task with the `MERGE INTO` sentence, reading from the stream (not from the event table)
154 | - Enable the task (that runs every minute)
155 | - And other check sentence util
156 |
157 | Well, is important (to avoid lost data) to create the stream before running the `MERGE INTO` sentence from
158 | the event table (I assume that you are ingesting data before creating the replication table).
159 |
160 | The `MERGE INTO` sentence includes:
161 | - Projection of important fields for the process (not from a functional data perspective). This include:
162 | - Fields used for sorting the events (binlog, lsn,...)
163 | - The functional data (payload of the event)
164 | - The CDC operation (read, insert, update, delete)
165 | - Metadata about the CDC process (source field of Debezium change event), util for traceability
166 | - Some fields util to calc latencies
167 | - Sort the input. This operation depends on your source database engine and his configuration:
168 | - From MySQL, exist diferent topologies. In our demo, use a standalone and build a binlog sequence
169 | with filename and position to sort it
170 | - From PostgreSQL, the path is used the lsn id
171 | - Take the last operation for each key
172 | - You should guarantee that the query only result in one result for each key
173 | - If the merge operation matches several keys to one, the operation is not deterministic and can apply any.
174 | - Check if the key of the source table match with the target (replica) table
175 | - If no match and operation is `delete`, the event should be discarded
176 | - If no match and operation is another, the event should be inserted
177 | - If match and operation is `delete`, the row in the replica table should be deleted
178 | - If match and operation is another, the event should be applied to the replica table
179 |
180 | When your query runs fine over the source table, you should schedule a task that runs it for you. If you run
181 | again and again this query over the events table, you process again and again all the events. To avoid it,
182 | run the task over the stream created, not for the event table. The stream is cleaned automatically every
183 | successful iteration, and you only process the new events. You can add a condition over the task that only
184 | runs if exist data in the stream.
185 |
186 | After create the task, you should enable it using a `ALTER TASK` sentence. You can see the task history execution with
187 | ```sql
188 | select *
189 | from table(demo_db.information_schema.task_history())
190 | order by scheduled_time desc;
191 | ```
192 |
193 | ### The final view
194 |
195 | The replication table contains columns with info about the CDC and replication process, util for checking. But for your
196 | final consumer, this information is not expected. They want the same table that they have in the source database system.
197 |
198 | One column has the valuable data: the `PAYLOAD` column. This content the functional data, in JSON format.
199 | You can create a view over this field, projecting the data like the source databases.
200 |
201 | This has one additional benefit: **evolution**. If your source database evolves (adding columns, removing it, wherever)
202 | all the process is not affected, all runs fine. The unique change is the view:
203 | - No changes in your data pipeline
204 | - No changes in your data
205 | - Coexistence of old and new data
206 | - The schema of each data is included with the data
207 |
208 | [debezium]: ../debezium/README.md
209 | [this connector]: ./connect/snowflake-sink-connector.json
210 | [snowflake/keys]: keys/
211 | [KIP-297]: https://cwiki.apache.org/confluence/display/KAFKA/KIP-297%3A+Externalizing+Secrets+for+Connect+Configurations
--------------------------------------------------------------------------------
/snowflake/connect/snowflake-sink-connector.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "snowflake-sink-connector",
3 | "config": {
4 | "connector.class": "com.snowflake.kafka.connector.SnowflakeSinkConnector",
5 | "tasks.max": "1",
6 | "topics": "mysqldb.inventory.users,postgresdb.inventory.product",
7 | "snowflake.topic2table.map": "mysqldb.inventory.users:cdc_mysql_inventory_users,postgresdb.inventory.product:cdc_postgresdb_inventory_product",
8 | "snowflake.url.name": "mh16247.eu-west-2.aws.snowflakecomputing.com:443",
9 | "snowflake.user.name": "dariocazas",
10 | "snowflake.private.key": "MIIFLTBXBgkqhkiG9w0BBQ0wSjApBgkqhkiG9w0BBQwwHAQIHl29yM4BvgICAggAMAwGCCqGSIb3DQIJBQAwHQYJYIZIAWUDBAEqBBCkFIfNB88Urq5VaPCCzze1BIIE0In6kYmdUnVvH5Q+nPXkPj3VCXd0/aPceHSbC4BsWRtli39bIrWCch1EQXZxoj3xt8QNtOL9XGKH3XqG9rYpu0VmR2MZjC+FteNZ98RXrDqdwkoN/ZWTeaQ+MaeZtiCn93N3dhh70Woee/JgVEcO38vV/i0eJ4ryM07a0eV4d5Y8JQHRBoVVxTPm0Ha/af+p7loS5AKvwfiHndFgQPNbILfweGuhRUe8AQf9Bo0vzeXHBj5nO4RqnaTcfmRDIul4ZtMD7zxxTMJnhezTGFDPqlxEvOOZAudakm01C3y5mfPUs+veMWjNcz0AfPPeyvJPO5Xmu7kiIGtM1UHRojnQOtZ9QWBmhLfMsRZ3GbVbycCLgZOKhW1OIy+PbbykiiRQD6AHszJiFKroZz3yqMRTh9QJFJ4mpa6XjkLGCE8CiPns5Tl7qX2BN78Qs5vxsWJC0Z7wqNuoldsNSFKVtfW6Qm81j12XQw1fsk5zqCnabpsiK/uWo1NowhEa5xAAeRW95wqTyWYi0tu8/u3EQo/xwBCCbDiYFxvbbOmWZjsxf95sO5yHrBxGTs8wCduj0I1UqTXWzTZ4JoAPdSFHwLS61slvujqlSvNvla12nqTYGBtWO/qgLh5egaTmGupLhu4b6FiO5CCXg4sfyOoKeZtykbM0wT0Ud8oK8fx9HwlUNxAaW8NrIo3EuRg7dsKdhtD2hJrqM1dyorVIT7bHSJ5YRLfXHdFGnmaOmJOGvMqXC2yfivEFbMI0nxnrJGDJ4KLS9a8DLmgsQZS8PySmWS+cGuvq4nUcxHnhX/j0ZWCZhSUxQ/z/lRx+RmZM+ey/PnzBuOQGaQrIHe44taN2skz97oopQu9lS6OANE9TPG1Vp1NqanU2Mxkz07++5swdeYp0WEJLWhkLpn3Ce7ImcceLlFI0B9TlAih4rEiE3REfbGCTvLKpaRPHmwYNmZIAhlhKm0Q+v/4Isk4hpce5MuOTiR7yz4neV3VCl66sw7o3tJSRnXtoVKFA2QlN0emdOj6ji0iPvRtKsU/9r8+8EkO3WTg/YO59aLM/pX8V9Rd87jnDidLuO2gVzIsghRiElg2g/4cC9zmvBSZLfF/TJZGs6pX9WxDh3VjLEjdqvU8weepk/LrxyJADp7Up7GuALSytFaMbDPRTLXICsu5q0C/ne//sHeVjiKcz0WgIzeUGqC4wt7ht/G1DDd4/gxAp6ZPmlnh5WjNPTtmfU2TVV14EYUs9UzrUYm+2G0uG/+da+WpB6hRKZkHNSoFKVq3g5IHlB2Lc7SFKYnQhpHxmpmCeoQ2/DlzSWS/EHrV54ej68TdPa2MnrrdeeDCGB3Oo4oSmHSh0bTO4vVOLS9ezLDiFfT0KnhI2HmN3JOGm/2njXwp/qnk3oscyYIxBocsmYeQ91EfS9M4iNjryFNLHNuyWq/9WsDF/LrWPJIoQ+7qZm9AmLZ9yx3ED8YbqIjiK1Q48gl0NwpyvCFEfWDCjmxUA+W1SnAhf4VK3pRLBbkr5UwNcW+FSQWNtoZ8eHASDab5l4HH1NoswYqzEc4jmssQG+3nDimNvenbXvuOjwMF9+wC5LVryysZ2nMeKql4lSr8hlHe4xkvquTyPbJCSsViAueAHmHxSNW/i6QVNukc24UtP",
11 | "snowflake.private.key.passphrase": "mypassphrase",
12 | "snowflake.database.name": "HOWTO_DB",
13 | "snowflake.schema.name": "public",
14 | "key.converter": "com.snowflake.kafka.connector.records.SnowflakeJsonConverter",
15 | "value.converter": "com.snowflake.kafka.connector.records.SnowflakeJsonConverter",
16 | "behavior.on.null.values": "IGNORE"
17 | }
18 | }
--------------------------------------------------------------------------------
/snowflake/delete_sink.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CONNECT_URL=http://localhost:8085
4 |
5 | CONNECTORS=$(curl -s -k ${CONNECT_URL}/connectors)
6 | echo Connector list:
7 | echo $CONNECTORS
8 | echo
9 |
10 | for row in $(echo "${CONNECTORS}" | jq -c -r '.[]'); do
11 | status=$(curl -s -k -X DELETE "${CONNECT_URL}/connectors/${row}")
12 | echo Deleted ${row}
13 | done
14 |
--------------------------------------------------------------------------------
/snowflake/init_sink.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CONNECT_URL=http://localhost:8085
4 | SINK_SNOWPIPE_CONNECT_CONFIG=connect/snowflake-sink-connector.json
5 |
6 | echo "### Creating Snowpipe sink connector ###"
7 | curl -i -X POST $CONNECT_URL/connectors \
8 | -H "Accept:application/json" \
9 | -H "Content-Type:application/json" \
10 | -d @$SINK_SNOWPIPE_CONNECT_CONFIG
11 | echo .
12 |
--------------------------------------------------------------------------------
/snowflake/keys/README.md:
--------------------------------------------------------------------------------
1 | # Credentials management
2 |
3 | ## Snowflake
4 |
5 | ### Create your account
6 |
7 | To use snowflake need to create a free trial: https://signup.snowflake.com
8 |
9 | You can select a Standard Snowflake edition over several clouds.
10 | After validate email and access to the web console, you can see that exists:
11 |
12 | - The host accessed in the URL is your configuration for the snowflake connector
13 | - In left panel, you can see the DEMO_DB database with a PUBLIC schema
14 | - In top-right panel, you can see
15 | - Your role (SYSADMIN)
16 | - Your warehouse (COMPUTE_WH)
17 |
18 | ### Create your key pair
19 |
20 | In [Kafka connector install - Using Key Pair Authentication & Key Rotation], you can
21 | see more detail about it.
22 |
23 | To simplify the management, we generate a unencrypted private key (and a public key)
24 | to use with snowflake:
25 |
26 | ```sh
27 | cd snowflake/keys
28 | openssl genrsa -out snowflake_rsa_key.pem 2048
29 | openssl pkcs8 -topk8 -inform PEM -in snowflake_rsa_key.pem -out snowflake_rsa_key.p8
30 | openssl rsa -in snowflake_rsa_key.p8 -pubout -out snowflake_rsa_key.pub
31 | ```
32 |
33 | If you don't have a [OpenSSL toolkit] installed in your environment, you can run
34 | this commands with docker:
35 |
36 | ```sh
37 | cd snowflake
38 | docker run -v $PWD:/work -it nginx openssl genrsa -out /work/keys/snowflake_rsa_key.pem 2048
39 | docker run -v $PWD:/work -it nginx openssl pkcs8 -topk8 -inform PEM -in /work/keys/snowflake_rsa_key.pem -out /work/keys/snowflake_rsa_key.p8
40 | docker run -v $PWD:/work -it nginx openssl rsa -in /work/keys/snowflake_rsa_key.pem -pubout -out /work/keys/snowflake_rsa_key.pub
41 | sudo chown -R $USER:$USER keys/*
42 | ```
43 |
44 | The content of the keys is similar to the content in this repo
45 | (we upload a valid cert, but it doesn't authenticate with our trial snowflake service)
46 |
47 | ```sh
48 | cat docker/credentials/snowflake_rsa_key.pem
49 | -----BEGIN ENCRYPTED PRIVATE KEY-----
50 | MIIFLTBXBgkqhkiG9w0BBQ0wSjApBgkqhkiG9w0BBQwwHAQIHl29yM4BvgICAggA
51 | MAwGCCqGSIb3DQIJBQAwHQYJYIZIAWUDBAEqBBCkFIfNB88Urq5VaPCCzze1BIIE
52 | ...
53 | -----END ENCRYPTED PRIVATE KEY-----
54 | ```
55 | ```sh
56 | cat docker/credentials/snowflake_rsa_key.pub
57 | -----BEGIN PUBLIC KEY-----
58 | MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAwBwYbPtbEUXueQ6u3KDw
59 | zlKu4IhAkGdcUBVbdTdUVBLNVsZX+eiKOedN3EnMtDeVzRlaT8JAwHX0LVXkgXtn
60 | ...
61 | -----END PUBLIC KEY-----
62 | ```
63 |
64 | ### Registry key pair in snowflake
65 |
66 | Access it to the snowflake web console, and locate on top-right your username.
67 | In the snowflake documentation refers swicth your role to SECURITYADMIN, but
68 | in our case need to change to ACCOUNTADMIN.
69 |
70 | Take your public key (without header and footer) and use it to registry in snowflake
71 | using the web console over your user:
72 |
73 | ```sql
74 | alter user dariocazas set rsa_public_key='MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAwBwYbPtbEUXueQ6u3KDw
75 | zlKu4IhAkGdcUBVbdTdUVBLNVsZX+eiKOedN3EnMtDeVzRlaT8JAwHX0LVXkgXtn
76 | KzMBp6TpS4j+2kKvbZc5p0KfZHjn42G+C/DXI4ZNQZEBQ/Q4UY6OkTZepFaOX3ev
77 | 2icxB6LnnVYI3WHkSnq3vTthhYhTuUOQ4YRudadOtoT4By09hxbsaanVl42FXIZP
78 | AXX1jwawzKe52V1+FB5/UMv+JMUFfczlO+acn/EaZvKbR55Vk/+OVrUP4KIKvdWn
79 | s/n4ASYqxiw9xjrizGCoUyl+b+Ch6A02fTU02HrT9jOOj+dVAeFD2QGOqaze0eCD
80 | dwIDAQAB';
81 | ```
82 |
83 | After do this, you can use the __snowflake_rsa_key.pem__ private key from kafka
84 | connect.
85 |
86 | [Kafka connector install - Using Key Pair Authentication & Key Rotation]: https://docs.snowflake.com/en/user-guide/kafka-connector-install.html#using-key-pair-authentication-key-rotation
87 |
--------------------------------------------------------------------------------
/snowflake/keys/snowflake_rsa_key.p8:
--------------------------------------------------------------------------------
1 | -----BEGIN ENCRYPTED PRIVATE KEY-----
2 | MIIFLTBXBgkqhkiG9w0BBQ0wSjApBgkqhkiG9w0BBQwwHAQIHl29yM4BvgICAggA
3 | MAwGCCqGSIb3DQIJBQAwHQYJYIZIAWUDBAEqBBCkFIfNB88Urq5VaPCCzze1BIIE
4 | 0In6kYmdUnVvH5Q+nPXkPj3VCXd0/aPceHSbC4BsWRtli39bIrWCch1EQXZxoj3x
5 | t8QNtOL9XGKH3XqG9rYpu0VmR2MZjC+FteNZ98RXrDqdwkoN/ZWTeaQ+MaeZtiCn
6 | 93N3dhh70Woee/JgVEcO38vV/i0eJ4ryM07a0eV4d5Y8JQHRBoVVxTPm0Ha/af+p
7 | 7loS5AKvwfiHndFgQPNbILfweGuhRUe8AQf9Bo0vzeXHBj5nO4RqnaTcfmRDIul4
8 | ZtMD7zxxTMJnhezTGFDPqlxEvOOZAudakm01C3y5mfPUs+veMWjNcz0AfPPeyvJP
9 | O5Xmu7kiIGtM1UHRojnQOtZ9QWBmhLfMsRZ3GbVbycCLgZOKhW1OIy+PbbykiiRQ
10 | D6AHszJiFKroZz3yqMRTh9QJFJ4mpa6XjkLGCE8CiPns5Tl7qX2BN78Qs5vxsWJC
11 | 0Z7wqNuoldsNSFKVtfW6Qm81j12XQw1fsk5zqCnabpsiK/uWo1NowhEa5xAAeRW9
12 | 5wqTyWYi0tu8/u3EQo/xwBCCbDiYFxvbbOmWZjsxf95sO5yHrBxGTs8wCduj0I1U
13 | qTXWzTZ4JoAPdSFHwLS61slvujqlSvNvla12nqTYGBtWO/qgLh5egaTmGupLhu4b
14 | 6FiO5CCXg4sfyOoKeZtykbM0wT0Ud8oK8fx9HwlUNxAaW8NrIo3EuRg7dsKdhtD2
15 | hJrqM1dyorVIT7bHSJ5YRLfXHdFGnmaOmJOGvMqXC2yfivEFbMI0nxnrJGDJ4KLS
16 | 9a8DLmgsQZS8PySmWS+cGuvq4nUcxHnhX/j0ZWCZhSUxQ/z/lRx+RmZM+ey/PnzB
17 | uOQGaQrIHe44taN2skz97oopQu9lS6OANE9TPG1Vp1NqanU2Mxkz07++5swdeYp0
18 | WEJLWhkLpn3Ce7ImcceLlFI0B9TlAih4rEiE3REfbGCTvLKpaRPHmwYNmZIAhlhK
19 | m0Q+v/4Isk4hpce5MuOTiR7yz4neV3VCl66sw7o3tJSRnXtoVKFA2QlN0emdOj6j
20 | i0iPvRtKsU/9r8+8EkO3WTg/YO59aLM/pX8V9Rd87jnDidLuO2gVzIsghRiElg2g
21 | /4cC9zmvBSZLfF/TJZGs6pX9WxDh3VjLEjdqvU8weepk/LrxyJADp7Up7GuALSyt
22 | FaMbDPRTLXICsu5q0C/ne//sHeVjiKcz0WgIzeUGqC4wt7ht/G1DDd4/gxAp6ZPm
23 | lnh5WjNPTtmfU2TVV14EYUs9UzrUYm+2G0uG/+da+WpB6hRKZkHNSoFKVq3g5IHl
24 | B2Lc7SFKYnQhpHxmpmCeoQ2/DlzSWS/EHrV54ej68TdPa2MnrrdeeDCGB3Oo4oSm
25 | HSh0bTO4vVOLS9ezLDiFfT0KnhI2HmN3JOGm/2njXwp/qnk3oscyYIxBocsmYeQ9
26 | 1EfS9M4iNjryFNLHNuyWq/9WsDF/LrWPJIoQ+7qZm9AmLZ9yx3ED8YbqIjiK1Q48
27 | gl0NwpyvCFEfWDCjmxUA+W1SnAhf4VK3pRLBbkr5UwNcW+FSQWNtoZ8eHASDab5l
28 | 4HH1NoswYqzEc4jmssQG+3nDimNvenbXvuOjwMF9+wC5LVryysZ2nMeKql4lSr8h
29 | lHe4xkvquTyPbJCSsViAueAHmHxSNW/i6QVNukc24UtP
30 | -----END ENCRYPTED PRIVATE KEY-----
31 |
--------------------------------------------------------------------------------
/snowflake/keys/snowflake_rsa_key.pem:
--------------------------------------------------------------------------------
1 | -----BEGIN RSA PRIVATE KEY-----
2 | MIIEpAIBAAKCAQEAwBwYbPtbEUXueQ6u3KDwzlKu4IhAkGdcUBVbdTdUVBLNVsZX
3 | +eiKOedN3EnMtDeVzRlaT8JAwHX0LVXkgXtnKzMBp6TpS4j+2kKvbZc5p0KfZHjn
4 | 42G+C/DXI4ZNQZEBQ/Q4UY6OkTZepFaOX3ev2icxB6LnnVYI3WHkSnq3vTthhYhT
5 | uUOQ4YRudadOtoT4By09hxbsaanVl42FXIZPAXX1jwawzKe52V1+FB5/UMv+JMUF
6 | fczlO+acn/EaZvKbR55Vk/+OVrUP4KIKvdWns/n4ASYqxiw9xjrizGCoUyl+b+Ch
7 | 6A02fTU02HrT9jOOj+dVAeFD2QGOqaze0eCDdwIDAQABAoIBABdl1JvBaXALImZg
8 | IXABshKPA3mZXrO3wwiF8WOvX3f16kh9U82+QseWomcIHgR9GKOoSNWaBSTruNek
9 | tHYIv6IFTlhKv5dRkiinIpKobn8uoBcump+ZEfqGVM2g19v6ezr5jbpxMHADPTeq
10 | yyBZhXN+fnw9nRQOnHnKKHHhzGshkEWEwHSxteG9HYqmID68bEta3azpq+T+Rrig
11 | N9UKLqgUXBD9G8i4T0iuoBZcq0sN7YtJMg6sHOu0JbZgPHlTJliNuQ3OLt3TGYew
12 | /kiQJOsZofVbHVQXvxpAU/Dy+87yUVwl9tCBbsFKmtxgyzZR7w2WxMdCrIjc940t
13 | Zi4UUSECgYEA6CGO6NiYjogmVFhs+BEMi7oRdBtqouQ3qQW5byk8F1bPruWdRnmB
14 | Ekmu5sYhg9oS8PWxXKJHdZ7in/kmbXke/UQ8I+R+RqjKJVNKX0nBAyQY7XNsM4pv
15 | CuStp0XWnsnwP5MO3SYIkmaaushsL6AxR9RGgJZISTKcktA+v5S2cwkCgYEA090L
16 | w3qLfu9egox2/YWykaPsOQnwIEFRueowcJp/ZyAARA2A3gvyoiQt1CVcT9KJ0nPx
17 | ryXb6mQ2rf5qHG0JceQ1DI+mVXhbs+AzPI/n0pPnCW50J5+kNVGQ4fBpbXmh61Tr
18 | VM+b2lTHoSjDisVToaQHYn/BpzaK8aVQggm0Yn8CgYEAwP0VaTSaMPW0mC8j+WGD
19 | Qq+hTxx0HZULSXS+5FIt6WF9LPUtOqhNzLyBss9Kkeo+ESLTICayrnE4DLQBZMZs
20 | IzgVn+mZqnkuBrYmgO46j7f1GYT6kicnhrD0RrtjYYSWPuSuWOIEAmNXhK6Yc0gF
21 | cKhlLQbEdkajsdN8N58VyLECgYAf4mltztiFjvKzRP53YxKftoLLhsJbqFjrWOJX
22 | X/kChR9lHn8ha7zlR/qZrdG5tZ7GTGq4CEOTf+d2wg4oHwTH3idZr5jBzi5G4Nv1
23 | JlcmKtofYj8a43ysBY1/Y1YKgr6qkwojpmb3McElcOQU02OltPDjkwSK7Lt2aIG4
24 | QEukcwKBgQC55ORnqG6548zBg1+eryNKYrzTEJpaFSZ4gRFwcPJMogkQoGRLvW7K
25 | 7P++3fqYvOHTa0dCIHqMXjcalzcyM/N6VSiZExi6N5BhZtwkcGO0YVi+6FFRfGxW
26 | K6ITKTgeTj409QzpwH2qPszq1zsfiHz6HWcKbsJ18thU9ISnod3u9g==
27 | -----END RSA PRIVATE KEY-----
28 |
--------------------------------------------------------------------------------
/snowflake/keys/snowflake_rsa_key.pub:
--------------------------------------------------------------------------------
1 | -----BEGIN PUBLIC KEY-----
2 | MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAwBwYbPtbEUXueQ6u3KDw
3 | zlKu4IhAkGdcUBVbdTdUVBLNVsZX+eiKOedN3EnMtDeVzRlaT8JAwHX0LVXkgXtn
4 | KzMBp6TpS4j+2kKvbZc5p0KfZHjn42G+C/DXI4ZNQZEBQ/Q4UY6OkTZepFaOX3ev
5 | 2icxB6LnnVYI3WHkSnq3vTthhYhTuUOQ4YRudadOtoT4By09hxbsaanVl42FXIZP
6 | AXX1jwawzKe52V1+FB5/UMv+JMUFfczlO+acn/EaZvKbR55Vk/+OVrUP4KIKvdWn
7 | s/n4ASYqxiw9xjrizGCoUyl+b+Ch6A02fTU02HrT9jOOj+dVAeFD2QGOqaze0eCD
8 | dwIDAQAB
9 | -----END PUBLIC KEY-----
10 |
--------------------------------------------------------------------------------
/snowflake/sql/00-security.sql:
--------------------------------------------------------------------------------
1 |
2 |
3 | -- Set public key in snowflake to your user account to enable access via kafka connect
4 | -- Snowflake doc refers to SECURITYADMIN, but for me didn't work (I need use ACCOUNTADMIN)
5 | -- https://docs.snowflake.com/en/user-guide/kafka-connector-install.html#using-key-pair-authentication-key-rotation
6 | use role accountadmin;
7 |
8 | alter user dariocazas set rsa_public_key='MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEArJFv7/40nuy8D4FC76wQ
9 | Qkz1FHnEhS8jvXVTrSGzlJoTRrKm3Nx039+PPgz0EkzW/WiUdyPF6G4ZJh5L9+WU
10 | 6xEQo9HGFJhA4U4rOOXv9q3SlZEMndpg9qbGd6mp/ym5GZ9lznBVc33oQO2lIWum
11 | j8EmuYn7SLpceY7iCUtCrGgu2gE+OxHcajvQPccdMtNlz+LfXXCe+4By7PGQuBkR
12 | 9wO0wkhoYfRdInvATRSpGJK8jtAmxe9UelobyeEFsbFVqsXruOw1LbNF2bq3IAaQ
13 | TvD5OVYcfyQ+nDrE55AngRAfewpur09laqYfqzYvVZjutZc2InD4VuSVouGc8bYg
14 | qwIDAQAB';
15 |
16 |
--------------------------------------------------------------------------------
/snowflake/sql/01-cdc-to-replica-mysql.sql:
--------------------------------------------------------------------------------
1 | -- Based on:
2 | -- https://docs.snowflake.com/en/user-guide/data-pipelines-examples.html#transforming-loaded-json-data-on-a-schedule
3 | -- https://docs.snowflake.com/en/sql-reference/sql/merge.html
4 |
5 | -- Use this role is not recomendable in production environments
6 | use role accountadmin;
7 |
8 | -- Create the replica table, including extra columns to support replica logic and process trazability
9 | create or replace
10 | table "HOWTO_DB"."PUBLIC"."REPLICA_MYSQL_INVENTORY_USERS"
11 | ( id number PRIMARY KEY comment 'primary key of the source table'
12 | , sourcedb_binlog_gtid string comment 'database log position, gtid used in HA MySQL (null in other cases), used for ordering events (RECORD_CONTENT:payload.source.gtid)'
13 | , sourcedb_binlog_file string comment 'database log position, file log name, used for ordering events (RECORD_CONTENT:payload.source.file)'
14 | , sourcedb_binlog_pos string comment 'database log position, position in log file, used for ordering events (RECORD_CONTENT:payload.source.pos)'
15 | , payload variant comment 'data after operation (RECORD_CONTENT:payload.after)'
16 | , cdc_operation char comment 'CDC registered operation in source DB (RECORD_CONTENT:payload.op)'
17 | , cdc_source_info variant comment 'Debezium source field, for trazability (RECORD_CONTENT:payload.source)'
18 | , ts_ms_sourcedb number comment 'the timestamp when database register the event, not available on database snapshot (RECORD_CONTENT:payload.source.ts_ms)'
19 | , ts_ms_cdc number comment 'the timestamp when the CDC connector capture the event (RECORD_CONTENT:payload.ts_ms)'
20 | , ts_ms_replica_sf number comment 'the timestamp when snowflake task fills the record')
21 | comment = 'Replica from CDC over MySQL Inventory Users';
22 |
23 | -- Create final view with same columns as MySQL database to use like the same table
24 | create or replace view "HOWTO_DB"."PUBLIC"."MYSQL_INVENTORY_USERS"
25 | as
26 | select payload:id id, payload:name name, payload:email email, payload:password password, payload:created_on created_on
27 | from "HOWTO_DB"."PUBLIC"."REPLICA_MYSQL_INVENTORY_USERS";
28 |
29 | -- Create a stream from CDC events table, to process new events into replica table
30 | create or replace
31 | stream "HOWTO_DB"."PUBLIC"."CDC_MYSQL_INVENTORY_USERS_STREAM_REPLICATION"
32 | on table "HOWTO_DB"."PUBLIC"."CDC_MYSQL_INVENTORY_USERS";
33 |
34 |
35 | -- After create stream (avoid loss events), process all events available in CDC events table
36 | merge into "HOWTO_DB"."PUBLIC"."REPLICA_MYSQL_INVENTORY_USERS" replica_table
37 | using
38 | (with
39 | prequery as (select RECORD_METADATA:key.payload.id id
40 | , COALESCE(RECORD_CONTENT:payload.source.gtid, '') sourcedb_binlog_gtid
41 | , COALESCE(RECORD_CONTENT:payload.source.file, '') sourcedb_binlog_file
42 | , to_number(RECORD_CONTENT:payload.source.pos) sourcedb_binlog_pos
43 | , RECORD_CONTENT:payload.after payload
44 | , RECORD_CONTENT:payload.op cdc_operation
45 | , RECORD_CONTENT:payload.source cdc_source_info
46 | , RECORD_CONTENT:payload.source.ts_ms ts_ms_sourcedb
47 | , RECORD_CONTENT:payload.ts_ms ts_ms_cdc
48 | from "HOWTO_DB"."PUBLIC"."CDC_MYSQL_INVENTORY_USERS"),
49 | rank_query as (select *
50 | , ROW_NUMBER() over (PARTITION BY id
51 | order by ts_ms_cdc desc, sourcedb_binlog_file desc, sourcedb_binlog_pos desc) as row_num
52 | from prequery)
53 | select * from rank_query where row_num = 1) event_data
54 | on replica_table.id = to_number(event_data.id)
55 | when not matched and event_data.cdc_operation <> 'd'
56 | then insert
57 | (id, sourcedb_binlog_gtid, sourcedb_binlog_file, sourcedb_binlog_pos, payload
58 | , cdc_operation, cdc_source_info, ts_ms_sourcedb, ts_ms_cdc, ts_ms_replica_sf)
59 | values
60 | (event_data.id, event_data.sourcedb_binlog_gtid, event_data.sourcedb_binlog_file
61 | , event_data.sourcedb_binlog_pos, event_data.payload, event_data.cdc_operation
62 | , event_data.cdc_source_info, event_data.ts_ms_sourcedb, event_data.ts_ms_cdc
63 | , date_part(epoch_millisecond, CURRENT_TIMESTAMP))
64 | when matched and event_data.cdc_operation = 'd'
65 | then delete
66 | when matched and event_data.cdc_operation <> 'd'
67 | then update set id=event_data.id
68 | , sourcedb_binlog_gtid=event_data.sourcedb_binlog_gtid
69 | , sourcedb_binlog_file=event_data.sourcedb_binlog_file
70 | , sourcedb_binlog_pos=event_data.sourcedb_binlog_pos
71 | , payload=event_data.payload
72 | , cdc_operation=event_data.cdc_operation
73 | , cdc_source_info=event_data.cdc_source_info
74 | , ts_ms_sourcedb=event_data.ts_ms_sourcedb
75 | , ts_ms_cdc=event_data.ts_ms_cdc
76 | , ts_ms_replica_sf=date_part(epoch_millisecond, CURRENT_TIMESTAMP);
77 |
78 |
79 | -- Create task with previous tested query, but read data from the created stream (not CDC events table).
80 | create or replace task "HOWTO_DB"."PUBLIC"."CDC_MYSQL_INVENTORY_USERS_TASK_REPLICATION"
81 | warehouse = compute_wh
82 | schedule = '1 minute'
83 | allow_overlapping_execution = false
84 | when
85 | system$stream_has_data('HOWTO_DB.PUBLIC.CDC_MYSQL_INVENTORY_USERS_STREAM_REPLICATION')
86 | as
87 | merge into "HOWTO_DB"."PUBLIC"."REPLICA_MYSQL_INVENTORY_USERS" replica_table
88 | using
89 | (with
90 | prequery as (select RECORD_METADATA:key.payload.id id
91 | , COALESCE(RECORD_CONTENT:payload.source.gtid, '') sourcedb_binlog_gtid
92 | , COALESCE(RECORD_CONTENT:payload.source.file, '') sourcedb_binlog_file
93 | , to_number(RECORD_CONTENT:payload.source.pos) sourcedb_binlog_pos
94 | , RECORD_CONTENT:payload.after payload
95 | , RECORD_CONTENT:payload.op cdc_operation
96 | , RECORD_CONTENT:payload.source cdc_source_info
97 | , RECORD_CONTENT:payload.source.ts_ms ts_ms_sourcedb
98 | , RECORD_CONTENT:payload.ts_ms ts_ms_cdc
99 | from "HOWTO_DB"."PUBLIC"."CDC_MYSQL_INVENTORY_USERS_STREAM_REPLICATION"),
100 | rank_query as (select *
101 | , ROW_NUMBER() over (PARTITION BY id
102 | order by ts_ms_cdc desc, sourcedb_binlog_file desc, sourcedb_binlog_pos desc) as row_num
103 | from prequery)
104 | select * from rank_query where row_num = 1) event_data
105 | on replica_table.id = to_number(event_data.id)
106 | when not matched and event_data.cdc_operation <> 'd'
107 | then insert
108 | (id, sourcedb_binlog_gtid, sourcedb_binlog_file, sourcedb_binlog_pos, payload
109 | , cdc_operation, cdc_source_info, ts_ms_sourcedb, ts_ms_cdc, ts_ms_replica_sf)
110 | values
111 | (event_data.id, event_data.sourcedb_binlog_gtid, event_data.sourcedb_binlog_file
112 | , event_data.sourcedb_binlog_pos, event_data.payload, event_data.cdc_operation
113 | , event_data.cdc_source_info, event_data.ts_ms_sourcedb, event_data.ts_ms_cdc
114 | , date_part(epoch_millisecond, CURRENT_TIMESTAMP))
115 | when matched and event_data.cdc_operation = 'd'
116 | then delete
117 | when matched and event_data.cdc_operation <> 'd'
118 | then update set id=event_data.id
119 | , sourcedb_binlog_gtid=event_data.sourcedb_binlog_gtid
120 | , sourcedb_binlog_file=event_data.sourcedb_binlog_file
121 | , sourcedb_binlog_pos=event_data.sourcedb_binlog_pos
122 | , payload=event_data.payload
123 | , cdc_operation=event_data.cdc_operation
124 | , cdc_source_info=event_data.cdc_source_info
125 | , ts_ms_sourcedb=event_data.ts_ms_sourcedb
126 | , ts_ms_cdc=event_data.ts_ms_cdc
127 | , ts_ms_replica_sf=date_part(epoch_millisecond, CURRENT_TIMESTAMP);
128 |
129 |
130 | -- Enable task
131 | ALTER TASK "HOWTO_DB"."PUBLIC"."CDC_MYSQL_INVENTORY_USERS_TASK_REPLICATION" RESUME;
132 |
133 | -- Check info about the task executions (STATE and NEXT_SCHEDULED_TIME columns)
134 | -- If you see error "Cannot execute task , EXECUTE TASK privilege must be granted to owner role"
135 | -- review 00-security.sql script
136 | select *
137 | from table(HOWTO_DB.information_schema.task_history())
138 | order by scheduled_time desc;
139 |
140 |
141 | -- Check counts (you don't see the same results in event table against the replica table)
142 | select to_char(RECORD_CONTENT:payload.op) cdc_operation, count(*), 'CDC_MYSQL_INVENTORY_USERS' table_name
143 | from "HOWTO_DB"."PUBLIC"."CDC_MYSQL_INVENTORY_USERS" group by RECORD_CONTENT:payload.op
144 | union all
145 | select cdc_operation, count(*), 'REPLICA_MYSQL_INVENTORY_USERS' table_name
146 | from "HOWTO_DB"."PUBLIC"."REPLICA_MYSQL_INVENTORY_USERS" group by cdc_operation
147 | order by table_name, cdc_operation;
148 |
--------------------------------------------------------------------------------
/snowflake/sql/01-cdc-to-replica-postgres.sql:
--------------------------------------------------------------------------------
1 | -- Based on:
2 | -- https://docs.snowflake.com/en/user-guide/data-pipelines-examples.html#transforming-loaded-json-data-on-a-schedule
3 | -- https://docs.snowflake.com/en/sql-reference/sql/merge.html
4 |
5 | -- Use this role is not recomendable in production environments
6 | use role accountadmin;
7 |
8 | -- Create the replica table, including extra columns to support replica logic and process trazability
9 | create or replace
10 | table "HOWTO_DB"."PUBLIC"."REPLICA_POSTGRESDB_INVENTORY_PRODUCT"
11 | ( id number PRIMARY KEY comment 'primary key of the source table'
12 | , sourcedb_lsn string comment 'postgres log sequence number, used for ordering events (RECORD_CONTENT:payload.source.lsn)'
13 | , payload variant comment 'data after operation (RECORD_CONTENT:payload.after)'
14 | , cdc_operation char comment 'CDC registered operation in source DB (RECORD_CONTENT:payload.op)'
15 | , cdc_source_info variant comment 'Debezium source field, for trazability (RECORD_CONTENT:payload.source)'
16 | , ts_ms_sourcedb number comment 'the timestamp when database register the event, not available on database snapshot (RECORD_CONTENT:payload.source.ts_ms)'
17 | , ts_ms_cdc number comment 'the timestamp when the CDC connector capture the event (RECORD_CONTENT:payload.ts_ms)'
18 | , ts_ms_replica_sf number comment 'the timestamp when snowflake task fills the record')
19 | comment = 'Replica from CDC over PostgreSQL Inventory Products';
20 |
21 | -- Create final view with same columns as PostgreSQL database to use like the same table
22 | create or replace view "HOWTO_DB"."PUBLIC"."POSTGRESDB_INVENTORY_PRODUCT"
23 | as
24 | select payload:id id, payload:name name, payload:description description, payload:created_on created_on
25 | from "HOWTO_DB"."PUBLIC"."REPLICA_POSTGRESDB_INVENTORY_PRODUCT";
26 |
27 | -- Create a stream from CDC events table, to process new events into replica table
28 | create or replace
29 | stream "HOWTO_DB"."PUBLIC"."CDC_POSTGRESDB_INVENTORY_PRODUCT_STREAM_REPLICATION"
30 | on table "HOWTO_DB"."PUBLIC"."CDC_POSTGRESDB_INVENTORY_PRODUCT";
31 |
32 |
33 | -- After create stream (avoid loss events), process all events available in CDC events table
34 | merge into "HOWTO_DB"."PUBLIC"."REPLICA_POSTGRESDB_INVENTORY_PRODUCT" replica_table
35 | using
36 | (with
37 | prequery as (select RECORD_METADATA:key.payload.id id
38 | , to_number(RECORD_CONTENT:payload.source.lsn) sourcedb_lsn
39 | , RECORD_CONTENT:payload.after payload
40 | , RECORD_CONTENT:payload.op cdc_operation
41 | , RECORD_CONTENT:payload.source cdc_source_info
42 | , RECORD_CONTENT:payload.source.ts_ms ts_ms_sourcedb
43 | , RECORD_CONTENT:payload.ts_ms ts_ms_cdc
44 | from "HOWTO_DB"."PUBLIC"."CDC_POSTGRESDB_INVENTORY_PRODUCT"),
45 | rank_query as (select *
46 | , ROW_NUMBER() over (PARTITION BY id
47 | order by ts_ms_cdc desc, sourcedb_lsn desc) as row_num
48 | from prequery)
49 | select * from rank_query where row_num = 1) event_data
50 | on replica_table.id = to_number(event_data.id)
51 | when not matched and event_data.cdc_operation <> 'd'
52 | then insert
53 | (id, sourcedb_lsn, payload, cdc_operation, cdc_source_info, ts_ms_sourcedb
54 | , ts_ms_cdc, ts_ms_replica_sf)
55 | values
56 | (event_data.id, event_data.sourcedb_lsn, event_data.payload, event_data.cdc_operation
57 | , event_data.cdc_source_info, event_data.ts_ms_sourcedb, event_data.ts_ms_cdc
58 | , date_part(epoch_millisecond, CURRENT_TIMESTAMP))
59 | when matched and event_data.cdc_operation = 'd'
60 | then delete
61 | when matched and event_data.cdc_operation <> 'd'
62 | then update set id=event_data.id
63 | , sourcedb_lsn=event_data.sourcedb_lsn
64 | , payload=event_data.payload
65 | , cdc_operation=event_data.cdc_operation
66 | , cdc_source_info=event_data.cdc_source_info
67 | , ts_ms_sourcedb=event_data.ts_ms_sourcedb
68 | , ts_ms_cdc=event_data.ts_ms_cdc
69 | , ts_ms_replica_sf=date_part(epoch_millisecond, CURRENT_TIMESTAMP);
70 |
71 |
72 | -- Create task with previous tested query, but read data from the created stream (not CDC events table).
73 | create or replace task "HOWTO_DB"."PUBLIC"."CDC_POSTGRESDB_INVENTORY_PRODUCT_TASK_REPLICATION"
74 | warehouse = compute_wh
75 | schedule = '1 minute'
76 | allow_overlapping_execution = false
77 | when
78 | system$stream_has_data('HOWTO_DB.PUBLIC.CDC_POSTGRESDB_INVENTORY_PRODUCT_STREAM_REPLICATION')
79 | as
80 | merge into "HOWTO_DB"."PUBLIC"."REPLICA_POSTGRESDB_INVENTORY_PRODUCT" replica_table
81 | using
82 | (with
83 | prequery as (select RECORD_METADATA:key.payload.id id
84 | , to_number(RECORD_CONTENT:payload.source.lsn) sourcedb_lsn
85 | , RECORD_CONTENT:payload.after payload
86 | , RECORD_CONTENT:payload.op cdc_operation
87 | , RECORD_CONTENT:payload.source cdc_source_info
88 | , RECORD_CONTENT:payload.source.ts_ms ts_ms_sourcedb
89 | , RECORD_CONTENT:payload.ts_ms ts_ms_cdc
90 | from "HOWTO_DB"."PUBLIC"."CDC_POSTGRESDB_INVENTORY_PRODUCT_STREAM_REPLICATION"),
91 | rank_query as (select *
92 | , ROW_NUMBER() over (PARTITION BY id
93 | order by ts_ms_cdc desc, sourcedb_lsn desc) as row_num
94 | from prequery)
95 | select * from rank_query where row_num = 1) event_data
96 | on replica_table.id = to_number(event_data.id)
97 | when not matched and event_data.cdc_operation <> 'd'
98 | then insert
99 | (id, sourcedb_lsn, payload, cdc_operation, cdc_source_info, ts_ms_sourcedb
100 | , ts_ms_cdc, ts_ms_replica_sf)
101 | values
102 | (event_data.id, event_data.sourcedb_lsn, event_data.payload, event_data.cdc_operation
103 | , event_data.cdc_source_info, event_data.ts_ms_sourcedb, event_data.ts_ms_cdc
104 | , date_part(epoch_millisecond, CURRENT_TIMESTAMP))
105 | when matched and event_data.cdc_operation = 'd'
106 | then delete
107 | when matched and event_data.cdc_operation <> 'd'
108 | then update set id=event_data.id
109 | , sourcedb_lsn=event_data.sourcedb_lsn
110 | , payload=event_data.payload
111 | , cdc_operation=event_data.cdc_operation
112 | , cdc_source_info=event_data.cdc_source_info
113 | , ts_ms_sourcedb=event_data.ts_ms_sourcedb
114 | , ts_ms_cdc=event_data.ts_ms_cdc
115 | , ts_ms_replica_sf=date_part(epoch_millisecond, CURRENT_TIMESTAMP);
116 |
117 |
118 | -- Enable task
119 | ALTER TASK "HOWTO_DB"."PUBLIC"."CDC_POSTGRESDB_INVENTORY_PRODUCT_TASK_REPLICATION" RESUME;
120 |
121 | -- Check info about the task executions (STATE and NEXT_SCHEDULED_TIME columns)
122 | -- If you see error "Cannot execute task , EXECUTE TASK privilege must be granted to owner role"
123 | -- review 00-security.sql script
124 | select *
125 | from table(HOWTO_DB.information_schema.task_history())
126 | order by scheduled_time desc;
127 |
128 |
129 | -- Check counts (you don't see the same results in event table against the replica table)
130 | select to_char(RECORD_CONTENT:payload.op) cdc_operation, count(*), 'CDC_POSTGRESDB_INVENTORY_PRODUCT' table_name
131 | from "HOWTO_DB"."PUBLIC"."CDC_POSTGRESDB_INVENTORY_PRODUCT" group by RECORD_CONTENT:payload.op
132 | union all
133 | select cdc_operation, count(*), 'REPLICA_POSTGRESDB_INVENTORY_PRODUCT' table_name
134 | from "HOWTO_DB"."PUBLIC"."REPLICA_POSTGRESDB_INVENTORY_PRODUCT" group by cdc_operation
135 | order by table_name, cdc_operation;
136 |
--------------------------------------------------------------------------------
/snowflake/status_sink.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CONNECT_URL=http://localhost:8085
4 |
5 | CONNECTORS=$(curl -s -k ${CONNECT_URL}/connectors)
6 | echo Connector list:
7 | echo $CONNECTORS
8 | echo
9 |
10 | echo Connector status:
11 | echo
12 |
13 | for row in $(echo "${CONNECTORS}" | jq -c -r '.[]'); do
14 | status=$(curl -s -k -X GET "${CONNECT_URL}/connectors/${row}/status")
15 | echo $status
16 | echo
17 | done
18 |
19 |
--------------------------------------------------------------------------------