├── assets └── architecture.png ├── debezium-jdbc └── Dockerfile ├── postgresql_sink_connector.sql ├── mysql_source_connector.sql ├── transformation_scripts.sql ├── docker-compose.yml ├── all_commands.sql └── README.md /assets/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dursunkoc/ksqlwithconnect/HEAD/assets/architecture.png -------------------------------------------------------------------------------- /debezium-jdbc/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG DEBEZIUM_VERSION=1.9 2 | 3 | FROM quay.io/debezium/connect:$DEBEZIUM_VERSION 4 | 5 | ARG POSTGRES_VERSION=42.2.8 6 | ARG KAFKA_JDBC_VERSION=5.3.1 7 | 8 | # Fetch and deploy PostgreSQL JDBC driver 9 | RUN cd /kafka/libs && \ 10 | curl -sO https://repo1.maven.org/maven2/org/postgresql/postgresql/$POSTGRES_VERSION/postgresql-$POSTGRES_VERSION.jar 11 | 12 | # Fetch and deploy Kafka Connect JDBC 13 | ENV KAFKA_CONNECT_JDBC_DIR=$KAFKA_CONNECT_PLUGINS_DIR/kafka-connect-jdbc 14 | RUN mkdir $KAFKA_CONNECT_JDBC_DIR 15 | 16 | RUN cd $KAFKA_CONNECT_JDBC_DIR && \ 17 | curl -sO https://packages.confluent.io/maven/io/confluent/kafka-connect-jdbc/$KAFKA_JDBC_VERSION/kafka-connect-jdbc-$KAFKA_JDBC_VERSION.jar -------------------------------------------------------------------------------- /postgresql_sink_connector.sql: -------------------------------------------------------------------------------- 1 | CREATE SINK CONNECTOR `postgres-sink` WITH( 2 | "connector.class"= 'io.confluent.connect.jdbc.JdbcSinkConnector', 3 | "tasks.max"= '1', 4 | "dialect.name"= 'PostgreSqlDatabaseDialect', 5 | "table.name.format"= 'ENRICHED_ORDER', 6 | "topics"= 'SA_ENRICHED_ORDER', 7 | "connection.url"= 'jdbc:postgresql://postgres:5432/inventory?user=postgresuser&password=postgrespw', 8 | "auto.create"= 'true', 9 | "insert.mode"= 'upsert', 10 | "pk.fields"= 'ORDER_NUMBER', 11 | "pk.mode"= 'record_key', 12 | "key.converter"= 'org.apache.kafka.connect.converters.IntegerConverter', 13 | "key.converter.schemas.enable" = 'false', 14 | "value.converter"= 'io.confluent.connect.avro.AvroConverter', 15 | "value.converter.schemas.enable" = 'true', 16 | "value.converter.schema.registry.url"= 'http://schema-registry:8081' 17 | ); -------------------------------------------------------------------------------- /mysql_source_connector.sql: -------------------------------------------------------------------------------- 1 | CREATE SOURCE CONNECTOR `mysql-connector` WITH( 2 | "connector.class"= 'io.debezium.connector.mysql.MySqlConnector', 3 | "tasks.max"= '1', 4 | "database.hostname"= 'mysql', 5 | "database.port"= '3306', 6 | "database.user"= 'root', 7 | "database.password"= 'debezium', 8 | "database.server.id"= '184054', 9 | "database.server.name"= 'dbserver1', 10 | "database.whitelist"= 'inventory', 11 | "table.whitelist"= 'inventory.customers,inventory.products,inventory.orders', 12 | "database.history.kafka.bootstrap.servers"= 'kafka:9092', 13 | "database.history.kafka.topic"= 'schema-changes.inventory', 14 | "transforms"= 'unwrap', 15 | "transforms.unwrap.type"= 'io.debezium.transforms.ExtractNewRecordState', 16 | "key.converter"= 'org.apache.kafka.connect.json.JsonConverter', 17 | "key.converter.schemas.enable"= 'false', 18 | "value.converter"= 'org.apache.kafka.connect.json.JsonConverter', 19 | "value.converter.schemas.enable"= 'false'); -------------------------------------------------------------------------------- /transformation_scripts.sql: -------------------------------------------------------------------------------- 1 | CREATE STREAM S_CUSTOMER (ID INT, 2 | FIRST_NAME string, 3 | LAST_NAME string, 4 | EMAIL string) 5 | WITH (KAFKA_TOPIC='dbserver1.inventory.customers', 6 | VALUE_FORMAT='json'); 7 | 8 | CREATE TABLE T_CUSTOMER 9 | AS 10 | SELECT id, 11 | latest_by_offset(first_name) as fist_name, 12 | latest_by_offset(last_name) as last_name, 13 | latest_by_offset(email) as email 14 | FROM s_customer 15 | GROUP BY id 16 | EMIT CHANGES; 17 | 18 | CREATE STREAM S_PRODUCT (ID INT, 19 | NAME string, 20 | description string, 21 | weight DOUBLE) 22 | WITH (KAFKA_TOPIC='dbserver1.inventory.products', 23 | VALUE_FORMAT='json'); 24 | 25 | CREATE TABLE T_PRODUCT 26 | AS 27 | SELECT id, 28 | latest_by_offset(name) as name, 29 | latest_by_offset(description) as description, 30 | latest_by_offset(weight) as weight 31 | FROM s_product 32 | GROUP BY id 33 | EMIT CHANGES; 34 | 35 | CREATE STREAM s_order ( 36 | order_number integer, 37 | order_date timestamp, 38 | purchaser integer, 39 | quantity integer, 40 | product_id integer) 41 | WITH (KAFKA_TOPIC='dbserver1.inventory.orders',VALUE_FORMAT='json'); 42 | 43 | CREATE STREAM SA_ENRICHED_ORDER WITH (VALUE_FORMAT='AVRO') AS 44 | select o.order_number, o.quantity, p.name as product, c.email as customer, p.id as product_id, c.id as customer_id 45 | from s_order as o 46 | left join t_product as p on o.product_id = p.id 47 | left join t_customer as c on o.purchaser = c.id 48 | partition by o.order_number 49 | emit changes; -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | 3 | services: 4 | zookeeper: 5 | image: confluentinc/cp-zookeeper:5.4.9 6 | environment: 7 | ZOOKEEPER_CLIENT_PORT: 32181 8 | ZOOKEEPER_TICK_TIME: 2000 9 | 10 | kafka: 11 | image: confluentinc/cp-enterprise-kafka:5.4.9 12 | ports: 13 | - "29092:29092" 14 | depends_on: 15 | - zookeeper 16 | environment: 17 | KAFKA_BROKER_ID: 1 18 | KAFKA_ZOOKEEPER_CONNECT: zookeeper:32181 19 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT 20 | KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT 21 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092,PLAINTEXT_HOST://localhost:29092 22 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 23 | KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1 24 | KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1 25 | KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 100 26 | 27 | connect: 28 | build: 29 | context: debezium-jdbc 30 | ports: 31 | - 8083:8083 32 | links: 33 | - kafka 34 | - mysql 35 | - postgres 36 | environment: 37 | - BOOTSTRAP_SERVERS=kafka:9092 38 | - GROUP_ID=1 39 | - CONFIG_STORAGE_TOPIC=my_connect_configs 40 | - OFFSET_STORAGE_TOPIC=my_connect_offsets 41 | - STATUS_STORAGE_TOPIC=my_connect_statuses 42 | 43 | schema-registry: 44 | image: confluentinc/cp-schema-registry:5.4.9 45 | depends_on: 46 | - zookeeper 47 | - kafka 48 | environment: 49 | SCHEMA_REGISTRY_HOST_NAME: schema-registry 50 | SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: PLAINTEXT://kafka:9092 51 | 52 | primary-ksqldb-server: 53 | image: confluentinc/ksqldb-server:0.27.2 54 | hostname: primary-ksqldb-server 55 | container_name: primary-ksqldb-server 56 | depends_on: 57 | - kafka 58 | - schema-registry 59 | ports: 60 | - "8088:8088" 61 | environment: 62 | KSQL_LISTENERS: http://0.0.0.0:8088 63 | KSQL_BOOTSTRAP_SERVERS: kafka:9092 64 | KSQL_KSQL_SCHEMA_REGISTRY_URL: http://schema-registry:8081 65 | KSQL_KSQL_CONNECT_URL: http://connect:8083 66 | KSQL_KSQL_LOGGING_PROCESSING_STREAM_AUTO_CREATE: "true" 67 | KSQL_KSQL_LOGGING_PROCESSING_TOPIC_AUTO_CREATE: "true" 68 | 69 | mysql: 70 | image: debezium/example-mysql:1.9.5.Final 71 | ports: 72 | - 3306:3306 73 | environment: 74 | - MYSQL_ROOT_PASSWORD=debezium 75 | - MYSQL_USER=mysqluser 76 | - MYSQL_PASSWORD=mysqlpw 77 | 78 | postgres: 79 | image: debezium/postgres:14-alpine 80 | ports: 81 | - "5432:5432" 82 | environment: 83 | - POSTGRES_USER=postgresuser 84 | - POSTGRES_PASSWORD=postgrespw 85 | - POSTGRES_DB=inventory 86 | 87 | # Access the cli by running: 88 | # > docker-compose exec ksqldb-cli ksql http://primary-ksqldb-server:8088 89 | ksqldb-cli: 90 | image: confluentinc/ksqldb-cli:0.27.2 91 | container_name: ksqldb-cli 92 | depends_on: 93 | - primary-ksqldb-server 94 | entrypoint: /bin/sh 95 | tty: true -------------------------------------------------------------------------------- /all_commands.sql: -------------------------------------------------------------------------------- 1 | CREATE SOURCE CONNECTOR `mysql-connector` WITH( 2 | "connector.class"= 'io.debezium.connector.mysql.MySqlConnector', 3 | "tasks.max"= '1', 4 | "database.hostname"= 'mysql', 5 | "database.port"= '3306', 6 | "database.user"= 'root', 7 | "database.password"= 'debezium', 8 | "database.server.id"= '184054', 9 | "database.server.name"= 'dbserver1', 10 | "database.whitelist"= 'inventory', 11 | "table.whitelist"= 'inventory.customers,inventory.products,inventory.orders', 12 | "database.history.kafka.bootstrap.servers"= 'kafka:9092', 13 | "database.history.kafka.topic"= 'schema-changes.inventory', 14 | "transforms"= 'unwrap', 15 | "transforms.unwrap.type"= 'io.debezium.transforms.ExtractNewRecordState', 16 | "key.converter"= 'org.apache.kafka.connect.json.JsonConverter', 17 | "key.converter.schemas.enable"= 'false', 18 | "value.converter"= 'org.apache.kafka.connect.json.JsonConverter', 19 | "value.converter.schemas.enable"= 'false'); 20 | 21 | show topics; 22 | 23 | SET 'auto.offset.reset' = 'earliest'; 24 | 25 | PRINT "dbserver1.inventory.customers" FROM BEGINNING; 26 | 27 | CREATE STREAM S_CUSTOMER (ID INT, 28 | FIRST_NAME string, 29 | LAST_NAME string, 30 | EMAIL string) 31 | WITH (KAFKA_TOPIC='dbserver1.inventory.customers', 32 | VALUE_FORMAT='json'); 33 | 34 | CREATE TABLE T_CUSTOMER 35 | AS 36 | SELECT id, 37 | latest_by_offset(first_name) as fist_name, 38 | latest_by_offset(last_name) as last_name, 39 | latest_by_offset(email) as email 40 | FROM s_customer 41 | GROUP BY id 42 | EMIT CHANGES; 43 | 44 | CREATE STREAM S_PRODUCT (ID INT, 45 | NAME string, 46 | description string, 47 | weight DOUBLE) 48 | WITH (KAFKA_TOPIC='dbserver1.inventory.products', 49 | VALUE_FORMAT='json'); 50 | 51 | CREATE TABLE T_PRODUCT 52 | AS 53 | SELECT id, 54 | latest_by_offset(name) as name, 55 | latest_by_offset(description) as description, 56 | latest_by_offset(weight) as weight 57 | FROM s_product 58 | GROUP BY id 59 | EMIT CHANGES; 60 | 61 | CREATE STREAM s_order ( 62 | order_number integer, 63 | order_date timestamp, 64 | purchaser integer, 65 | quantity integer, 66 | product_id integer) 67 | WITH (KAFKA_TOPIC='dbserver1.inventory.orders',VALUE_FORMAT='json'); 68 | 69 | select o.order_number, o.quantity, p.name as product from s_order as o left join t_product as p on p.id = o.product_id emit changes; 70 | 71 | select o.order_number, o.quantity, p.name as product, c.email as customer, p.id as product_id, c.id as customer_id 72 | from s_order as o 73 | left join t_product as p on o.product_id = p.id 74 | left join t_customer as c on o.purchaser = c.id 75 | emit changes; 76 | 77 | CREATE STREAM SA_ENRICHED_ORDER WITH (VALUE_FORMAT='AVRO') AS 78 | select o.order_number, o.quantity, p.name as product, c.email as customer, p.id as product_id, c.id as customer_id 79 | from s_order as o 80 | left join t_product as p on o.product_id = p.id 81 | left join t_customer as c on o.purchaser = c.id 82 | partition by o.order_number 83 | emit changes; 84 | 85 | CREATE SINK CONNECTOR `postgres-sink` WITH( 86 | "connector.class"= 'io.confluent.connect.jdbc.JdbcSinkConnector', 87 | "tasks.max"= '1', 88 | "dialect.name"= 'PostgreSqlDatabaseDialect', 89 | "table.name.format"= 'ENRICHED_ORDER', 90 | "topics"= 'SA_ENRICHED_ORDER', 91 | "connection.url"= 'jdbc:postgresql://postgres:5432/inventory?user=postgresuser&password=postgrespw', 92 | "auto.create"= 'true', 93 | "insert.mode"= 'upsert', 94 | "pk.fields"= 'ORDER_NUMBER', 95 | "pk.mode"= 'record_key', 96 | "key.converter"= 'org.apache.kafka.connect.converters.IntegerConverter', 97 | "key.converter.schemas.enable" = 'false', 98 | "value.converter"= 'io.confluent.connect.avro.AvroConverter', 99 | "value.converter.schemas.enable" = 'true', 100 | "value.converter.schema.registry.url"= 'http://schema-registry:8081' 101 | ); -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Streaming ETL Using Kafka Connect & KSQL DB 2 | 3 | This is a demo project for demonstrating data extraction from multiple tables/databases using debezium, joining them on the fly using ksqldb and storing into a different table/database. 4 | 5 | ![Architecture](./assets/architecture.png) 6 | 7 | ## Starting up the architecture 8 | 9 | In order to run the infrastructure you should build and run the docker-compose file. 10 | 11 | ```bash 12 | > docker-compose -f docker-compose.yml up --build --no-start 13 | ``` 14 | 15 | Start every container described in the `docker-compose.yml`, 16 | 17 | ```bash 18 | > docker-compose -f docker-compose.yml start 19 | ``` 20 | 21 | ## Building the ETL Pipeline 22 | 23 | First we should create a source connector (debezium) for listening the changes in the inventory schema objects. First things first we should open up a terminal to connect to Ksql Server by running the following command, after that we will be using the terminal of ksqldb client created here. 24 | 25 | ```bash 26 | > docker-compose exec ksqldb-cli ksql http://primary-ksqldb-server:8088 27 | ``` 28 | 29 | ### Create Source Connector 30 | 31 | The script for the source connector is avaliable at [mysql_source_connector](./mysql_source_connector.sql) 32 | 33 | ```bash 34 | ksql> CREATE SOURCE CONNECTOR `mysql-connector` WITH( 35 | > "connector.class"= 'io.debezium.connector.mysql.MySqlConnector', 36 | > "tasks.max"= '1', 37 | > "database.hostname"= 'mysql', 38 | > "database.port"= '3306', 39 | > "database.user"= 'root', 40 | > "database.password"= 'debezium', 41 | > "database.server.id"= '184054', 42 | > "database.server.name"= 'dbserver1', 43 | > "database.whitelist"= 'inventory', 44 | > "table.whitelist"= 'inventory.customers,inventory.products,inventory.orders', 45 | > "database.history.kafka.bootstrap.servers"= 'kafka:9092', 46 | > "database.history.kafka.topic"= 'schema-changes.inventory', 47 | > "transforms"= 'unwrap', 48 | > "transforms.unwrap.type"= 'io.debezium.transforms.ExtractNewRecordState', 49 | > "key.converter"= 'org.apache.kafka.connect.json.JsonConverter', 50 | > "key.converter.schemas.enable"= 'false', 51 | > "value.converter"= 'org.apache.kafka.connect.json.JsonConverter', 52 | > "value.converter.schemas.enable"= 'false'); 53 | ``` 54 | 55 | After that you should be able to see the topics for the tables residing in the inventory schema at mysql. 56 | 57 | ```bash 58 | ksql> show topics; 59 | ``` 60 | 61 | > **_NOTE:_** In order to keep the offset at begining during the demo please run the following command! 62 | > 63 | >```bash 64 | >ksql> SET 'auto.offset.reset' = 'earliest'; 65 | >```` 66 | 67 | ### Create Transformations with Streams and Tables 68 | 69 | Run the following script which is avaliable in [transformation_scripts](./transformation_scripts.sql) creating stream and tables for the transformation. 70 | 71 | ```bash 72 | ksql> CREATE STREAM S_CUSTOMER (ID INT, 73 | > FIRST_NAME string, 74 | > LAST_NAME string, 75 | > EMAIL string) 76 | > WITH (KAFKA_TOPIC='dbserver1.inventory.customers', 77 | > VALUE_FORMAT='json'); 78 | > 79 | >CREATE TABLE T_CUSTOMER 80 | >AS 81 | > SELECT id, 82 | > latest_by_offset(first_name) as fist_name, 83 | > latest_by_offset(last_name) as last_name, 84 | > latest_by_offset(email) as email 85 | > FROM s_customer 86 | > GROUP BY id 87 | > EMIT CHANGES; 88 | > 89 | >CREATE STREAM S_PRODUCT (ID INT, 90 | > NAME string, 91 | > description string, 92 | > weight DOUBLE) 93 | > WITH (KAFKA_TOPIC='dbserver1.inventory.products', 94 | > VALUE_FORMAT='json'); 95 | > 96 | >CREATE TABLE T_PRODUCT 97 | >AS 98 | > SELECT id, 99 | > latest_by_offset(name) as name, 100 | > latest_by_offset(description) as description, 101 | > latest_by_offset(weight) as weight 102 | > FROM s_product 103 | > GROUP BY id 104 | > EMIT CHANGES; 105 | > 106 | >CREATE STREAM s_order ( 107 | > order_number integer, 108 | > order_date timestamp, 109 | > purchaser integer, 110 | > quantity integer, 111 | > product_id integer) 112 | > WITH (KAFKA_TOPIC='dbserver1.inventory.orders',VALUE_FORMAT='json'); 113 | > 114 | >CREATE STREAM SA_ENRICHED_ORDER WITH (VALUE_FORMAT='AVRO') AS 115 | > select o.order_number, o.quantity, p.name as product, c.email as customer, p.id as product_id, c.id as customer_id 116 | > from s_order as o 117 | >left join t_product as p on o.product_id = p.id 118 | >left join t_customer as c on o.purchaser = c.id 119 | >partition by o.order_number 120 | >emit changes; 121 | ``` 122 | 123 | ### Create Sink Connector 124 | 125 | In order to load the final Order Stream into Postgresql Database run the script in [postgresql_sink_connector](./postgresql_sink_connector.sql) 126 | 127 | ```bash 128 | ksql> CREATE SINK CONNECTOR `postgres-sink` WITH( 129 | > "connector.class"= 'io.confluent.connect.jdbc.JdbcSinkConnector', 130 | > "tasks.max"= '1', 131 | > "dialect.name"= 'PostgreSqlDatabaseDialect', 132 | > "table.name.format"= 'ENRICHED_ORDER', 133 | > "topics"= 'SA_ENRICHED_ORDER', 134 | > "connection.url"= 'jdbc:postgresql://postgres:5432/inventory?user=postgresuser&password=postgrespw', 135 | > "auto.create"= 'true', 136 | > "insert.mode"= 'upsert', 137 | > "pk.fields"= 'ORDER_NUMBER', 138 | > "pk.mode"= 'record_key', 139 | > "key.converter"= 'org.apache.kafka.connect.converters.IntegerConverter', 140 | > "key.converter.schemas.enable" = 'false', 141 | > "value.converter"= 'io.confluent.connect.avro.AvroConverter', 142 | > "value.converter.schemas.enable" = 'true', 143 | > "value.converter.schema.registry.url"= 'http://schema-registry:8081' 144 | >); 145 | ``` 146 | 147 | Finally you can modify the data in mysql's orders, customers and products tables, and you can see the result in the ENRICHED_ORDER table at postgresql database. 148 | --------------------------------------------------------------------------------