├── etc
├── catalog
│ ├── tpch.properties
│ ├── tpcds.properties
│ ├── local_postgres.properties
│ └── iceberg.properties
├── log.properties
├── node.properties
├── config.properties
└── jvm.config
├── demo.png
├── config.properties
├── docker-compose-db2.yml
├── README.md
├── hive-site.xml
├── conf
└── metastore-site.xml
├── Dockerfile
└── docker-compose.yml
/etc/catalog/tpch.properties:
--------------------------------------------------------------------------------
1 | connector.name=tpch
2 | tpch.splits-per-node=4
--------------------------------------------------------------------------------
/etc/catalog/tpcds.properties:
--------------------------------------------------------------------------------
1 | connector.name=tpcds
2 | tpcds.splits-per-node=4
--------------------------------------------------------------------------------
/etc/log.properties:
--------------------------------------------------------------------------------
1 | # Enable verbose logging from Presto
2 | #io.prestosql=DEBUG
--------------------------------------------------------------------------------
/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pranav1699/flink-iceberg-minio-trino/HEAD/demo.png
--------------------------------------------------------------------------------
/etc/node.properties:
--------------------------------------------------------------------------------
1 | node.environment=docker
2 | node.data-dir=/data/trino
3 | plugin.dir=/usr/lib/trino/plugin
4 |
--------------------------------------------------------------------------------
/config.properties:
--------------------------------------------------------------------------------
1 | coordinator=true
2 | node-scheduler.include-coordinator=true
3 | http-server.http.port=8080
4 | discovery-server.enabled=true
5 | discovery.uri=http://localhost:8080
--------------------------------------------------------------------------------
/etc/catalog/local_postgres.properties:
--------------------------------------------------------------------------------
1 | connector.name=postgresql
2 | connection-url=jdbc:postgresql://192.168.1.6:5432/demo
3 | connection-user=postgres
4 | connection-password=1999
--------------------------------------------------------------------------------
/etc/config.properties:
--------------------------------------------------------------------------------
1 | #single node install config
2 | coordinator=true
3 | node-scheduler.include-coordinator=true
4 | http-server.http.port=8080
5 | discovery-server.enabled=true
6 | discovery.uri=http://localhost:8080
--------------------------------------------------------------------------------
/etc/catalog/iceberg.properties:
--------------------------------------------------------------------------------
1 | connector.name=iceberg
2 | hive.metastore.uri=thrift://hive-metastore:9083
3 | hive.s3.path-style-access=true
4 | hive.s3.endpoint=http://minio:9000
5 | hive.s3.aws-access-key=minio
6 | hive.s3.aws-secret-key=minio123
7 |
--------------------------------------------------------------------------------
/docker-compose-db2.yml:
--------------------------------------------------------------------------------
1 | version: '2.1'
2 | services:
3 | db2:
4 | image: ruanhang/db2-cdc-demo:v1
5 | privileged: true
6 | ports:
7 | - 50000:50000
8 | environment:
9 | - LICENSE=accept
10 | - DB2INSTANCE=db2inst1
11 | - DB2INST1_PASSWORD=admin
12 | - DBNAME=testdb
13 | - ARCHIVE_LOGS=true
14 |
15 | mysql:
16 | image: debezium/example-mysql:1.1
17 | ports:
18 | - "3307:3306"
19 | environment:
20 | - MYSQL_ROOT_PASSWORD=123456
21 | - MYSQL_USER=mysqluser
22 | - MYSQL_PASSWORD=mysqlpw
--------------------------------------------------------------------------------
/etc/jvm.config:
--------------------------------------------------------------------------------
1 | -server
2 | -Xmx16G
3 | -XX:InitialRAMPercentage=80
4 | -XX:MaxRAMPercentage=80
5 | -XX:G1HeapRegionSize=32M
6 | -XX:+ExplicitGCInvokesConcurrent
7 | -XX:+ExitOnOutOfMemoryError
8 | -XX:+HeapDumpOnOutOfMemoryError
9 | -XX:-OmitStackTraceInFastThrow
10 | -XX:ReservedCodeCacheSize=512M
11 | -XX:PerMethodRecompilationCutoff=10000
12 | -XX:PerBytecodeRecompilationCutoff=10000
13 | -Djdk.attach.allowAttachSelf=true
14 | -Djdk.nio.maxCachedBufferSize=2000000
15 | -Dfile.encoding=UTF-8
16 | # Reduce starvation of threads by GClocker, recommend to set about the number of cpu cores (JDK-8192647)
17 | -XX:+UnlockDiagnosticVMOptions
18 | -XX:GCLockerRetryAllocationCount=32
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Real-Time Streaming of CDC Data from MySql to Apache Iceberg using Flink
2 | 
3 |
4 | ## Overview
5 |
6 | This project demonstrates the seamless streaming of Change Data Capture (CDC) data from MySql to Apache Iceberg using Apache Flink. By utilizing Flink's SQL Client, we enable fast data analytics and support machine learning workloads.
7 |
8 | ## Purpose
9 |
10 | The purpose of this repository is to provide a comprehensive example of setting up a real-time streaming pipeline for CDC data synchronization. The integration of Flink, MySql CDC connectors, Iceberg, Minio, Hive Metastore, and Trino showcases the capabilities of modern data tools in handling dynamic data scenarios.
11 |
12 | ## Tools Used
13 |
14 | - **Trino:** High-performance query engine for distributed data processing.
15 | - **Apache Flink:** Robust stream processing framework for real-time data analytics.
16 | - **Apache Iceberg:** Open-source table format and processing framework for efficient data lake management.
17 | - **Hive Metastore:** Schema management tool ensuring seamless evolution and organization of data.
18 | - **Minio:** Secure object storage solution for reliable data storage in distributed environments.
19 |
20 | ## Setup Instructions
21 |
22 | 1. Ensure that Docker and Docker Compose are installed on your system.
23 | 2. Clone this repository:
24 |
25 | ```bash
26 | git clone https://github.com/pranav1699/flink-iceberg-minio-trino.git
27 | cd flink-iceberg-minio-trino
28 | ```
29 |
30 | 3. Start the Docker containers:
31 |
32 | ```bash
33 | docker-compose up -d
34 | ```
35 | ## Next Steps
36 | Read this blog : https://medium.com/dev-genius/streaming-cdc-data-from-mysql-to-apache-iceberg-with-hive-metastore-using-apache-flink-0de9738fba0d
37 |
--------------------------------------------------------------------------------
/hive-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | metastore.thrift.uris
5 | thrift://hive-metastore:9083
6 | Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.
7 |
8 |
9 |
10 | metastore.task.threads.always
11 | org.apache.hadoop.hive.metastore.events.EventCleanerTask,org.apache.hadoop.hive.metastore.MaterializationsCacheCleanerTask
12 |
13 |
14 |
15 | metastore.expression.proxy
16 | org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy
17 |
18 |
19 |
20 | javax.jdo.option.ConnectionDriverName
21 | com.mysql.cj.jdbc.Driver
22 |
23 |
24 |
25 | javax.jdo.option.ConnectionURL
26 | jdbc:mysql://mariadb:3306/metastore_db
27 |
28 |
29 |
30 | javax.jdo.option.ConnectionUserName
31 | admin
32 |
33 |
34 |
35 | javax.jdo.option.ConnectionPassword
36 | admin
37 |
38 |
39 |
40 | fs.s3a.access.key
41 | minio
42 |
43 |
44 |
45 | fs.s3a.secret.key
46 | minio123
47 |
48 |
49 |
50 | fs.s3a.endpoint
51 | http://minio:9000
52 |
53 |
54 |
55 | fs.s3a.path.style.access
56 | true
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/conf/metastore-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | metastore.thrift.uris
5 | thrift://hive-metastore:9083
6 | Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore.
7 |
8 |
9 |
10 | metastore.task.threads.always
11 | org.apache.hadoop.hive.metastore.events.EventCleanerTask,org.apache.hadoop.hive.metastore.MaterializationsCacheCleanerTask
12 |
13 |
14 |
15 | metastore.expression.proxy
16 | org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy
17 |
18 |
19 |
20 | javax.jdo.option.ConnectionDriverName
21 | com.mysql.cj.jdbc.Driver
22 |
23 |
24 |
25 | javax.jdo.option.ConnectionURL
26 | jdbc:mysql://mariadb:3306/metastore_db
27 |
28 |
29 |
30 | javax.jdo.option.ConnectionUserName
31 | admin
32 |
33 |
34 |
35 | javax.jdo.option.ConnectionPassword
36 | admin
37 |
38 |
39 |
40 | fs.s3a.access.key
41 | minio
42 |
43 |
44 |
45 | fs.s3a.secret.key
46 | minio123
47 |
48 |
49 |
50 | fs.s3a.endpoint
51 | http://minio:9000
52 |
53 |
54 |
55 | fs.s3a.path.style.access
56 | true
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM flink:1.15
2 |
3 | RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws/1.3.0/iceberg-aws-1.3.0.jar
4 | RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/org/apache/flink/flink-s3-fs-hadoop/1.15.3/flink-s3-fs-hadoop-1.15.3.jar
5 | RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-flink-runtime-1.15/1.3.0/iceberg-flink-runtime-1.15-1.3.0.jar
6 | RUN wget -P /opt/flink/lib https://repo.maven.apache.org/maven2/mysql/mysql-connector-java/8.0.30/mysql-connector-java-8.0.30.jar
7 | # RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-flink/1.3.0/iceberg-flink-1.3.0.jar
8 | RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-mysql-cdc/2.3.0/flink-sql-connector-mysql-cdc-2.3.0.jar
9 | # RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.6/hadoop-aws-3.3.6.jar
10 | # RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-s3/1.11.0/aws-java-sdk-s3-1.11.0.jar
11 | RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.4.1-10.0/flink-shaded-hadoop-2-uber-2.4.1-10.0.jar
12 | RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/org/apache/hudi/hudi-flink1.15-bundle/0.12.2/hudi-flink1.15-bundle-0.12.2.jar
13 | RUN wget -P /opt/flink/lib https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-hive-3.1.2_2.12/1.15.2/flink-sql-connector-hive-3.1.2_2.12-1.15.2.jar
14 | RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/org/postgresql/postgresql/42.2.12/postgresql-42.2.12.jar
15 | RUN wget -P /opt/flink/lib https://repo.maven.apache.org/maven2/org/apache/flink/flink-connector-jdbc/1.15.0/flink-connector-jdbc-1.15.0.jar
16 | RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/2.20.18/bundle-2.20.18.jar
17 | RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/2.8.3/hadoop-common-2.8.3.jar
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.7'
2 | services:
3 | jobmanager:
4 | build: ./
5 | ports:
6 | - "8081:8081"
7 | command: jobmanager
8 | environment:
9 | - |
10 | FLINK_PROPERTIES=
11 | jobmanager.rpc.address: jobmanager
12 | - AWS_ACCESS_KEY_ID=minio
13 | - AWS_SECRET_ACCESS_KEY=minio123
14 | - AWS_REGION=us-east-1
15 | - AWS_DEFAULT_REGION=us-east-1
16 | - S3_ENDPOINT=http://minio:9000
17 | - S3_PATH_STYLE_ACCESS=true
18 | volumes:
19 | - ./data:/tmp/hudi/
20 | - /hive-site.xml:/opt/flink/hive/hive-site.xml
21 |
22 |
23 | taskmanager:
24 | build: ./
25 | depends_on:
26 | - jobmanager
27 | command: taskmanager
28 | scale: 1
29 | environment:
30 | - |
31 | FLINK_PROPERTIES=
32 | jobmanager.rpc.address: jobmanager
33 | taskmanager.numberOfTaskSlots: 20
34 | - ./data:/tmp/hudi
35 | - /hive-site.xml:/opt/flink/hive/hive-site.xml
36 | - AWS_ACCESS_KEY_ID=minio
37 | - AWS_SECRET_ACCESS_KEY=minio123
38 | - AWS_REGION=us-east-1
39 | - AWS_DEFAULT_REGION=us-east-1
40 | - S3_ENDPOINT=http://192.168.1.8:9000
41 | - S3_PATH_STYLE_ACCESS=true
42 | volumes:
43 | - ./data:/tmp/hudi/
44 | - /hive-site.xml:/opt/flink/hive/hive-site.xml
45 |
46 |
47 |
48 | sql-client:
49 | build: ./
50 | command: bin/sql-client.sh
51 | depends_on:
52 | - jobmanager
53 | environment:
54 | - |
55 | FLINK_PROPERTIES=
56 | jobmanager.rpc.address: jobmanager
57 | rest.address: jobmanager
58 | - AWS_ACCESS_KEY_ID=minio
59 | - AWS_SECRET_ACCESS_KEY=minio123
60 | - AWS_REGION=us-east-1
61 | - AWS_DEFAULT_REGION=us-east-1
62 | - S3_ENDPOINT=http://192.168.1.8:9000
63 | - S3_PATH_STYLE_ACCESS=true
64 | volumes:
65 | - ./data:/tmp/hudi
66 | - ./hive-site.xml:/opt/flink/hive/hive-site.xml
67 |
68 |
69 | trino-coordinator:
70 | image: 'trinodb/trino:435'
71 | hostname: trino-coordinator
72 | ports:
73 | - '8080:8080'
74 | volumes:
75 | - ./etc:/etc/trino
76 | networks:
77 | - trino-network
78 |
79 | mariadb:
80 | image: 'mariadb:10.5.8'
81 | hostname: mariadb
82 | ports:
83 | - '3306:3306'
84 | environment:
85 | MYSQL_ROOT_PASSWORD: admin
86 | MYSQL_USER: admin
87 | MYSQL_PASSWORD: admin
88 | MYSQL_DATABASE: metastore_db
89 | networks:
90 | - trino-network
91 |
92 | hive-metastore:
93 | hostname: hive-metastore
94 | image: 'bitsondatadev/hive-metastore:latest'
95 | ports:
96 | - '9083:9083' # Metastore Thrift
97 | volumes:
98 | - ./conf/metastore-site.xml:/opt/apache-hive-metastore-3.0.0-bin/conf/metastore-site.xml:ro
99 | environment:
100 | METASTORE_DB_HOSTNAME: mariadb
101 | depends_on:
102 | - mariadb
103 | networks:
104 | - trino-network
105 | # https://min.io/docs/minio/linux/operations/install-deploy-manage/migrate-fs-gateway.html#overview
106 | # https://github.com/minio/minio/discussions/15967
107 | minio:
108 | hostname: minio
109 | image: 'minio/minio'
110 | container_name: minio
111 | ports:
112 | - '9000:9000'
113 | - '9001:9001'
114 | volumes:
115 | - minio-data:/mnt/data
116 | environment:
117 | MINIO_ROOT_USER: minio
118 | MINIO_ROOT_PASSWORD: minio123
119 | # MINIO_ACCESS_KEY and MINIO_SECRET_KEY are deprecated since version RELEASE.2021-04-22T15-44-28Z.
120 | # https://min.io/docs/minio/linux/reference/minio-server/minio-server.html#envvar.MINIO_ACCESS_KEY
121 | MINIO_ACCESS_KEY: minio
122 | MINIO_SECRET_KEY: minio123
123 | MINIO_VOLUMES: /mnt/data
124 | command: server /mnt/data --console-address ":9001"
125 | networks:
126 | - trino-network
127 |
128 | minio-setup:
129 | depends_on:
130 | - minio
131 | image: minio/mc
132 | container_name: mc
133 | environment:
134 | - MINIO_ACCESS_KEY=minio
135 | - MINIO_SECRET_KEY=minio123
136 | networks:
137 | - trino-network
138 | volumes:
139 | - minio-data:/mnt/data
140 | entrypoint: >
141 | /bin/sh -c "
142 | until (/usr/bin/mc config host add minio http://minio:9000 minio minio123) do echo '...waiting...' && sleep 1; done;
143 | /usr/bin/mc rm -r --force minio/datalake;
144 | /usr/bin/mc mb minio/datalake;
145 | /usr/bin/mc policy set public minio/datalake;
146 | exit 0;
147 | "
148 |
149 |
150 | volumes:
151 | minio-data:
152 | driver: local
153 |
154 | networks:
155 | trino-network:
156 | driver: bridge
--------------------------------------------------------------------------------