├── etc ├── catalog │ ├── tpch.properties │ ├── tpcds.properties │ ├── local_postgres.properties │ └── iceberg.properties ├── log.properties ├── node.properties ├── config.properties └── jvm.config ├── demo.png ├── config.properties ├── docker-compose-db2.yml ├── README.md ├── hive-site.xml ├── conf └── metastore-site.xml ├── Dockerfile └── docker-compose.yml /etc/catalog/tpch.properties: -------------------------------------------------------------------------------- 1 | connector.name=tpch 2 | tpch.splits-per-node=4 -------------------------------------------------------------------------------- /etc/catalog/tpcds.properties: -------------------------------------------------------------------------------- 1 | connector.name=tpcds 2 | tpcds.splits-per-node=4 -------------------------------------------------------------------------------- /etc/log.properties: -------------------------------------------------------------------------------- 1 | # Enable verbose logging from Presto 2 | #io.prestosql=DEBUG -------------------------------------------------------------------------------- /demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pranav1699/flink-iceberg-minio-trino/HEAD/demo.png -------------------------------------------------------------------------------- /etc/node.properties: -------------------------------------------------------------------------------- 1 | node.environment=docker 2 | node.data-dir=/data/trino 3 | plugin.dir=/usr/lib/trino/plugin 4 | -------------------------------------------------------------------------------- /config.properties: -------------------------------------------------------------------------------- 1 | coordinator=true 2 | node-scheduler.include-coordinator=true 3 | http-server.http.port=8080 4 | discovery-server.enabled=true 5 | discovery.uri=http://localhost:8080 -------------------------------------------------------------------------------- /etc/catalog/local_postgres.properties: -------------------------------------------------------------------------------- 1 | connector.name=postgresql 2 | connection-url=jdbc:postgresql://192.168.1.6:5432/demo 3 | connection-user=postgres 4 | connection-password=1999 -------------------------------------------------------------------------------- /etc/config.properties: -------------------------------------------------------------------------------- 1 | #single node install config 2 | coordinator=true 3 | node-scheduler.include-coordinator=true 4 | http-server.http.port=8080 5 | discovery-server.enabled=true 6 | discovery.uri=http://localhost:8080 -------------------------------------------------------------------------------- /etc/catalog/iceberg.properties: -------------------------------------------------------------------------------- 1 | connector.name=iceberg 2 | hive.metastore.uri=thrift://hive-metastore:9083 3 | hive.s3.path-style-access=true 4 | hive.s3.endpoint=http://minio:9000 5 | hive.s3.aws-access-key=minio 6 | hive.s3.aws-secret-key=minio123 7 | -------------------------------------------------------------------------------- /docker-compose-db2.yml: -------------------------------------------------------------------------------- 1 | version: '2.1' 2 | services: 3 | db2: 4 | image: ruanhang/db2-cdc-demo:v1 5 | privileged: true 6 | ports: 7 | - 50000:50000 8 | environment: 9 | - LICENSE=accept 10 | - DB2INSTANCE=db2inst1 11 | - DB2INST1_PASSWORD=admin 12 | - DBNAME=testdb 13 | - ARCHIVE_LOGS=true 14 | 15 | mysql: 16 | image: debezium/example-mysql:1.1 17 | ports: 18 | - "3307:3306" 19 | environment: 20 | - MYSQL_ROOT_PASSWORD=123456 21 | - MYSQL_USER=mysqluser 22 | - MYSQL_PASSWORD=mysqlpw -------------------------------------------------------------------------------- /etc/jvm.config: -------------------------------------------------------------------------------- 1 | -server 2 | -Xmx16G 3 | -XX:InitialRAMPercentage=80 4 | -XX:MaxRAMPercentage=80 5 | -XX:G1HeapRegionSize=32M 6 | -XX:+ExplicitGCInvokesConcurrent 7 | -XX:+ExitOnOutOfMemoryError 8 | -XX:+HeapDumpOnOutOfMemoryError 9 | -XX:-OmitStackTraceInFastThrow 10 | -XX:ReservedCodeCacheSize=512M 11 | -XX:PerMethodRecompilationCutoff=10000 12 | -XX:PerBytecodeRecompilationCutoff=10000 13 | -Djdk.attach.allowAttachSelf=true 14 | -Djdk.nio.maxCachedBufferSize=2000000 15 | -Dfile.encoding=UTF-8 16 | # Reduce starvation of threads by GClocker, recommend to set about the number of cpu cores (JDK-8192647) 17 | -XX:+UnlockDiagnosticVMOptions 18 | -XX:GCLockerRetryAllocationCount=32 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Real-Time Streaming of CDC Data from MySql to Apache Iceberg using Flink 2 | ![Project Logo](https://github.com/pranav1699/flink-iceberg-minio-trino/blob/master/demo.png) 3 | 4 | ## Overview 5 | 6 | This project demonstrates the seamless streaming of Change Data Capture (CDC) data from MySql to Apache Iceberg using Apache Flink. By utilizing Flink's SQL Client, we enable fast data analytics and support machine learning workloads. 7 | 8 | ## Purpose 9 | 10 | The purpose of this repository is to provide a comprehensive example of setting up a real-time streaming pipeline for CDC data synchronization. The integration of Flink, MySql CDC connectors, Iceberg, Minio, Hive Metastore, and Trino showcases the capabilities of modern data tools in handling dynamic data scenarios. 11 | 12 | ## Tools Used 13 | 14 | - **Trino:** High-performance query engine for distributed data processing. 15 | - **Apache Flink:** Robust stream processing framework for real-time data analytics. 16 | - **Apache Iceberg:** Open-source table format and processing framework for efficient data lake management. 17 | - **Hive Metastore:** Schema management tool ensuring seamless evolution and organization of data. 18 | - **Minio:** Secure object storage solution for reliable data storage in distributed environments. 19 | 20 | ## Setup Instructions 21 | 22 | 1. Ensure that Docker and Docker Compose are installed on your system. 23 | 2. Clone this repository: 24 | 25 | ```bash 26 | git clone https://github.com/pranav1699/flink-iceberg-minio-trino.git 27 | cd flink-iceberg-minio-trino 28 | ``` 29 | 30 | 3. Start the Docker containers: 31 | 32 | ```bash 33 | docker-compose up -d 34 | ``` 35 | ## Next Steps 36 | Read this blog : https://medium.com/dev-genius/streaming-cdc-data-from-mysql-to-apache-iceberg-with-hive-metastore-using-apache-flink-0de9738fba0d 37 | -------------------------------------------------------------------------------- /hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | metastore.thrift.uris 5 | thrift://hive-metastore:9083 6 | Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore. 7 | 8 | 9 | 10 | metastore.task.threads.always 11 | org.apache.hadoop.hive.metastore.events.EventCleanerTask,org.apache.hadoop.hive.metastore.MaterializationsCacheCleanerTask 12 | 13 | 14 | 15 | metastore.expression.proxy 16 | org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy 17 | 18 | 19 | 20 | javax.jdo.option.ConnectionDriverName 21 | com.mysql.cj.jdbc.Driver 22 | 23 | 24 | 25 | javax.jdo.option.ConnectionURL 26 | jdbc:mysql://mariadb:3306/metastore_db 27 | 28 | 29 | 30 | javax.jdo.option.ConnectionUserName 31 | admin 32 | 33 | 34 | 35 | javax.jdo.option.ConnectionPassword 36 | admin 37 | 38 | 39 | 40 | fs.s3a.access.key 41 | minio 42 | 43 | 44 | 45 | fs.s3a.secret.key 46 | minio123 47 | 48 | 49 | 50 | fs.s3a.endpoint 51 | http://minio:9000 52 | 53 | 54 | 55 | fs.s3a.path.style.access 56 | true 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /conf/metastore-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | metastore.thrift.uris 5 | thrift://hive-metastore:9083 6 | Thrift URI for the remote metastore. Used by metastore client to connect to remote metastore. 7 | 8 | 9 | 10 | metastore.task.threads.always 11 | org.apache.hadoop.hive.metastore.events.EventCleanerTask,org.apache.hadoop.hive.metastore.MaterializationsCacheCleanerTask 12 | 13 | 14 | 15 | metastore.expression.proxy 16 | org.apache.hadoop.hive.metastore.DefaultPartitionExpressionProxy 17 | 18 | 19 | 20 | javax.jdo.option.ConnectionDriverName 21 | com.mysql.cj.jdbc.Driver 22 | 23 | 24 | 25 | javax.jdo.option.ConnectionURL 26 | jdbc:mysql://mariadb:3306/metastore_db 27 | 28 | 29 | 30 | javax.jdo.option.ConnectionUserName 31 | admin 32 | 33 | 34 | 35 | javax.jdo.option.ConnectionPassword 36 | admin 37 | 38 | 39 | 40 | fs.s3a.access.key 41 | minio 42 | 43 | 44 | 45 | fs.s3a.secret.key 46 | minio123 47 | 48 | 49 | 50 | fs.s3a.endpoint 51 | http://minio:9000 52 | 53 | 54 | 55 | fs.s3a.path.style.access 56 | true 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM flink:1.15 2 | 3 | RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws/1.3.0/iceberg-aws-1.3.0.jar 4 | RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/org/apache/flink/flink-s3-fs-hadoop/1.15.3/flink-s3-fs-hadoop-1.15.3.jar 5 | RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-flink-runtime-1.15/1.3.0/iceberg-flink-runtime-1.15-1.3.0.jar 6 | RUN wget -P /opt/flink/lib https://repo.maven.apache.org/maven2/mysql/mysql-connector-java/8.0.30/mysql-connector-java-8.0.30.jar 7 | # RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-flink/1.3.0/iceberg-flink-1.3.0.jar 8 | RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/com/ververica/flink-sql-connector-mysql-cdc/2.3.0/flink-sql-connector-mysql-cdc-2.3.0.jar 9 | # RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.6/hadoop-aws-3.3.6.jar 10 | # RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-s3/1.11.0/aws-java-sdk-s3-1.11.0.jar 11 | RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.4.1-10.0/flink-shaded-hadoop-2-uber-2.4.1-10.0.jar 12 | RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/org/apache/hudi/hudi-flink1.15-bundle/0.12.2/hudi-flink1.15-bundle-0.12.2.jar 13 | RUN wget -P /opt/flink/lib https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-hive-3.1.2_2.12/1.15.2/flink-sql-connector-hive-3.1.2_2.12-1.15.2.jar 14 | RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/org/postgresql/postgresql/42.2.12/postgresql-42.2.12.jar 15 | RUN wget -P /opt/flink/lib https://repo.maven.apache.org/maven2/org/apache/flink/flink-connector-jdbc/1.15.0/flink-connector-jdbc-1.15.0.jar 16 | RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/2.20.18/bundle-2.20.18.jar 17 | RUN wget -P /opt/flink/lib https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/2.8.3/hadoop-common-2.8.3.jar -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | services: 3 | jobmanager: 4 | build: ./ 5 | ports: 6 | - "8081:8081" 7 | command: jobmanager 8 | environment: 9 | - | 10 | FLINK_PROPERTIES= 11 | jobmanager.rpc.address: jobmanager 12 | - AWS_ACCESS_KEY_ID=minio 13 | - AWS_SECRET_ACCESS_KEY=minio123 14 | - AWS_REGION=us-east-1 15 | - AWS_DEFAULT_REGION=us-east-1 16 | - S3_ENDPOINT=http://minio:9000 17 | - S3_PATH_STYLE_ACCESS=true 18 | volumes: 19 | - ./data:/tmp/hudi/ 20 | - /hive-site.xml:/opt/flink/hive/hive-site.xml 21 | 22 | 23 | taskmanager: 24 | build: ./ 25 | depends_on: 26 | - jobmanager 27 | command: taskmanager 28 | scale: 1 29 | environment: 30 | - | 31 | FLINK_PROPERTIES= 32 | jobmanager.rpc.address: jobmanager 33 | taskmanager.numberOfTaskSlots: 20 34 | - ./data:/tmp/hudi 35 | - /hive-site.xml:/opt/flink/hive/hive-site.xml 36 | - AWS_ACCESS_KEY_ID=minio 37 | - AWS_SECRET_ACCESS_KEY=minio123 38 | - AWS_REGION=us-east-1 39 | - AWS_DEFAULT_REGION=us-east-1 40 | - S3_ENDPOINT=http://192.168.1.8:9000 41 | - S3_PATH_STYLE_ACCESS=true 42 | volumes: 43 | - ./data:/tmp/hudi/ 44 | - /hive-site.xml:/opt/flink/hive/hive-site.xml 45 | 46 | 47 | 48 | sql-client: 49 | build: ./ 50 | command: bin/sql-client.sh 51 | depends_on: 52 | - jobmanager 53 | environment: 54 | - | 55 | FLINK_PROPERTIES= 56 | jobmanager.rpc.address: jobmanager 57 | rest.address: jobmanager 58 | - AWS_ACCESS_KEY_ID=minio 59 | - AWS_SECRET_ACCESS_KEY=minio123 60 | - AWS_REGION=us-east-1 61 | - AWS_DEFAULT_REGION=us-east-1 62 | - S3_ENDPOINT=http://192.168.1.8:9000 63 | - S3_PATH_STYLE_ACCESS=true 64 | volumes: 65 | - ./data:/tmp/hudi 66 | - ./hive-site.xml:/opt/flink/hive/hive-site.xml 67 | 68 | 69 | trino-coordinator: 70 | image: 'trinodb/trino:435' 71 | hostname: trino-coordinator 72 | ports: 73 | - '8080:8080' 74 | volumes: 75 | - ./etc:/etc/trino 76 | networks: 77 | - trino-network 78 | 79 | mariadb: 80 | image: 'mariadb:10.5.8' 81 | hostname: mariadb 82 | ports: 83 | - '3306:3306' 84 | environment: 85 | MYSQL_ROOT_PASSWORD: admin 86 | MYSQL_USER: admin 87 | MYSQL_PASSWORD: admin 88 | MYSQL_DATABASE: metastore_db 89 | networks: 90 | - trino-network 91 | 92 | hive-metastore: 93 | hostname: hive-metastore 94 | image: 'bitsondatadev/hive-metastore:latest' 95 | ports: 96 | - '9083:9083' # Metastore Thrift 97 | volumes: 98 | - ./conf/metastore-site.xml:/opt/apache-hive-metastore-3.0.0-bin/conf/metastore-site.xml:ro 99 | environment: 100 | METASTORE_DB_HOSTNAME: mariadb 101 | depends_on: 102 | - mariadb 103 | networks: 104 | - trino-network 105 | # https://min.io/docs/minio/linux/operations/install-deploy-manage/migrate-fs-gateway.html#overview 106 | # https://github.com/minio/minio/discussions/15967 107 | minio: 108 | hostname: minio 109 | image: 'minio/minio' 110 | container_name: minio 111 | ports: 112 | - '9000:9000' 113 | - '9001:9001' 114 | volumes: 115 | - minio-data:/mnt/data 116 | environment: 117 | MINIO_ROOT_USER: minio 118 | MINIO_ROOT_PASSWORD: minio123 119 | # MINIO_ACCESS_KEY and MINIO_SECRET_KEY are deprecated since version RELEASE.2021-04-22T15-44-28Z. 120 | # https://min.io/docs/minio/linux/reference/minio-server/minio-server.html#envvar.MINIO_ACCESS_KEY 121 | MINIO_ACCESS_KEY: minio 122 | MINIO_SECRET_KEY: minio123 123 | MINIO_VOLUMES: /mnt/data 124 | command: server /mnt/data --console-address ":9001" 125 | networks: 126 | - trino-network 127 | 128 | minio-setup: 129 | depends_on: 130 | - minio 131 | image: minio/mc 132 | container_name: mc 133 | environment: 134 | - MINIO_ACCESS_KEY=minio 135 | - MINIO_SECRET_KEY=minio123 136 | networks: 137 | - trino-network 138 | volumes: 139 | - minio-data:/mnt/data 140 | entrypoint: > 141 | /bin/sh -c " 142 | until (/usr/bin/mc config host add minio http://minio:9000 minio minio123) do echo '...waiting...' && sleep 1; done; 143 | /usr/bin/mc rm -r --force minio/datalake; 144 | /usr/bin/mc mb minio/datalake; 145 | /usr/bin/mc policy set public minio/datalake; 146 | exit 0; 147 | " 148 | 149 | 150 | volumes: 151 | minio-data: 152 | driver: local 153 | 154 | networks: 155 | trino-network: 156 | driver: bridge --------------------------------------------------------------------------------