├── .gitignore ├── README.md ├── docker-compose.yml └── sql-client ├── Dockerfile └── conf └── flink-conf.yaml /.gitignore: -------------------------------------------------------------------------------- 1 | docker 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Getting Started with Flink SQL and Apache Iceberg 2 | 3 | The full story is published on [Medium blog](https://lazypro.medium.com/71b96817e3c3). 4 | 5 | This repo was inspired by [this article](https://www.dremio.com/blog/getting-started-with-flink-sql-and-apache-iceberg/) and fixes many fatal bugs. 6 | 7 | The entire playground is completely free, and at its core are the following three components. 8 | 1. DynamoDB: This is the catalog store, and uses the local version to avoid expenses. 9 | 2. Minio: This is where the actual iceberg is stored, again using minio to avoid the expense of s3. 10 | 3. Flink: the key to Playground. 11 | 12 | Because all the URLs of the dependency packages are written in the `sql-client` self-contained image, if you run into a situation where you can't build an image, you have to change the corresponding path. 13 | 14 | Alternatively, you can use an already created docker image. 15 | > docker pull wirelessr/flink-iceberg:1.16.3 16 | 17 | ## Playing Steps 18 | 19 | 0. Build the container: `docker compose build sql-client` 20 | 1. Directly activate the entire environment: `docker compose up -d`. 21 | 2. Launch the SQL Client `docker compose exec sql-client bash -c "./bin/sql-client.sh"` 22 | 4. Create and use the DynamoDB Catalog. 23 | 24 | ```sql 25 | CREATE CATALOG dynamo_catalog WITH ( 26 | 'type' = 'iceberg', 27 | 'catalog-impl' = 'org.apache.iceberg.aws.dynamodb.DynamoDbCatalog', 28 | 'io-impl' = 'org.apache.iceberg.aws.s3.S3FileIO', 29 | 'client.assume-role.region' = 'us-east-1', 30 | 'warehouse' = 's3://warehouse', 31 | 's3.endpoint' = 'http://storage:9000', 32 | 's3.path-style-access' = 'true', 33 | 'dynamodb.table-name' = 'iceberg-catalog', 34 | 'dynamodb.endpoint' = 'http://dynamodb-local:8000'); 35 | 36 | USE CATALOG dynamo_catalog; 37 | ``` 38 | 39 | 5. Create the table. 40 | 41 | ```sql 42 | CREATE database db; 43 | USE db; 44 | 45 | CREATE TABLE spotify (songid BIGINT, artist STRING, rating BIGINT); 46 | ``` 47 | 48 | 6. Insert and read data. 49 | 50 | ```sql 51 | INSERT INTO spotify VALUES (2, 'drake', 3); 52 | SELECT * FROM spotify; 53 | ``` 54 | 55 | ## Viewing the Iceberg data in MinIO 56 | 57 | 1. Go to http://localhost:9001/ and login (`admin` / `password`) 58 | 2. Inspect the `warehouse` bucket, observe the Parquet files under `data` and JSON files under `metadata` 59 | 60 | ## Viewing the catalog data in DynamoDB 61 | 62 | 1. Install [dynamodb-admin](https://github.com/aaronshaf/dynamodb-admin): `npm install -g dynamodb-admin` 63 | 2. Launch GUI `DYNAMO_ENDPOINT=http://localhost:8000 dynamodb-admin` 64 | 3. Go to http://0.0.0.0:8001/ and inspect the values held under `iceberg-catalog` 65 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | ########################################### 2 | # Flink - Iceberg - DynamoDB Setup 3 | ########################################### 4 | 5 | version: "3" 6 | 7 | services: 8 | sql-client: 9 | build: ./sql-client 10 | image: sql-client 11 | networks: 12 | iceberg-dynamodb-flink-net: 13 | environment: 14 | - AWS_ACCESS_KEY_ID=admin 15 | - AWS_SECRET_ACCESS_KEY=password 16 | - AWS_REGION=us-east-1 17 | - AWS_DEFAULT_REGION=us-east-1 18 | stdin_open: true 19 | tty: true 20 | # Flink Job Manager 21 | flink-jobmanager: 22 | image: alexmerced/flink-iceberg:latest 23 | ports: 24 | - "8081:8081" 25 | command: jobmanager 26 | networks: 27 | iceberg-dynamodb-flink-net: 28 | environment: 29 | - | 30 | FLINK_PROPERTIES= 31 | jobmanager.rpc.address: flink-jobmanager 32 | - AWS_ACCESS_KEY_ID=admin 33 | - AWS_SECRET_ACCESS_KEY=password 34 | - AWS_REGION=us-east-1 35 | - AWS_DEFAULT_REGION=us-east-1 36 | - S3_ENDPOINT=http://minio.storage:9000 37 | - S3_PATH_STYLE_ACCESS=true 38 | platform: linux/amd64 39 | # Flink Task Manager 40 | flink-taskmanager: 41 | image: alexmerced/flink-iceberg:latest 42 | depends_on: 43 | - flink-jobmanager 44 | command: taskmanager 45 | networks: 46 | iceberg-dynamodb-flink-net: 47 | scale: 1 48 | environment: 49 | - | 50 | FLINK_PROPERTIES= 51 | jobmanager.rpc.address: flink-jobmanager 52 | taskmanager.numberOfTaskSlots: 2 53 | - AWS_ACCESS_KEY_ID=admin 54 | - AWS_SECRET_ACCESS_KEY=password 55 | - AWS_REGION=us-east-1 56 | - AWS_DEFAULT_REGION=us-east-1 57 | - S3_ENDPOINT=http://minio.storage:9000 58 | - S3_PATH_STYLE_ACCESS=true 59 | platform: linux/amd64 60 | 61 | #All of the following services are optional 62 | # Catalog 63 | dynamodb-local: 64 | command: "-jar DynamoDBLocal.jar -sharedDb -dbPath ./data" 65 | image: "amazon/dynamodb-local:latest" 66 | container_name: dynamodb-local 67 | networks: 68 | iceberg-dynamodb-flink-net: 69 | ports: 70 | - "8000:8000" 71 | volumes: 72 | - "./docker/dynamodb:/home/dynamodblocal/data" 73 | working_dir: /home/dynamodblocal 74 | # Minio Storage Server 75 | storage: 76 | image: minio/minio 77 | container_name: storage 78 | environment: 79 | - MINIO_ROOT_USER=admin 80 | - MINIO_ROOT_PASSWORD=password 81 | - MINIO_DOMAIN=storage 82 | - MINIO_REGION_NAME=us-east-1 83 | - MINIO_REGION=us-east-1 84 | networks: 85 | iceberg-dynamodb-flink-net: 86 | ports: 87 | - 9001:9001 88 | - 9000:9000 89 | command: ["server", "/data", "--console-address", ":9001"] 90 | # Minio Client Container 91 | mc: 92 | depends_on: 93 | - storage 94 | image: minio/mc 95 | container_name: mc 96 | networks: 97 | iceberg-dynamodb-flink-net: 98 | aliases: 99 | - minio.storage 100 | environment: 101 | - AWS_ACCESS_KEY_ID=admin 102 | - AWS_SECRET_ACCESS_KEY=password 103 | - AWS_REGION=us-east-1 104 | - AWS_DEFAULT_REGION=us-east-1 105 | entrypoint: > 106 | /bin/sh -c " 107 | until (/usr/bin/mc config host add minio http://storage:9000 admin password) do echo '...waiting...' && sleep 1; done; 108 | /usr/bin/mc rm -r --force minio/warehouse; 109 | /usr/bin/mc mb minio/warehouse; 110 | /usr/bin/mc mb minio/iceberg; 111 | /usr/bin/mc policy set public minio/warehouse; 112 | /usr/bin/mc policy set public minio/iceberg; 113 | tail -f /dev/null 114 | " 115 | 116 | networks: 117 | iceberg-dynamodb-flink-net: 118 | -------------------------------------------------------------------------------- /sql-client/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM openjdk:11-jre 2 | 3 | RUN curl https://dlcdn.apache.org/flink/flink-1.16.3/flink-1.16.3-bin-scala_2.12.tgz -O && \ 4 | tar -zxf flink-1.16.3-bin-scala_2.12.tgz -C /opt && \ 5 | rm -f flink-1.16.3-bin-scala_2.12.tgz 6 | 7 | WORKDIR /opt/flink-1.16.3 8 | 9 | RUN curl https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-flink-runtime-1.17/1.3.0/iceberg-flink-runtime-1.17-1.3.0.jar -o lib/iceberg-flink-runtime-1.17-1.3.0.jar && \ 10 | curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/2.8.3/hadoop-common-2.8.3.jar -o lib/hadoop-common-2.8.3.jar && \ 11 | curl https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar -o lib/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar && \ 12 | curl https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/2.20.18/bundle-2.20.18.jar -o lib/bundle-2.20.18.jar 13 | 14 | COPY conf/flink-conf.yaml conf/flink-conf.yaml 15 | 16 | 17 | -------------------------------------------------------------------------------- /sql-client/conf/flink-conf.yaml: -------------------------------------------------------------------------------- 1 | jobmanager.rpc.address: flink-jobmanager 2 | --------------------------------------------------------------------------------