├── .gitignore ├── README.md ├── datagen ├── pom.xml └── src │ └── main │ └── java │ └── org │ └── apache │ └── datagen │ └── DataGen.java ├── docker-compose.yml └── images ├── .Dockerfile.swp ├── datagen ├── Dockerfile └── datagen.jar └── zeppelin-flink └── Dockerfile /.gitignore: -------------------------------------------------------------------------------- 1 | images/zeppelin-flink/flink-1.15-SNAPSHOT 2 | images/zeppelin-flink/zeppelin-0.11.0-SNAPSHOT 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Flink Dynamic Table Demo 2 | 3 | ## Demo 组件 4 | - Mysql 5 | - Datagen 持续生成数据到 Mysql 6 | - Kafka 7 | - Flink 8 | - Zeppelin 9 | 10 | 使用 Docker 启动上述组件,请保证 Docker 内存大于 4G (推荐 6G)。([参考链接](https://docs.docker.com/desktop/mac/)) 11 | 12 | ## 准备工作 13 | - `docker-compose up -d` 14 | - 打开 localhost:8080 ,进入 Zeppelin 界面 15 | - 点击右上角 16 | - 点击 Interpreter 17 | - 搜索 flink 18 | - 配置 FLINK_HOME 为 `/opt/flink-1.15-SNAPSHOT` 19 | - 拉到下面,选 SAVE 20 | - 点击左上 Notebook,create new note 21 | - 自定义 Notebook 名称,Interpreter 选择 flink,点击 create 22 | - 执行 `%flink.ssql show tables`; 查看 Flink UI: localhost:8081 23 | 24 | ## 流式数仓 25 | ![image](https://user-images.githubusercontent.com/9601882/145389495-0f0dad27-9e6d-457e-971d-9a4844151e2b.png) 26 | 27 | Mysql cdc DDLs: 28 | ```sql 29 | %flink.ssql 30 | 31 | -- Mysql CDC:订单表 32 | CREATE TEMPORARY TABLE orders ( 33 | order_id VARCHAR, 34 | cate_id VARCHAR, 35 | trans_amount BIGINT, 36 | gmt_create VARCHAR, 37 | dt AS DATE_FORMAT(gmt_create, 'yyyy-MM-dd'), 38 | PRIMARY KEY (order_id) NOT ENFORCED 39 | ) WITH ( 40 | 'connector' = 'mysql-cdc', 41 | 'hostname' = 'mysql', 42 | 'port' = '3306', 43 | 'username' = 'root', 44 | 'password' = '123456', 45 | 'database-name' = 'retail', 46 | 'table-name' = 'orders' 47 | ) 48 | ``` 49 | ```sql 50 | --Mysql CDC:类目表 51 | CREATE TEMPORARY TABLE cate_dim ( 52 | cate_id VARCHAR, 53 | parent_cate_id VARCHAR, 54 | PRIMARY KEY (cate_id) NOT ENFORCED 55 | ) WITH ( 56 | 'connector' = 'mysql-cdc', 57 | 'hostname' = 'mysql', 58 | 'port' = '3306', 59 | 'username' = 'root', 60 | 'password' = '123456', 61 | 'database-name' = 'retail', 62 | 'table-name' = 'category' 63 | ) 64 | ``` 65 | 66 | Dynamic Table DDLs: 67 | ```sql 68 | %flink.ssql 69 | 70 | -- Flink 动态表:DWD 订单类目宽表 71 | CREATE TEMPORARY TABLE dwd_orders_cate ( 72 | dt STRING, 73 | parent_cate_id VARCHAR, 74 | cate_id VARCHAR, 75 | order_id VARCHAR, 76 | trans_amount BIGINT, 77 | gmt_create STRING, 78 | PRIMARY KEY (order_id, dt) NOT ENFORCED 79 | ) PARTITIONED BY (dt) 80 | ``` 81 | 82 | ```sql 83 | %flink.ssql 84 | 85 | -- Flink 动态表:DWS 类目指标聚合表 86 | CREATE TABLE dws_cate_day ( 87 | dt STRING, 88 | parent_cate_id VARCHAR, 89 | cate_gmv BIGINT, 90 | PRIMARY KEY (parent_cate_id, dt) NOT ENFORCED 91 | ) PARTITIONED BY (dt) 92 | ``` 93 | 94 | Streaming pipeline: 95 | ```sql 96 | %flink.ssql 97 | 98 | -- 流作业:两张Mysql cdc表join写入DWD 99 | INSERT INTO dwd_orders_cate 100 | SELECT 101 | s.dt, 102 | d.parent_cate_id, 103 | s.cate_id, 104 | s.order_id, 105 | s.trans_amount, 106 | s.gmt_create 107 | FROM `orders` s INNER JOIN `cate_dim` `d` 108 | ON s.cate_id = d.cate_id 109 | ``` 110 | ```sql 111 | -- 流作业:DWD经过聚合写入DWS 112 | INSERT INTO dws_cate_day 113 | SELECT 114 | dt, 115 | parent_cate_id, 116 | SUM(trans_amount) AS cate_gmv 117 | FROM dwd_orders_cate 118 | GROUP BY parent_cate_id, dt 119 | ``` 120 | 121 | ## OLAP 查询 122 | 123 | 请修改对应的日期: 124 | ```sql 125 | %flink.ssql 126 | 127 | -- 实时OLAP:Join 订单宽表和类目指标表,得出订单在这个类目下金额的占比 128 | SELECT 129 | order_id, 130 | trans_amount, 131 | CAST(trans_amount AS DOUBLE) / cate_gmv AS ratio 132 | FROM dwd_orders_cate d JOIN dws_cate_day s 133 | ON d.parent_cate_id = s.parent_cate_id -- Join condition 134 | WHERE d.dt = '${TODAY}' AND s.dt = '${TODAY}' -- 分区Pruning 135 | ORDER BY ratio DESC LIMIT 10 136 | ``` 137 | 138 | ```sql 139 | %flink.bsql 140 | 141 | -- 历史OLAP:查询看订单宽表三天前的数据 142 | SELECT * FROM dwd_orders_cate WHERE dt = '${3-days-ago}' 143 | ``` 144 | 145 | ## 数据订正 146 | ![image](https://user-images.githubusercontent.com/9601882/145390269-35318825-6d8c-4e00-9396-37b30178bc0e.png) 147 | 148 | 请修改对应的日期: 149 | ```sql 150 | %flink.bsql 151 | 152 | -- Batch统计:查看有脏数据的分区 153 | SELECT DISTINCT dt FROM dwd_orders_cate WHERE trans_amount <= 0 154 | ``` 155 | 156 | ```sql 157 | %flink.bsql 158 | 159 | --Batch 数据订正:覆写指定分区 160 | INSERT OVERWRITE dws_cate_day PARTITION (dt = '${3-days-ago}') 161 | SELECT 162 | parent_cate_id, 163 | SUM(trans_amount) AS cate_gmv 164 | FROM dwd_orders_cate 165 | WHERE dt = '${3-days-ago}' AND trans_amount > 0 166 | GROUP BY parent_cate_id 167 | ``` 168 | 169 | ```sql 170 | %flink.bsql 171 | 172 | --OLAP查询:查看订正后数据 173 | SELECT * FROM dws_cate_day WHERE dt = '${3-days-ago}' 174 | ``` 175 | 176 | ## 附录:查看动态表文件存储 177 | - docker-compose exec zeppelin-flink /bin/bash 178 | - cd /tmp/store/ 179 | 180 | ## FAQ 181 | - `docker-compose up -d`遇到 `address already in use` 报错 182 | ```text 183 | ERROR: for flink-dynamic-table-demo_zeppelin-flink_1 Cannot start service zeppelin-flink: Ports are not available: listen tcp 0.0.0.0:8081: bind: address already in use 184 | 185 | ERROR: for zeppelin-flink Cannot start service zeppelin-flink: Ports are not available: listen tcp 0.0.0.0:8081: bind: address already in use 186 | ERROR: Encountered errors while bringing up the project. 187 | ``` 188 | 189 | 原因: 本地有其它进程占用了 8081 端口号, 可通过如下命令找到进程 pid 190 | 191 | ```shell 192 | sudo lsof -nP -iTCP:8081 | grep LISTEN 193 | ``` 194 | 195 | ```shell 196 | sudo kill -9 ${pid} 197 | ``` 198 | 也可以修改 `docker-compose.yml` HOST_PORT 与 CONTAINER_PORT 的 mapping 关系, e.g. 将 HOST_PORT 改为 8082 199 | ```yaml 200 | ports: 201 | - "${HOST_PORT}:${CONTAINER_PORT}" 202 | - "8082:8081" 203 | ``` 204 | container ready 后打开 `localhost:${HOST_PORT}` 205 | 206 | # 谢谢尝试 207 | 208 | -------------------------------------------------------------------------------- /datagen/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.example 8 | datagen 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 13 | org.apache.maven.plugins 14 | maven-compiler-plugin 15 | 16 | 8 17 | 8 18 | 19 | 20 | 21 | maven-assembly-plugin 22 | 23 | 24 | jar-with-dependencies 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | mysql 34 | mysql-connector-java 35 | 8.0.27 36 | 37 | 38 | 39 | org.apache.commons 40 | commons-lang3 41 | 3.12.0 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /datagen/src/main/java/org/apache/datagen/DataGen.java: -------------------------------------------------------------------------------- 1 | package org.apache.datagen; 2 | 3 | import org.apache.commons.lang3.time.DateUtils; 4 | 5 | import java.sql.Connection; 6 | import java.sql.DriverManager; 7 | import java.sql.PreparedStatement; 8 | import java.sql.SQLException; 9 | import java.sql.Statement; 10 | import java.text.SimpleDateFormat; 11 | import java.util.Date; 12 | import java.util.HashMap; 13 | import java.util.Map; 14 | import java.util.Random; 15 | 16 | public class DataGen { 17 | private static Map productCateInfo = new HashMap(); 18 | private static long orderCnt = 0; 19 | private static final String RETAIL = "retail"; 20 | 21 | public static void main(String[] args) throws Exception { 22 | if (args.length < 2) { 23 | throw new IllegalArgumentException("Two arguments are required: Hostname Port"); 24 | } 25 | String host = args[0]; 26 | String port = args[1]; 27 | 28 | Class.forName("com.mysql.jdbc.Driver"); 29 | String url = String.format("jdbc:mysql://%s:%s/", host, port); 30 | // wait until mysql is ready 31 | int retries = 0; 32 | while (retries++ < 20) { 33 | try { 34 | DriverManager.getConnection(url, "root", "123456"); 35 | break; 36 | } catch (SQLException c) { 37 | Thread.sleep(3000); 38 | } 39 | } 40 | System.out.println("### Mysql is alive..."); 41 | createDbAndTable(url); 42 | insertData(url); 43 | } 44 | 45 | private static void createDbAndTable(String url) throws Exception { 46 | Connection conn = DriverManager.getConnection(url, "root", "123456"); 47 | Statement stmt = conn.createStatement(); 48 | // create db 49 | String sql = String.format("CREATE DATABASE %s;", RETAIL); 50 | stmt.executeUpdate(sql); 51 | stmt.executeUpdate("USE " + RETAIL); 52 | 53 | // create table orders 54 | String createOrder = "CREATE TABLE orders (\n" + 55 | " order_id VARCHAR(255) NOT NULL PRIMARY KEY,\n" + 56 | " product_id VARCHAR(255) COMMENT '000~499',\n" + 57 | " cate_id VARCHAR(255) COMMENT '00~99',\n" + 58 | " trans_amount BIGINT COMMENT '10000~20000',\n" + 59 | " gmt_create VARCHAR(255) NOT NULL\n" + 60 | ");"; 61 | stmt.executeUpdate(createOrder); 62 | 63 | // create tb category 64 | String createCate = "CREATE TABLE category (\n" + 65 | " cate_id VARCHAR(255) NOT NULL PRIMARY KEY COMMENT '00~99',\n" + 66 | " parent_cate_id VARCHAR(255) COMMENT '0~9'\n" + 67 | ");"; 68 | stmt.executeUpdate(createCate); 69 | conn.close(); 70 | } 71 | 72 | private static void insertData(String url) throws Exception { 73 | Connection conn = DriverManager.getConnection(url + RETAIL, "root", "123456"); 74 | conn.setAutoCommit(false); 75 | String cateSql = "INSERT INTO category (cate_id, parent_cate_id) VALUES (?, ?)"; 76 | PreparedStatement pStmt1 = conn.prepareStatement(cateSql); 77 | 78 | // insert data into `category` 79 | for (int i = 0; i < 10; i++) { 80 | for (int j = 0; j < 10; j++) { 81 | pStmt1.setString(1, String.format("%03d", i * 10 + j)); 82 | pStmt1.setString(2, String.format("%03d", i)); 83 | pStmt1.addBatch(); 84 | } 85 | } 86 | pStmt1.executeBatch(); 87 | 88 | // order insert statement 89 | String orderSql = "INSERT INTO orders (order_id, product_id, cate_id, trans_amount, gmt_create)" 90 | + " VALUES (?, ?, ?, ?, ?)"; 91 | PreparedStatement pStmt2 = conn.prepareStatement(orderSql); 92 | 93 | // create orders for last 5 days 94 | SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); 95 | Random rnd = new Random(); 96 | for (int i = 0; i < 5; i++) { 97 | // 1000 data for each data 98 | for (int j = 0; j < 1000; j++) { 99 | pStmt2.setString(1, (++orderCnt) + ""); // order_id 100 | String productId = String.format("%03d", rnd.nextInt(500)); 101 | pStmt2.setString(2, productId); 102 | String cateId = productCateInfo.computeIfAbsent( 103 | productId, s -> String.format("%03d", rnd.nextInt(100))); 104 | pStmt2.setString(3, cateId); 105 | 106 | // insert some invalid data: trans_amount < 0 107 | if (rnd.nextInt(10) < 1) { 108 | pStmt2.setLong(4, -10000 - rnd.nextInt(10000)); 109 | } else { 110 | pStmt2.setLong(4, 10000 + rnd.nextInt(10000)); 111 | } 112 | 113 | Date date = DateUtils.addDays(new Date(), -i-1); 114 | date.setHours(0); 115 | date.setMinutes(0); 116 | date = DateUtils.addMinutes(date, rnd.nextInt(1000)); 117 | String ts = sdf.format(date); 118 | pStmt2.setString(5, ts); 119 | pStmt2.addBatch(); 120 | } 121 | pStmt2.executeBatch(); 122 | } 123 | conn.commit(); 124 | 125 | conn.setAutoCommit(true); 126 | // continous insert data for today... 127 | try { 128 | while (true) { 129 | pStmt2.setString(1, (++orderCnt) + ""); // order_id 130 | String productId = String.format("%03d", rnd.nextInt(500)); 131 | pStmt2.setString(2, productId); 132 | String cateId = productCateInfo.computeIfAbsent( 133 | productId, s -> String.format("%03d", rnd.nextInt(100))); 134 | pStmt2.setString(3, cateId); 135 | pStmt2.setLong(4, 10000 + rnd.nextInt(10000)); 136 | 137 | Date date = new Date(); 138 | int elapsedMinutes = date.getHours() * 60 + date.getMinutes(); 139 | date.setHours(0); 140 | date.setMinutes(0); 141 | date = DateUtils.addMinutes(date, rnd.nextInt(elapsedMinutes)); 142 | String ts = sdf.format(date); 143 | pStmt2.setString(5, ts); 144 | pStmt2.execute(); 145 | 146 | System.out.println("insert with oder_id: " + orderCnt); 147 | 148 | Thread.sleep(5000); 149 | } 150 | } finally { 151 | conn.close(); 152 | } 153 | 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2.1' 2 | services: 3 | zeppelin-flink: 4 | image: lzljs3620320/zeppelin-flink:1.0 5 | ports: 6 | - "8080:8080" 7 | - "8081:8081" 8 | depends_on: 9 | - mysql 10 | - datagen 11 | environment: 12 | MYSQL_HOST: mysql 13 | volumes: 14 | - shared-tmpfs:/tmp/store 15 | kafka: 16 | image: johnnypark/kafka-zookeeper 17 | ports: 18 | - "2181:2181" 19 | - "9092:9092" 20 | environment: 21 | - NUM_PARTITIONS=8 22 | mysql: 23 | image: debezium/example-mysql:1.1 24 | ports: 25 | - "3306:3306" 26 | environment: 27 | - MYSQL_ROOT_PASSWORD=123456 28 | - MYSQL_USER=mysqluser 29 | - MYSQL_PASSWORD=mysqlpw 30 | datagen: 31 | image: lzljs3620320/datagen:1.0 32 | depends_on: 33 | - mysql 34 | environment: 35 | - DB_HOST=mysql 36 | - DB_PORT=3306 37 | 38 | volumes: 39 | shared-tmpfs: 40 | driver: local 41 | driver_opts: 42 | type: "tmpfs" 43 | device: "tmpfs" 44 | -------------------------------------------------------------------------------- /images/.Dockerfile.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JingsongLi/flink-dynamic-table-demo/15854b6e68d0f01c3293618aa063f1fcf687e3cb/images/.Dockerfile.swp -------------------------------------------------------------------------------- /images/datagen/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM openjdk:11 2 | COPY . /usr/src/datagen 3 | WORKDIR /usr/src/datagen 4 | CMD java -cp datagen.jar org.apache.datagen.DataGen $DB_HOST $DB_PORT 5 | -------------------------------------------------------------------------------- /images/datagen/datagen.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JingsongLi/flink-dynamic-table-demo/15854b6e68d0f01c3293618aa063f1fcf687e3cb/images/datagen/datagen.jar -------------------------------------------------------------------------------- /images/zeppelin-flink/Dockerfile: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | FROM ubuntu:20.04 17 | LABEL maintainer="Apache Software Foundation " 18 | 19 | RUN set -ex && \ 20 | apt-get -y update && \ 21 | # Install language and other base packages 22 | DEBIAN_FRONTEND=noninteractive apt-get install -y language-pack-en openjdk-8-jre-headless tini wget && \ 23 | # Cleanup 24 | rm -rf /var/lib/apt/lists/* && \ 25 | apt-get autoclean && \ 26 | apt-get clean 27 | 28 | ARG version="0.10.0" 29 | 30 | ENV LANG=en_US.UTF-8 \ 31 | LC_ALL=en_US.UTF-8 \ 32 | JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ 33 | VERSION="${version}" \ 34 | HOME="/opt/zeppelin" \ 35 | ZEPPELIN_HOME="/opt/zeppelin" \ 36 | ZEPPELIN_ADDR="0.0.0.0" \ 37 | ZEPPELIN_WAR_TEMPDIR="/tmp/webapps" 38 | 39 | # Copy Zeppelin related files 40 | COPY zeppelin-0.11.0-SNAPSHOT /opt/zeppelin/ 41 | COPY flink-1.15-SNAPSHOT /opt/flink-1.15-SNAPSHOT 42 | 43 | RUN mkdir -p "${ZEPPELIN_HOME}/logs" "${ZEPPELIN_HOME}/run" "${ZEPPELIN_HOME}/notebook" "${ZEPPELIN_HOME}/local-repo" && \ 44 | # Allow process to edit /etc/passwd, to create a user entry for zeppelin 45 | chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \ 46 | # Give access to some specific folders 47 | chmod -R 775 "${ZEPPELIN_HOME}/logs" "${ZEPPELIN_HOME}/run" "${ZEPPELIN_HOME}/conf" "${ZEPPELIN_HOME}/notebook" "${ZEPPELIN_HOME}/local-repo" 48 | 49 | USER 1000 50 | 51 | EXPOSE 8080 52 | 53 | ENTRYPOINT [ "/usr/bin/tini", "--" ] 54 | WORKDIR ${ZEPPELIN_HOME} 55 | CMD ["bin/zeppelin.sh"] 56 | --------------------------------------------------------------------------------