├── .gitignore
├── README.md
├── datagen
├── pom.xml
└── src
│ └── main
│ └── java
│ └── org
│ └── apache
│ └── datagen
│ └── DataGen.java
├── docker-compose.yml
└── images
├── .Dockerfile.swp
├── datagen
├── Dockerfile
└── datagen.jar
└── zeppelin-flink
└── Dockerfile
/.gitignore:
--------------------------------------------------------------------------------
1 | images/zeppelin-flink/flink-1.15-SNAPSHOT
2 | images/zeppelin-flink/zeppelin-0.11.0-SNAPSHOT
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Flink Dynamic Table Demo
2 |
3 | ## Demo 组件
4 | - Mysql
5 | - Datagen 持续生成数据到 Mysql
6 | - Kafka
7 | - Flink
8 | - Zeppelin
9 |
10 | 使用 Docker 启动上述组件,请保证 Docker 内存大于 4G (推荐 6G)。([参考链接](https://docs.docker.com/desktop/mac/))
11 |
12 | ## 准备工作
13 | - `docker-compose up -d`
14 | - 打开 localhost:8080 ,进入 Zeppelin 界面
15 | - 点击右上角
16 | - 点击 Interpreter
17 | - 搜索 flink
18 | - 配置 FLINK_HOME 为 `/opt/flink-1.15-SNAPSHOT`
19 | - 拉到下面,选 SAVE
20 | - 点击左上 Notebook,create new note
21 | - 自定义 Notebook 名称,Interpreter 选择 flink,点击 create
22 | - 执行 `%flink.ssql show tables`; 查看 Flink UI: localhost:8081
23 |
24 | ## 流式数仓
25 | 
26 |
27 | Mysql cdc DDLs:
28 | ```sql
29 | %flink.ssql
30 |
31 | -- Mysql CDC:订单表
32 | CREATE TEMPORARY TABLE orders (
33 | order_id VARCHAR,
34 | cate_id VARCHAR,
35 | trans_amount BIGINT,
36 | gmt_create VARCHAR,
37 | dt AS DATE_FORMAT(gmt_create, 'yyyy-MM-dd'),
38 | PRIMARY KEY (order_id) NOT ENFORCED
39 | ) WITH (
40 | 'connector' = 'mysql-cdc',
41 | 'hostname' = 'mysql',
42 | 'port' = '3306',
43 | 'username' = 'root',
44 | 'password' = '123456',
45 | 'database-name' = 'retail',
46 | 'table-name' = 'orders'
47 | )
48 | ```
49 | ```sql
50 | --Mysql CDC:类目表
51 | CREATE TEMPORARY TABLE cate_dim (
52 | cate_id VARCHAR,
53 | parent_cate_id VARCHAR,
54 | PRIMARY KEY (cate_id) NOT ENFORCED
55 | ) WITH (
56 | 'connector' = 'mysql-cdc',
57 | 'hostname' = 'mysql',
58 | 'port' = '3306',
59 | 'username' = 'root',
60 | 'password' = '123456',
61 | 'database-name' = 'retail',
62 | 'table-name' = 'category'
63 | )
64 | ```
65 |
66 | Dynamic Table DDLs:
67 | ```sql
68 | %flink.ssql
69 |
70 | -- Flink 动态表:DWD 订单类目宽表
71 | CREATE TEMPORARY TABLE dwd_orders_cate (
72 | dt STRING,
73 | parent_cate_id VARCHAR,
74 | cate_id VARCHAR,
75 | order_id VARCHAR,
76 | trans_amount BIGINT,
77 | gmt_create STRING,
78 | PRIMARY KEY (order_id, dt) NOT ENFORCED
79 | ) PARTITIONED BY (dt)
80 | ```
81 |
82 | ```sql
83 | %flink.ssql
84 |
85 | -- Flink 动态表:DWS 类目指标聚合表
86 | CREATE TABLE dws_cate_day (
87 | dt STRING,
88 | parent_cate_id VARCHAR,
89 | cate_gmv BIGINT,
90 | PRIMARY KEY (parent_cate_id, dt) NOT ENFORCED
91 | ) PARTITIONED BY (dt)
92 | ```
93 |
94 | Streaming pipeline:
95 | ```sql
96 | %flink.ssql
97 |
98 | -- 流作业:两张Mysql cdc表join写入DWD
99 | INSERT INTO dwd_orders_cate
100 | SELECT
101 | s.dt,
102 | d.parent_cate_id,
103 | s.cate_id,
104 | s.order_id,
105 | s.trans_amount,
106 | s.gmt_create
107 | FROM `orders` s INNER JOIN `cate_dim` `d`
108 | ON s.cate_id = d.cate_id
109 | ```
110 | ```sql
111 | -- 流作业:DWD经过聚合写入DWS
112 | INSERT INTO dws_cate_day
113 | SELECT
114 | dt,
115 | parent_cate_id,
116 | SUM(trans_amount) AS cate_gmv
117 | FROM dwd_orders_cate
118 | GROUP BY parent_cate_id, dt
119 | ```
120 |
121 | ## OLAP 查询
122 |
123 | 请修改对应的日期:
124 | ```sql
125 | %flink.ssql
126 |
127 | -- 实时OLAP:Join 订单宽表和类目指标表,得出订单在这个类目下金额的占比
128 | SELECT
129 | order_id,
130 | trans_amount,
131 | CAST(trans_amount AS DOUBLE) / cate_gmv AS ratio
132 | FROM dwd_orders_cate d JOIN dws_cate_day s
133 | ON d.parent_cate_id = s.parent_cate_id -- Join condition
134 | WHERE d.dt = '${TODAY}' AND s.dt = '${TODAY}' -- 分区Pruning
135 | ORDER BY ratio DESC LIMIT 10
136 | ```
137 |
138 | ```sql
139 | %flink.bsql
140 |
141 | -- 历史OLAP:查询看订单宽表三天前的数据
142 | SELECT * FROM dwd_orders_cate WHERE dt = '${3-days-ago}'
143 | ```
144 |
145 | ## 数据订正
146 | 
147 |
148 | 请修改对应的日期:
149 | ```sql
150 | %flink.bsql
151 |
152 | -- Batch统计:查看有脏数据的分区
153 | SELECT DISTINCT dt FROM dwd_orders_cate WHERE trans_amount <= 0
154 | ```
155 |
156 | ```sql
157 | %flink.bsql
158 |
159 | --Batch 数据订正:覆写指定分区
160 | INSERT OVERWRITE dws_cate_day PARTITION (dt = '${3-days-ago}')
161 | SELECT
162 | parent_cate_id,
163 | SUM(trans_amount) AS cate_gmv
164 | FROM dwd_orders_cate
165 | WHERE dt = '${3-days-ago}' AND trans_amount > 0
166 | GROUP BY parent_cate_id
167 | ```
168 |
169 | ```sql
170 | %flink.bsql
171 |
172 | --OLAP查询:查看订正后数据
173 | SELECT * FROM dws_cate_day WHERE dt = '${3-days-ago}'
174 | ```
175 |
176 | ## 附录:查看动态表文件存储
177 | - docker-compose exec zeppelin-flink /bin/bash
178 | - cd /tmp/store/
179 |
180 | ## FAQ
181 | - `docker-compose up -d`遇到 `address already in use` 报错
182 | ```text
183 | ERROR: for flink-dynamic-table-demo_zeppelin-flink_1 Cannot start service zeppelin-flink: Ports are not available: listen tcp 0.0.0.0:8081: bind: address already in use
184 |
185 | ERROR: for zeppelin-flink Cannot start service zeppelin-flink: Ports are not available: listen tcp 0.0.0.0:8081: bind: address already in use
186 | ERROR: Encountered errors while bringing up the project.
187 | ```
188 |
189 | 原因: 本地有其它进程占用了 8081 端口号, 可通过如下命令找到进程 pid
190 |
191 | ```shell
192 | sudo lsof -nP -iTCP:8081 | grep LISTEN
193 | ```
194 |
195 | ```shell
196 | sudo kill -9 ${pid}
197 | ```
198 | 也可以修改 `docker-compose.yml` HOST_PORT 与 CONTAINER_PORT 的 mapping 关系, e.g. 将 HOST_PORT 改为 8082
199 | ```yaml
200 | ports:
201 | - "${HOST_PORT}:${CONTAINER_PORT}"
202 | - "8082:8081"
203 | ```
204 | container ready 后打开 `localhost:${HOST_PORT}`
205 |
206 | # 谢谢尝试
207 |
208 |
--------------------------------------------------------------------------------
/datagen/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | org.example
8 | datagen
9 | 1.0-SNAPSHOT
10 |
11 |
12 |
13 | org.apache.maven.plugins
14 | maven-compiler-plugin
15 |
16 | 8
17 | 8
18 |
19 |
20 |
21 | maven-assembly-plugin
22 |
23 |
24 | jar-with-dependencies
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 | mysql
34 | mysql-connector-java
35 | 8.0.27
36 |
37 |
38 |
39 | org.apache.commons
40 | commons-lang3
41 | 3.12.0
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/datagen/src/main/java/org/apache/datagen/DataGen.java:
--------------------------------------------------------------------------------
1 | package org.apache.datagen;
2 |
3 | import org.apache.commons.lang3.time.DateUtils;
4 |
5 | import java.sql.Connection;
6 | import java.sql.DriverManager;
7 | import java.sql.PreparedStatement;
8 | import java.sql.SQLException;
9 | import java.sql.Statement;
10 | import java.text.SimpleDateFormat;
11 | import java.util.Date;
12 | import java.util.HashMap;
13 | import java.util.Map;
14 | import java.util.Random;
15 |
16 | public class DataGen {
17 | private static Map productCateInfo = new HashMap();
18 | private static long orderCnt = 0;
19 | private static final String RETAIL = "retail";
20 |
21 | public static void main(String[] args) throws Exception {
22 | if (args.length < 2) {
23 | throw new IllegalArgumentException("Two arguments are required: Hostname Port");
24 | }
25 | String host = args[0];
26 | String port = args[1];
27 |
28 | Class.forName("com.mysql.jdbc.Driver");
29 | String url = String.format("jdbc:mysql://%s:%s/", host, port);
30 | // wait until mysql is ready
31 | int retries = 0;
32 | while (retries++ < 20) {
33 | try {
34 | DriverManager.getConnection(url, "root", "123456");
35 | break;
36 | } catch (SQLException c) {
37 | Thread.sleep(3000);
38 | }
39 | }
40 | System.out.println("### Mysql is alive...");
41 | createDbAndTable(url);
42 | insertData(url);
43 | }
44 |
45 | private static void createDbAndTable(String url) throws Exception {
46 | Connection conn = DriverManager.getConnection(url, "root", "123456");
47 | Statement stmt = conn.createStatement();
48 | // create db
49 | String sql = String.format("CREATE DATABASE %s;", RETAIL);
50 | stmt.executeUpdate(sql);
51 | stmt.executeUpdate("USE " + RETAIL);
52 |
53 | // create table orders
54 | String createOrder = "CREATE TABLE orders (\n" +
55 | " order_id VARCHAR(255) NOT NULL PRIMARY KEY,\n" +
56 | " product_id VARCHAR(255) COMMENT '000~499',\n" +
57 | " cate_id VARCHAR(255) COMMENT '00~99',\n" +
58 | " trans_amount BIGINT COMMENT '10000~20000',\n" +
59 | " gmt_create VARCHAR(255) NOT NULL\n" +
60 | ");";
61 | stmt.executeUpdate(createOrder);
62 |
63 | // create tb category
64 | String createCate = "CREATE TABLE category (\n" +
65 | " cate_id VARCHAR(255) NOT NULL PRIMARY KEY COMMENT '00~99',\n" +
66 | " parent_cate_id VARCHAR(255) COMMENT '0~9'\n" +
67 | ");";
68 | stmt.executeUpdate(createCate);
69 | conn.close();
70 | }
71 |
72 | private static void insertData(String url) throws Exception {
73 | Connection conn = DriverManager.getConnection(url + RETAIL, "root", "123456");
74 | conn.setAutoCommit(false);
75 | String cateSql = "INSERT INTO category (cate_id, parent_cate_id) VALUES (?, ?)";
76 | PreparedStatement pStmt1 = conn.prepareStatement(cateSql);
77 |
78 | // insert data into `category`
79 | for (int i = 0; i < 10; i++) {
80 | for (int j = 0; j < 10; j++) {
81 | pStmt1.setString(1, String.format("%03d", i * 10 + j));
82 | pStmt1.setString(2, String.format("%03d", i));
83 | pStmt1.addBatch();
84 | }
85 | }
86 | pStmt1.executeBatch();
87 |
88 | // order insert statement
89 | String orderSql = "INSERT INTO orders (order_id, product_id, cate_id, trans_amount, gmt_create)"
90 | + " VALUES (?, ?, ?, ?, ?)";
91 | PreparedStatement pStmt2 = conn.prepareStatement(orderSql);
92 |
93 | // create orders for last 5 days
94 | SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
95 | Random rnd = new Random();
96 | for (int i = 0; i < 5; i++) {
97 | // 1000 data for each data
98 | for (int j = 0; j < 1000; j++) {
99 | pStmt2.setString(1, (++orderCnt) + ""); // order_id
100 | String productId = String.format("%03d", rnd.nextInt(500));
101 | pStmt2.setString(2, productId);
102 | String cateId = productCateInfo.computeIfAbsent(
103 | productId, s -> String.format("%03d", rnd.nextInt(100)));
104 | pStmt2.setString(3, cateId);
105 |
106 | // insert some invalid data: trans_amount < 0
107 | if (rnd.nextInt(10) < 1) {
108 | pStmt2.setLong(4, -10000 - rnd.nextInt(10000));
109 | } else {
110 | pStmt2.setLong(4, 10000 + rnd.nextInt(10000));
111 | }
112 |
113 | Date date = DateUtils.addDays(new Date(), -i-1);
114 | date.setHours(0);
115 | date.setMinutes(0);
116 | date = DateUtils.addMinutes(date, rnd.nextInt(1000));
117 | String ts = sdf.format(date);
118 | pStmt2.setString(5, ts);
119 | pStmt2.addBatch();
120 | }
121 | pStmt2.executeBatch();
122 | }
123 | conn.commit();
124 |
125 | conn.setAutoCommit(true);
126 | // continous insert data for today...
127 | try {
128 | while (true) {
129 | pStmt2.setString(1, (++orderCnt) + ""); // order_id
130 | String productId = String.format("%03d", rnd.nextInt(500));
131 | pStmt2.setString(2, productId);
132 | String cateId = productCateInfo.computeIfAbsent(
133 | productId, s -> String.format("%03d", rnd.nextInt(100)));
134 | pStmt2.setString(3, cateId);
135 | pStmt2.setLong(4, 10000 + rnd.nextInt(10000));
136 |
137 | Date date = new Date();
138 | int elapsedMinutes = date.getHours() * 60 + date.getMinutes();
139 | date.setHours(0);
140 | date.setMinutes(0);
141 | date = DateUtils.addMinutes(date, rnd.nextInt(elapsedMinutes));
142 | String ts = sdf.format(date);
143 | pStmt2.setString(5, ts);
144 | pStmt2.execute();
145 |
146 | System.out.println("insert with oder_id: " + orderCnt);
147 |
148 | Thread.sleep(5000);
149 | }
150 | } finally {
151 | conn.close();
152 | }
153 |
154 | }
155 | }
156 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '2.1'
2 | services:
3 | zeppelin-flink:
4 | image: lzljs3620320/zeppelin-flink:1.0
5 | ports:
6 | - "8080:8080"
7 | - "8081:8081"
8 | depends_on:
9 | - mysql
10 | - datagen
11 | environment:
12 | MYSQL_HOST: mysql
13 | volumes:
14 | - shared-tmpfs:/tmp/store
15 | kafka:
16 | image: johnnypark/kafka-zookeeper
17 | ports:
18 | - "2181:2181"
19 | - "9092:9092"
20 | environment:
21 | - NUM_PARTITIONS=8
22 | mysql:
23 | image: debezium/example-mysql:1.1
24 | ports:
25 | - "3306:3306"
26 | environment:
27 | - MYSQL_ROOT_PASSWORD=123456
28 | - MYSQL_USER=mysqluser
29 | - MYSQL_PASSWORD=mysqlpw
30 | datagen:
31 | image: lzljs3620320/datagen:1.0
32 | depends_on:
33 | - mysql
34 | environment:
35 | - DB_HOST=mysql
36 | - DB_PORT=3306
37 |
38 | volumes:
39 | shared-tmpfs:
40 | driver: local
41 | driver_opts:
42 | type: "tmpfs"
43 | device: "tmpfs"
44 |
--------------------------------------------------------------------------------
/images/.Dockerfile.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JingsongLi/flink-dynamic-table-demo/15854b6e68d0f01c3293618aa063f1fcf687e3cb/images/.Dockerfile.swp
--------------------------------------------------------------------------------
/images/datagen/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM openjdk:11
2 | COPY . /usr/src/datagen
3 | WORKDIR /usr/src/datagen
4 | CMD java -cp datagen.jar org.apache.datagen.DataGen $DB_HOST $DB_PORT
5 |
--------------------------------------------------------------------------------
/images/datagen/datagen.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JingsongLi/flink-dynamic-table-demo/15854b6e68d0f01c3293618aa063f1fcf687e3cb/images/datagen/datagen.jar
--------------------------------------------------------------------------------
/images/zeppelin-flink/Dockerfile:
--------------------------------------------------------------------------------
1 | # Licensed to the Apache Software Foundation (ASF) under one or more
2 | # contributor license agreements. See the NOTICE file distributed with
3 | # this work for additional information regarding copyright ownership.
4 | # The ASF licenses this file to You under the Apache License, Version 2.0
5 | # (the "License"); you may not use this file except in compliance with
6 | # the License. You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | FROM ubuntu:20.04
17 | LABEL maintainer="Apache Software Foundation "
18 |
19 | RUN set -ex && \
20 | apt-get -y update && \
21 | # Install language and other base packages
22 | DEBIAN_FRONTEND=noninteractive apt-get install -y language-pack-en openjdk-8-jre-headless tini wget && \
23 | # Cleanup
24 | rm -rf /var/lib/apt/lists/* && \
25 | apt-get autoclean && \
26 | apt-get clean
27 |
28 | ARG version="0.10.0"
29 |
30 | ENV LANG=en_US.UTF-8 \
31 | LC_ALL=en_US.UTF-8 \
32 | JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \
33 | VERSION="${version}" \
34 | HOME="/opt/zeppelin" \
35 | ZEPPELIN_HOME="/opt/zeppelin" \
36 | ZEPPELIN_ADDR="0.0.0.0" \
37 | ZEPPELIN_WAR_TEMPDIR="/tmp/webapps"
38 |
39 | # Copy Zeppelin related files
40 | COPY zeppelin-0.11.0-SNAPSHOT /opt/zeppelin/
41 | COPY flink-1.15-SNAPSHOT /opt/flink-1.15-SNAPSHOT
42 |
43 | RUN mkdir -p "${ZEPPELIN_HOME}/logs" "${ZEPPELIN_HOME}/run" "${ZEPPELIN_HOME}/notebook" "${ZEPPELIN_HOME}/local-repo" && \
44 | # Allow process to edit /etc/passwd, to create a user entry for zeppelin
45 | chgrp root /etc/passwd && chmod ug+rw /etc/passwd && \
46 | # Give access to some specific folders
47 | chmod -R 775 "${ZEPPELIN_HOME}/logs" "${ZEPPELIN_HOME}/run" "${ZEPPELIN_HOME}/conf" "${ZEPPELIN_HOME}/notebook" "${ZEPPELIN_HOME}/local-repo"
48 |
49 | USER 1000
50 |
51 | EXPOSE 8080
52 |
53 | ENTRYPOINT [ "/usr/bin/tini", "--" ]
54 | WORKDIR ${ZEPPELIN_HOME}
55 | CMD ["bin/zeppelin.sh"]
56 |
--------------------------------------------------------------------------------