├── .gitignore ├── Flink Sql Cookbook ├── Aggregations and Analytics │ ├── 01 Aggregating Time Series Data (TVF).gif │ ├── 01 Aggregating Time Series Data (TVF)_2GH45WG7A.zpln │ ├── 01 Aggregating Time Series Data_2G18EQKYD.zpln │ ├── 02 Watermarks.gif │ ├── 02 Watermarks_2FXSSGYS2.zpln │ ├── 03 Analyzing Sessions in Time Series Data_2FZCP849J.zpln │ ├── 03 Analyzing Sessions.gif │ ├── 04 Rolling Aggregations on Time Series Data_2G1KSXF73.zpln │ ├── 04 Rolling Aggregations.gif │ ├── 05 Continuous Top-N.gif │ ├── 05 Continuous Top-N_2FYDMAK2S.zpln │ ├── 06 Deduplication_2G21WQVZB.zpln │ ├── 07 Chained (Event) Time Windows_2FZ88A3H2.zpln │ ├── 08 Detecting patterns with MATCH_RECOGNIZE_2FYC4KQ45.zpln │ ├── 10 Hopping Time Windows_2GGSZQZP4.zpln │ ├── 11 Window Top-N.gif │ ├── 11 Window Top-N_2GM5RDCWG.zpln │ └── 12 Retrieve previous row value without self-join_2HGWZS5C6.zpln ├── Foundations │ ├── 01 Creating Tables_2FXZ2C2NN.zpln │ ├── 02 Inserting Into Tables_2G1GAHZY7.zpln │ ├── 03 Working with Temporary Tables_2G19ENM4Q.zpln │ ├── 04 Filtering Data_2FY1Q15QK.zpln │ ├── 05 Aggregating Data_2FYNFVHG9.zpln │ ├── 06 Sorting Tables_2G22Y5XQ4.zpln │ ├── 07 Encapsulating Logic with (Temporary) Views_2FYJ4TZC6.zpln │ ├── 08 Writing Results into Multiple Tables_2G1MEGYE2.zpln │ └── 09 Convert timestamps with timezones_2HHBK28GB.zpln ├── Joins │ ├── 01 Regular Joins_2FYA62DSS.zpln │ ├── 01_Regular_Join.gif │ ├── 02 Interval Joins_2FYMBTGSF.zpln │ ├── 02_Interval_Join.gif │ ├── 03 Temporal Table Join between a non-compacted and compacted Kafka Topic_2G1YY15ZB.zpln │ ├── 04 Lookup Joins_2FYWYEW8C.zpln │ ├── 05 Real Time Star Schema Denormalization (N-Way Join)_2G1ZCV2GP.zpln │ ├── 06 Lateral Table Join_2G1VYGDFE.zpln │ └── 06_Lateral_Table_Join.gif ├── Other Builtin Functions │ ├── 01 Working with Dates and Timestamps_2G1WY9HY2.zpln │ ├── 02 Building the Union of Multiple Streams_2GJJCR6H2.zpln │ ├── 02 Union_Multiple_Stream.gif │ ├── 03 Filter_Late_Data.gif │ └── 03 Filtering out Late Data_2GJ18VM3X.zpln └── UDF │ └── 01 Extending SQL with Python UDFs_2G19AQ57T.zpln ├── README.md └── docker-compose.yml /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | logs 3 | 4 | -------------------------------------------------------------------------------- /Flink Sql Cookbook/Aggregations and Analytics/01 Aggregating Time Series Data (TVF).gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjffdu/flink-sql-cookbook-on-zeppelin/6cb9a0a0b64ef9eb87b4f8ced63e447b4aab72b9/Flink Sql Cookbook/Aggregations and Analytics/01 Aggregating Time Series Data (TVF).gif -------------------------------------------------------------------------------- /Flink Sql Cookbook/Aggregations and Analytics/02 Watermarks.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjffdu/flink-sql-cookbook-on-zeppelin/6cb9a0a0b64ef9eb87b4f8ced63e447b4aab72b9/Flink Sql Cookbook/Aggregations and Analytics/02 Watermarks.gif -------------------------------------------------------------------------------- /Flink Sql Cookbook/Aggregations and Analytics/03 Analyzing Sessions.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjffdu/flink-sql-cookbook-on-zeppelin/6cb9a0a0b64ef9eb87b4f8ced63e447b4aab72b9/Flink Sql Cookbook/Aggregations and Analytics/03 Analyzing Sessions.gif -------------------------------------------------------------------------------- /Flink Sql Cookbook/Aggregations and Analytics/04 Rolling Aggregations.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjffdu/flink-sql-cookbook-on-zeppelin/6cb9a0a0b64ef9eb87b4f8ced63e447b4aab72b9/Flink Sql Cookbook/Aggregations and Analytics/04 Rolling Aggregations.gif -------------------------------------------------------------------------------- /Flink Sql Cookbook/Aggregations and Analytics/05 Continuous Top-N.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjffdu/flink-sql-cookbook-on-zeppelin/6cb9a0a0b64ef9eb87b4f8ced63e447b4aab72b9/Flink Sql Cookbook/Aggregations and Analytics/05 Continuous Top-N.gif -------------------------------------------------------------------------------- /Flink Sql Cookbook/Aggregations and Analytics/06 Deduplication_2G21WQVZB.zpln: -------------------------------------------------------------------------------- 1 | { 2 | "paragraphs": [ 3 | { 4 | "text": "%md\n\n\n\u003e :bulb: This example will show how you can identify and filter out duplicates in a stream of events.\n\nThere are different ways that duplicate events can end up in your data sources, from human error to application bugs. Regardless of the origin, unclean data can have a real impact in the quality (and correctness) of your results. Suppose that your order system occasionally generates duplicate events with the same `order_id`, and that you\u0027re only interested in keeping the most recent event for downstream processing.\n\nAs a first step, you can use a combination of the `COUNT` function and the `HAVING` clause to check if and which orders have more than one event; and then filter out these events using `ROW_NUMBER()`. In practice, deduplication is a special case of [Top-N aggregation](../05_top_n/05_top_n.md), where N is 1 (`rownum \u003d 1`) and the ordering column is either the processing or event time of events.", 5 | "user": "anonymous", 6 | "dateUpdated": "2021-10-08 16:27:51.215", 7 | "progress": 0, 8 | "config": { 9 | "tableHide": false, 10 | "editorSetting": { 11 | "language": "markdown", 12 | "editOnDblClick": true, 13 | "completionKey": "TAB", 14 | "completionSupport": false 15 | }, 16 | "colWidth": 12.0, 17 | "editorMode": "ace/mode/markdown", 18 | "fontSize": 9.0, 19 | "editorHide": true, 20 | "results": {}, 21 | "enabled": true 22 | }, 23 | "settings": { 24 | "params": {}, 25 | "forms": {} 26 | }, 27 | "results": { 28 | "code": "SUCCESS", 29 | "msg": [ 30 | { 31 | "type": "HTML", 32 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cblockquote\u003e\n\u003cp\u003e💡 This example will show how you can identify and filter out duplicates in a stream of events.\u003c/p\u003e\n\u003c/blockquote\u003e\n\u003cp\u003eThere are different ways that duplicate events can end up in your data sources, from human error to application bugs. Regardless of the origin, unclean data can have a real impact in the quality (and correctness) of your results. Suppose that your order system occasionally generates duplicate events with the same \u003ccode\u003eorder_id\u003c/code\u003e, and that you\u0026rsquo;re only interested in keeping the most recent event for downstream processing.\u003c/p\u003e\n\u003cp\u003eAs a first step, you can use a combination of the \u003ccode\u003eCOUNT\u003c/code\u003e function and the \u003ccode\u003eHAVING\u003c/code\u003e clause to check if and which orders have more than one event; and then filter out these events using \u003ccode\u003eROW_NUMBER()\u003c/code\u003e. In practice, deduplication is a special case of \u003ca href\u003d\"../05_top_n/05_top_n.md\"\u003eTop-N aggregation\u003c/a\u003e, where N is 1 (\u003ccode\u003erownum \u003d 1\u003c/code\u003e) and the ordering column is either the processing or event time of events.\u003c/p\u003e\n\n\u003c/div\u003e" 33 | } 34 | ] 35 | }, 36 | "apps": [], 37 | "runtimeInfos": {}, 38 | "progressUpdateIntervalMs": 500, 39 | "jobName": "paragraph_1614313688051_1570601018", 40 | "id": "paragraph_1614313688051_1570601018", 41 | "dateCreated": "2021-02-26 12:28:08.054", 42 | "dateStarted": "2021-10-08 16:27:51.216", 43 | "dateFinished": "2021-10-08 16:27:51.221", 44 | "status": "FINISHED" 45 | }, 46 | { 47 | "text": "%md\n\n本例将展示如何在事件流中识别并筛选出重复的事件。\n\n从人为错误到应用程序错误,重复的事件最终会以不同的方式出现在数据源中。不管来源如何,不干净的数据都会对结果的质量(和正确性)产生实际影响。假设您的订单系统偶尔会生成具有相同 `order_id` 的重复事件,并且您只想保留最新的事件以供下游处理。\n\n首先,您可以组合使用 `COUNT` 函数和 `HAVING` 语句来检查有没有重复的事件以及哪些订单有重复事件;然后使用 `ROW_NUMBER()` 来过滤掉这些事件。实际上,去重复数据是Top-N聚合的一种特殊情况,其中N是1(`rownum\u003d1`),排序字段可以是事件的处理时间或事件时间。\n", 48 | "user": "anonymous", 49 | "dateUpdated": "2021-03-18 15:54:56.306", 50 | "progress": 0, 51 | "config": { 52 | "editorSetting": { 53 | "language": "markdown", 54 | "editOnDblClick": true, 55 | "completionKey": "TAB", 56 | "completionSupport": false 57 | }, 58 | "colWidth": 12.0, 59 | "editorMode": "ace/mode/markdown", 60 | "fontSize": 9.0, 61 | "results": {}, 62 | "enabled": true, 63 | "editorHide": true, 64 | "tableHide": false 65 | }, 66 | "settings": { 67 | "params": {}, 68 | "forms": {} 69 | }, 70 | "results": { 71 | "code": "SUCCESS", 72 | "msg": [ 73 | { 74 | "type": "HTML", 75 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e本例将展示如何在事件流中识别并筛选出重复的事件。\u003c/p\u003e\n\u003cp\u003e从人为错误到应用程序错误,重复的事件最终会以不同的方式出现在数据源中。不管来源如何,不干净的数据都会对结果的质量(和正确性)产生实际影响。假设您的订单系统偶尔会生成具有相同 \u003ccode\u003eorder_id\u003c/code\u003e 的重复事件,并且您只想保留最新的事件以供下游处理。\u003c/p\u003e\n\u003cp\u003e首先,您可以组合使用 \u003ccode\u003eCOUNT\u003c/code\u003e 函数和 \u003ccode\u003eHAVING\u003c/code\u003e 语句来检查有没有重复的事件以及哪些订单有重复事件;然后使用 \u003ccode\u003eROW_NUMBER()\u003c/code\u003e 来过滤掉这些事件。实际上,去重复数据是Top-N聚合的一种特殊情况,其中N是1(\u003ccode\u003erownum\u003d1\u003c/code\u003e),排序字段可以是事件的处理时间或事件时间。\u003c/p\u003e\n\n\u003c/div\u003e" 76 | } 77 | ] 78 | }, 79 | "apps": [], 80 | "runtimeInfos": {}, 81 | "progressUpdateIntervalMs": 500, 82 | "jobName": "paragraph_1615117539626_58897275", 83 | "id": "paragraph_1615117539626_58897275", 84 | "dateCreated": "2021-03-07 11:45:39.626", 85 | "dateStarted": "2021-03-18 15:54:56.310", 86 | "dateFinished": "2021-03-18 15:54:56.323", 87 | "status": "FINISHED" 88 | }, 89 | { 90 | "text": "%flink.ssql\n\nDROP TABLE IF EXISTS orders;\n\nCREATE TABLE orders (\n id INT,\n order_time AS CURRENT_TIMESTAMP,\n WATERMARK FOR order_time AS order_time - INTERVAL \u00275\u0027 SECONDS\n)\nWITH (\n \u0027connector\u0027 \u003d \u0027datagen\u0027,\n \u0027rows-per-second\u0027\u003d\u002710\u0027,\n \u0027fields.id.kind\u0027\u003d\u0027random\u0027,\n \u0027fields.id.min\u0027\u003d\u00271\u0027,\n \u0027fields.id.max\u0027\u003d\u0027100\u0027\n);\n", 91 | "user": "anonymous", 92 | "dateUpdated": "2021-02-26 12:28:48.077", 93 | "progress": 0, 94 | "config": { 95 | "editorSetting": { 96 | "language": "sql", 97 | "editOnDblClick": false, 98 | "completionKey": "TAB", 99 | "completionSupport": true 100 | }, 101 | "colWidth": 12.0, 102 | "editorMode": "ace/mode/sql", 103 | "fontSize": 9.0, 104 | "results": {}, 105 | "enabled": true 106 | }, 107 | "settings": { 108 | "params": {}, 109 | "forms": {} 110 | }, 111 | "apps": [], 112 | "runtimeInfos": {}, 113 | "progressUpdateIntervalMs": 500, 114 | "jobName": "paragraph_1614304246775_222173658", 115 | "id": "paragraph_1614304246775_222173658", 116 | "dateCreated": "2021-02-26 09:50:46.775", 117 | "dateStarted": "2021-02-26 12:28:48.088", 118 | "dateFinished": "2021-02-26 12:28:49.030", 119 | "status": "FINISHED" 120 | }, 121 | { 122 | "text": "%flink.ssql(type\u003dupdate)\n\n--Check for duplicates in the `orders` table\nSELECT id AS order_id,\n COUNT(*) AS order_cnt\nFROM orders o\nGROUP BY id\nHAVING COUNT(*) \u003e 1;\n\n", 123 | "user": "anonymous", 124 | "dateUpdated": "2021-02-26 12:28:53.794", 125 | "progress": 0, 126 | "config": { 127 | "editorSetting": { 128 | "language": "sql", 129 | "editOnDblClick": false, 130 | "completionKey": "TAB", 131 | "completionSupport": true 132 | }, 133 | "colWidth": 12.0, 134 | "editorMode": "ace/mode/sql", 135 | "fontSize": 9.0, 136 | "results": { 137 | "0": { 138 | "graph": { 139 | "mode": "table", 140 | "height": 300.0, 141 | "optionOpen": false, 142 | "setting": { 143 | "table": { 144 | "tableGridState": {}, 145 | "tableColumnTypeState": { 146 | "names": { 147 | "order_id": "string", 148 | "order_cnt": "string" 149 | }, 150 | "updated": false 151 | }, 152 | "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", 153 | "tableOptionValue": { 154 | "useFilter": false, 155 | "showPagination": false, 156 | "showAggregationFooter": false 157 | }, 158 | "updated": false, 159 | "initialized": false 160 | } 161 | }, 162 | "commonSetting": {} 163 | } 164 | } 165 | }, 166 | "enabled": true 167 | }, 168 | "settings": { 169 | "params": {}, 170 | "forms": {} 171 | }, 172 | "apps": [], 173 | "runtimeInfos": { 174 | "jobUrl": { 175 | "propertyName": "jobUrl", 176 | "label": "FLINK JOB", 177 | "tooltip": "View in Flink web UI", 178 | "group": "flink", 179 | "values": [ 180 | { 181 | "jobUrl": "http://localhost:8081#/job/2138ecd297e0c870258386db23aae2f8" 182 | } 183 | ], 184 | "interpreterSettingId": "flink" 185 | } 186 | }, 187 | "progressUpdateIntervalMs": 500, 188 | "jobName": "paragraph_1614304259692_480547273", 189 | "id": "paragraph_1614304259692_480547273", 190 | "dateCreated": "2021-02-26 09:50:59.692", 191 | "dateStarted": "2021-02-26 12:28:53.807", 192 | "dateFinished": "2021-02-26 09:51:36.961", 193 | "status": "ABORT" 194 | }, 195 | { 196 | "text": "%flink.ssql(type\u003dupdate)\n\n--Use deduplication to keep only the latest record for each `order_id`\nSELECT\n order_id,\n order_time\nFROM (\n SELECT id AS order_id,\n order_time,\n ROW_NUMBER() OVER (PARTITION BY id ORDER BY order_time) AS rownum\n FROM orders\n )\nWHERE rownum \u003d 1;\nORDER BY order_time DESC\nLIMIT 10;\n\n", 197 | "user": "anonymous", 198 | "dateUpdated": "2021-02-26 12:29:37.756", 199 | "progress": 0, 200 | "config": { 201 | "editorSetting": { 202 | "language": "sql", 203 | "editOnDblClick": false, 204 | "completionKey": "TAB", 205 | "completionSupport": true 206 | }, 207 | "colWidth": 12.0, 208 | "editorMode": "ace/mode/sql", 209 | "fontSize": 9.0, 210 | "results": { 211 | "0": { 212 | "graph": { 213 | "mode": "table", 214 | "height": 300.0, 215 | "optionOpen": false, 216 | "setting": { 217 | "table": { 218 | "tableGridState": {}, 219 | "tableColumnTypeState": { 220 | "names": { 221 | "order_id": "string", 222 | "order_time": "string" 223 | }, 224 | "updated": false 225 | }, 226 | "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", 227 | "tableOptionValue": { 228 | "useFilter": false, 229 | "showPagination": false, 230 | "showAggregationFooter": false 231 | }, 232 | "updated": false, 233 | "initialized": false 234 | } 235 | }, 236 | "commonSetting": {} 237 | } 238 | } 239 | }, 240 | "enabled": true 241 | }, 242 | "settings": { 243 | "params": {}, 244 | "forms": {} 245 | }, 246 | "apps": [], 247 | "runtimeInfos": { 248 | "jobUrl": { 249 | "propertyName": "jobUrl", 250 | "label": "FLINK JOB", 251 | "tooltip": "View in Flink web UI", 252 | "group": "flink", 253 | "values": [ 254 | { 255 | "jobUrl": "http://localhost:8081#/job/6bb5e0075b2a7a37b203ca5e8de665e3" 256 | } 257 | ], 258 | "interpreterSettingId": "flink" 259 | } 260 | }, 261 | "progressUpdateIntervalMs": 500, 262 | "jobName": "paragraph_1614304280591_269871547", 263 | "id": "paragraph_1614304280591_269871547", 264 | "dateCreated": "2021-02-26 09:51:20.591", 265 | "dateStarted": "2021-02-26 12:29:37.764", 266 | "dateFinished": "2021-02-26 12:29:42.058", 267 | "status": "ABORT" 268 | }, 269 | { 270 | "text": "%flink.ssql\n", 271 | "user": "anonymous", 272 | "dateUpdated": "2021-02-26 09:51:43.627", 273 | "progress": 0, 274 | "config": {}, 275 | "settings": { 276 | "params": {}, 277 | "forms": {} 278 | }, 279 | "apps": [], 280 | "runtimeInfos": {}, 281 | "progressUpdateIntervalMs": 500, 282 | "jobName": "paragraph_1614304303627_249077410", 283 | "id": "paragraph_1614304303627_249077410", 284 | "dateCreated": "2021-02-26 09:51:43.627", 285 | "status": "READY" 286 | } 287 | ], 288 | "name": "06 Deduplication", 289 | "id": "2G21WQVZB", 290 | "defaultInterpreterGroup": "flink", 291 | "version": "0.10.0-SNAPSHOT", 292 | "noteParams": {}, 293 | "noteForms": {}, 294 | "angularObjects": {}, 295 | "config": { 296 | "isZeppelinNotebookCronEnable": false 297 | }, 298 | "info": {} 299 | } -------------------------------------------------------------------------------- /Flink Sql Cookbook/Aggregations and Analytics/11 Window Top-N.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjffdu/flink-sql-cookbook-on-zeppelin/6cb9a0a0b64ef9eb87b4f8ced63e447b4aab72b9/Flink Sql Cookbook/Aggregations and Analytics/11 Window Top-N.gif -------------------------------------------------------------------------------- /Flink Sql Cookbook/Foundations/01 Creating Tables_2FXZ2C2NN.zpln: -------------------------------------------------------------------------------- 1 | { 2 | "paragraphs": [ 3 | { 4 | "text": "%md\n\n\u003e :bulb: This example will show how to create a table using SQL DDL.\n\nFlink SQL operates against logical tables, just like a traditional database. However, it does not maintain tables internally but always operates against external systems.\n\nTable definitions are in two parts; the logical schema and connector configuration. The logical schema defines the columns and types in the table and is what queries operate against. The connector configuration is contained in the `WITH` clause and defines the physical system that backs this table. This example uses the `datagen` connector which generates rows in memory and is convenient for testing queries.\n\nYou can test the table is properly created by running a simple `SELECT` statement. In Zeppelin you will see the results printed to the UI. In this example below you would see the latest 10 records.\n", 5 | "user": "anonymous", 6 | "dateUpdated": "2021-10-08 16:32:34.920", 7 | "progress": 0, 8 | "config": { 9 | "tableHide": false, 10 | "editorSetting": { 11 | "language": "markdown", 12 | "editOnDblClick": true, 13 | "completionKey": "TAB", 14 | "completionSupport": false 15 | }, 16 | "colWidth": 12.0, 17 | "editorMode": "ace/mode/markdown", 18 | "fontSize": 9.0, 19 | "editorHide": true, 20 | "results": {}, 21 | "enabled": true, 22 | "title": false 23 | }, 24 | "settings": { 25 | "params": {}, 26 | "forms": {} 27 | }, 28 | "results": { 29 | "code": "SUCCESS", 30 | "msg": [ 31 | { 32 | "type": "HTML", 33 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cblockquote\u003e\n\u003cp\u003e💡 This example will show how to create a table using SQL DDL.\u003c/p\u003e\n\u003c/blockquote\u003e\n\u003cp\u003eFlink SQL operates against logical tables, just like a traditional database. However, it does not maintain tables internally but always operates against external systems.\u003c/p\u003e\n\u003cp\u003eTable definitions are in two parts; the logical schema and connector configuration. The logical schema defines the columns and types in the table and is what queries operate against. The connector configuration is contained in the \u003ccode\u003eWITH\u003c/code\u003e clause and defines the physical system that backs this table. This example uses the \u003ccode\u003edatagen\u003c/code\u003e connector which generates rows in memory and is convenient for testing queries.\u003c/p\u003e\n\u003cp\u003eYou can test the table is properly created by running a simple \u003ccode\u003eSELECT\u003c/code\u003e statement. In Zeppelin you will see the results printed to the UI. In this example below you would see the latest 10 records.\u003c/p\u003e\n\n\u003c/div\u003e" 34 | } 35 | ] 36 | }, 37 | "apps": [], 38 | "runtimeInfos": {}, 39 | "progressUpdateIntervalMs": 500, 40 | "jobName": "paragraph_1614267718812_631245959", 41 | "id": "paragraph_1614267718812_631245959", 42 | "dateCreated": "2021-02-25 23:41:58.812", 43 | "dateStarted": "2021-10-08 16:32:34.920", 44 | "dateFinished": "2021-10-08 16:32:34.928", 45 | "status": "FINISHED" 46 | }, 47 | { 48 | "text": "%md\n本例子将展示怎么使用 DDL SQL 来创建表\n\nFlink SQL 能像操作传统数据库一样操作逻辑表。 但是他并不维护表本身,而是通过操作外部系统来变更表的数据。\n\n定义表分为2部分;分别是设计逻辑 schema 和 connector 的配置。逻辑 schema 定义了表中的字段及其类型它也是查询操作的对象。连接器配置在 `WITH` 语句中,它定义了逻辑表依赖的物理储存。本例使用 `datagen` 连接器的在内存中生成数据,便于测试查询。\n\n你可以通过运行一条简单的 `SELETE` 语句来测试这个表。在 `Zeppelin` 中你可以在UI上看到查询的结果。在下面的例子中你将看到最近的10条记录。", 49 | "user": "anonymous", 50 | "dateUpdated": "2021-03-18 15:07:01.050", 51 | "progress": 0, 52 | "config": { 53 | "editorSetting": { 54 | "language": "markdown", 55 | "editOnDblClick": true, 56 | "completionKey": "TAB", 57 | "completionSupport": false 58 | }, 59 | "colWidth": 12.0, 60 | "editorMode": "ace/mode/markdown", 61 | "fontSize": 9.0, 62 | "results": {}, 63 | "enabled": true, 64 | "editorHide": true, 65 | "tableHide": false 66 | }, 67 | "settings": { 68 | "params": {}, 69 | "forms": {} 70 | }, 71 | "results": { 72 | "code": "SUCCESS", 73 | "msg": [ 74 | { 75 | "type": "HTML", 76 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e本例子将展示怎么使用 DDL SQL 来创建表\u003c/p\u003e\n\u003cp\u003eFlink SQL 能像操作传统数据库一样操作逻辑表。 但是他并不维护表本身,而是通过操作外部系统来变更表的数据。\u003c/p\u003e\n\u003cp\u003e定义表分为2部分;分别是设计逻辑 schema 和 connector 的配置。逻辑 schema 定义了表中的字段及其类型它也是查询操作的对象。连接器配置在 \u003ccode\u003eWITH\u003c/code\u003e 语句中,它定义了逻辑表依赖的物理储存。本例使用 \u003ccode\u003edatagen\u003c/code\u003e 连接器的在内存中生成数据,便于测试查询。\u003c/p\u003e\n\u003cp\u003e你可以通过运行一条简单的 \u003ccode\u003eSELETE\u003c/code\u003e 语句来测试这个表。在 \u003ccode\u003eZeppelin\u003c/code\u003e 中你可以在UI上看到查询的结果。在下面的例子中你将看到最近的10条记录。\u003c/p\u003e\n\n\u003c/div\u003e" 77 | } 78 | ] 79 | }, 80 | "apps": [], 81 | "runtimeInfos": {}, 82 | "progressUpdateIntervalMs": 500, 83 | "jobName": "paragraph_1615106915535_2015311282", 84 | "id": "paragraph_1615106915535_2015311282", 85 | "dateCreated": "2021-03-07 08:48:35.535", 86 | "dateStarted": "2021-03-18 15:07:01.049", 87 | "dateFinished": "2021-03-18 15:07:01.074", 88 | "status": "FINISHED" 89 | }, 90 | { 91 | "text": "%flink.ssql\n\n\nDROP TABLE IF EXISTS orders;\n\nCREATE TABLE orders (\n order_uid BIGINT,\n product_id BIGINT,\n price DOUBLE,\n order_time TIMESTAMP(3)\n) WITH (\n \u0027connector\u0027 \u003d \u0027datagen\u0027\n);\n\n", 92 | "user": "anonymous", 93 | "dateUpdated": "2021-03-04 10:53:48.061", 94 | "progress": 0, 95 | "config": { 96 | "editorSetting": { 97 | "language": "sql", 98 | "editOnDblClick": false, 99 | "completionKey": "TAB", 100 | "completionSupport": true 101 | }, 102 | "colWidth": 12.0, 103 | "editorMode": "ace/mode/sql", 104 | "fontSize": 9.0, 105 | "results": {}, 106 | "enabled": true 107 | }, 108 | "settings": { 109 | "params": {}, 110 | "forms": {} 111 | }, 112 | "apps": [], 113 | "runtimeInfos": {}, 114 | "progressUpdateIntervalMs": 500, 115 | "jobName": "paragraph_1614263971802_451683322", 116 | "id": "paragraph_1614263971802_451683322", 117 | "dateCreated": "2021-02-25 22:39:31.803", 118 | "dateStarted": "2021-03-04 10:53:48.142", 119 | "dateFinished": "2021-03-04 10:54:14.637", 120 | "status": "FINISHED" 121 | }, 122 | { 123 | "text": "%flink.ssql(type\u003dupdate)\n\nSELECT * FROM orders \nORDER BY order_time DESC LIMIT 10;\n", 124 | "user": "anonymous", 125 | "dateUpdated": "2021-02-26 11:16:27.636", 126 | "progress": 0, 127 | "config": { 128 | "editorSetting": { 129 | "language": "sql", 130 | "editOnDblClick": false, 131 | "completionKey": "TAB", 132 | "completionSupport": true 133 | }, 134 | "colWidth": 12.0, 135 | "editorMode": "ace/mode/sql", 136 | "fontSize": 9.0, 137 | "results": { 138 | "0": { 139 | "graph": { 140 | "mode": "table", 141 | "height": 300.0, 142 | "optionOpen": false, 143 | "setting": { 144 | "table": { 145 | "tableGridState": {}, 146 | "tableColumnTypeState": { 147 | "names": { 148 | "order_uid": "string", 149 | "product_id": "string", 150 | "price": "string", 151 | "order_time": "string" 152 | }, 153 | "updated": false 154 | }, 155 | "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", 156 | "tableOptionValue": { 157 | "useFilter": false, 158 | "showPagination": false, 159 | "showAggregationFooter": false 160 | }, 161 | "updated": false, 162 | "initialized": false 163 | } 164 | }, 165 | "commonSetting": {} 166 | } 167 | } 168 | }, 169 | "enabled": true 170 | }, 171 | "settings": { 172 | "params": {}, 173 | "forms": {} 174 | }, 175 | "apps": [], 176 | "runtimeInfos": { 177 | "jobUrl": { 178 | "propertyName": "jobUrl", 179 | "label": "FLINK JOB", 180 | "tooltip": "View in Flink web UI", 181 | "group": "flink", 182 | "values": [ 183 | { 184 | "jobUrl": "http://localhost:8081#/job/9c231364f3c735c1d66d1477a4ac1135" 185 | } 186 | ], 187 | "interpreterSettingId": "flink" 188 | } 189 | }, 190 | "progressUpdateIntervalMs": 500, 191 | "jobName": "paragraph_1614264008246_345717910", 192 | "id": "paragraph_1614264008246_345717910", 193 | "dateCreated": "2021-02-25 22:40:08.246", 194 | "dateStarted": "2021-02-26 11:16:27.652", 195 | "dateFinished": "2021-02-26 11:16:48.392", 196 | "status": "ABORT" 197 | }, 198 | { 199 | "text": "%flink.ssql\n", 200 | "user": "anonymous", 201 | "dateUpdated": "2021-02-25 22:41:34.177", 202 | "progress": 0, 203 | "config": {}, 204 | "settings": { 205 | "params": {}, 206 | "forms": {} 207 | }, 208 | "apps": [], 209 | "runtimeInfos": {}, 210 | "progressUpdateIntervalMs": 500, 211 | "jobName": "paragraph_1614264094177_424155746", 212 | "id": "paragraph_1614264094177_424155746", 213 | "dateCreated": "2021-02-25 22:41:34.177", 214 | "status": "READY" 215 | } 216 | ], 217 | "name": "01 Creating Tables", 218 | "id": "2FXZ2C2NN", 219 | "defaultInterpreterGroup": "spark", 220 | "version": "0.10.0-SNAPSHOT", 221 | "noteParams": {}, 222 | "noteForms": {}, 223 | "angularObjects": {}, 224 | "config": { 225 | "isZeppelinNotebookCronEnable": false 226 | }, 227 | "info": {} 228 | } -------------------------------------------------------------------------------- /Flink Sql Cookbook/Foundations/02 Inserting Into Tables_2G1GAHZY7.zpln: -------------------------------------------------------------------------------- 1 | { 2 | "paragraphs": [ 3 | { 4 | "text": "%md\n\n\n\u003e :bulb: This recipe shows how to insert rows into a table so that downstream applications can read them.\n\nAs outlined in the first recipe Flink SQL operates on tables, that are stored in external systems. To publish results of a query for consumption by downstream applications, you write the results of a query into a table. This table can be read by Flink SQL, or directly by connecting to the external system that is storing the data (e.g. an ElasticSearch index.)\n\nThis example takes the `server_logs` tables, filters for client errors, and writes these logs into another table called `client_errors`. Any number of external systems could back the result table, including Apache Kafka, Apache Hive, ElasticSearch, JDBC, among many others. To keep this example self-contained, `client_errors` is of type `blackhole`: instead of actually writing the data to an external system, the table discards any rows written to it.\n", 5 | "user": "anonymous", 6 | "dateUpdated": "2021-10-08 16:33:04.654", 7 | "progress": 0, 8 | "config": { 9 | "tableHide": false, 10 | "editorSetting": { 11 | "language": "markdown", 12 | "editOnDblClick": true, 13 | "completionKey": "TAB", 14 | "completionSupport": false 15 | }, 16 | "colWidth": 12.0, 17 | "editorMode": "ace/mode/markdown", 18 | "fontSize": 9.0, 19 | "editorHide": true, 20 | "results": {}, 21 | "enabled": true 22 | }, 23 | "settings": { 24 | "params": {}, 25 | "forms": {} 26 | }, 27 | "results": { 28 | "code": "SUCCESS", 29 | "msg": [ 30 | { 31 | "type": "HTML", 32 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cblockquote\u003e\n\u003cp\u003e💡 This recipe shows how to insert rows into a table so that downstream applications can read them.\u003c/p\u003e\n\u003c/blockquote\u003e\n\u003cp\u003eAs outlined in the first recipe Flink SQL operates on tables, that are stored in external systems. To publish results of a query for consumption by downstream applications, you write the results of a query into a table. This table can be read by Flink SQL, or directly by connecting to the external system that is storing the data (e.g. an ElasticSearch index.)\u003c/p\u003e\n\u003cp\u003eThis example takes the \u003ccode\u003eserver_logs\u003c/code\u003e tables, filters for client errors, and writes these logs into another table called \u003ccode\u003eclient_errors\u003c/code\u003e. Any number of external systems could back the result table, including Apache Kafka, Apache Hive, ElasticSearch, JDBC, among many others. To keep this example self-contained, \u003ccode\u003eclient_errors\u003c/code\u003e is of type \u003ccode\u003eblackhole\u003c/code\u003e: instead of actually writing the data to an external system, the table discards any rows written to it.\u003c/p\u003e\n\n\u003c/div\u003e" 33 | } 34 | ] 35 | }, 36 | "apps": [], 37 | "runtimeInfos": {}, 38 | "progressUpdateIntervalMs": 500, 39 | "jobName": "paragraph_1614267802169_591840692", 40 | "id": "paragraph_1614267802169_591840692", 41 | "dateCreated": "2021-02-25 23:43:22.169", 42 | "dateStarted": "2021-10-08 16:33:04.655", 43 | "dateFinished": "2021-10-08 16:33:04.659", 44 | "status": "FINISHED" 45 | }, 46 | { 47 | "text": "%md\n这个事例展示了怎么向表中插入数据以便下游应用使用这些数据。\n\n如第一个例子所示,Flink SQL 操作的表数据都是存储在外部系统上。通过将查询的结果写入表中来将结果提供给后续应用消费。这个表可以被 Flink SQL 读取,也可以通过直接与外部系统来保存数据(例如 ElasticSearch)。\n\n本例从 `server_logs` 表中过滤出客户端错误,然后将过滤出来的日志写入另一个叫 `client_errors` 的表,结果表可以持久化到任意外部系统中,包括 Apache Kafka, Apache Hive, ElasticSearch,JDBC, \n为了让这个例子能够在没有任何依赖的情况下跑起来,`client_errors` 表是 `blackhole` 类型,这个表将任何写入的数据丢弃掉而不是写入外部系统中。\n", 48 | "user": "anonymous", 49 | "dateUpdated": "2021-03-18 15:07:40.048", 50 | "progress": 0, 51 | "config": { 52 | "editorSetting": { 53 | "language": "markdown", 54 | "editOnDblClick": true, 55 | "completionKey": "TAB", 56 | "completionSupport": false 57 | }, 58 | "colWidth": 12.0, 59 | "editorMode": "ace/mode/markdown", 60 | "fontSize": 9.0, 61 | "results": {}, 62 | "enabled": true, 63 | "editorHide": true, 64 | "tableHide": false 65 | }, 66 | "settings": { 67 | "params": {}, 68 | "forms": {} 69 | }, 70 | "results": { 71 | "code": "SUCCESS", 72 | "msg": [ 73 | { 74 | "type": "HTML", 75 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e这个事例展示了怎么向表中插入数据以便下游应用使用这些数据。\u003c/p\u003e\n\u003cp\u003e如第一个例子所示,Flink SQL 操作的表数据都是存储在外部系统上。通过将查询的结果写入表中来将结果提供给后续应用消费。这个表可以被 Flink SQL 读取,也可以通过直接与外部系统来保存数据(例如 ElasticSearch)。\u003c/p\u003e\n\u003cp\u003e本例从 \u003ccode\u003eserver_logs\u003c/code\u003e 表中过滤出客户端错误,然后将过滤出来的日志写入另一个叫 \u003ccode\u003eclient_errors\u003c/code\u003e 的表,结果表可以持久化到任意外部系统中,包括 Apache Kafka, Apache Hive, ElasticSearch,JDBC,\u003cbr /\u003e\n为了让这个例子能够在没有任何依赖的情况下跑起来,\u003ccode\u003eclient_errors\u003c/code\u003e 表是 \u003ccode\u003eblackhole\u003c/code\u003e 类型,这个表将任何写入的数据丢弃掉而不是写入外部系统中。\u003c/p\u003e\n\n\u003c/div\u003e" 76 | } 77 | ] 78 | }, 79 | "apps": [], 80 | "runtimeInfos": {}, 81 | "progressUpdateIntervalMs": 500, 82 | "jobName": "paragraph_1615106952557_787103027", 83 | "id": "paragraph_1615106952557_787103027", 84 | "dateCreated": "2021-03-07 08:49:12.557", 85 | "dateStarted": "2021-03-18 15:07:40.048", 86 | "dateFinished": "2021-03-18 15:07:40.072", 87 | "status": "FINISHED" 88 | }, 89 | { 90 | "text": "%flink.ssql\n\nDROP TABLE IF EXISTS server_logs;\n\nCREATE TABLE server_logs ( \n client_ip STRING,\n client_identity STRING, \n userid STRING, \n user_agent STRING,\n log_time TIMESTAMP(3),\n request_line STRING, \n status_code STRING, \n size INT\n) WITH (\n \u0027connector\u0027 \u003d \u0027faker\u0027, \n \u0027fields.client_ip.expression\u0027 \u003d \u0027#{Internet.publicIpV4Address}\u0027,\n \u0027fields.client_identity.expression\u0027 \u003d \u0027-\u0027,\n \u0027fields.userid.expression\u0027 \u003d \u0027-\u0027,\n \u0027fields.user_agent.expression\u0027 \u003d \u0027#{Internet.userAgentAny}\u0027,\n \u0027fields.log_time.expression\u0027 \u003d \u0027#{date.past \u0027\u002715\u0027\u0027,\u0027\u00275\u0027\u0027,\u0027\u0027SECONDS\u0027\u0027}\u0027,\n \u0027fields.request_line.expression\u0027 \u003d \u0027#{regexify \u0027\u0027(GET|POST|PUT|PATCH){1}\u0027\u0027} #{regexify \u0027\u0027(/search\\.html|/login\\.html|/prod\\.html|cart\\.html|/order\\.html){1}\u0027\u0027} #{regexify \u0027\u0027(HTTP/1\\.1|HTTP/2|/HTTP/1\\.0){1}\u0027\u0027}\u0027,\n \u0027fields.status_code.expression\u0027 \u003d \u0027#{regexify \u0027\u0027(200|201|204|400|401|403|301){1}\u0027\u0027}\u0027,\n \u0027fields.size.expression\u0027 \u003d \u0027#{number.numberBetween \u0027\u0027100\u0027\u0027,\u0027\u002710000000\u0027\u0027}\u0027\n);\n\nDROP TABLE IF EXISTS client_errors;\n\nCREATE TABLE client_errors (\n log_time TIMESTAMP(3),\n request_line STRING,\n status_code STRING,\n size INT\n)\nWITH (\n \u0027connector\u0027 \u003d \u0027blackhole\u0027\n);\n\n\n", 91 | "user": "anonymous", 92 | "dateUpdated": "2021-02-26 11:21:45.704", 93 | "progress": 0, 94 | "config": { 95 | "editorSetting": { 96 | "language": "sql", 97 | "editOnDblClick": false, 98 | "completionKey": "TAB", 99 | "completionSupport": true 100 | }, 101 | "colWidth": 12.0, 102 | "editorMode": "ace/mode/sql", 103 | "fontSize": 9.0, 104 | "results": {}, 105 | "enabled": true 106 | }, 107 | "settings": { 108 | "params": {}, 109 | "forms": {} 110 | }, 111 | "apps": [], 112 | "runtimeInfos": {}, 113 | "progressUpdateIntervalMs": 500, 114 | "jobName": "paragraph_1614264261687_1276008773", 115 | "id": "paragraph_1614264261687_1276008773", 116 | "dateCreated": "2021-02-25 22:44:21.687", 117 | "dateStarted": "2021-02-26 11:21:45.713", 118 | "dateFinished": "2021-02-26 11:21:46.702", 119 | "status": "FINISHED" 120 | }, 121 | { 122 | "text": "%flink.ssql\n\nINSERT INTO client_errors\nSELECT \n log_time,\n request_line,\n status_code,\n size\nFROM server_logs\nWHERE \n status_code SIMILAR TO \u00274[0-9][0-9]\u0027\n ", 123 | "user": "anonymous", 124 | "dateUpdated": "2021-02-26 11:21:49.117", 125 | "progress": 0, 126 | "config": { 127 | "editorSetting": { 128 | "language": "sql", 129 | "editOnDblClick": false, 130 | "completionKey": "TAB", 131 | "completionSupport": true 132 | }, 133 | "colWidth": 12.0, 134 | "editorMode": "ace/mode/sql", 135 | "fontSize": 9.0, 136 | "results": {}, 137 | "enabled": true 138 | }, 139 | "settings": { 140 | "params": {}, 141 | "forms": {} 142 | }, 143 | "apps": [], 144 | "runtimeInfos": { 145 | "jobUrl": { 146 | "propertyName": "jobUrl", 147 | "label": "FLINK JOB", 148 | "tooltip": "View in Flink web UI", 149 | "group": "flink", 150 | "values": [ 151 | { 152 | "jobUrl": "http://localhost:8081#/job/35a2d342bc8e4a0423dbcc29c0ef362d" 153 | } 154 | ], 155 | "interpreterSettingId": "flink" 156 | } 157 | }, 158 | "progressUpdateIntervalMs": 500, 159 | "jobName": "paragraph_1614264274230_70413416", 160 | "id": "paragraph_1614264274230_70413416", 161 | "dateCreated": "2021-02-25 22:44:34.231", 162 | "dateStarted": "2021-02-26 11:21:49.125", 163 | "dateFinished": "2021-02-26 11:22:45.789", 164 | "status": "ABORT" 165 | }, 166 | { 167 | "text": "%flink.ssql(type\u003dupdate)\n\nSELECT \n log_time,\n request_line,\n status_code,\n size\nFROM server_logs\nWHERE \n status_code SIMILAR TO \u00274[0-9][0-9]\u0027 order by log_time desc limit 10", 168 | "user": "anonymous", 169 | "dateUpdated": "2021-02-25 22:55:03.460", 170 | "progress": 0, 171 | "config": { 172 | "editorSetting": { 173 | "language": "sql", 174 | "editOnDblClick": false, 175 | "completionKey": "TAB", 176 | "completionSupport": true 177 | }, 178 | "colWidth": 12.0, 179 | "editorMode": "ace/mode/sql", 180 | "fontSize": 9.0, 181 | "results": { 182 | "0": { 183 | "graph": { 184 | "mode": "table", 185 | "height": 300.0, 186 | "optionOpen": false, 187 | "setting": { 188 | "table": { 189 | "tableGridState": {}, 190 | "tableColumnTypeState": { 191 | "names": { 192 | "log_time": "string", 193 | "request_line": "string", 194 | "status_code": "string", 195 | "size": "string" 196 | }, 197 | "updated": false 198 | }, 199 | "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", 200 | "tableOptionValue": { 201 | "useFilter": false, 202 | "showPagination": false, 203 | "showAggregationFooter": false 204 | }, 205 | "updated": false, 206 | "initialized": false 207 | } 208 | }, 209 | "commonSetting": {} 210 | } 211 | } 212 | }, 213 | "enabled": true 214 | }, 215 | "settings": { 216 | "params": {}, 217 | "forms": {} 218 | }, 219 | "apps": [], 220 | "runtimeInfos": { 221 | "jobUrl": { 222 | "propertyName": "jobUrl", 223 | "label": "FLINK JOB", 224 | "tooltip": "View in Flink web UI", 225 | "group": "flink", 226 | "values": [ 227 | { 228 | "jobUrl": "http://localhost:8081#/job/9415a68521cc97c502af147db472b0da" 229 | } 230 | ], 231 | "interpreterSettingId": "flink" 232 | } 233 | }, 234 | "progressUpdateIntervalMs": 500, 235 | "jobName": "paragraph_1614264279156_1925150514", 236 | "id": "paragraph_1614264279156_1925150514", 237 | "dateCreated": "2021-02-25 22:44:39.156", 238 | "dateStarted": "2021-02-25 22:52:52.486", 239 | "dateFinished": "2021-02-25 22:53:19.389", 240 | "status": "ABORT" 241 | }, 242 | { 243 | "text": "%flink.ssql\n", 244 | "user": "anonymous", 245 | "dateUpdated": "2021-02-25 22:52:46.246", 246 | "progress": 0, 247 | "config": {}, 248 | "settings": { 249 | "params": {}, 250 | "forms": {} 251 | }, 252 | "apps": [], 253 | "runtimeInfos": {}, 254 | "progressUpdateIntervalMs": 500, 255 | "jobName": "paragraph_1614264766246_456910806", 256 | "id": "paragraph_1614264766246_456910806", 257 | "dateCreated": "2021-02-25 22:52:46.246", 258 | "status": "READY" 259 | } 260 | ], 261 | "name": "02 Inserting Into Tables", 262 | "id": "2G1GAHZY7", 263 | "defaultInterpreterGroup": "spark", 264 | "version": "0.10.0-SNAPSHOT", 265 | "noteParams": {}, 266 | "noteForms": {}, 267 | "angularObjects": { 268 | "flink-shared_process": [ 269 | { 270 | "name": "duration", 271 | "object": "52 seconds", 272 | "noteId": "2G1GAHZY7", 273 | "paragraphId": "paragraph_1614264274230_70413416" 274 | } 275 | ] 276 | }, 277 | "config": { 278 | "isZeppelinNotebookCronEnable": false 279 | }, 280 | "info": {} 281 | } -------------------------------------------------------------------------------- /Flink Sql Cookbook/Foundations/03 Working with Temporary Tables_2G19ENM4Q.zpln: -------------------------------------------------------------------------------- 1 | { 2 | "paragraphs": [ 3 | { 4 | "text": "%md\n\n![Twitter Badge](https://img.shields.io/badge/Flink%20Version-1.11%2B-lightgrey)\n\n\u003e :bulb: This example will show how and why to create a temporary table using SQL DDL.\n\nNon-temporary tables in Flink SQL are stored in a catalog, while temporary tables only live within the current session. You can use a temporary table instead of a regular (catalog) table, if it is only meant to be used within the current session.\n\nThis example is exactly the same as Inserting Into Tables except that both `server_logs` and `client_errors` are created as temporary tables.\n\n## Why Temporary Tables?\nFor result tables like `client_errors` that no one can ever read from (because of its type `blackhole`) it makes a lot of sense to use a temporary table instead of publishing its metadata in a catalog.\n\nFurthermore, temporary tables allow you to create fully self-contained scripts, which is why we will mostly use those in the Flink SQL Cookbook.\n", 5 | "user": "anonymous", 6 | "dateUpdated": "2021-10-08 16:34:36.979", 7 | "progress": 0, 8 | "config": { 9 | "tableHide": false, 10 | "editorSetting": { 11 | "language": "markdown", 12 | "editOnDblClick": true, 13 | "completionKey": "TAB", 14 | "completionSupport": false 15 | }, 16 | "colWidth": 12.0, 17 | "editorMode": "ace/mode/markdown", 18 | "fontSize": 9.0, 19 | "editorHide": true, 20 | "results": {}, 21 | "enabled": true 22 | }, 23 | "settings": { 24 | "params": {}, 25 | "forms": {} 26 | }, 27 | "results": { 28 | "code": "SUCCESS", 29 | "msg": [ 30 | { 31 | "type": "HTML", 32 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e\u003cimg src\u003d\"https://img.shields.io/badge/Flink%20Version-1.11%2B-lightgrey\" alt\u003d\"Twitter Badge\" /\u003e\u003c/p\u003e\n\u003cblockquote\u003e\n\u003cp\u003e💡 This example will show how and why to create a temporary table using SQL DDL.\u003c/p\u003e\n\u003c/blockquote\u003e\n\u003cp\u003eNon-temporary tables in Flink SQL are stored in a catalog, while temporary tables only live within the current session. You can use a temporary table instead of a regular (catalog) table, if it is only meant to be used within the current session.\u003c/p\u003e\n\u003cp\u003eThis example is exactly the same as Inserting Into Tables except that both \u003ccode\u003eserver_logs\u003c/code\u003e and \u003ccode\u003eclient_errors\u003c/code\u003e are created as temporary tables.\u003c/p\u003e\n\u003ch2\u003eWhy Temporary Tables?\u003c/h2\u003e\n\u003cp\u003eFor result tables like \u003ccode\u003eclient_errors\u003c/code\u003e that no one can ever read from (because of its type \u003ccode\u003eblackhole\u003c/code\u003e) it makes a lot of sense to use a temporary table instead of publishing its metadata in a catalog.\u003c/p\u003e\n\u003cp\u003eFurthermore, temporary tables allow you to create fully self-contained scripts, which is why we will mostly use those in the Flink SQL Cookbook.\u003c/p\u003e\n\n\u003c/div\u003e" 33 | } 34 | ] 35 | }, 36 | "apps": [], 37 | "runtimeInfos": {}, 38 | "progressUpdateIntervalMs": 500, 39 | "jobName": "paragraph_1614267879471_225252437", 40 | "id": "paragraph_1614267879471_225252437", 41 | "dateCreated": "2021-02-25 23:44:39.472", 42 | "dateStarted": "2021-10-08 16:34:36.980", 43 | "dateFinished": "2021-10-08 16:34:36.984", 44 | "status": "FINISHED" 45 | }, 46 | { 47 | "text": "%md\n本例将展示怎么使用 SQl DDL 来创建临时表,以及为什么需要临时表。\n\nFlink SQL 中的非临时表存放与 catalog 中,而临时表的生命周期仅限于当前会话。如果你只需要在当前会话中使用某个表,这时使用临时表比常规catalog表更合适。\n\n本例除了将 `server_logs` 和 `client_errors` 创建为临时表以外,其他的都与上一个例子:表的插入(Inserting Into Tables)完全一样\n\n## 为什么需要临时表\n对于像 `client_errors` 这样的结果表, 因为它的类型是 `blackhole` 所以我们不能从它哪里读取结果,这时不将它的元数据保存到catalog 而是使用一个临时表更合适。\n\n另外,临时表允许你创建完全独立的脚本,这也是我们在 Flink SQL Cookbook 里面大量使用临时表的原因。\n", 48 | "user": "anonymous", 49 | "dateUpdated": "2021-03-18 15:08:48.254", 50 | "progress": 0, 51 | "config": { 52 | "editorSetting": { 53 | "language": "markdown", 54 | "editOnDblClick": true, 55 | "completionKey": "TAB", 56 | "completionSupport": false 57 | }, 58 | "colWidth": 12.0, 59 | "editorMode": "ace/mode/markdown", 60 | "fontSize": 9.0, 61 | "results": {}, 62 | "enabled": true, 63 | "editorHide": true, 64 | "tableHide": false 65 | }, 66 | "settings": { 67 | "params": {}, 68 | "forms": {} 69 | }, 70 | "results": { 71 | "code": "SUCCESS", 72 | "msg": [ 73 | { 74 | "type": "HTML", 75 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e本例将展示怎么使用 SQl DDL 来创建临时表,以及为什么需要临时表。\u003c/p\u003e\n\u003cp\u003eFlink SQL 中的非临时表存放与 catalog 中,而临时表的生命周期仅限于当前会话。如果你只需要在当前会话中使用某个表,这时使用临时表比常规catalog表更合适。\u003c/p\u003e\n\u003cp\u003e本例除了将 \u003ccode\u003eserver_logs\u003c/code\u003e 和 \u003ccode\u003eclient_errors\u003c/code\u003e 创建为临时表以外,其他的都与上一个例子:表的插入(Inserting Into Tables)完全一样\u003c/p\u003e\n\u003ch2\u003e为什么需要临时表\u003c/h2\u003e\n\u003cp\u003e对于像 \u003ccode\u003eclient_errors\u003c/code\u003e 这样的结果表, 因为它的类型是 \u003ccode\u003eblackhole\u003c/code\u003e 所以我们不能从它哪里读取结果,这时不将它的元数据保存到catalog 而是使用一个临时表更合适。\u003c/p\u003e\n\u003cp\u003e另外,临时表允许你创建完全独立的脚本,这也是我们在 Flink SQL Cookbook 里面大量使用临时表的原因。\u003c/p\u003e\n\n\u003c/div\u003e" 76 | } 77 | ] 78 | }, 79 | "apps": [], 80 | "runtimeInfos": {}, 81 | "progressUpdateIntervalMs": 500, 82 | "jobName": "paragraph_1615106975488_1767580088", 83 | "id": "paragraph_1615106975488_1767580088", 84 | "dateCreated": "2021-03-07 08:49:35.489", 85 | "dateStarted": "2021-03-18 15:08:48.254", 86 | "dateFinished": "2021-03-18 15:08:48.279", 87 | "status": "FINISHED" 88 | }, 89 | { 90 | "text": "%flink.ssql\n\nDROP TEMPORARY TABLE IF EXISTS server_logs_temp;\n\nCREATE TEMPORARY TABLE server_logs_temp ( \n client_ip STRING,\n client_identity STRING, \n userid STRING, \n user_agent STRING,\n log_time TIMESTAMP(3),\n request_line STRING, \n status_code STRING, \n size INT\n) WITH (\n \u0027connector\u0027 \u003d \u0027faker\u0027, \n \u0027fields.client_ip.expression\u0027 \u003d \u0027#{Internet.publicIpV4Address}\u0027,\n \u0027fields.client_identity.expression\u0027 \u003d \u0027-\u0027,\n \u0027fields.userid.expression\u0027 \u003d \u0027-\u0027,\n \u0027fields.user_agent.expression\u0027 \u003d \u0027#{Internet.userAgentAny}\u0027,\n \u0027fields.log_time.expression\u0027 \u003d \u0027#{date.past \u0027\u002715\u0027\u0027,\u0027\u00275\u0027\u0027,\u0027\u0027SECONDS\u0027\u0027}\u0027,\n \u0027fields.request_line.expression\u0027 \u003d \u0027#{regexify \u0027\u0027(GET|POST|PUT|PATCH){1}\u0027\u0027} #{regexify \u0027\u0027(/search\\.html|/login\\.html|/prod\\.html|cart\\.html|/order\\.html){1}\u0027\u0027} #{regexify \u0027\u0027(HTTP/1\\.1|HTTP/2|/HTTP/1\\.0){1}\u0027\u0027}\u0027,\n \u0027fields.status_code.expression\u0027 \u003d \u0027#{regexify \u0027\u0027(200|201|204|400|401|403|301){1}\u0027\u0027}\u0027,\n \u0027fields.size.expression\u0027 \u003d \u0027#{number.numberBetween \u0027\u0027100\u0027\u0027,\u0027\u002710000000\u0027\u0027}\u0027\n);", 91 | "user": "anonymous", 92 | "dateUpdated": "2021-03-04 14:02:40.027", 93 | "progress": 0, 94 | "config": { 95 | "editorSetting": { 96 | "language": "sql", 97 | "editOnDblClick": false, 98 | "completionKey": "TAB", 99 | "completionSupport": true 100 | }, 101 | "colWidth": 12.0, 102 | "editorMode": "ace/mode/sql", 103 | "fontSize": 9.0, 104 | "results": {}, 105 | "enabled": true 106 | }, 107 | "settings": { 108 | "params": {}, 109 | "forms": {} 110 | }, 111 | "apps": [], 112 | "runtimeInfos": {}, 113 | "progressUpdateIntervalMs": 500, 114 | "jobName": "paragraph_1614264882150_819187836", 115 | "id": "paragraph_1614264882150_819187836", 116 | "dateCreated": "2021-02-25 22:54:42.150", 117 | "dateStarted": "2021-03-04 14:02:40.087", 118 | "dateFinished": "2021-03-04 14:03:10.337", 119 | "status": "FINISHED" 120 | }, 121 | { 122 | "text": "%flink.ssql\n\nDROP TEMPORARY TABLE IF EXISTS client_errors_temp;\n\nCREATE TEMPORARY TABLE client_errors_temp (\n log_time TIMESTAMP(3),\n request_line STRING,\n status_code STRING,\n size INT\n)\nWITH (\n \u0027connector\u0027 \u003d \u0027blackhole\u0027\n);", 123 | "user": "anonymous", 124 | "dateUpdated": "2021-02-26 11:50:18.337", 125 | "progress": 0, 126 | "config": { 127 | "editorSetting": { 128 | "language": "sql", 129 | "editOnDblClick": false, 130 | "completionKey": "TAB", 131 | "completionSupport": true 132 | }, 133 | "colWidth": 12.0, 134 | "editorMode": "ace/mode/sql", 135 | "fontSize": 9.0, 136 | "results": {}, 137 | "enabled": true 138 | }, 139 | "settings": { 140 | "params": {}, 141 | "forms": {} 142 | }, 143 | "apps": [], 144 | "runtimeInfos": {}, 145 | "progressUpdateIntervalMs": 500, 146 | "jobName": "paragraph_1614264932930_876258121", 147 | "id": "paragraph_1614264932930_876258121", 148 | "dateCreated": "2021-02-25 22:55:32.930", 149 | "dateStarted": "2021-02-26 11:50:18.354", 150 | "dateFinished": "2021-02-26 11:50:19.213", 151 | "status": "FINISHED" 152 | }, 153 | { 154 | "text": "%flink.ssql\n\nINSERT INTO client_errors_temp\nSELECT \n log_time,\n request_line,\n status_code,\n size\nFROM server_logs_temp\nWHERE \n status_code SIMILAR TO \u00274[0-9][0-9]\u0027", 155 | "user": "anonymous", 156 | "dateUpdated": "2021-03-04 14:02:50.315", 157 | "progress": 0, 158 | "config": { 159 | "editorSetting": { 160 | "language": "sql", 161 | "editOnDblClick": false, 162 | "completionKey": "TAB", 163 | "completionSupport": true 164 | }, 165 | "colWidth": 12.0, 166 | "editorMode": "ace/mode/sql", 167 | "fontSize": 9.0, 168 | "results": {}, 169 | "enabled": true 170 | }, 171 | "settings": { 172 | "params": {}, 173 | "forms": {} 174 | }, 175 | "apps": [], 176 | "runtimeInfos": {}, 177 | "progressUpdateIntervalMs": 500, 178 | "jobName": "paragraph_1614264958394_1745571437", 179 | "id": "paragraph_1614264958394_1745571437", 180 | "dateCreated": "2021-02-25 22:55:58.394", 181 | "dateStarted": "2021-03-04 14:03:08.606", 182 | "dateFinished": "2021-03-04 14:03:10.333", 183 | "status": "ERROR" 184 | }, 185 | { 186 | "text": "%flink.ssql\n", 187 | "user": "anonymous", 188 | "dateUpdated": "2021-02-25 22:56:45.200", 189 | "progress": 0, 190 | "config": {}, 191 | "settings": { 192 | "params": {}, 193 | "forms": {} 194 | }, 195 | "apps": [], 196 | "runtimeInfos": {}, 197 | "progressUpdateIntervalMs": 500, 198 | "jobName": "paragraph_1614265005199_360947962", 199 | "id": "paragraph_1614265005199_360947962", 200 | "dateCreated": "2021-02-25 22:56:45.199", 201 | "status": "READY" 202 | } 203 | ], 204 | "name": "03 Working with Temporary Tables", 205 | "id": "2G19ENM4Q", 206 | "defaultInterpreterGroup": "flink", 207 | "version": "0.10.0-SNAPSHOT", 208 | "noteParams": {}, 209 | "noteForms": {}, 210 | "angularObjects": { 211 | "flink-shared_process": [ 212 | { 213 | "name": "duration", 214 | "object": "12 seconds", 215 | "noteId": "2G19ENM4Q", 216 | "paragraphId": "paragraph_1614264958394_1745571437" 217 | } 218 | ] 219 | }, 220 | "config": { 221 | "isZeppelinNotebookCronEnable": false 222 | }, 223 | "info": {} 224 | } -------------------------------------------------------------------------------- /Flink Sql Cookbook/Foundations/04 Filtering Data_2FY1Q15QK.zpln: -------------------------------------------------------------------------------- 1 | { 2 | "paragraphs": [ 3 | { 4 | "text": "%md\n\n\u003e :bulb: This example will show how to filter server logs in real-time using a standard `WHERE` clause.\n\nThe table it uses `server_logs` is backed by the [`faker` connector](https://github.com/knaufk/flink-faker) which continuously generates rows in memory based on Java Faker expressions and is convenient for testing queries. As such, it is an alternative to the built-in `datagen` connector used for example in the first recipe.\n\nYou can continuously filter these logs for those requests that experience authx issues with a simple `SELECT` statement with a `WHERE` clause filtering on the auth related HTTP status codes. \n", 5 | "user": "anonymous", 6 | "dateUpdated": "2021-10-08 16:35:39.946", 7 | "progress": 0, 8 | "config": { 9 | "tableHide": false, 10 | "editorSetting": { 11 | "language": "markdown", 12 | "editOnDblClick": true, 13 | "completionKey": "TAB", 14 | "completionSupport": false 15 | }, 16 | "colWidth": 12.0, 17 | "editorMode": "ace/mode/markdown", 18 | "fontSize": 9.0, 19 | "editorHide": true, 20 | "results": {}, 21 | "enabled": true 22 | }, 23 | "settings": { 24 | "params": {}, 25 | "forms": {} 26 | }, 27 | "results": { 28 | "code": "SUCCESS", 29 | "msg": [ 30 | { 31 | "type": "HTML", 32 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cblockquote\u003e\n\u003cp\u003e💡 This example will show how to filter server logs in real-time using a standard \u003ccode\u003eWHERE\u003c/code\u003e clause.\u003c/p\u003e\n\u003c/blockquote\u003e\n\u003cp\u003eThe table it uses \u003ccode\u003eserver_logs\u003c/code\u003e is backed by the \u003ca href\u003d\"https://github.com/knaufk/flink-faker\"\u003e\u003ccode\u003efaker\u003c/code\u003e connector\u003c/a\u003e which continuously generates rows in memory based on Java Faker expressions and is convenient for testing queries. As such, it is an alternative to the built-in \u003ccode\u003edatagen\u003c/code\u003e connector used for example in the first recipe.\u003c/p\u003e\n\u003cp\u003eYou can continuously filter these logs for those requests that experience authx issues with a simple \u003ccode\u003eSELECT\u003c/code\u003e statement with a \u003ccode\u003eWHERE\u003c/code\u003e clause filtering on the auth related HTTP status codes.\u003c/p\u003e\n\n\u003c/div\u003e" 33 | } 34 | ] 35 | }, 36 | "apps": [], 37 | "runtimeInfos": {}, 38 | "progressUpdateIntervalMs": 500, 39 | "jobName": "paragraph_1614267953363_1697493143", 40 | "id": "paragraph_1614267953363_1697493143", 41 | "dateCreated": "2021-02-25 23:45:53.363", 42 | "dateStarted": "2021-10-08 16:35:12.770", 43 | "dateFinished": "2021-10-08 16:35:12.775", 44 | "status": "FINISHED" 45 | }, 46 | { 47 | "text": "%md\n本例将展示怎样使用 `WHERE` 语言实时过滤服务器日志。\n\n例子中使用的表`server_logs` 的数据是利用 [`faker` connector](https://github.com/knaufk/flink-faker) 产生的,它基于 Java Faker 表达式不断的在内存中生成数据,以方便查询测试。因此,它是在第一个例子中使用的内置连接器 `datagen` 的一个替代品。\n\n你可以不断的过滤出含有认证问题的请求日志,通过简单的 SELECT 语句 以及 WHERE 条件语句删选认证相关的HTTP 状态码。\n", 48 | "user": "anonymous", 49 | "dateUpdated": "2021-03-18 15:26:06.029", 50 | "progress": 0, 51 | "config": { 52 | "editorSetting": { 53 | "language": "markdown", 54 | "editOnDblClick": true, 55 | "completionKey": "TAB", 56 | "completionSupport": false 57 | }, 58 | "colWidth": 12.0, 59 | "editorMode": "ace/mode/markdown", 60 | "fontSize": 9.0, 61 | "results": {}, 62 | "enabled": true, 63 | "editorHide": true, 64 | "tableHide": false 65 | }, 66 | "settings": { 67 | "params": {}, 68 | "forms": {} 69 | }, 70 | "results": { 71 | "code": "SUCCESS", 72 | "msg": [ 73 | { 74 | "type": "HTML", 75 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e本例将展示怎样使用 \u003ccode\u003eWHERE\u003c/code\u003e 语言实时过滤服务器日志。\u003c/p\u003e\n\u003cp\u003e例子中使用的表\u003ccode\u003eserver_logs\u003c/code\u003e 的数据是利用 \u003ca href\u003d\"https://github.com/knaufk/flink-faker\"\u003e\u003ccode\u003efaker\u003c/code\u003e connector\u003c/a\u003e 产生的,它基于 Java Faker 表达式不断的在内存中生成数据,以方便查询测试。因此,它是在第一个例子中使用的内置连接器 \u003ccode\u003edatagen\u003c/code\u003e 的一个替代品。\u003c/p\u003e\n\u003cp\u003e你可以不断的过滤出含有认证问题的请求日志,通过简单的 SELECT 语句 以及 WHERE 条件语句删选认证相关的HTTP 状态码。\u003c/p\u003e\n\n\u003c/div\u003e" 76 | } 77 | ] 78 | }, 79 | "apps": [], 80 | "runtimeInfos": {}, 81 | "progressUpdateIntervalMs": 500, 82 | "jobName": "paragraph_1615106992290_704524151", 83 | "id": "paragraph_1615106992290_704524151", 84 | "dateCreated": "2021-03-07 08:49:52.290", 85 | "dateStarted": "2021-03-18 15:26:06.029", 86 | "dateFinished": "2021-03-18 15:26:06.054", 87 | "status": "FINISHED" 88 | }, 89 | { 90 | "text": "%flink.ssql\n\nDROP TABLE IF EXISTS server_logs;\n\nCREATE TABLE server_logs ( \n client_ip STRING,\n client_identity STRING, \n userid STRING, \n log_time TIMESTAMP(3),\n request_line STRING, \n status_code STRING, \n size INT\n) WITH (\n \u0027connector\u0027 \u003d \u0027faker\u0027, \n \u0027fields.client_ip.expression\u0027 \u003d \u0027#{Internet.publicIpV4Address}\u0027,\n \u0027fields.client_identity.expression\u0027 \u003d \u0027-\u0027,\n \u0027fields.userid.expression\u0027 \u003d \u0027-\u0027,\n \u0027fields.log_time.expression\u0027 \u003d \u0027#{date.past \u0027\u002715\u0027\u0027,\u0027\u00275\u0027\u0027,\u0027\u0027SECONDS\u0027\u0027}\u0027,\n \u0027fields.request_line.expression\u0027 \u003d \u0027#{regexify \u0027\u0027(GET|POST|PUT|PATCH){1}\u0027\u0027} #{regexify \u0027\u0027(/search\\.html|/login\\.html|/prod\\.html|cart\\.html|/order\\.html){1}\u0027\u0027} #{regexify \u0027\u0027(HTTP/1\\.1|HTTP/2|/HTTP/1\\.0){1}\u0027\u0027}\u0027,\n \u0027fields.status_code.expression\u0027 \u003d \u0027#{regexify \u0027\u0027(200|201|204|400|401|403|301){1}\u0027\u0027}\u0027,\n \u0027fields.size.expression\u0027 \u003d \u0027#{number.numberBetween \u0027\u0027100\u0027\u0027,\u0027\u002710000000\u0027\u0027}\u0027\n);\n\n\n ", 91 | "user": "anonymous", 92 | "dateUpdated": "2021-03-04 14:25:10.186", 93 | "progress": 0, 94 | "config": { 95 | "editorSetting": { 96 | "language": "sql", 97 | "editOnDblClick": false, 98 | "completionKey": "TAB", 99 | "completionSupport": true 100 | }, 101 | "colWidth": 12.0, 102 | "editorMode": "ace/mode/sql", 103 | "fontSize": 9.0, 104 | "results": {}, 105 | "enabled": true 106 | }, 107 | "settings": { 108 | "params": {}, 109 | "forms": {} 110 | }, 111 | "apps": [], 112 | "runtimeInfos": {}, 113 | "progressUpdateIntervalMs": 500, 114 | "jobName": "paragraph_1614265116139_298210191", 115 | "id": "paragraph_1614265116139_298210191", 116 | "dateCreated": "2021-02-25 22:58:36.139", 117 | "dateStarted": "2021-03-04 14:25:10.221", 118 | "dateFinished": "2021-03-04 14:25:10.682", 119 | "status": "FINISHED" 120 | }, 121 | { 122 | "text": "%flink.ssql(type\u003dupdate)\n\nSELECT \n log_time, \n request_line,\n status_code \nFROM server_logs\nWHERE\n status_code IN (\u0027403\u0027, \u0027401\u0027) \nORDER BY log_time DESC LIMIT 10\n", 123 | "user": "anonymous", 124 | "dateUpdated": "2021-03-04 14:27:20.396", 125 | "progress": 0, 126 | "config": { 127 | "editorSetting": { 128 | "language": "sql", 129 | "editOnDblClick": false, 130 | "completionKey": "TAB", 131 | "completionSupport": true 132 | }, 133 | "colWidth": 12.0, 134 | "editorMode": "ace/mode/sql", 135 | "fontSize": 9.0, 136 | "results": { 137 | "0": { 138 | "graph": { 139 | "mode": "table", 140 | "height": 300.0, 141 | "optionOpen": false, 142 | "setting": { 143 | "table": { 144 | "tableGridState": {}, 145 | "tableColumnTypeState": { 146 | "names": { 147 | "log_time": "string", 148 | "request_line": "string", 149 | "status_code": "string" 150 | }, 151 | "updated": false 152 | }, 153 | "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", 154 | "tableOptionValue": { 155 | "useFilter": false, 156 | "showPagination": false, 157 | "showAggregationFooter": false 158 | }, 159 | "updated": false, 160 | "initialized": false 161 | } 162 | }, 163 | "commonSetting": {} 164 | }, 165 | "helium": {} 166 | } 167 | }, 168 | "enabled": true 169 | }, 170 | "settings": { 171 | "params": {}, 172 | "forms": {} 173 | }, 174 | "apps": [], 175 | "runtimeInfos": { 176 | "jobUrl": { 177 | "propertyName": "jobUrl", 178 | "label": "FLINK JOB", 179 | "tooltip": "View in Flink web UI", 180 | "group": "flink", 181 | "values": [ 182 | { 183 | "jobUrl": "http://localhost:8081#/job/cf2cec11b7c9772131c8127e26c4b10a" 184 | } 185 | ], 186 | "interpreterSettingId": "flink" 187 | } 188 | }, 189 | "progressUpdateIntervalMs": 500, 190 | "jobName": "paragraph_1614265141086_666927857", 191 | "id": "paragraph_1614265141086_666927857", 192 | "dateCreated": "2021-02-25 22:59:01.086", 193 | "dateStarted": "2021-03-04 14:26:16.464", 194 | "dateFinished": "2021-03-04 14:27:37.612", 195 | "status": "ABORT" 196 | }, 197 | { 198 | "text": "%flink.ssql\n", 199 | "user": "anonymous", 200 | "dateUpdated": "2021-02-25 22:59:26.795", 201 | "progress": 0, 202 | "config": {}, 203 | "settings": { 204 | "params": {}, 205 | "forms": {} 206 | }, 207 | "apps": [], 208 | "runtimeInfos": {}, 209 | "progressUpdateIntervalMs": 500, 210 | "jobName": "paragraph_1614265166795_238623732", 211 | "id": "paragraph_1614265166795_238623732", 212 | "dateCreated": "2021-02-25 22:59:26.795", 213 | "status": "READY" 214 | } 215 | ], 216 | "name": "04 Filtering Data", 217 | "id": "2FY1Q15QK", 218 | "defaultInterpreterGroup": "flink", 219 | "version": "0.10.0-SNAPSHOT", 220 | "noteParams": {}, 221 | "noteForms": {}, 222 | "angularObjects": {}, 223 | "config": { 224 | "isZeppelinNotebookCronEnable": false 225 | }, 226 | "info": {} 227 | } -------------------------------------------------------------------------------- /Flink Sql Cookbook/Foundations/05 Aggregating Data_2FYNFVHG9.zpln: -------------------------------------------------------------------------------- 1 | { 2 | "paragraphs": [ 3 | { 4 | "text": "%md\n\n\u003e :bulb: This example will show how to aggregate server logs in real-time using the standard `GROUP BY` clause.\n\nThe source table (`server_logs`) is backed by the [`faker` connector](https://github.com/knaufk/flink-faker), which continuously generates rows in memory based on Java Faker expressions.\n\nTo count the number of logs received per browser for each status code _over time_, you can combine the `COUNT` aggregate function with a `GROUP BY` clause. Because the `user_agent` field contains a lot of information, you can extract the browser using the built-in [string function](https://ci.apache.org/projects/flink/flink-docs-release-1.11/dev/table/functions/systemFunctions.html#string-functions) `REGEXP_EXTRACT`.\n\nA `GROUP BY` on a streaming table produces an updating result, so you will see the aggregated count for each browser continuously changing as new rows flow in.\n\n\u003e As an exercise, you can play around with other standard SQL aggregate functions (e.g. `SUM`,`AVG`,`MIN`,`MAX`).\n", 5 | "user": "anonymous", 6 | "dateUpdated": "2021-10-08 16:35:48.826", 7 | "progress": 0, 8 | "config": { 9 | "tableHide": false, 10 | "editorSetting": { 11 | "language": "markdown", 12 | "editOnDblClick": true, 13 | "completionKey": "TAB", 14 | "completionSupport": false 15 | }, 16 | "colWidth": 12.0, 17 | "editorMode": "ace/mode/markdown", 18 | "fontSize": 9.0, 19 | "editorHide": true, 20 | "results": {}, 21 | "enabled": true 22 | }, 23 | "settings": { 24 | "params": {}, 25 | "forms": {} 26 | }, 27 | "results": { 28 | "code": "SUCCESS", 29 | "msg": [ 30 | { 31 | "type": "HTML", 32 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cblockquote\u003e\n\u003cp\u003e💡 This example will show how to aggregate server logs in real-time using the standard \u003ccode\u003eGROUP BY\u003c/code\u003e clause.\u003c/p\u003e\n\u003c/blockquote\u003e\n\u003cp\u003eThe source table (\u003ccode\u003eserver_logs\u003c/code\u003e) is backed by the \u003ca href\u003d\"https://github.com/knaufk/flink-faker\"\u003e\u003ccode\u003efaker\u003c/code\u003e connector\u003c/a\u003e, which continuously generates rows in memory based on Java Faker expressions.\u003c/p\u003e\n\u003cp\u003eTo count the number of logs received per browser for each status code \u003cem\u003eover time\u003c/em\u003e, you can combine the \u003ccode\u003eCOUNT\u003c/code\u003e aggregate function with a \u003ccode\u003eGROUP BY\u003c/code\u003e clause. Because the \u003ccode\u003euser_agent\u003c/code\u003e field contains a lot of information, you can extract the browser using the built-in \u003ca href\u003d\"https://ci.apache.org/projects/flink/flink-docs-release-1.11/dev/table/functions/systemFunctions.html#string-functions\"\u003estring function\u003c/a\u003e \u003ccode\u003eREGEXP_EXTRACT\u003c/code\u003e.\u003c/p\u003e\n\u003cp\u003eA \u003ccode\u003eGROUP BY\u003c/code\u003e on a streaming table produces an updating result, so you will see the aggregated count for each browser continuously changing as new rows flow in.\u003c/p\u003e\n\u003cblockquote\u003e\n\u003cp\u003eAs an exercise, you can play around with other standard SQL aggregate functions (e.g. \u003ccode\u003eSUM\u003c/code\u003e,\u003ccode\u003eAVG\u003c/code\u003e,\u003ccode\u003eMIN\u003c/code\u003e,\u003ccode\u003eMAX\u003c/code\u003e).\u003c/p\u003e\n\u003c/blockquote\u003e\n\n\u003c/div\u003e" 33 | } 34 | ] 35 | }, 36 | "apps": [], 37 | "runtimeInfos": {}, 38 | "progressUpdateIntervalMs": 500, 39 | "jobName": "paragraph_1614268004818_923961985", 40 | "id": "paragraph_1614268004818_923961985", 41 | "dateCreated": "2021-02-25 23:46:44.818", 42 | "dateStarted": "2021-10-08 16:35:48.826", 43 | "dateFinished": "2021-10-08 16:35:48.834", 44 | "status": "FINISHED" 45 | }, 46 | { 47 | "text": "%md\n本例将暂时怎样使用标准的 `GROUP BY` 语句来实时聚合服务器日志。\n\n例子中使用的 source 表`server_logs` 的数据是利用 [`faker` connector](https://github.com/knaufk/flink-faker) 产生的,它基于 Java Faker 表达式不断的在内存中生成数据。\n\n为了计算每种状态码中每种连览器发送的日志的数量,我们可以结合使用 `COUNT` 聚合行数与 `GROUP BY` 条件语句。由于 `user_agent` 字段包含很多信息,你可以使用内置[字符串函数](https://ci.apache.org/projects/flink/flink-docs-release-1.11/dev/table/functions/systemFunctions.html#string-functions) `REGEXP_EXTRACT`.\n\n对流表进行 `GROUP BY` 将产生不断更新的结果,所以你将看到随着数据的流入每种浏览器的聚合结果不断的改变。\n\n\u003e 作为练习,你可以试试其他的标准 SQL 聚合函数(例如: `SUM`,`AVG`,`MIN`,`MAX`)。\n", 48 | "user": "anonymous", 49 | "dateUpdated": "2021-03-18 15:26:45.797", 50 | "progress": 0, 51 | "config": { 52 | "editorSetting": { 53 | "language": "markdown", 54 | "editOnDblClick": true, 55 | "completionKey": "TAB", 56 | "completionSupport": false 57 | }, 58 | "colWidth": 12.0, 59 | "editorMode": "ace/mode/markdown", 60 | "fontSize": 9.0, 61 | "results": {}, 62 | "enabled": true, 63 | "editorHide": true, 64 | "tableHide": false 65 | }, 66 | "settings": { 67 | "params": {}, 68 | "forms": {} 69 | }, 70 | "results": { 71 | "code": "SUCCESS", 72 | "msg": [ 73 | { 74 | "type": "HTML", 75 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e本例将暂时怎样使用标准的 \u003ccode\u003eGROUP BY\u003c/code\u003e 语句来实时聚合服务器日志。\u003c/p\u003e\n\u003cp\u003e例子中使用的 source 表\u003ccode\u003eserver_logs\u003c/code\u003e 的数据是利用 \u003ca href\u003d\"https://github.com/knaufk/flink-faker\"\u003e\u003ccode\u003efaker\u003c/code\u003e connector\u003c/a\u003e 产生的,它基于 Java Faker 表达式不断的在内存中生成数据。\u003c/p\u003e\n\u003cp\u003e为了计算每种状态码中每种连览器发送的日志的数量,我们可以结合使用 \u003ccode\u003eCOUNT\u003c/code\u003e 聚合行数与 \u003ccode\u003eGROUP BY\u003c/code\u003e 条件语句。由于 \u003ccode\u003euser_agent\u003c/code\u003e 字段包含很多信息,你可以使用内置\u003ca href\u003d\"https://ci.apache.org/projects/flink/flink-docs-release-1.11/dev/table/functions/systemFunctions.html#string-functions\"\u003e字符串函数\u003c/a\u003e \u003ccode\u003eREGEXP_EXTRACT\u003c/code\u003e.\u003c/p\u003e\n\u003cp\u003e对流表进行 \u003ccode\u003eGROUP BY\u003c/code\u003e 将产生不断更新的结果,所以你将看到随着数据的流入每种浏览器的聚合结果不断的改变。\u003c/p\u003e\n\u003cblockquote\u003e\n\u003cp\u003e作为练习,你可以试试其他的标准 SQL 聚合函数(例如: \u003ccode\u003eSUM\u003c/code\u003e,\u003ccode\u003eAVG\u003c/code\u003e,\u003ccode\u003eMIN\u003c/code\u003e,\u003ccode\u003eMAX\u003c/code\u003e)。\u003c/p\u003e\n\u003c/blockquote\u003e\n\n\u003c/div\u003e" 76 | } 77 | ] 78 | }, 79 | "apps": [], 80 | "runtimeInfos": {}, 81 | "progressUpdateIntervalMs": 500, 82 | "jobName": "paragraph_1615107092001_168094460", 83 | "id": "paragraph_1615107092001_168094460", 84 | "dateCreated": "2021-03-07 08:51:32.002", 85 | "dateStarted": "2021-03-18 15:26:45.796", 86 | "dateFinished": "2021-03-18 15:26:45.813", 87 | "status": "FINISHED" 88 | }, 89 | { 90 | "text": "%flink.ssql\n\nDROP TABLE IF EXISTS server_logs;\n\nCREATE TABLE server_logs ( \n client_ip STRING,\n client_identity STRING, \n userid STRING, \n user_agent STRING,\n log_time TIMESTAMP(3),\n request_line STRING, \n status_code STRING, \n size INT\n) WITH (\n \u0027connector\u0027 \u003d \u0027faker\u0027, \n \u0027fields.client_ip.expression\u0027 \u003d \u0027#{Internet.publicIpV4Address}\u0027,\n \u0027fields.client_identity.expression\u0027 \u003d \u0027-\u0027,\n \u0027fields.userid.expression\u0027 \u003d \u0027-\u0027,\n \u0027fields.user_agent.expression\u0027 \u003d \u0027#{Internet.userAgentAny}\u0027,\n \u0027fields.log_time.expression\u0027 \u003d \u0027#{date.past \u0027\u002715\u0027\u0027,\u0027\u00275\u0027\u0027,\u0027\u0027SECONDS\u0027\u0027}\u0027,\n \u0027fields.request_line.expression\u0027 \u003d \u0027#{regexify \u0027\u0027(GET|POST|PUT|PATCH){1}\u0027\u0027} #{regexify \u0027\u0027(/search\\.html|/login\\.html|/prod\\.html|cart\\.html|/order\\.html){1}\u0027\u0027} #{regexify \u0027\u0027(HTTP/1\\.1|HTTP/2|/HTTP/1\\.0){1}\u0027\u0027}\u0027,\n \u0027fields.status_code.expression\u0027 \u003d \u0027#{regexify \u0027\u0027(200|201|204|400|401|403|301){1}\u0027\u0027}\u0027,\n \u0027fields.size.expression\u0027 \u003d \u0027#{number.numberBetween \u0027\u0027100\u0027\u0027,\u0027\u002710000000\u0027\u0027}\u0027\n);\n", 91 | "user": "anonymous", 92 | "dateUpdated": "2021-08-25 05:30:00.641", 93 | "progress": 0, 94 | "config": { 95 | "editorSetting": { 96 | "language": "sql", 97 | "editOnDblClick": false, 98 | "completionKey": "TAB", 99 | "completionSupport": true 100 | }, 101 | "colWidth": 12.0, 102 | "editorMode": "ace/mode/sql", 103 | "fontSize": 9.0, 104 | "results": {}, 105 | "enabled": true 106 | }, 107 | "settings": { 108 | "params": {}, 109 | "forms": {} 110 | }, 111 | "results": { 112 | "code": "SUCCESS", 113 | "msg": [ 114 | { 115 | "type": "TEXT", 116 | "data": "Table has been dropped.\nTable has been created.\n" 117 | } 118 | ] 119 | }, 120 | "apps": [], 121 | "runtimeInfos": {}, 122 | "progressUpdateIntervalMs": 500, 123 | "jobName": "paragraph_1614265474939_121202527", 124 | "id": "paragraph_1614265474939_121202527", 125 | "dateCreated": "2021-02-25 23:04:34.939", 126 | "dateStarted": "2021-08-25 05:30:00.645", 127 | "dateFinished": "2021-08-25 05:30:00.859", 128 | "status": "FINISHED" 129 | }, 130 | { 131 | "text": "%flink.ssql(type\u003dupdate)\n\n-- Sample user_agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A\n-- Regex pattern: \u0027[^\\/]+\u0027 (Match everything before \u0027/\u0027)\nSELECT \n REGEXP_EXTRACT(user_agent,\u0027[^\\/]+\u0027) AS browser,\n status_code, \n COUNT(*) AS cnt_status\nFROM server_logs\nGROUP BY \n REGEXP_EXTRACT(user_agent,\u0027[^\\/]+\u0027),\n status_code;\n \n ", 132 | "user": "anonymous", 133 | "dateUpdated": "2021-08-25 05:30:02.963", 134 | "progress": 0, 135 | "config": { 136 | "editorSetting": { 137 | "language": "sql", 138 | "editOnDblClick": false, 139 | "completionKey": "TAB", 140 | "completionSupport": true 141 | }, 142 | "colWidth": 12.0, 143 | "editorMode": "ace/mode/sql", 144 | "fontSize": 9.0, 145 | "results": { 146 | "0": { 147 | "graph": { 148 | "mode": "table", 149 | "height": 300.0, 150 | "optionOpen": false, 151 | "setting": { 152 | "multiBarChart": { 153 | "xLabelStatus": "default", 154 | "rotate": { 155 | "degree": "-45" 156 | } 157 | }, 158 | "table": { 159 | "tableGridState": {}, 160 | "tableColumnTypeState": { 161 | "names": { 162 | "browser": "string", 163 | "status_code": "string", 164 | "cnt_status": "string" 165 | }, 166 | "updated": false 167 | }, 168 | "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", 169 | "tableOptionValue": { 170 | "useFilter": false, 171 | "showPagination": false, 172 | "showAggregationFooter": false 173 | }, 174 | "updated": false, 175 | "initialized": false 176 | } 177 | }, 178 | "commonSetting": {}, 179 | "keys": [ 180 | { 181 | "name": "browser", 182 | "index": 0.0, 183 | "aggr": "sum" 184 | } 185 | ], 186 | "groups": [ 187 | { 188 | "name": "status_code", 189 | "index": 1.0, 190 | "aggr": "sum" 191 | } 192 | ], 193 | "values": [ 194 | { 195 | "name": "cnt_status", 196 | "index": 2.0, 197 | "aggr": "sum" 198 | } 199 | ] 200 | }, 201 | "helium": {} 202 | } 203 | }, 204 | "enabled": true 205 | }, 206 | "settings": { 207 | "params": {}, 208 | "forms": {} 209 | }, 210 | "apps": [], 211 | "runtimeInfos": { 212 | "jobUrl": { 213 | "propertyName": "jobUrl", 214 | "label": "FLINK JOB", 215 | "tooltip": "View in Flink web UI", 216 | "group": "flink", 217 | "values": [ 218 | { 219 | "jobUrl": "http://localhost:8081#/job/7e103a837f0d410dbf676304a901a7e6" 220 | } 221 | ], 222 | "interpreterSettingId": "flink" 223 | } 224 | }, 225 | "progressUpdateIntervalMs": 500, 226 | "jobName": "paragraph_1614265500959_774321427", 227 | "id": "paragraph_1614265500959_774321427", 228 | "dateCreated": "2021-02-25 23:05:00.959", 229 | "dateStarted": "2021-08-25 05:30:02.966", 230 | "dateFinished": "2021-03-04 14:43:27.553", 231 | "status": "ABORT" 232 | }, 233 | { 234 | "text": "%flink.ssql\n", 235 | "user": "anonymous", 236 | "dateUpdated": "2021-02-25 23:05:23.045", 237 | "progress": 0, 238 | "config": {}, 239 | "settings": { 240 | "params": {}, 241 | "forms": {} 242 | }, 243 | "apps": [], 244 | "runtimeInfos": {}, 245 | "progressUpdateIntervalMs": 500, 246 | "jobName": "paragraph_1614265523044_544403182", 247 | "id": "paragraph_1614265523044_544403182", 248 | "dateCreated": "2021-02-25 23:05:23.045", 249 | "status": "READY" 250 | } 251 | ], 252 | "name": "05 Aggregating Data", 253 | "id": "2FYNFVHG9", 254 | "defaultInterpreterGroup": "flink", 255 | "version": "0.10.0-SNAPSHOT", 256 | "noteParams": {}, 257 | "noteForms": {}, 258 | "angularObjects": {}, 259 | "config": { 260 | "isZeppelinNotebookCronEnable": false 261 | }, 262 | "info": {} 263 | } -------------------------------------------------------------------------------- /Flink Sql Cookbook/Foundations/07 Encapsulating Logic with (Temporary) Views_2FYJ4TZC6.zpln: -------------------------------------------------------------------------------- 1 | { 2 | "paragraphs": [ 3 | { 4 | "text": "%md\n\n![Twitter Badge](https://img.shields.io/badge/Flink%20Version-1.11%2B-lightgrey)\n\n\u003e :bulb: This example will show how you can use (temporary) views to reuse code and to structure long queries and scripts. \n\n`CREATE (TEMPORARY) VIEW` defines a view from a query. **A view is not physically materialized.** Instead, the query is run every time the view is referenced in a query.\n\nTemporary views are very useful to structure and decompose more complicated queries and to re-use queries within a longer script.Non-temporary views - stored in a persistent [catalog](https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/catalogs.html) - can also be used to share common queries within your organization, e.g. common filters or pre-processing steps. \n\nHere, we create a view on the `server_logs` that only contains successful requests. This view encapsulates the logic of filtering the logs based on certain `status_code`s. This logic can subsequently be used by any query or script that has access to the catalog. \n", 5 | "user": "anonymous", 6 | "dateUpdated": "2021-10-08 16:36:53.390", 7 | "progress": 0, 8 | "config": { 9 | "tableHide": false, 10 | "editorSetting": { 11 | "language": "markdown", 12 | "editOnDblClick": true, 13 | "completionKey": "TAB", 14 | "completionSupport": false 15 | }, 16 | "colWidth": 12.0, 17 | "editorMode": "ace/mode/markdown", 18 | "fontSize": 9.0, 19 | "editorHide": true, 20 | "results": {}, 21 | "enabled": true 22 | }, 23 | "settings": { 24 | "params": {}, 25 | "forms": {} 26 | }, 27 | "results": { 28 | "code": "SUCCESS", 29 | "msg": [ 30 | { 31 | "type": "HTML", 32 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e\u003cimg src\u003d\"https://img.shields.io/badge/Flink%20Version-1.11%2B-lightgrey\" alt\u003d\"Twitter Badge\" /\u003e\u003c/p\u003e\n\u003cblockquote\u003e\n\u003cp\u003e💡 This example will show how you can use (temporary) views to reuse code and to structure long queries and scripts.\u003c/p\u003e\n\u003c/blockquote\u003e\n\u003cp\u003e\u003ccode\u003eCREATE (TEMPORARY) VIEW\u003c/code\u003e defines a view from a query. \u003cstrong\u003eA view is not physically materialized.\u003c/strong\u003e Instead, the query is run every time the view is referenced in a query.\u003c/p\u003e\n\u003cp\u003eTemporary views are very useful to structure and decompose more complicated queries and to re-use queries within a longer script.Non-temporary views - stored in a persistent \u003ca href\u003d\"https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/catalogs.html\"\u003ecatalog\u003c/a\u003e - can also be used to share common queries within your organization, e.g. common filters or pre-processing steps.\u003c/p\u003e\n\u003cp\u003eHere, we create a view on the \u003ccode\u003eserver_logs\u003c/code\u003e that only contains successful requests. This view encapsulates the logic of filtering the logs based on certain \u003ccode\u003estatus_code\u003c/code\u003es. This logic can subsequently be used by any query or script that has access to the catalog.\u003c/p\u003e\n\n\u003c/div\u003e" 33 | } 34 | ] 35 | }, 36 | "apps": [], 37 | "runtimeInfos": {}, 38 | "progressUpdateIntervalMs": 500, 39 | "jobName": "paragraph_1614268082780_1390435132", 40 | "id": "paragraph_1614268082780_1390435132", 41 | "dateCreated": "2021-02-25 23:48:02.780", 42 | "dateStarted": "2021-10-08 16:36:53.390", 43 | "dateFinished": "2021-10-08 16:36:53.396", 44 | "status": "FINISHED" 45 | }, 46 | { 47 | "text": "%md\n本例将展示怎么使用(临时)视图来复用代码以及构造长查询和脚本。\n\n`CREATE (TEMPORARY) VIEW` 根据查询来定义视图。** 视图不是物理上的实体 ** 相反,每当某个查询引用视图时这个查询都会执行。\n\n临时视图对构造和解构更复杂的查询以及在长脚本中复用查询很有用。非临时视图持久化在[catalog](https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/catalogs.html) 中, 可以被用来在组织内共享公共的查询,例如公共的过滤或者预处理步骤。\n\n这里,我们在 `server_logs` 上创建一个只包含成功请求的视图。这个视图封装来基于特定 `status_code` 筛选日志的逻辑。这个逻辑可用于后续的任何拥有查询这个 catalog 权限的查询和脚本。\n", 48 | "user": "anonymous", 49 | "dateUpdated": "2021-03-18 15:28:03.231", 50 | "progress": 0, 51 | "config": { 52 | "editorSetting": { 53 | "language": "markdown", 54 | "editOnDblClick": true, 55 | "completionKey": "TAB", 56 | "completionSupport": false 57 | }, 58 | "colWidth": 12.0, 59 | "editorMode": "ace/mode/markdown", 60 | "fontSize": 9.0, 61 | "results": {}, 62 | "enabled": true, 63 | "editorHide": true, 64 | "tableHide": false 65 | }, 66 | "settings": { 67 | "params": {}, 68 | "forms": {} 69 | }, 70 | "results": { 71 | "code": "SUCCESS", 72 | "msg": [ 73 | { 74 | "type": "HTML", 75 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e本例将展示怎么使用(临时)视图来复用代码以及构造长查询和脚本。\u003c/p\u003e\n\u003cp\u003e\u003ccode\u003eCREATE (TEMPORARY) VIEW\u003c/code\u003e 根据查询来定义视图。** 视图不是物理上的实体 ** 相反,每当某个查询引用视图时这个查询都会执行。\u003c/p\u003e\n\u003cp\u003e临时视图对构造和解构更复杂的查询以及在长脚本中复用查询很有用。非临时视图持久化在\u003ca href\u003d\"https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/catalogs.html\"\u003ecatalog\u003c/a\u003e 中, 可以被用来在组织内共享公共的查询,例如公共的过滤或者预处理步骤。\u003c/p\u003e\n\u003cp\u003e这里,我们在 \u003ccode\u003eserver_logs\u003c/code\u003e 上创建一个只包含成功请求的视图。这个视图封装来基于特定 \u003ccode\u003estatus_code\u003c/code\u003e 筛选日志的逻辑。这个逻辑可用于后续的任何拥有查询这个 catalog 权限的查询和脚本。\u003c/p\u003e\n\n\u003c/div\u003e" 76 | } 77 | ] 78 | }, 79 | "apps": [], 80 | "runtimeInfos": {}, 81 | "progressUpdateIntervalMs": 500, 82 | "jobName": "paragraph_1615107136845_1987078896", 83 | "id": "paragraph_1615107136845_1987078896", 84 | "dateCreated": "2021-03-07 08:52:16.847", 85 | "dateStarted": "2021-03-18 15:28:03.232", 86 | "dateFinished": "2021-03-18 15:28:03.248", 87 | "status": "FINISHED" 88 | }, 89 | { 90 | "text": "%flink.ssql\n\n\nDROP TABLE IF EXISTS server_logs;\n\nCREATE TABLE server_logs ( \n client_ip STRING,\n client_identity STRING, \n userid STRING, \n user_agent STRING,\n log_time TIMESTAMP(3),\n request_line STRING, \n status_code STRING, \n size INT\n) WITH (\n \u0027connector\u0027 \u003d \u0027faker\u0027, \n \u0027fields.client_ip.expression\u0027 \u003d \u0027#{Internet.publicIpV4Address}\u0027,\n \u0027fields.client_identity.expression\u0027 \u003d \u0027-\u0027,\n \u0027fields.userid.expression\u0027 \u003d \u0027-\u0027,\n \u0027fields.user_agent.expression\u0027 \u003d \u0027#{Internet.userAgentAny}\u0027,\n \u0027fields.log_time.expression\u0027 \u003d \u0027#{date.past \u0027\u002715\u0027\u0027,\u0027\u00275\u0027\u0027,\u0027\u0027SECONDS\u0027\u0027}\u0027,\n \u0027fields.request_line.expression\u0027 \u003d \u0027#{regexify \u0027\u0027(GET|POST|PUT|PATCH){1}\u0027\u0027} #{regexify \u0027\u0027(/search\\.html|/login\\.html|/prod\\.html|cart\\.html|/order\\.html){1}\u0027\u0027} #{regexify \u0027\u0027(HTTP/1\\.1|HTTP/2|/HTTP/1\\.0){1}\u0027\u0027}\u0027,\n \u0027fields.status_code.expression\u0027 \u003d \u0027#{regexify \u0027\u0027(200|201|204|400|401|403|301){1}\u0027\u0027}\u0027,\n \u0027fields.size.expression\u0027 \u003d \u0027#{number.numberBetween \u0027\u0027100\u0027\u0027,\u0027\u002710000000\u0027\u0027}\u0027\n);\n\nCREATE VIEW successful_requests AS \nSELECT * \nFROM server_logs\nWHERE status_code SIMILAR TO \u0027[2,3][0-9][0-9]\u0027", 91 | "user": "anonymous", 92 | "dateUpdated": "2021-02-26 11:52:38.766", 93 | "progress": 0, 94 | "config": { 95 | "editorSetting": { 96 | "language": "sql", 97 | "editOnDblClick": false, 98 | "completionKey": "TAB", 99 | "completionSupport": true 100 | }, 101 | "colWidth": 12.0, 102 | "editorMode": "ace/mode/sql", 103 | "fontSize": 9.0, 104 | "results": {}, 105 | "enabled": true 106 | }, 107 | "settings": { 108 | "params": {}, 109 | "forms": {} 110 | }, 111 | "apps": [], 112 | "runtimeInfos": {}, 113 | "progressUpdateIntervalMs": 500, 114 | "jobName": "paragraph_1614265884772_820912582", 115 | "id": "paragraph_1614265884772_820912582", 116 | "dateCreated": "2021-02-25 23:11:24.772", 117 | "dateStarted": "2021-02-26 11:52:38.773", 118 | "dateFinished": "2021-02-26 11:53:00.513", 119 | "status": "FINISHED" 120 | }, 121 | { 122 | "text": "%flink.ssql(type\u003dupdate)\n\nSELECT * FROM successful_requests \nORDER BY log_time DESC\nLIMIT 10;\n\n", 123 | "user": "anonymous", 124 | "dateUpdated": "2021-02-26 11:53:44.163", 125 | "progress": 0, 126 | "config": { 127 | "editorSetting": { 128 | "language": "sql", 129 | "editOnDblClick": false, 130 | "completionKey": "TAB", 131 | "completionSupport": true 132 | }, 133 | "colWidth": 12.0, 134 | "editorMode": "ace/mode/sql", 135 | "fontSize": 9.0, 136 | "results": { 137 | "0": { 138 | "graph": { 139 | "mode": "table", 140 | "height": 300.0, 141 | "optionOpen": false, 142 | "setting": { 143 | "table": { 144 | "tableGridState": {}, 145 | "tableColumnTypeState": { 146 | "names": { 147 | "client_ip": "string", 148 | "client_identity": "string", 149 | "userid": "string", 150 | "user_agent": "string", 151 | "log_time": "string", 152 | "request_line": "string", 153 | "status_code": "string", 154 | "size": "string" 155 | }, 156 | "updated": false 157 | }, 158 | "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", 159 | "tableOptionValue": { 160 | "useFilter": false, 161 | "showPagination": false, 162 | "showAggregationFooter": false 163 | }, 164 | "updated": false, 165 | "initialized": false 166 | } 167 | }, 168 | "commonSetting": {} 169 | } 170 | } 171 | }, 172 | "enabled": true 173 | }, 174 | "settings": { 175 | "params": {}, 176 | "forms": {} 177 | }, 178 | "apps": [], 179 | "runtimeInfos": { 180 | "jobUrl": { 181 | "propertyName": "jobUrl", 182 | "label": "FLINK JOB", 183 | "tooltip": "View in Flink web UI", 184 | "group": "flink", 185 | "values": [ 186 | { 187 | "jobUrl": "http://localhost:8081#/job/4ff64342a29886547f5c4a3c1e8ef830" 188 | } 189 | ], 190 | "interpreterSettingId": "flink" 191 | } 192 | }, 193 | "progressUpdateIntervalMs": 500, 194 | "jobName": "paragraph_1614265919071_1139674991", 195 | "id": "paragraph_1614265919071_1139674991", 196 | "dateCreated": "2021-02-25 23:11:59.071", 197 | "dateStarted": "2021-02-26 11:53:44.170", 198 | "dateFinished": "2021-02-26 11:54:13.932", 199 | "status": "ABORT" 200 | }, 201 | { 202 | "text": "%flink.ssql\n", 203 | "user": "anonymous", 204 | "dateUpdated": "2021-02-25 23:12:27.804", 205 | "progress": 0, 206 | "config": {}, 207 | "settings": { 208 | "params": {}, 209 | "forms": {} 210 | }, 211 | "apps": [], 212 | "runtimeInfos": {}, 213 | "progressUpdateIntervalMs": 500, 214 | "jobName": "paragraph_1614265947803_931245766", 215 | "id": "paragraph_1614265947803_931245766", 216 | "dateCreated": "2021-02-25 23:12:27.804", 217 | "status": "READY" 218 | } 219 | ], 220 | "name": "07 Encapsulating Logic with (Temporary) Views", 221 | "id": "2FYJ4TZC6", 222 | "defaultInterpreterGroup": "flink", 223 | "version": "0.10.0-SNAPSHOT", 224 | "noteParams": {}, 225 | "noteForms": {}, 226 | "angularObjects": {}, 227 | "config": { 228 | "isZeppelinNotebookCronEnable": false 229 | }, 230 | "info": {} 231 | } -------------------------------------------------------------------------------- /Flink Sql Cookbook/Foundations/08 Writing Results into Multiple Tables_2G1MEGYE2.zpln: -------------------------------------------------------------------------------- 1 | { 2 | "paragraphs": [ 3 | { 4 | "text": "%md\n\n![Twitter Badge](https://img.shields.io/badge/Flink%20Version-1.13%2B-lightgrey)\n\n\n\u003e :bulb: In this recipe, you will learn how to use `runAsOne` which is only supported by Zeppelin to run multiple `INSERT INTO` statements in a single, optimized Flink Job.\n\nMany product requirements involve outputting the results of a streaming application to two or more sinks, such as Apache Kafka for real-time use cases, or a Filesystem for offline ones. Other times, two queries are not the same but share some extensive intermediate operations.\n\nWhen working with server logs, the support team would like to see the number of status codes per browser every 5 minutes to have real-time insights into a web pages\u0027 status. Additionally, they would like the same information on an hourly basis made available as partitioned Apache Parquet files so they can perform historical analysis.\n\nWe could quickly write two Flink SQL queries to solve both these requirements, but that would not be efficient. These queries have a lot of duplicated work, like reading the source logs Kafka topic and cleansing the data.\n\nZeppelin includes a feature called `runAsOne`, that allows for multiplexing `INSERT INTO` statements into a single query holistically optimized by Apache Flink and deployed as a single application.\n", 5 | "user": "anonymous", 6 | "dateUpdated": "2021-10-08 16:37:50.240", 7 | "progress": 0, 8 | "config": { 9 | "tableHide": false, 10 | "editorSetting": { 11 | "language": "markdown", 12 | "editOnDblClick": true, 13 | "completionKey": "TAB", 14 | "completionSupport": false 15 | }, 16 | "colWidth": 12.0, 17 | "editorMode": "ace/mode/markdown", 18 | "fontSize": 9.0, 19 | "editorHide": true, 20 | "results": {}, 21 | "enabled": true 22 | }, 23 | "settings": { 24 | "params": {}, 25 | "forms": {} 26 | }, 27 | "results": { 28 | "code": "SUCCESS", 29 | "msg": [ 30 | { 31 | "type": "HTML", 32 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e\u003cimg src\u003d\"https://img.shields.io/badge/Flink%20Version-1.13%2B-lightgrey\" alt\u003d\"Twitter Badge\" /\u003e\u003c/p\u003e\n\u003cblockquote\u003e\n\u003cp\u003e💡 In this recipe, you will learn how to use \u003ccode\u003erunAsOne\u003c/code\u003e which is only supported by Zeppelin to run multiple \u003ccode\u003eINSERT INTO\u003c/code\u003e statements in a single, optimized Flink Job.\u003c/p\u003e\n\u003c/blockquote\u003e\n\u003cp\u003eMany product requirements involve outputting the results of a streaming application to two or more sinks, such as Apache Kafka for real-time use cases, or a Filesystem for offline ones. Other times, two queries are not the same but share some extensive intermediate operations.\u003c/p\u003e\n\u003cp\u003eWhen working with server logs, the support team would like to see the number of status codes per browser every 5 minutes to have real-time insights into a web pages\u0026rsquo; status. Additionally, they would like the same information on an hourly basis made available as partitioned Apache Parquet files so they can perform historical analysis.\u003c/p\u003e\n\u003cp\u003eWe could quickly write two Flink SQL queries to solve both these requirements, but that would not be efficient. These queries have a lot of duplicated work, like reading the source logs Kafka topic and cleansing the data.\u003c/p\u003e\n\u003cp\u003eZeppelin includes a feature called \u003ccode\u003erunAsOne\u003c/code\u003e, that allows for multiplexing \u003ccode\u003eINSERT INTO\u003c/code\u003e statements into a single query holistically optimized by Apache Flink and deployed as a single application.\u003c/p\u003e\n\n\u003c/div\u003e" 33 | } 34 | ] 35 | }, 36 | "apps": [], 37 | "runtimeInfos": {}, 38 | "progressUpdateIntervalMs": 500, 39 | "jobName": "paragraph_1614268141191_633824219", 40 | "id": "paragraph_1614268141191_633824219", 41 | "dateCreated": "2021-02-25 23:49:01.191", 42 | "dateStarted": "2021-10-08 16:37:50.240", 43 | "dateFinished": "2021-10-08 16:37:50.246", 44 | "status": "FINISHED" 45 | }, 46 | { 47 | "text": "%md\n在本例中,你将学会怎么使用 `runAsOne` 来在单个优化的Flink任务中运行多个 INSERT INTO 语句。\n\n很多产品的需求包含将流程序的结果输出到多个接收器,例如实时场景的 Apache Kafka 或者 离线场景的文件系统。有时,2个查询并不一样但是共享很多中间操作。\n\n在使用服务器日志时,支持团队希望每5分钟查看每种浏览器的每种状态代码数量,以便实时了解web页面的状态。此外,他们还希望每小时都能以分的区Apache Parquet文件的形式提供相同的信息,以便执行历史分析。\n\n我们可以马上写 2 个 Flink SQL 查询来解决这两个需求,但是这种方式效率不高。因为这些查询有很多重复的工作,像例如从Kafka tipic 种读取原始日志和清理数据。\n\nZeppelin 包括一个称为 `runAsOne` 的特性,它允许在单个查询种多路复用多个 INSERT INTO 语句,这个查询被 Apache Flink 整体优化,并作为单个应用程序部署。\n", 48 | "user": "anonymous", 49 | "dateUpdated": "2021-03-18 15:42:36.890", 50 | "progress": 0, 51 | "config": { 52 | "editorSetting": { 53 | "language": "markdown", 54 | "editOnDblClick": true, 55 | "completionKey": "TAB", 56 | "completionSupport": false 57 | }, 58 | "colWidth": 12.0, 59 | "editorMode": "ace/mode/markdown", 60 | "fontSize": 9.0, 61 | "results": {}, 62 | "enabled": true, 63 | "editorHide": true, 64 | "tableHide": false 65 | }, 66 | "settings": { 67 | "params": {}, 68 | "forms": {} 69 | }, 70 | "results": { 71 | "code": "SUCCESS", 72 | "msg": [ 73 | { 74 | "type": "HTML", 75 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e在本例中,你将学会怎么使用 \u003ccode\u003erunAsOne\u003c/code\u003e 来在单个优化的Flink任务中运行多个 INSERT INTO 语句。\u003c/p\u003e\n\u003cp\u003e很多产品的需求包含将流程序的结果输出到多个接收器,例如实时场景的 Apache Kafka 或者 离线场景的文件系统。有时,2个查询并不一样但是共享很多中间操作。\u003c/p\u003e\n\u003cp\u003e在使用服务器日志时,支持团队希望每5分钟查看每种浏览器的每种状态代码数量,以便实时了解web页面的状态。此外,他们还希望每小时都能以分的区Apache Parquet文件的形式提供相同的信息,以便执行历史分析。\u003c/p\u003e\n\u003cp\u003e我们可以马上写 2 个 Flink SQL 查询来解决这两个需求,但是这种方式效率不高。因为这些查询有很多重复的工作,像例如从Kafka tipic 种读取原始日志和清理数据。\u003c/p\u003e\n\u003cp\u003eZeppelin 包括一个称为 \u003ccode\u003erunAsOne\u003c/code\u003e 的特性,它允许在单个查询种多路复用多个 INSERT INTO 语句,这个查询被 Apache Flink 整体优化,并作为单个应用程序部署。\u003c/p\u003e\n\n\u003c/div\u003e" 76 | } 77 | ] 78 | }, 79 | "apps": [], 80 | "runtimeInfos": {}, 81 | "progressUpdateIntervalMs": 500, 82 | "jobName": "paragraph_1615107156917_227458529", 83 | "id": "paragraph_1615107156917_227458529", 84 | "dateCreated": "2021-03-07 08:52:36.917", 85 | "dateStarted": "2021-03-18 15:42:36.890", 86 | "dateFinished": "2021-03-18 15:42:36.904", 87 | "status": "FINISHED" 88 | }, 89 | { 90 | "text": "%flink.ssql\n\nDROP TEMPORARY TABLE IF EXISTS server_logs_temp;\n\nCREATE TEMPORARY TABLE server_logs_temp ( \n client_ip STRING,\n client_identity STRING, \n userid STRING, \n user_agent STRING,\n log_time TIMESTAMP(3),\n request_line STRING, \n status_code STRING, \n size INT,\n WATERMARK FOR log_time AS log_time - INTERVAL \u002730\u0027 SECONDS\n) WITH (\n \u0027connector\u0027 \u003d \u0027faker\u0027, \n \u0027fields.client_ip.expression\u0027 \u003d \u0027#{Internet.publicIpV4Address}\u0027,\n \u0027fields.client_identity.expression\u0027 \u003d \u0027-\u0027,\n \u0027fields.userid.expression\u0027 \u003d \u0027-\u0027,\n \u0027fields.user_agent.expression\u0027 \u003d \u0027#{Internet.userAgentAny}\u0027,\n \u0027fields.log_time.expression\u0027 \u003d \u0027#{date.past \u0027\u002715\u0027\u0027,\u0027\u00275\u0027\u0027,\u0027\u0027SECONDS\u0027\u0027}\u0027,\n \u0027fields.request_line.expression\u0027 \u003d \u0027#{regexify \u0027\u0027(GET|POST|PUT|PATCH){1}\u0027\u0027} #{regexify \u0027\u0027(/search\\.html|/login\\.html|/prod\\.html|cart\\.html|/order\\.html){1}\u0027\u0027} #{regexify \u0027\u0027(HTTP/1\\.1|HTTP/2|/HTTP/1\\.0){1}\u0027\u0027}\u0027,\n \u0027fields.status_code.expression\u0027 \u003d \u0027#{regexify \u0027\u0027(200|201|204|400|401|403|301){1}\u0027\u0027}\u0027,\n \u0027fields.size.expression\u0027 \u003d \u0027#{number.numberBetween \u0027\u0027100\u0027\u0027,\u0027\u002710000000\u0027\u0027}\u0027\n);", 91 | "user": "anonymous", 92 | "dateUpdated": "2021-03-06 12:37:48.310", 93 | "progress": 0, 94 | "config": { 95 | "editorSetting": { 96 | "language": "sql", 97 | "editOnDblClick": false, 98 | "completionKey": "TAB", 99 | "completionSupport": true 100 | }, 101 | "colWidth": 12.0, 102 | "editorMode": "ace/mode/sql", 103 | "fontSize": 9.0, 104 | "results": {}, 105 | "enabled": true 106 | }, 107 | "settings": { 108 | "params": {}, 109 | "forms": {} 110 | }, 111 | "apps": [], 112 | "runtimeInfos": {}, 113 | "progressUpdateIntervalMs": 500, 114 | "jobName": "paragraph_1614266965392_2093520683", 115 | "id": "paragraph_1614266965392_2093520683", 116 | "dateCreated": "2021-02-25 23:29:25.392", 117 | "dateStarted": "2021-03-06 12:37:48.455", 118 | "dateFinished": "2021-03-06 12:37:50.413", 119 | "status": "FINISHED" 120 | }, 121 | { 122 | "text": "%flink.ssql\n\n\nDROP TEMPORARY TABLE IF EXISTS realtime_aggregations_temp;\n\nCREATE TEMPORARY TABLE realtime_aggregations_temp (\n `browser` STRING,\n `status_code` STRING,\n `end_time` TIMESTAMP(3),\n `requests` BIGINT NOT NULL\n) WITH (\n \u0027connector\u0027 \u003d \u0027filesystem\u0027,\n \u0027path\u0027 \u003d \u0027file:///tmp/realtime_aggregations\u0027,\n \u0027sink.partition-commit.trigger\u0027 \u003d \u0027partition-time\u0027, \n \u0027format\u0027 \u003d \u0027csv\u0027 \n);\n", 123 | "user": "anonymous", 124 | "dateUpdated": "2021-03-06 12:51:56.712", 125 | "progress": 0, 126 | "config": { 127 | "editorSetting": { 128 | "language": "sql", 129 | "editOnDblClick": false, 130 | "completionKey": "TAB", 131 | "completionSupport": true 132 | }, 133 | "colWidth": 12.0, 134 | "editorMode": "ace/mode/sql", 135 | "fontSize": 9.0, 136 | "results": {}, 137 | "enabled": true 138 | }, 139 | "settings": { 140 | "params": {}, 141 | "forms": {} 142 | }, 143 | "apps": [], 144 | "runtimeInfos": {}, 145 | "progressUpdateIntervalMs": 500, 146 | "jobName": "paragraph_1614266979082_1973127487", 147 | "id": "paragraph_1614266979082_1973127487", 148 | "dateCreated": "2021-02-25 23:29:39.083", 149 | "dateStarted": "2021-03-06 12:51:56.767", 150 | "dateFinished": "2021-03-06 12:51:57.217", 151 | "status": "FINISHED" 152 | }, 153 | { 154 | "text": "%flink.ssql\n\n\nDROP TEMPORARY TABLE IF EXISTS offline_datawarehouse_temp;\n\nCREATE TEMPORARY TABLE offline_datawarehouse_temp (\n `browser` STRING,\n `status_code` STRING,\n `dt` STRING,\n `hour` STRING,\n `requests` BIGINT NOT NULL\n) PARTITIONED BY (`dt`, `hour`) WITH (\n \u0027connector\u0027 \u003d \u0027filesystem\u0027,\n \u0027path\u0027 \u003d \u0027file:///tmp/offline_datawarehouse\u0027,\n \u0027sink.partition-commit.trigger\u0027 \u003d \u0027partition-time\u0027, \n \u0027format\u0027 \u003d \u0027csv\u0027 \n);", 155 | "user": "anonymous", 156 | "dateUpdated": "2021-03-06 12:52:00.115", 157 | "progress": 0, 158 | "config": { 159 | "editorSetting": { 160 | "language": "sql", 161 | "editOnDblClick": false, 162 | "completionKey": "TAB", 163 | "completionSupport": true 164 | }, 165 | "colWidth": 12.0, 166 | "editorMode": "ace/mode/sql", 167 | "fontSize": 9.0, 168 | "results": {}, 169 | "enabled": true 170 | }, 171 | "settings": { 172 | "params": {}, 173 | "forms": {} 174 | }, 175 | "apps": [], 176 | "runtimeInfos": {}, 177 | "progressUpdateIntervalMs": 500, 178 | "jobName": "paragraph_1614267087710_1500021598", 179 | "id": "paragraph_1614267087710_1500021598", 180 | "dateCreated": "2021-02-25 23:31:27.710", 181 | "dateStarted": "2021-03-06 12:52:00.171", 182 | "dateFinished": "2021-03-06 12:52:00.587", 183 | "status": "FINISHED" 184 | }, 185 | { 186 | "text": "%flink.ssql\n\n-- This is a shared view that will be used by both \n-- insert into statements\nDROP TEMPORARY VIEW IF EXISTS browsers_temp;\n\nCREATE TEMPORARY VIEW browsers_temp AS \nSELECT \n REGEXP_EXTRACT(user_agent,\u0027[^\\/]+\u0027) AS browser,\n status_code,\n log_time\nFROM server_logs_temp;", 187 | "user": "anonymous", 188 | "dateUpdated": "2021-03-06 12:52:05.194", 189 | "progress": 0, 190 | "config": { 191 | "editorSetting": { 192 | "language": "sql", 193 | "editOnDblClick": false, 194 | "completionKey": "TAB", 195 | "completionSupport": true 196 | }, 197 | "colWidth": 12.0, 198 | "editorMode": "ace/mode/sql", 199 | "fontSize": 9.0, 200 | "results": {}, 201 | "enabled": true 202 | }, 203 | "settings": { 204 | "params": {}, 205 | "forms": {} 206 | }, 207 | "apps": [], 208 | "runtimeInfos": {}, 209 | "progressUpdateIntervalMs": 500, 210 | "jobName": "paragraph_1614267108142_227560020", 211 | "id": "paragraph_1614267108142_227560020", 212 | "dateCreated": "2021-02-25 23:31:48.148", 213 | "dateStarted": "2021-03-06 12:52:05.244", 214 | "dateFinished": "2021-03-06 12:52:06.166", 215 | "status": "FINISHED" 216 | }, 217 | { 218 | "text": "%flink.ssql(runAsOne\u003dtrue)\n\nINSERT INTO realtime_aggregations_temp\nSELECT\n browser,\n status_code,\n TUMBLE_ROWTIME(log_time, INTERVAL \u00275\u0027 MINUTE) AS end_time,\n COUNT(*) requests\nFROM browsers_temp\nGROUP BY \n browser,\n status_code,\n TUMBLE(log_time, INTERVAL \u00275\u0027 MINUTE);\n \nINSERT INTO offline_datawarehouse_temp\nSELECT\n browser,\n status_code,\n DATE_FORMAT(TUMBLE_ROWTIME(log_time, INTERVAL \u00271\u0027 HOUR), \u0027yyyy-MM-dd\u0027) AS `dt`,\n DATE_FORMAT(TUMBLE_ROWTIME(log_time, INTERVAL \u00271\u0027 HOUR), \u0027HH\u0027) AS `hour`,\n COUNT(*) requests\nFROM browsers_temp\nGROUP BY \n browser,\n status_code,\n TUMBLE(log_time, INTERVAL \u00271\u0027 HOUR);\n", 219 | "user": "anonymous", 220 | "dateUpdated": "2021-03-06 12:53:27.697", 221 | "progress": 0, 222 | "config": { 223 | "editorSetting": { 224 | "language": "sql", 225 | "editOnDblClick": false, 226 | "completionKey": "TAB", 227 | "completionSupport": true 228 | }, 229 | "colWidth": 12.0, 230 | "editorMode": "ace/mode/sql", 231 | "fontSize": 9.0, 232 | "results": {}, 233 | "enabled": true 234 | }, 235 | "settings": { 236 | "params": {}, 237 | "forms": {} 238 | }, 239 | "apps": [], 240 | "runtimeInfos": { 241 | "jobUrl": { 242 | "propertyName": "jobUrl", 243 | "label": "FLINK JOB", 244 | "tooltip": "View in Flink web UI", 245 | "group": "flink", 246 | "values": [ 247 | { 248 | "jobUrl": "http://localhost:8081#/job/90ffa75345c515c217dce8ae51859fbd" 249 | } 250 | ], 251 | "interpreterSettingId": "flink" 252 | } 253 | }, 254 | "progressUpdateIntervalMs": 500, 255 | "jobName": "paragraph_1614267128467_2138149264", 256 | "id": "paragraph_1614267128467_2138149264", 257 | "dateCreated": "2021-02-25 23:32:08.467", 258 | "dateStarted": "2021-03-06 12:53:27.733", 259 | "dateFinished": "2021-03-06 12:53:34.465", 260 | "status": "ABORT" 261 | }, 262 | { 263 | "text": "%flink.ssql\n", 264 | "user": "anonymous", 265 | "dateUpdated": "2021-02-25 23:33:34.659", 266 | "progress": 0, 267 | "config": {}, 268 | "settings": { 269 | "params": {}, 270 | "forms": {} 271 | }, 272 | "apps": [], 273 | "runtimeInfos": {}, 274 | "progressUpdateIntervalMs": 500, 275 | "jobName": "paragraph_1614267214658_1658872109", 276 | "id": "paragraph_1614267214658_1658872109", 277 | "dateCreated": "2021-02-25 23:33:34.658", 278 | "status": "READY" 279 | } 280 | ], 281 | "name": "08 Writing Results into Multiple Tables", 282 | "id": "2G1MEGYE2", 283 | "defaultInterpreterGroup": "flink", 284 | "version": "0.10.0-SNAPSHOT", 285 | "noteParams": {}, 286 | "noteForms": {}, 287 | "angularObjects": {}, 288 | "config": { 289 | "isZeppelinNotebookCronEnable": false 290 | }, 291 | "info": {} 292 | } -------------------------------------------------------------------------------- /Flink Sql Cookbook/Foundations/09 Convert timestamps with timezones_2HHBK28GB.zpln: -------------------------------------------------------------------------------- 1 | { 2 | "paragraphs": [ 3 | { 4 | "text": "%md\n\n\n# 09 Convert timestamps with timezones\n\n![Twitter Badge](https://img.shields.io/badge/Flink%20Version-1.15%2B-lightgrey)\n\n\u003e :bulb: In this recipe, you will learn how to consolidate timestamps with different time zones to UTC. \n\nTimestamps in incoming data can refer to different time zones and consolidating them to the same time zone (e.g. UTC) is a prerequisite to ensure correctness in temporal analysis.\n\nThe source table (`iot_status`) is backed by the [`faker` connector](https://flink-packages.org/packages/flink-faker), which continuously generates fake IoT status messages in memory based on Java Faker expressions.\n\nIn this recipe we create a table which contains IoT devices status updates including timestamp and device time zone, which we\u0027ll convert to UTC. \n\nWe create the table first, then use the [`CONVERT_TZ`](https://nightlies.apache.org/flink/flink-docs-stable/docs/dev/table/functions/systemfunctions/#temporal-functions) function to convert the timestamp to UTC. The `CONVERT_TZ` function requires the input timestamp to be passed as string, thus we apply the cast function to `iot_timestamp`.", 5 | "user": "anonymous", 6 | "dateUpdated": "2022-11-06 19:33:48.416", 7 | "progress": 0, 8 | "config": { 9 | "tableHide": false, 10 | "editorSetting": { 11 | "language": "markdown", 12 | "editOnDblClick": true, 13 | "completionKey": "TAB", 14 | "completionSupport": false 15 | }, 16 | "colWidth": 12.0, 17 | "editorMode": "ace/mode/markdown", 18 | "fontSize": 9.0, 19 | "editorHide": true, 20 | "results": {}, 21 | "enabled": true 22 | }, 23 | "settings": { 24 | "params": {}, 25 | "forms": {} 26 | }, 27 | "results": { 28 | "code": "SUCCESS", 29 | "msg": [ 30 | { 31 | "type": "HTML", 32 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003ch1\u003e09 Convert timestamps with timezones\u003c/h1\u003e\n\u003cp\u003e\u003cimg src\u003d\"https://img.shields.io/badge/Flink%20Version-1.15%2B-lightgrey\" alt\u003d\"Twitter Badge\" /\u003e\u003c/p\u003e\n\u003cblockquote\u003e\n\u003cp\u003e💡 In this recipe, you will learn how to consolidate timestamps with different time zones to UTC.\u003c/p\u003e\n\u003c/blockquote\u003e\n\u003cp\u003eTimestamps in incoming data can refer to different time zones and consolidating them to the same time zone (e.g. UTC) is a prerequisite to ensure correctness in temporal analysis.\u003c/p\u003e\n\u003cp\u003eThe source table (\u003ccode\u003eiot_status\u003c/code\u003e) is backed by the \u003ca href\u003d\"https://flink-packages.org/packages/flink-faker\"\u003e\u003ccode\u003efaker\u003c/code\u003e connector\u003c/a\u003e, which continuously generates fake IoT status messages in memory based on Java Faker expressions.\u003c/p\u003e\n\u003cp\u003eIn this recipe we create a table which contains IoT devices status updates including timestamp and device time zone, which we\u0026rsquo;ll convert to UTC.\u003c/p\u003e\n\u003cp\u003eWe create the table first, then use the \u003ca href\u003d\"https://nightlies.apache.org/flink/flink-docs-stable/docs/dev/table/functions/systemfunctions/#temporal-functions\"\u003e\u003ccode\u003eCONVERT_TZ\u003c/code\u003e\u003c/a\u003e function to convert the timestamp to UTC. The \u003ccode\u003eCONVERT_TZ\u003c/code\u003e function requires the input timestamp to be passed as string, thus we apply the cast function to \u003ccode\u003eiot_timestamp\u003c/code\u003e.\u003c/p\u003e\n\n\u003c/div\u003e" 33 | } 34 | ] 35 | }, 36 | "apps": [], 37 | "runtimeInfos": {}, 38 | "progressUpdateIntervalMs": 500, 39 | "jobName": "paragraph_1667734218713_107345213", 40 | "id": "paragraph_1667734218713_107345213", 41 | "dateCreated": "2022-11-06 19:30:18.713", 42 | "dateStarted": "2022-11-06 19:33:48.416", 43 | "dateFinished": "2022-11-06 19:33:48.428", 44 | "status": "FINISHED" 45 | }, 46 | { 47 | "text": "%flink.ssql\n\nCREATE TABLE iot_status ( \n device_ip STRING,\n device_timezone STRING,\n iot_timestamp TIMESTAMP(3),\n status_code STRING\n) WITH (\n \u0027connector\u0027 \u003d \u0027faker\u0027, \n \u0027fields.device_ip.expression\u0027 \u003d \u0027#{Internet.publicIpV4Address}\u0027,\n \u0027fields.device_timezone.expression\u0027 \u003d \u0027#{regexify \u0027\u0027(America\\/Los_Angeles|Europe\\/Rome|Europe\\/London|Australia\\/Sydney){1}\u0027\u0027}\u0027,\n \u0027fields.iot_timestamp.expression\u0027 \u003d \u0027#{date.past \u0027\u002715\u0027\u0027,\u0027\u00275\u0027\u0027,\u0027\u0027SECONDS\u0027\u0027}\u0027,\n \u0027fields.status_code.expression\u0027 \u003d \u0027#{regexify \u0027\u0027(OK|KO|WARNING){1}\u0027\u0027}\u0027,\n \u0027rows-per-second\u0027 \u003d \u00273\u0027\n);\n", 48 | "user": "anonymous", 49 | "dateUpdated": "2022-11-06 19:31:41.867", 50 | "progress": 0, 51 | "config": { 52 | "editorSetting": { 53 | "language": "sql", 54 | "editOnDblClick": false, 55 | "completionKey": "TAB", 56 | "completionSupport": false 57 | }, 58 | "colWidth": 12.0, 59 | "editorMode": "ace/mode/sql", 60 | "fontSize": 9.0, 61 | "results": {}, 62 | "enabled": true 63 | }, 64 | "settings": { 65 | "params": {}, 66 | "forms": {} 67 | }, 68 | "results": { 69 | "code": "SUCCESS", 70 | "msg": [ 71 | { 72 | "type": "TEXT", 73 | "data": "Table has been created.\n" 74 | } 75 | ] 76 | }, 77 | "apps": [], 78 | "runtimeInfos": {}, 79 | "progressUpdateIntervalMs": 500, 80 | "jobName": "paragraph_1667734238536_868089923", 81 | "id": "paragraph_1667734238536_868089923", 82 | "dateCreated": "2022-11-06 19:30:38.536", 83 | "dateStarted": "2022-11-06 19:31:41.879", 84 | "dateFinished": "2022-11-06 19:31:56.405", 85 | "status": "FINISHED" 86 | }, 87 | { 88 | "text": "%flink.ssql\n\nSELECT \n device_ip, \n device_timezone,\n iot_timestamp,\n convert_tz(cast(iot_timestamp as string), device_timezone, \u0027UTC\u0027) iot_timestamp_utc,\n status_code\nFROM iot_status order by iot_timestamp desc limit 10;\n", 89 | "user": "anonymous", 90 | "dateUpdated": "2022-11-06 19:32:48.642", 91 | "progress": 0, 92 | "config": { 93 | "editorSetting": { 94 | "language": "sql", 95 | "editOnDblClick": false, 96 | "completionKey": "TAB", 97 | "completionSupport": true 98 | }, 99 | "colWidth": 12.0, 100 | "editorMode": "ace/mode/sql", 101 | "fontSize": 9.0, 102 | "results": { 103 | "0": { 104 | "graph": { 105 | "mode": "table", 106 | "height": 300.0, 107 | "optionOpen": false, 108 | "setting": { 109 | "table": { 110 | "tableGridState": {}, 111 | "tableColumnTypeState": { 112 | "names": { 113 | "device_ip": "string", 114 | "device_timezone": "string", 115 | "iot_timestamp": "string", 116 | "iot_timestamp_utc": "string", 117 | "status_code": "string" 118 | }, 119 | "updated": false 120 | }, 121 | "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", 122 | "tableOptionValue": { 123 | "useFilter": false, 124 | "showPagination": false, 125 | "showAggregationFooter": false 126 | }, 127 | "updated": false, 128 | "initialized": false 129 | } 130 | }, 131 | "commonSetting": {} 132 | } 133 | } 134 | }, 135 | "enabled": true 136 | }, 137 | "settings": { 138 | "params": {}, 139 | "forms": {} 140 | }, 141 | "apps": [], 142 | "runtimeInfos": {}, 143 | "progressUpdateIntervalMs": 500, 144 | "jobName": "paragraph_1667734263336_1919898471", 145 | "id": "paragraph_1667734263336_1919898471", 146 | "dateCreated": "2022-11-06 19:31:03.336", 147 | "dateStarted": "2022-11-06 19:32:29.497", 148 | "dateFinished": "2022-11-06 19:32:45.417", 149 | "status": "ABORT" 150 | }, 151 | { 152 | "text": "%flink.ssql\n", 153 | "user": "anonymous", 154 | "dateUpdated": "2022-11-06 19:31:57.713", 155 | "progress": 0, 156 | "config": {}, 157 | "settings": { 158 | "params": {}, 159 | "forms": {} 160 | }, 161 | "apps": [], 162 | "runtimeInfos": {}, 163 | "progressUpdateIntervalMs": 500, 164 | "jobName": "paragraph_1667734317713_1294235244", 165 | "id": "paragraph_1667734317713_1294235244", 166 | "dateCreated": "2022-11-06 19:31:57.713", 167 | "status": "READY" 168 | } 169 | ], 170 | "name": "09 Convert timestamps with timezones", 171 | "id": "2HHBK28GB", 172 | "defaultInterpreterGroup": "flink", 173 | "version": "0.11.0-SNAPSHOT", 174 | "noteParams": {}, 175 | "noteForms": {}, 176 | "angularObjects": {}, 177 | "config": { 178 | "isZeppelinNotebookCronEnable": false 179 | }, 180 | "info": {} 181 | } -------------------------------------------------------------------------------- /Flink Sql Cookbook/Joins/01 Regular Joins_2FYA62DSS.zpln: -------------------------------------------------------------------------------- 1 | { 2 | "paragraphs": [ 3 | { 4 | "text": "%md\n\n\u003e :bulb: This example will show how you can use joins to correlate rows across multiple tables.\n\nFlink SQL supports complex and flexible join operations over continuous tables. There are several different types of joins to account for the wide variety of semantics queries may require.\n\nRegular joins are the most generic and flexible type of join. These include the standard `INNER` and `[FULL|LEFT|RIGHT] OUTER` joins that are available in most modern databases. \n\nSuppose we have a [NOC list](https://en.wikipedia.org/wiki/Non-official_cover) of secret agents all over the world. Your mission if you choose to accept it, is to join this table with another containin the agents real name.\n\nIn Flink SQL, this can be achieved using a simple `INNER JOIN`. Flink will join the tables using an equi-join predicate on the `agent_id` and output a new row everytime there is a match.\n\nHowever, there is something to be careful of. Flink must retain every input row as part of the join to potentially join it with the other table in the future. This means the queries resource requirements will grow indefinitely and will eventually fail. While this type of join is useful in some scenarios, other joins are more powerful in a streaming context and significantly more space-efficient.\n\nIn this example, both tables are bounded to remain space efficient.\n", 5 | "user": "anonymous", 6 | "dateUpdated": "2021-10-08 22:56:11.153", 7 | "progress": 0, 8 | "config": { 9 | "tableHide": false, 10 | "editorSetting": { 11 | "language": "markdown", 12 | "editOnDblClick": true, 13 | "completionKey": "TAB", 14 | "completionSupport": false 15 | }, 16 | "colWidth": 12.0, 17 | "editorMode": "ace/mode/markdown", 18 | "fontSize": 9.0, 19 | "editorHide": true, 20 | "results": {}, 21 | "enabled": true 22 | }, 23 | "settings": { 24 | "params": {}, 25 | "forms": {} 26 | }, 27 | "results": { 28 | "code": "SUCCESS", 29 | "msg": [ 30 | { 31 | "type": "HTML", 32 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cblockquote\u003e\n\u003cp\u003e💡 This example will show how you can use joins to correlate rows across multiple tables.\u003c/p\u003e\n\u003c/blockquote\u003e\n\u003cp\u003eFlink SQL supports complex and flexible join operations over continuous tables. There are several different types of joins to account for the wide variety of semantics queries may require.\u003c/p\u003e\n\u003cp\u003eRegular joins are the most generic and flexible type of join. These include the standard \u003ccode\u003eINNER\u003c/code\u003e and \u003ccode\u003e[FULL|LEFT|RIGHT] OUTER\u003c/code\u003e joins that are available in most modern databases.\u003c/p\u003e\n\u003cp\u003eSuppose we have a \u003ca href\u003d\"https://en.wikipedia.org/wiki/Non-official_cover\"\u003eNOC list\u003c/a\u003e of secret agents all over the world. Your mission if you choose to accept it, is to join this table with another containin the agents real name.\u003c/p\u003e\n\u003cp\u003eIn Flink SQL, this can be achieved using a simple \u003ccode\u003eINNER JOIN\u003c/code\u003e. Flink will join the tables using an equi-join predicate on the \u003ccode\u003eagent_id\u003c/code\u003e and output a new row everytime there is a match.\u003c/p\u003e\n\u003cp\u003eHowever, there is something to be careful of. Flink must retain every input row as part of the join to potentially join it with the other table in the future. This means the queries resource requirements will grow indefinitely and will eventually fail. While this type of join is useful in some scenarios, other joins are more powerful in a streaming context and significantly more space-efficient.\u003c/p\u003e\n\u003cp\u003eIn this example, both tables are bounded to remain space efficient.\u003c/p\u003e\n\n\u003c/div\u003e" 33 | } 34 | ] 35 | }, 36 | "apps": [], 37 | "runtimeInfos": {}, 38 | "progressUpdateIntervalMs": 500, 39 | "jobName": "paragraph_1614316653435_771824735", 40 | "id": "paragraph_1614316653435_771824735", 41 | "dateCreated": "2021-02-26 13:17:33.438", 42 | "dateStarted": "2021-10-08 22:56:11.154", 43 | "dateFinished": "2021-10-08 22:56:11.159", 44 | "status": "FINISHED" 45 | }, 46 | { 47 | "text": "%md\n\n本例将展示如何使用 joins 来将多个表中的行关联起来。\n\nFlink SQL 支持对连续表的灵活且复杂的join操作。有几种不同类型的连接用来应对广泛的查询语义的要求。\n\nRegular joins(普通连接)是最通用且最灵活的连接类型,它包括大多数现代数据库提供的标准的 `INNER` 和 `[FULL|LEFT|RIGHT] OUTER` 连接。\n\n假设我们有一个全世界范围内的密码代理的[NOC 列表](https://en.wikipedia.org/wiki/Non-official_cover)。如果你选择接受,你的任务是将这个表与另一个包含代理真实名称的表关联。\n\n这个任务使用 Flink SQL 的 `INNER JOIN` 可以轻松实现。 Flink 将在 `agent_id` 上使用 equi-join 谓词来连接表,然后在每当有匹配时输出一个新的行记录。\n\n但是,这里有一些地方需要注意。Flink 为了能够在将来能让表中的行与另一张表中潜在的行进行关联,它必须保留输入的每一行。这意味着查询资源的需求将无限增长并最终失败。但是这种类型的连接在一些场景下很有用,其他的连接在流处理环境下很有用并且拥有显著的空间效率。\n\n本例中,2 个表都是有界的以节省空间。\n", 48 | "user": "anonymous", 49 | "dateUpdated": "2021-03-18 15:56:01.436", 50 | "progress": 0, 51 | "config": { 52 | "editorSetting": { 53 | "language": "markdown", 54 | "editOnDblClick": true, 55 | "completionKey": "TAB", 56 | "completionSupport": false 57 | }, 58 | "colWidth": 12.0, 59 | "editorMode": "ace/mode/markdown", 60 | "fontSize": 9.0, 61 | "results": {}, 62 | "enabled": true, 63 | "editorHide": true, 64 | "tableHide": false 65 | }, 66 | "settings": { 67 | "params": {}, 68 | "forms": {} 69 | }, 70 | "results": { 71 | "code": "SUCCESS", 72 | "msg": [ 73 | { 74 | "type": "HTML", 75 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e本例将展示如何使用 joins 来将多个表中的行关联起来。\u003c/p\u003e\n\u003cp\u003eFlink SQL 支持对连续表的灵活且复杂的join操作。有几种不同类型的连接用来应对广泛的查询语义的要求。\u003c/p\u003e\n\u003cp\u003eRegular joins(普通连接)是最通用且最灵活的连接类型,它包括大多数现代数据库提供的标准的 \u003ccode\u003eINNER\u003c/code\u003e 和 \u003ccode\u003e[FULL|LEFT|RIGHT] OUTER\u003c/code\u003e 连接。\u003c/p\u003e\n\u003cp\u003e假设我们有一个全世界范围内的密码代理的\u003ca href\u003d\"https://en.wikipedia.org/wiki/Non-official_cover\"\u003eNOC 列表\u003c/a\u003e。如果你选择接受,你的任务是将这个表与另一个包含代理真实名称的表关联。\u003c/p\u003e\n\u003cp\u003e这个任务使用 Flink SQL 的 \u003ccode\u003eINNER JOIN\u003c/code\u003e 可以轻松实现。 Flink 将在 \u003ccode\u003eagent_id\u003c/code\u003e 上使用 equi-join 谓词来连接表,然后在每当有匹配时输出一个新的行记录。\u003c/p\u003e\n\u003cp\u003e但是,这里有一些地方需要注意。Flink 为了能够在将来能让表中的行与另一张表中潜在的行进行关联,它必须保留输入的每一行。这意味着查询资源的需求将无限增长并最终失败。但是这种类型的连接在一些场景下很有用,其他的连接在流处理环境下很有用并且拥有显著的空间效率。\u003c/p\u003e\n\u003cp\u003e本例中,2 个表都是有界的以节省空间。\u003c/p\u003e\n\n\u003c/div\u003e" 76 | } 77 | ] 78 | }, 79 | "apps": [], 80 | "runtimeInfos": {}, 81 | "progressUpdateIntervalMs": 500, 82 | "jobName": "paragraph_1615185616790_975194353", 83 | "id": "paragraph_1615185616790_975194353", 84 | "dateCreated": "2021-03-08 06:40:16.790", 85 | "dateStarted": "2021-03-18 15:56:01.434", 86 | "dateFinished": "2021-03-18 15:56:01.459", 87 | "status": "FINISHED" 88 | }, 89 | { 90 | "text": "%flink.ssql\n\nDROP TABLE IF EXISTS NOC;\n\nCREATE TABLE NOC (\n agent_id STRING,\n codename STRING\n)\nWITH (\n \u0027connector\u0027 \u003d \u0027faker\u0027,\n \u0027fields.agent_id.expression\u0027 \u003d \u0027#{regexify \u0027\u0027(1|2|3|4|5){1}\u0027\u0027}\u0027,\n \u0027fields.codename.expression\u0027 \u003d \u0027#{superhero.name}\u0027,\n \u0027number-of-rows\u0027 \u003d \u002710\u0027\n);\n\nDROP TABLE IF EXISTS RealNames;\n\nCREATE TABLE RealNames (\n agent_id STRING,\n name STRING\n)\nWITH (\n \u0027connector\u0027 \u003d \u0027faker\u0027,\n \u0027fields.agent_id.expression\u0027 \u003d \u0027#{regexify \u0027\u0027(1|2|3|4|5){1}\u0027\u0027}\u0027,\n \u0027fields.name.expression\u0027 \u003d \u0027#{Name.full_name}\u0027,\n \u0027number-of-rows\u0027 \u003d \u002710\u0027\n);\n", 91 | "user": "anonymous", 92 | "dateUpdated": "2021-02-26 10:06:52.684", 93 | "progress": 0, 94 | "config": { 95 | "editorSetting": { 96 | "language": "sql", 97 | "editOnDblClick": false, 98 | "completionKey": "TAB", 99 | "completionSupport": true 100 | }, 101 | "colWidth": 12.0, 102 | "editorMode": "ace/mode/sql", 103 | "fontSize": 9.0, 104 | "results": {}, 105 | "enabled": true 106 | }, 107 | "settings": { 108 | "params": {}, 109 | "forms": {} 110 | }, 111 | "apps": [], 112 | "runtimeInfos": {}, 113 | "progressUpdateIntervalMs": 500, 114 | "jobName": "paragraph_1614305028935_1653205277", 115 | "id": "paragraph_1614305028935_1653205277", 116 | "dateCreated": "2021-02-26 10:03:48.936", 117 | "dateStarted": "2021-02-26 10:06:52.691", 118 | "dateFinished": "2021-02-26 10:06:53.732", 119 | "status": "FINISHED" 120 | }, 121 | { 122 | "text": "%flink.ssql(type\u003dupdate)\n\nSELECT\n name,\n codename\nFROM NOC\nINNER JOIN RealNames ON NOC.agent_id \u003d RealNames.agent_id;\n", 123 | "user": "anonymous", 124 | "dateUpdated": "2021-02-26 13:17:10.651", 125 | "progress": 0, 126 | "config": { 127 | "editorSetting": { 128 | "language": "sql", 129 | "editOnDblClick": false, 130 | "completionKey": "TAB", 131 | "completionSupport": true 132 | }, 133 | "colWidth": 12.0, 134 | "editorMode": "ace/mode/sql", 135 | "fontSize": 9.0, 136 | "results": { 137 | "0": { 138 | "graph": { 139 | "mode": "table", 140 | "height": 300.0, 141 | "optionOpen": false, 142 | "setting": { 143 | "table": { 144 | "tableGridState": {}, 145 | "tableColumnTypeState": { 146 | "names": { 147 | "name": "string", 148 | "codename": "string" 149 | }, 150 | "updated": false 151 | }, 152 | "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", 153 | "tableOptionValue": { 154 | "useFilter": false, 155 | "showPagination": false, 156 | "showAggregationFooter": false 157 | }, 158 | "updated": false, 159 | "initialized": false 160 | } 161 | }, 162 | "commonSetting": {} 163 | } 164 | } 165 | }, 166 | "enabled": true 167 | }, 168 | "settings": { 169 | "params": {}, 170 | "forms": {} 171 | }, 172 | "apps": [], 173 | "runtimeInfos": { 174 | "jobUrl": { 175 | "propertyName": "jobUrl", 176 | "label": "FLINK JOB", 177 | "tooltip": "View in Flink web UI", 178 | "group": "flink", 179 | "values": [ 180 | { 181 | "jobUrl": "http://localhost:8081#/job/8b568fc439d250170c3c0684e536efd1" 182 | } 183 | ], 184 | "interpreterSettingId": "flink" 185 | } 186 | }, 187 | "progressUpdateIntervalMs": 500, 188 | "jobName": "paragraph_1614305202536_606838204", 189 | "id": "paragraph_1614305202536_606838204", 190 | "dateCreated": "2021-02-26 10:06:42.536", 191 | "dateStarted": "2021-02-26 10:07:05.548", 192 | "dateFinished": "2021-02-26 10:07:10.928", 193 | "status": "FINISHED" 194 | }, 195 | { 196 | "text": "%flink.ssql\n", 197 | "user": "anonymous", 198 | "dateUpdated": "2021-02-26 10:07:05.546", 199 | "progress": 0, 200 | "config": {}, 201 | "settings": { 202 | "params": {}, 203 | "forms": {} 204 | }, 205 | "apps": [], 206 | "runtimeInfos": {}, 207 | "progressUpdateIntervalMs": 500, 208 | "jobName": "paragraph_1614305225546_2102469394", 209 | "id": "paragraph_1614305225546_2102469394", 210 | "dateCreated": "2021-02-26 10:07:05.546", 211 | "status": "READY" 212 | } 213 | ], 214 | "name": "01 Regular Joins", 215 | "id": "2FYA62DSS", 216 | "defaultInterpreterGroup": "flink", 217 | "version": "0.10.0-SNAPSHOT", 218 | "noteParams": {}, 219 | "noteForms": {}, 220 | "angularObjects": {}, 221 | "config": { 222 | "isZeppelinNotebookCronEnable": false 223 | }, 224 | "info": {} 225 | } -------------------------------------------------------------------------------- /Flink Sql Cookbook/Joins/01_Regular_Join.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjffdu/flink-sql-cookbook-on-zeppelin/6cb9a0a0b64ef9eb87b4f8ced63e447b4aab72b9/Flink Sql Cookbook/Joins/01_Regular_Join.gif -------------------------------------------------------------------------------- /Flink Sql Cookbook/Joins/02 Interval Joins_2FYMBTGSF.zpln: -------------------------------------------------------------------------------- 1 | { 2 | "paragraphs": [ 3 | { 4 | "text": "%md\n\n\u003e :bulb: This example will show how you can perform joins between tables with events that are related in a temporal context.\n\n## Why Interval Joins?\n\nIn a previous recipe, you learned about using _regular joins_ in Flink SQL. This kind of join works well for some scenarios, but for others a more efficient type of join is required to keep resource utilization from growing indefinitely.\n\nOne of the ways to optimize joining operations in Flink SQL is to use [_interval joins_](https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/streaming/joins.html#interval-joins). An interval join is defined by a join predicate that checks if the time attributes of the input events are within certain time constraints (i.e. a time window).\n\n## Using Interval Joins\n\nSuppose you want to join events of two tables that correlate to each other in the [order fulfillment lifecycle](https://en.wikipedia.org/wiki/Order_fulfillment) (`orders` and `shipments`) and that are under a Service-level Aggreement (SLA) Service-level Aggreement (SLA) of **3 days**. To reduce the amount of input rows Flink has to retain and optimize the join operation, you can define a time constraint in the `WHERE` clause to bound the time on both sides to that specific interval using a `BETWEEN` predicate.\n\n", 5 | "user": "anonymous", 6 | "dateUpdated": "2021-10-08 22:56:58.356", 7 | "progress": 0, 8 | "config": { 9 | "tableHide": false, 10 | "editorSetting": { 11 | "language": "markdown", 12 | "editOnDblClick": true, 13 | "completionKey": "TAB", 14 | "completionSupport": false 15 | }, 16 | "colWidth": 12.0, 17 | "editorMode": "ace/mode/markdown", 18 | "fontSize": 9.0, 19 | "editorHide": true, 20 | "results": {}, 21 | "enabled": true 22 | }, 23 | "settings": { 24 | "params": {}, 25 | "forms": {} 26 | }, 27 | "results": { 28 | "code": "SUCCESS", 29 | "msg": [ 30 | { 31 | "type": "HTML", 32 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cblockquote\u003e\n\u003cp\u003e💡 This example will show how you can perform joins between tables with events that are related in a temporal context.\u003c/p\u003e\n\u003c/blockquote\u003e\n\u003ch2\u003eWhy Interval Joins?\u003c/h2\u003e\n\u003cp\u003eIn a previous recipe, you learned about using \u003cem\u003eregular joins\u003c/em\u003e in Flink SQL. This kind of join works well for some scenarios, but for others a more efficient type of join is required to keep resource utilization from growing indefinitely.\u003c/p\u003e\n\u003cp\u003eOne of the ways to optimize joining operations in Flink SQL is to use \u003ca href\u003d\"https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/streaming/joins.html#interval-joins\"\u003e\u003cem\u003einterval joins\u003c/em\u003e\u003c/a\u003e. An interval join is defined by a join predicate that checks if the time attributes of the input events are within certain time constraints (i.e. a time window).\u003c/p\u003e\n\u003ch2\u003eUsing Interval Joins\u003c/h2\u003e\n\u003cp\u003eSuppose you want to join events of two tables that correlate to each other in the \u003ca href\u003d\"https://en.wikipedia.org/wiki/Order_fulfillment\"\u003eorder fulfillment lifecycle\u003c/a\u003e (\u003ccode\u003eorders\u003c/code\u003e and \u003ccode\u003eshipments\u003c/code\u003e) and that are under a Service-level Aggreement (SLA) Service-level Aggreement (SLA) of \u003cstrong\u003e3 days\u003c/strong\u003e. To reduce the amount of input rows Flink has to retain and optimize the join operation, you can define a time constraint in the \u003ccode\u003eWHERE\u003c/code\u003e clause to bound the time on both sides to that specific interval using a \u003ccode\u003eBETWEEN\u003c/code\u003e predicate.\u003c/p\u003e\n\n\u003c/div\u003e" 33 | } 34 | ] 35 | }, 36 | "apps": [], 37 | "runtimeInfos": {}, 38 | "progressUpdateIntervalMs": 500, 39 | "jobName": "paragraph_1614316707292_565283423", 40 | "id": "paragraph_1614316707292_565283423", 41 | "dateCreated": "2021-02-26 13:18:27.292", 42 | "dateStarted": "2021-10-08 22:56:58.357", 43 | "dateFinished": "2021-10-08 22:56:58.363", 44 | "status": "FINISHED" 45 | }, 46 | { 47 | "text": "%md\n\n本例将展示如果对时间上下文相关联的事件表执行连接操作。\n\n## 为什么使用 Interval Joins?\n在之前的例子中,我们已经学习了使用 Flink SQL 的 _regular joins_。这种类型的join适合某个场景,但是有些场景下需要一个更有效的连接类型以防止资源利用无限增长。\n\nFLink SQL 中其中一个优化连接操作的方式是使用 [_interval joins_](https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/streaming/joins.html#interval-joins)。interval join(事件区间连接) 使用连接 predicate 定义用来检查输入的事件的事件属性是否在一个固定的时间约束内(即时间窗口).\n\n## 使用 Interval Joins\n\n假设你希望连接2张在 [order fulfillment lifecycle](https://en.wikipedia.org/wiki/Order_fulfillment) (`orders` and `shipments`) 中互相联系的表的事件。并且具有 **3 天** 的 服务级别协定 (SLA)。为了减少Flink要保留的输入行的数量以及优化join操作,我们可以定义一个时间约束在 `WHERE` 语句中来将两边的数据限定在使用 `BETWEEN` 表述的特定时间区间内。\n", 48 | "user": "anonymous", 49 | "dateUpdated": "2021-03-18 15:56:15.693", 50 | "progress": 0, 51 | "config": { 52 | "editorSetting": { 53 | "language": "markdown", 54 | "editOnDblClick": true, 55 | "completionKey": "TAB", 56 | "completionSupport": false 57 | }, 58 | "colWidth": 12.0, 59 | "editorMode": "ace/mode/markdown", 60 | "fontSize": 9.0, 61 | "results": {}, 62 | "enabled": true, 63 | "editorHide": true, 64 | "tableHide": false 65 | }, 66 | "settings": { 67 | "params": {}, 68 | "forms": {} 69 | }, 70 | "results": { 71 | "code": "SUCCESS", 72 | "msg": [ 73 | { 74 | "type": "HTML", 75 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e本例将展示如果对时间上下文相关联的事件表执行连接操作。\u003c/p\u003e\n\u003ch2\u003e为什么使用 Interval Joins?\u003c/h2\u003e\n\u003cp\u003e在之前的例子中,我们已经学习了使用 Flink SQL 的 \u003cem\u003eregular joins\u003c/em\u003e。这种类型的join适合某个场景,但是有些场景下需要一个更有效的连接类型以防止资源利用无限增长。\u003c/p\u003e\n\u003cp\u003eFLink SQL 中其中一个优化连接操作的方式是使用 \u003ca href\u003d\"https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/streaming/joins.html#interval-joins\"\u003e\u003cem\u003einterval joins\u003c/em\u003e\u003c/a\u003e。interval join(事件区间连接) 使用连接 predicate 定义用来检查输入的事件的事件属性是否在一个固定的时间约束内(即时间窗口).\u003c/p\u003e\n\u003ch2\u003e使用 Interval Joins\u003c/h2\u003e\n\u003cp\u003e假设你希望连接2张在 \u003ca href\u003d\"https://en.wikipedia.org/wiki/Order_fulfillment\"\u003eorder fulfillment lifecycle\u003c/a\u003e (\u003ccode\u003eorders\u003c/code\u003e and \u003ccode\u003eshipments\u003c/code\u003e) 中互相联系的表的事件。并且具有 \u003cstrong\u003e3 天\u003c/strong\u003e 的 服务级别协定 (SLA)。为了减少Flink要保留的输入行的数量以及优化join操作,我们可以定义一个时间约束在 \u003ccode\u003eWHERE\u003c/code\u003e 语句中来将两边的数据限定在使用 \u003ccode\u003eBETWEEN\u003c/code\u003e 表述的特定时间区间内。\u003c/p\u003e\n\n\u003c/div\u003e" 76 | } 77 | ] 78 | }, 79 | "apps": [], 80 | "runtimeInfos": {}, 81 | "progressUpdateIntervalMs": 500, 82 | "jobName": "paragraph_1615187074212_175017653", 83 | "id": "paragraph_1615187074212_175017653", 84 | "dateCreated": "2021-03-08 07:04:34.213", 85 | "dateStarted": "2021-03-18 15:56:15.694", 86 | "dateFinished": "2021-03-18 15:56:15.707", 87 | "status": "FINISHED" 88 | }, 89 | { 90 | "text": "%flink.ssql\n\nDROP TABLE IF EXISTS orders;\n\nCREATE TABLE orders (\n id INT,\n order_time AS TIMESTAMPADD(DAY, CAST(FLOOR(RAND()*(1-5+1)+5)*(-1) AS INT), CURRENT_TIMESTAMP)\n)\nWITH (\n \u0027connector\u0027 \u003d \u0027datagen\u0027,\n \u0027rows-per-second\u0027\u003d\u002710\u0027,\n \u0027fields.id.kind\u0027\u003d\u0027sequence\u0027,\n \u0027fields.id.start\u0027\u003d\u00271\u0027,\n \u0027fields.id.end\u0027\u003d\u00271000\u0027\n);\n\nDROP TABLE IF EXISTS shipments;\n\nCREATE TABLE shipments (\n id INT,\n order_id INT,\n shipment_time AS TIMESTAMPADD(DAY, CAST(FLOOR(RAND()*(1-5+1)) AS INT), CURRENT_TIMESTAMP)\n)\nWITH (\n \u0027connector\u0027 \u003d \u0027datagen\u0027,\n \u0027rows-per-second\u0027\u003d\u00275\u0027,\n \u0027fields.id.kind\u0027\u003d\u0027random\u0027,\n \u0027fields.id.min\u0027\u003d\u00270\u0027,\n \u0027fields.order_id.kind\u0027\u003d\u0027sequence\u0027,\n \u0027fields.order_id.start\u0027\u003d\u00271\u0027,\n \u0027fields.order_id.end\u0027\u003d\u00271000\u0027\n);\n", 91 | "user": "anonymous", 92 | "dateUpdated": "2021-02-26 10:08:43.655", 93 | "progress": 0, 94 | "config": { 95 | "editorSetting": { 96 | "language": "sql", 97 | "editOnDblClick": false, 98 | "completionKey": "TAB", 99 | "completionSupport": true 100 | }, 101 | "colWidth": 12.0, 102 | "editorMode": "ace/mode/sql", 103 | "fontSize": 9.0, 104 | "results": {}, 105 | "enabled": true 106 | }, 107 | "settings": { 108 | "params": {}, 109 | "forms": {} 110 | }, 111 | "apps": [], 112 | "runtimeInfos": {}, 113 | "progressUpdateIntervalMs": 500, 114 | "jobName": "paragraph_1614305279491_945824539", 115 | "id": "paragraph_1614305279491_945824539", 116 | "dateCreated": "2021-02-26 10:07:59.491", 117 | "dateStarted": "2021-02-26 10:08:43.664", 118 | "dateFinished": "2021-02-26 10:08:44.705", 119 | "status": "FINISHED" 120 | }, 121 | { 122 | "text": "%flink.ssql(type\u003dupdate)\n\nSELECT\n o.id AS order_id,\n o.order_time,\n s.shipment_time,\n TIMESTAMPDIFF(DAY,o.order_time,s.shipment_time) AS day_diff\nFROM orders o\nJOIN shipments s ON o.id \u003d s.order_id\nWHERE \n o.order_time BETWEEN s.shipment_time - INTERVAL \u00273\u0027 DAY AND s.shipment_time\nORDER BY order_time DESC\nLIMIT 10;\n", 123 | "user": "anonymous", 124 | "dateUpdated": "2021-02-26 13:18:26.703", 125 | "progress": 0, 126 | "config": { 127 | "editorSetting": { 128 | "language": "sql", 129 | "editOnDblClick": false, 130 | "completionKey": "TAB", 131 | "completionSupport": true 132 | }, 133 | "colWidth": 12.0, 134 | "editorMode": "ace/mode/sql", 135 | "fontSize": 9.0, 136 | "results": { 137 | "0": { 138 | "graph": { 139 | "mode": "table", 140 | "height": 300.0, 141 | "optionOpen": false, 142 | "setting": { 143 | "table": { 144 | "tableGridState": {}, 145 | "tableColumnTypeState": { 146 | "names": { 147 | "order_id": "string", 148 | "order_time": "string", 149 | "shipment_time": "string", 150 | "day_diff": "string" 151 | }, 152 | "updated": false 153 | }, 154 | "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", 155 | "tableOptionValue": { 156 | "useFilter": false, 157 | "showPagination": false, 158 | "showAggregationFooter": false 159 | }, 160 | "updated": false, 161 | "initialized": false 162 | } 163 | }, 164 | "commonSetting": {} 165 | } 166 | } 167 | }, 168 | "enabled": true 169 | }, 170 | "settings": { 171 | "params": {}, 172 | "forms": {} 173 | }, 174 | "apps": [], 175 | "runtimeInfos": { 176 | "jobUrl": { 177 | "propertyName": "jobUrl", 178 | "label": "FLINK JOB", 179 | "tooltip": "View in Flink web UI", 180 | "group": "flink", 181 | "values": [ 182 | { 183 | "jobUrl": "http://localhost:8081#/job/86a3dc8e59db03052aa439a29a4eba23" 184 | } 185 | ], 186 | "interpreterSettingId": "flink" 187 | } 188 | }, 189 | "progressUpdateIntervalMs": 500, 190 | "jobName": "paragraph_1614305302690_1560304565", 191 | "id": "paragraph_1614305302690_1560304565", 192 | "dateCreated": "2021-02-26 10:08:22.690", 193 | "dateStarted": "2021-02-26 10:10:38.764", 194 | "dateFinished": "2021-02-26 10:10:51.156", 195 | "status": "ABORT" 196 | }, 197 | { 198 | "text": "%flink.ssql\n", 199 | "user": "anonymous", 200 | "dateUpdated": "2021-02-26 10:09:02.042", 201 | "progress": 0, 202 | "config": {}, 203 | "settings": { 204 | "params": {}, 205 | "forms": {} 206 | }, 207 | "apps": [], 208 | "runtimeInfos": {}, 209 | "progressUpdateIntervalMs": 500, 210 | "jobName": "paragraph_1614305342041_1071723300", 211 | "id": "paragraph_1614305342041_1071723300", 212 | "dateCreated": "2021-02-26 10:09:02.042", 213 | "status": "READY" 214 | } 215 | ], 216 | "name": "02 Interval Joins", 217 | "id": "2FYMBTGSF", 218 | "defaultInterpreterGroup": "flink", 219 | "version": "0.10.0-SNAPSHOT", 220 | "noteParams": {}, 221 | "noteForms": {}, 222 | "angularObjects": {}, 223 | "config": { 224 | "isZeppelinNotebookCronEnable": false 225 | }, 226 | "info": {} 227 | } -------------------------------------------------------------------------------- /Flink Sql Cookbook/Joins/02_Interval_Join.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjffdu/flink-sql-cookbook-on-zeppelin/6cb9a0a0b64ef9eb87b4f8ced63e447b4aab72b9/Flink Sql Cookbook/Joins/02_Interval_Join.gif -------------------------------------------------------------------------------- /Flink Sql Cookbook/Joins/04 Lookup Joins_2FYWYEW8C.zpln: -------------------------------------------------------------------------------- 1 | { 2 | "paragraphs": [ 3 | { 4 | "text": "%md\n\n\u003e :bulb: This example will show how you can enrich a stream with an external table of reference data (i.e. a _lookup_ table).\n\n## Data Enrichment\n\nNot all data changes frequently, even when working in real-time: in some cases, you might need to enrich streaming data with static — or _reference_ — data that is stored externally. For example, `user` metadata may be stored in a relational database that Flink needs to join against directly.\nFlink SQL allows you to look up reference data and join it with a stream using a _lookup join_. The join requires one table to have a [processing time attribute](https://docs.ververica.com/user_guide/sql_development/table_view.html#processing-time-attributes) and the other table to be backed by a [lookup source connector](https://docs.ververica.com/user_guide/sql_development/connectors.html#id1), like the JDBC connector.\n\n## Using Lookup Joins\n\nIn this example, you will look up reference user data stored in MySQL to flag subscription events for users that are minors (`age \u003c 18`). The `FOR SYSTEM_TIME AS OF` clause uses the processing time attribute to ensure that each row of the `subscriptions` table is joined with the `users` rows that match the join predicate at the point in time when the `subscriptions` row is processed by the join operator. The lookup join also requires an equality join predicate based on the `PRIMARY KEY` of the lookup table (`usub.user_id \u003d u.user_id`). Here, the source does not have to read the entire table and can lazily fetch individual values from the external table when necessary.\n\n## Script\n\nThe source table (`subscriptions`) is backed by the [`faker` connector](https://github.com/knaufk/flink-faker), which continuously generates rows in memory based on Java Faker expressions. The `users` table is backed by an existing MySQL reference table using the [JDBC connector](https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/connectors/jdbc.html).\n", 5 | "user": "anonymous", 6 | "dateUpdated": "2021-10-08 22:58:34.507", 7 | "progress": 0, 8 | "config": { 9 | "tableHide": false, 10 | "editorSetting": { 11 | "language": "markdown", 12 | "editOnDblClick": true, 13 | "completionKey": "TAB", 14 | "completionSupport": false 15 | }, 16 | "colWidth": 12.0, 17 | "editorMode": "ace/mode/markdown", 18 | "fontSize": 9.0, 19 | "editorHide": true, 20 | "results": {}, 21 | "enabled": true 22 | }, 23 | "settings": { 24 | "params": {}, 25 | "forms": {} 26 | }, 27 | "results": { 28 | "code": "SUCCESS", 29 | "msg": [ 30 | { 31 | "type": "HTML", 32 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cblockquote\u003e\n\u003cp\u003e💡 This example will show how you can enrich a stream with an external table of reference data (i.e. a \u003cem\u003elookup\u003c/em\u003e table).\u003c/p\u003e\n\u003c/blockquote\u003e\n\u003ch2\u003eData Enrichment\u003c/h2\u003e\n\u003cp\u003eNot all data changes frequently, even when working in real-time: in some cases, you might need to enrich streaming data with static — or \u003cem\u003ereference\u003c/em\u003e — data that is stored externally. For example, \u003ccode\u003euser\u003c/code\u003e metadata may be stored in a relational database that Flink needs to join against directly.\u003cbr /\u003e\nFlink SQL allows you to look up reference data and join it with a stream using a \u003cem\u003elookup join\u003c/em\u003e. The join requires one table to have a \u003ca href\u003d\"https://docs.ververica.com/user_guide/sql_development/table_view.html#processing-time-attributes\"\u003eprocessing time attribute\u003c/a\u003e and the other table to be backed by a \u003ca href\u003d\"https://docs.ververica.com/user_guide/sql_development/connectors.html#id1\"\u003elookup source connector\u003c/a\u003e, like the JDBC connector.\u003c/p\u003e\n\u003ch2\u003eUsing Lookup Joins\u003c/h2\u003e\n\u003cp\u003eIn this example, you will look up reference user data stored in MySQL to flag subscription events for users that are minors (\u003ccode\u003eage \u0026lt; 18\u003c/code\u003e). The \u003ccode\u003eFOR SYSTEM_TIME AS OF\u003c/code\u003e clause uses the processing time attribute to ensure that each row of the \u003ccode\u003esubscriptions\u003c/code\u003e table is joined with the \u003ccode\u003eusers\u003c/code\u003e rows that match the join predicate at the point in time when the \u003ccode\u003esubscriptions\u003c/code\u003e row is processed by the join operator. The lookup join also requires an equality join predicate based on the \u003ccode\u003ePRIMARY KEY\u003c/code\u003e of the lookup table (\u003ccode\u003eusub.user_id \u003d u.user_id\u003c/code\u003e). Here, the source does not have to read the entire table and can lazily fetch individual values from the external table when necessary.\u003c/p\u003e\n\u003ch2\u003eScript\u003c/h2\u003e\n\u003cp\u003eThe source table (\u003ccode\u003esubscriptions\u003c/code\u003e) is backed by the \u003ca href\u003d\"https://github.com/knaufk/flink-faker\"\u003e\u003ccode\u003efaker\u003c/code\u003e connector\u003c/a\u003e, which continuously generates rows in memory based on Java Faker expressions. The \u003ccode\u003eusers\u003c/code\u003e table is backed by an existing MySQL reference table using the \u003ca href\u003d\"https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/connectors/jdbc.html\"\u003eJDBC connector\u003c/a\u003e.\u003c/p\u003e\n\n\u003c/div\u003e" 33 | } 34 | ] 35 | }, 36 | "apps": [], 37 | "runtimeInfos": {}, 38 | "progressUpdateIntervalMs": 500, 39 | "jobName": "paragraph_1614316800709_621606447", 40 | "id": "paragraph_1614316800709_621606447", 41 | "dateCreated": "2021-02-26 13:20:00.710", 42 | "dateStarted": "2021-10-08 22:58:34.509", 43 | "dateFinished": "2021-10-08 22:58:34.515", 44 | "status": "FINISHED" 45 | }, 46 | { 47 | "text": "%md\n\n本文将展示如何使用外部表中的 reference 数据来充实(enrich)一个流(即 _lookup_ 表)。\n\n## 数据充实(data enrichment)\n\n即使在实时场景下,也并不是所有的数据都频繁的改变:在一些场景中,我们可能会需要使用储存在外部的静态或者 _reference_ 数据来扩展一个流。例如,`user`表的元信息可能储存在关系型数据库中,Flink 需要直接 join 这个表。\nFLINK SQL 允许我们查询reference数据,并使用 _lookup join_ 来将它与一个流 join。这个 join 操作需要一个表有 [processing time attribute](https://docs.ververica.com/user_guide/sql_development/table_view.html#processing-time-attributes) 另一个表使用像 JDBC connector 这样的 [lookup source connector](https://docs.ververica.com/user_guide/sql_development/connectors.html#id1) 来提供数据。\n\n## 使用 Loopup Joins\n\n在这个例子中,我们使用储存在 MYSQL 中的用户 reference 数据来标记未成年用户 (`age \u003c 18`). `FOR SYSTEM_TIME AS OF` 语句使用处理时间属性来确保 `subscriptions` 表的每一个行在join操作处理这个 `subscriptions` 行的时间点与 `users` 表中 `users` 记录连接。lookup join 也需要一个 基于 lookup 表 `PRIMARY KEY` 的 相等 join 谓词 (`usub.user_id \u003d u.user_id`). 这里,数据源不需要读取整个表,可以在需要的时候从外部表中获取各自需要的值。\n\n\n## 脚本\n\n数据源 `subscriptions` 表使用 [`faker` connector](https://github.com/knaufk/flink-faker) 提供数据,它可以基于 Java Faker 表达式持续的在内存中生成数据。`users` 表使用 [JDBC connector](https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/connectors/jdbc.html) 从一个已存在的 MYSQL reference 表中获取数据。\n", 48 | "user": "anonymous", 49 | "dateUpdated": "2021-03-18 15:57:00.333", 50 | "progress": 0, 51 | "config": { 52 | "tableHide": false, 53 | "editorSetting": { 54 | "language": "markdown", 55 | "editOnDblClick": true, 56 | "completionKey": "TAB", 57 | "completionSupport": false 58 | }, 59 | "colWidth": 12.0, 60 | "editorMode": "ace/mode/markdown", 61 | "fontSize": 9.0, 62 | "editorHide": true, 63 | "results": {}, 64 | "enabled": true 65 | }, 66 | "settings": { 67 | "params": {}, 68 | "forms": {} 69 | }, 70 | "results": { 71 | "code": "SUCCESS", 72 | "msg": [ 73 | { 74 | "type": "HTML", 75 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e本文将展示如何使用外部表中的 reference 数据来充实(enrich)一个流(即 \u003cem\u003elookup\u003c/em\u003e 表)。\u003c/p\u003e\n\u003ch2\u003e数据充实(data enrichment)\u003c/h2\u003e\n\u003cp\u003e即使在实时场景下,也并不是所有的数据都频繁的改变:在一些场景中,我们可能会需要使用储存在外部的静态或者 \u003cem\u003ereference\u003c/em\u003e 数据来扩展一个流。例如,\u003ccode\u003euser\u003c/code\u003e表的元信息可能储存在关系型数据库中,Flink 需要直接 join 这个表。\u003cbr /\u003e\nFLINK SQL 允许我们查询reference数据,并使用 \u003cem\u003elookup join\u003c/em\u003e 来将它与一个流 join。这个 join 操作需要一个表有 \u003ca href\u003d\"https://docs.ververica.com/user_guide/sql_development/table_view.html#processing-time-attributes\"\u003eprocessing time attribute\u003c/a\u003e 另一个表使用像 JDBC connector 这样的 \u003ca href\u003d\"https://docs.ververica.com/user_guide/sql_development/connectors.html#id1\"\u003elookup source connector\u003c/a\u003e 来提供数据。\u003c/p\u003e\n\u003ch2\u003e使用 Loopup Joins\u003c/h2\u003e\n\u003cp\u003e在这个例子中,我们使用储存在 MYSQL 中的用户 reference 数据来标记未成年用户 (\u003ccode\u003eage \u0026lt; 18\u003c/code\u003e). \u003ccode\u003eFOR SYSTEM_TIME AS OF\u003c/code\u003e 语句使用处理时间属性来确保 \u003ccode\u003esubscriptions\u003c/code\u003e 表的每一个行在join操作处理这个 \u003ccode\u003esubscriptions\u003c/code\u003e 行的时间点与 \u003ccode\u003eusers\u003c/code\u003e 表中 \u003ccode\u003eusers\u003c/code\u003e 记录连接。lookup join 也需要一个 基于 lookup 表 \u003ccode\u003ePRIMARY KEY\u003c/code\u003e 的 相等 join 谓词 (\u003ccode\u003eusub.user_id \u003d u.user_id\u003c/code\u003e). 这里,数据源不需要读取整个表,可以在需要的时候从外部表中获取各自需要的值。\u003c/p\u003e\n\u003ch2\u003e脚本\u003c/h2\u003e\n\u003cp\u003e数据源 \u003ccode\u003esubscriptions\u003c/code\u003e 表使用 \u003ca href\u003d\"https://github.com/knaufk/flink-faker\"\u003e\u003ccode\u003efaker\u003c/code\u003e connector\u003c/a\u003e 提供数据,它可以基于 Java Faker 表达式持续的在内存中生成数据。\u003ccode\u003eusers\u003c/code\u003e 表使用 \u003ca href\u003d\"https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/connectors/jdbc.html\"\u003eJDBC connector\u003c/a\u003e 从一个已存在的 MYSQL reference 表中获取数据。\u003c/p\u003e\n\n\u003c/div\u003e" 76 | } 77 | ] 78 | }, 79 | "apps": [], 80 | "runtimeInfos": {}, 81 | "progressUpdateIntervalMs": 500, 82 | "jobName": "paragraph_1615520569624_169709885", 83 | "id": "paragraph_1615520569624_169709885", 84 | "dateCreated": "2021-03-12 03:42:49.625", 85 | "dateStarted": "2021-03-18 15:57:00.333", 86 | "dateFinished": "2021-03-18 15:57:00.348", 87 | "status": "FINISHED" 88 | }, 89 | { 90 | "text": "%flink.ssql\n\nDROP TABLE IF EXISTS subscriptions;\n\nCREATE TABLE subscriptions ( \n id STRING,\n user_id INT,\n type STRING,\n start_date TIMESTAMP(3),\n end_date TIMESTAMP(3),\n payment_expiration TIMESTAMP(3),\n proc_time AS PROCTIME()\n) WITH (\n \u0027connector\u0027 \u003d \u0027faker\u0027,\n \u0027fields.id.expression\u0027 \u003d \u0027#{Internet.uuid}\u0027, \n \u0027fields.user_id.expression\u0027 \u003d \u0027#{number.numberBetween \u0027\u00271\u0027\u0027,\u0027\u002750\u0027\u0027}\u0027,\n \u0027fields.type.expression\u0027\u003d \u0027#{regexify \u0027\u0027(basic|premium|platinum){1}\u0027\u0027}\u0027,\n \u0027fields.start_date.expression\u0027 \u003d \u0027#{date.past \u0027\u002730\u0027\u0027,\u0027\u0027DAYS\u0027\u0027}\u0027,\n \u0027fields.end_date.expression\u0027 \u003d \u0027#{date.future \u0027\u0027365\u0027\u0027,\u0027\u0027DAYS\u0027\u0027}\u0027,\n \u0027fields.payment_expiration.expression\u0027 \u003d \u0027#{date.future \u0027\u0027365\u0027\u0027,\u0027\u0027DAYS\u0027\u0027}\u0027\n);\n\nDROP TABLE IF EXISTS users;\n\nCREATE TABLE users (\n user_id INT PRIMARY KEY,\n user_name VARCHAR(255) NOT NULL, \n age INT NOT NULL\n)\nWITH (\n \u0027connector\u0027 \u003d \u0027jdbc\u0027, \n \u0027url\u0027 \u003d \u0027jdbc:mysql://localhost:3306/mysql-database\u0027, \n \u0027table-name\u0027 \u003d \u0027users\u0027, \n \u0027username\u0027 \u003d \u0027mysql-user\u0027, \n \u0027password\u0027 \u003d \u0027mysql-password\u0027\n);", 91 | "user": "anonymous", 92 | "dateUpdated": "2021-02-26 10:13:13.775", 93 | "progress": 0, 94 | "config": { 95 | "editorSetting": { 96 | "language": "sql", 97 | "editOnDblClick": false, 98 | "completionKey": "TAB", 99 | "completionSupport": true 100 | }, 101 | "colWidth": 12.0, 102 | "editorMode": "ace/mode/sql", 103 | "fontSize": 9.0, 104 | "results": {}, 105 | "enabled": true 106 | }, 107 | "settings": { 108 | "params": {}, 109 | "forms": {} 110 | }, 111 | "apps": [], 112 | "runtimeInfos": {}, 113 | "progressUpdateIntervalMs": 500, 114 | "jobName": "paragraph_1614305528439_1240137011", 115 | "id": "paragraph_1614305528439_1240137011", 116 | "dateCreated": "2021-02-26 10:12:08.439", 117 | "dateStarted": "2021-02-26 10:13:13.787", 118 | "dateFinished": "2021-02-26 10:13:14.718", 119 | "status": "FINISHED" 120 | }, 121 | { 122 | "text": "%flink.ssql(type\u003dupdate)\n\nSELECT \n id AS subscription_id,\n type AS subscription_type,\n age AS user_age,\n CASE \n WHEN age \u003c 18 THEN 1\n ELSE 0\n END AS is_minor\nFROM subscriptions usub\nJOIN users FOR SYSTEM_TIME AS OF usub.proc_time AS u\n ON usub.user_id \u003d u.user_id;\n", 123 | "user": "anonymous", 124 | "dateUpdated": "2021-02-26 10:13:21.804", 125 | "progress": 0, 126 | "config": { 127 | "editorSetting": { 128 | "language": "sql", 129 | "editOnDblClick": false, 130 | "completionKey": "TAB", 131 | "completionSupport": true 132 | }, 133 | "colWidth": 12.0, 134 | "editorMode": "ace/mode/sql", 135 | "fontSize": 9.0, 136 | "results": {}, 137 | "enabled": true 138 | }, 139 | "settings": { 140 | "params": {}, 141 | "forms": {} 142 | }, 143 | "apps": [], 144 | "runtimeInfos": {}, 145 | "progressUpdateIntervalMs": 500, 146 | "jobName": "paragraph_1614305593778_1681665940", 147 | "id": "paragraph_1614305593778_1681665940", 148 | "dateCreated": "2021-02-26 10:13:13.779", 149 | "dateStarted": "2021-02-26 10:13:21.809", 150 | "dateFinished": "2021-02-26 10:13:22.703", 151 | "status": "ERROR" 152 | }, 153 | { 154 | "text": "%flink.ssql\n", 155 | "user": "anonymous", 156 | "dateUpdated": "2021-02-26 10:13:21.807", 157 | "progress": 0, 158 | "config": {}, 159 | "settings": { 160 | "params": {}, 161 | "forms": {} 162 | }, 163 | "apps": [], 164 | "runtimeInfos": {}, 165 | "progressUpdateIntervalMs": 500, 166 | "jobName": "paragraph_1614305601807_1982709287", 167 | "id": "paragraph_1614305601807_1982709287", 168 | "dateCreated": "2021-02-26 10:13:21.807", 169 | "status": "READY" 170 | } 171 | ], 172 | "name": "04 Lookup Joins", 173 | "id": "2FYWYEW8C", 174 | "defaultInterpreterGroup": "flink", 175 | "version": "0.10.0-SNAPSHOT", 176 | "noteParams": {}, 177 | "noteForms": {}, 178 | "angularObjects": {}, 179 | "config": { 180 | "isZeppelinNotebookCronEnable": false 181 | }, 182 | "info": {} 183 | } -------------------------------------------------------------------------------- /Flink Sql Cookbook/Joins/05 Real Time Star Schema Denormalization (N-Way Join)_2G1ZCV2GP.zpln: -------------------------------------------------------------------------------- 1 | { 2 | "paragraphs": [ 3 | { 4 | "text": "%md\n\n\u003e :bulb: In this recipe, we will de-normalize a simple star schema with an n-way temporal table join. \t \n\n[Star schemas](https://en.wikipedia.org/wiki/Star_schema) are a popular way of normalizing data within a data warehouse. At the center of a star schema is a **fact table** whose rows contain metrics, measurements, and other facts about the world. Surrounding fact tables are one or more **dimension tables** which have metadata useful for enriching facts when computing queries. \nYou are running a small data warehouse for a railroad company which consists of a fact table (`train_activity`) and three dimension tables (`stations`, `booking_channels`, and `passengers`). All inserts to the fact table, and all updates to the dimension tables, are mirrored to Apache Kafka. Records in the fact table are interpreted as inserts only, and so the table is backed by the [standard Kafka connector](https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/connectors/kafka.html) (`connector` \u003d `kafka`);. In contrast, the records in the dimensional tables are upserts based on a primary key, which requires the [Upsert Kafka connector](https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/connectors/upsert-kafka.html) (`connector` \u003d `upsert-kafka`).\t \n\nWith Flink SQL you can now easily join all dimensions to our fact table using a 5-way temporal table join. Temporal table joins take an arbitrary table (left input/probe site) and correlate each row to the corresponding row’s relevant version in a versioned table (right input/build side). Flink uses the SQL syntax of ``FOR SYSTEM_TIME AS OF`` to perform this operation. Using a temporal table join leads to consistent, reproducible results when joining a fact table with more (slowly) changing dimensional tables. Every event (row in the fact table) is joined to its corresponding value of each dimension based on when the event occurred in the real world. \n", 5 | "user": "anonymous", 6 | "dateUpdated": "2021-10-08 22:59:14.588", 7 | "progress": 0, 8 | "config": { 9 | "tableHide": false, 10 | "editorSetting": { 11 | "language": "markdown", 12 | "editOnDblClick": true, 13 | "completionKey": "TAB", 14 | "completionSupport": false 15 | }, 16 | "colWidth": 12.0, 17 | "editorMode": "ace/mode/markdown", 18 | "fontSize": 9.0, 19 | "editorHide": true, 20 | "results": {}, 21 | "enabled": true 22 | }, 23 | "settings": { 24 | "params": {}, 25 | "forms": {} 26 | }, 27 | "results": { 28 | "code": "SUCCESS", 29 | "msg": [ 30 | { 31 | "type": "HTML", 32 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cblockquote\u003e\n\u003cp\u003e💡 In this recipe, we will de-normalize a simple star schema with an n-way temporal table join.\u003c/p\u003e\n\u003c/blockquote\u003e\n\u003cp\u003e\u003ca href\u003d\"https://en.wikipedia.org/wiki/Star_schema\"\u003eStar schemas\u003c/a\u003e are a popular way of normalizing data within a data warehouse. At the center of a star schema is a \u003cstrong\u003efact table\u003c/strong\u003e whose rows contain metrics, measurements, and other facts about the world. Surrounding fact tables are one or more \u003cstrong\u003edimension tables\u003c/strong\u003e which have metadata useful for enriching facts when computing queries.\u003cbr /\u003e\nYou are running a small data warehouse for a railroad company which consists of a fact table (\u003ccode\u003etrain_activity\u003c/code\u003e) and three dimension tables (\u003ccode\u003estations\u003c/code\u003e, \u003ccode\u003ebooking_channels\u003c/code\u003e, and \u003ccode\u003epassengers\u003c/code\u003e). All inserts to the fact table, and all updates to the dimension tables, are mirrored to Apache Kafka. Records in the fact table are interpreted as inserts only, and so the table is backed by the \u003ca href\u003d\"https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/connectors/kafka.html\"\u003estandard Kafka connector\u003c/a\u003e (\u003ccode\u003econnector\u003c/code\u003e \u003d \u003ccode\u003ekafka\u003c/code\u003e);. In contrast, the records in the dimensional tables are upserts based on a primary key, which requires the \u003ca href\u003d\"https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/connectors/upsert-kafka.html\"\u003eUpsert Kafka connector\u003c/a\u003e (\u003ccode\u003econnector\u003c/code\u003e \u003d \u003ccode\u003eupsert-kafka\u003c/code\u003e).\u003c/p\u003e\n\u003cp\u003eWith Flink SQL you can now easily join all dimensions to our fact table using a 5-way temporal table join. Temporal table joins take an arbitrary table (left input/probe site) and correlate each row to the corresponding row’s relevant version in a versioned table (right input/build side). Flink uses the SQL syntax of \u003ccode\u003eFOR SYSTEM_TIME AS OF\u003c/code\u003e to perform this operation. Using a temporal table join leads to consistent, reproducible results when joining a fact table with more (slowly) changing dimensional tables. Every event (row in the fact table) is joined to its corresponding value of each dimension based on when the event occurred in the real world.\u003c/p\u003e\n\n\u003c/div\u003e" 33 | } 34 | ] 35 | }, 36 | "apps": [], 37 | "runtimeInfos": {}, 38 | "progressUpdateIntervalMs": 500, 39 | "jobName": "paragraph_1614305636615_1009654891", 40 | "id": "paragraph_1614305636615_1009654891", 41 | "dateCreated": "2021-02-26 10:13:56.615", 42 | "dateStarted": "2021-10-08 22:59:14.588", 43 | "dateFinished": "2021-10-08 22:59:14.593", 44 | "status": "FINISHED" 45 | }, 46 | { 47 | "text": "%md\n \n在本例中,我们将使用 n-way temporal table join 来 de-normalize 一个简单的星座模型。\n\n[Star schemas](https://en.wikipedia.org/wiki/Star_schema) 是一个流行的在数据仓库中 normalizing 数据的方式。星座模型的中心是含有指标、度量和其他关于世界的事实的 **fact table**(事实表)。\n\n围绕事实表的是一张或者多张 **dimension tables**(维度表),维度表的元数据在查询计算时是对事实表的扩展。\n\n你在一个小型的铁路公司运行了小型数据仓库,它包含一张事实表 (`train_activity`) 和3张 维度表 (`stations`, `booking_channels`, and `passengers`)。所有对事实表的插入和对维度表的更新都镜像到 Apache Kafka 中。事实表中的记录表示为只插入,所以它使用 [standard Kafka connector](https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/connectors/kafka.html) (`connector` \u003d `kafka`) 提供数据。相反,维度表的记录基于主键更新或者插入,它需要 [Upsert Kafka connector](https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/connectors/upsert-kafka.html) (`connector` \u003d `upsert-kafka`)。\n\n通过 Flink SQL 使用 5-way temporal 表连接, 你可以轻易的将所有的维度表连接到事实表上。Temporal table joins 采用任意一张表(左边 输入/探针 地址),将每一行记录与版本表(右边 输入/构建 侧)中对应行记录的相关版本关联。使用 `FOR SYSTEM_TIME AS OF` SQL 语法来执行这个操作。当将事实表与多个缓慢变化的维度表连接是使用一个 temporal table join 将产出一致的、可复现的结果。 每个事件(事实表中的一行)与每个维度表基于事件在真实世界发送的事件来连接对应的值。\n", 48 | "user": "anonymous", 49 | "dateUpdated": "2021-03-18 15:57:11.460", 50 | "progress": 0, 51 | "config": { 52 | "editorSetting": { 53 | "language": "markdown", 54 | "editOnDblClick": true, 55 | "completionKey": "TAB", 56 | "completionSupport": false 57 | }, 58 | "colWidth": 12.0, 59 | "editorMode": "ace/mode/markdown", 60 | "fontSize": 9.0, 61 | "results": {}, 62 | "enabled": true, 63 | "editorHide": true, 64 | "tableHide": false 65 | }, 66 | "settings": { 67 | "params": {}, 68 | "forms": {} 69 | }, 70 | "results": { 71 | "code": "SUCCESS", 72 | "msg": [ 73 | { 74 | "type": "HTML", 75 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e在本例中,我们将使用 n-way temporal table join 来 de-normalize 一个简单的星座模型。\u003c/p\u003e\n\u003cp\u003e\u003ca href\u003d\"https://en.wikipedia.org/wiki/Star_schema\"\u003eStar schemas\u003c/a\u003e 是一个流行的在数据仓库中 normalizing 数据的方式。星座模型的中心是含有指标、度量和其他关于世界的事实的 \u003cstrong\u003efact table\u003c/strong\u003e(事实表)。\u003c/p\u003e\n\u003cp\u003e围绕事实表的是一张或者多张 \u003cstrong\u003edimension tables\u003c/strong\u003e(维度表),维度表的元数据在查询计算时是对事实表的扩展。\u003c/p\u003e\n\u003cp\u003e你在一个小型的铁路公司运行了小型数据仓库,它包含一张事实表 (\u003ccode\u003etrain_activity\u003c/code\u003e) 和3张 维度表 (\u003ccode\u003estations\u003c/code\u003e, \u003ccode\u003ebooking_channels\u003c/code\u003e, and \u003ccode\u003epassengers\u003c/code\u003e)。所有对事实表的插入和对维度表的更新都镜像到 Apache Kafka 中。事实表中的记录表示为只插入,所以它使用 \u003ca href\u003d\"https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/connectors/kafka.html\"\u003estandard Kafka connector\u003c/a\u003e (\u003ccode\u003econnector\u003c/code\u003e \u003d \u003ccode\u003ekafka\u003c/code\u003e) 提供数据。相反,维度表的记录基于主键更新或者插入,它需要 \u003ca href\u003d\"https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/connectors/upsert-kafka.html\"\u003eUpsert Kafka connector\u003c/a\u003e (\u003ccode\u003econnector\u003c/code\u003e \u003d \u003ccode\u003eupsert-kafka\u003c/code\u003e)。\u003c/p\u003e\n\u003cp\u003e通过 Flink SQL 使用 5-way temporal 表连接, 你可以轻易的将所有的维度表连接到事实表上。Temporal table joins 采用任意一张表(左边 输入/探针 地址),将每一行记录与版本表(右边 输入/构建 侧)中对应行记录的相关版本关联。使用 \u003ccode\u003eFOR SYSTEM_TIME AS OF\u003c/code\u003e SQL 语法来执行这个操作。当将事实表与多个缓慢变化的维度表连接是使用一个 temporal table join 将产出一致的、可复现的结果。 每个事件(事实表中的一行)与每个维度表基于事件在真实世界发送的事件来连接对应的值。\u003c/p\u003e\n\n\u003c/div\u003e" 76 | } 77 | ] 78 | }, 79 | "apps": [], 80 | "runtimeInfos": {}, 81 | "progressUpdateIntervalMs": 500, 82 | "jobName": "paragraph_1615523318129_1784896943", 83 | "id": "paragraph_1615523318129_1784896943", 84 | "dateCreated": "2021-03-12 04:28:38.129", 85 | "dateStarted": "2021-03-18 15:57:11.460", 86 | "dateFinished": "2021-03-18 15:57:11.471", 87 | "status": "FINISHED" 88 | }, 89 | { 90 | "text": "%flink.ssql\n\n\nDROP TABLE IF EXISTS passengers;\n\nCREATE TABLE passengers (\n passenger_key STRING, \n first_name STRING, \n last_name STRING,\n update_time TIMESTAMP(3),\n WATERMARK FOR update_time AS update_time - INTERVAL \u002710\u0027 SECONDS,\n PRIMARY KEY (passenger_key) NOT ENFORCED\n) WITH (\n \u0027connector\u0027 \u003d \u0027upsert-kafka\u0027,\n \u0027topic\u0027 \u003d \u0027passengers\u0027,\n \u0027properties.bootstrap.servers\u0027 \u003d \u0027localhost:9092\u0027,\n \u0027key.format\u0027 \u003d \u0027raw\u0027,\n \u0027value.format\u0027 \u003d \u0027json\u0027\n);\n\nDROP TABLE IF EXISTS stations;\n\nCREATE TABLE stations (\n station_key STRING, \n update_time TIMESTAMP(3),\n city STRING,\n WATERMARK FOR update_time AS update_time - INTERVAL \u002710\u0027 SECONDS,\n PRIMARY KEY (station_key) NOT ENFORCED\n) WITH (\n \u0027connector\u0027 \u003d \u0027upsert-kafka\u0027,\n \u0027topic\u0027 \u003d \u0027stations\u0027,\n \u0027properties.bootstrap.servers\u0027 \u003d \u0027localhost:9092\u0027,\n \u0027key.format\u0027 \u003d \u0027raw\u0027,\n \u0027value.format\u0027 \u003d \u0027json\u0027\n);\n\nDROP TABLE IF EXISTS booking_channels;\n\nCREATE TABLE booking_channels (\n booking_channel_key STRING, \n update_time TIMESTAMP(3),\n channel STRING,\n WATERMARK FOR update_time AS update_time - INTERVAL \u002710\u0027 SECONDS,\n PRIMARY KEY (booking_channel_key) NOT ENFORCED\n) WITH (\n \u0027connector\u0027 \u003d \u0027upsert-kafka\u0027,\n \u0027topic\u0027 \u003d \u0027booking_channels\u0027,\n \u0027properties.bootstrap.servers\u0027 \u003d \u0027localhost:9092\u0027,\n \u0027key.format\u0027 \u003d \u0027raw\u0027,\n \u0027value.format\u0027 \u003d \u0027json\u0027\n);\n\nDROP TABLE IF EXISTS train_activities;\n\nCREATE TABLE train_activities (\n scheduled_departure_time TIMESTAMP(3),\n actual_departure_date TIMESTAMP(3),\n passenger_key STRING, \n origin_station_key STRING, \n destination_station_key STRING,\n booking_channel_key STRING,\n WATERMARK FOR actual_departure_date AS actual_departure_date - INTERVAL \u002710\u0027 SECONDS\n) WITH (\n \u0027connector\u0027 \u003d \u0027kafka\u0027,\n \u0027topic\u0027 \u003d \u0027train_activities\u0027,\n \u0027properties.bootstrap.servers\u0027 \u003d \u0027localhost:9092\u0027,\n \u0027value.format\u0027 \u003d \u0027json\u0027,\n \u0027value.fields-include\u0027 \u003d \u0027ALL\u0027\n);\n\n", 91 | "user": "anonymous", 92 | "dateUpdated": "2021-02-27 15:20:14.742", 93 | "progress": 0, 94 | "config": { 95 | "colWidth": 12.0, 96 | "fontSize": 9.0, 97 | "enabled": true, 98 | "results": {}, 99 | "editorSetting": { 100 | "language": "sql", 101 | "editOnDblClick": false, 102 | "completionKey": "TAB", 103 | "completionSupport": true 104 | }, 105 | "editorMode": "ace/mode/sql" 106 | }, 107 | "settings": { 108 | "params": {}, 109 | "forms": {} 110 | }, 111 | "apps": [], 112 | "runtimeInfos": {}, 113 | "progressUpdateIntervalMs": 500, 114 | "jobName": "paragraph_1614316870006_1898527227", 115 | "id": "paragraph_1614316870006_1898527227", 116 | "dateCreated": "2021-02-26 13:21:10.006", 117 | "dateStarted": "2021-02-27 15:20:14.746", 118 | "dateFinished": "2021-02-27 15:20:15.750", 119 | "status": "FINISHED" 120 | }, 121 | { 122 | "text": "%flink.ssql(type\u003dupdate)\n\nSELECT \n t.actual_departure_date, \n p.first_name,\n p.last_name,\n b.channel, \n os.city AS origin_station,\n ds.city AS destination_station\nFROM train_activities t\nLEFT JOIN booking_channels FOR SYSTEM_TIME AS OF t.actual_departure_date AS b \nON t.booking_channel_key \u003d b.booking_channel_key;\nLEFT JOIN passengers FOR SYSTEM_TIME AS OF t.actual_departure_date AS p\nON t.passenger_key \u003d p.passenger_key\nLEFT JOIN stations FOR SYSTEM_TIME AS OF t.actual_departure_date AS os\nON t.origin_station_key \u003d os.station_key\nLEFT JOIN stations FOR SYSTEM_TIME AS OF t.actual_departure_date AS ds\nON t.destination_station_key \u003d ds.station_key\nORDER BY t.actual_departure_date DESC\nLIMIT 10;\n", 123 | "user": "anonymous", 124 | "dateUpdated": "2021-02-27 15:21:03.980", 125 | "progress": 0, 126 | "config": { 127 | "editorSetting": { 128 | "language": "sql", 129 | "editOnDblClick": false, 130 | "completionKey": "TAB", 131 | "completionSupport": true 132 | }, 133 | "colWidth": 12.0, 134 | "editorMode": "ace/mode/sql", 135 | "fontSize": 9.0, 136 | "results": {}, 137 | "enabled": true 138 | }, 139 | "settings": { 140 | "params": {}, 141 | "forms": {} 142 | }, 143 | "apps": [], 144 | "runtimeInfos": {}, 145 | "progressUpdateIntervalMs": 500, 146 | "jobName": "paragraph_1614410414745_969940744", 147 | "id": "paragraph_1614410414745_969940744", 148 | "dateCreated": "2021-02-27 15:20:14.745", 149 | "dateStarted": "2021-02-27 15:21:03.984", 150 | "dateFinished": "2021-02-27 15:21:04.881", 151 | "status": "ERROR" 152 | }, 153 | { 154 | "text": "%flink.ssql\n", 155 | "user": "anonymous", 156 | "dateUpdated": "2021-02-27 15:20:55.778", 157 | "progress": 0, 158 | "config": {}, 159 | "settings": { 160 | "params": {}, 161 | "forms": {} 162 | }, 163 | "apps": [], 164 | "runtimeInfos": {}, 165 | "progressUpdateIntervalMs": 500, 166 | "jobName": "paragraph_1614410455778_1166301104", 167 | "id": "paragraph_1614410455778_1166301104", 168 | "dateCreated": "2021-02-27 15:20:55.778", 169 | "status": "READY" 170 | } 171 | ], 172 | "name": "05 Real Time Star Schema Denormalization (N-Way Join)", 173 | "id": "2G1ZCV2GP", 174 | "defaultInterpreterGroup": "flink", 175 | "version": "0.10.0-SNAPSHOT", 176 | "noteParams": {}, 177 | "noteForms": {}, 178 | "angularObjects": {}, 179 | "config": { 180 | "isZeppelinNotebookCronEnable": false 181 | }, 182 | "info": {} 183 | } -------------------------------------------------------------------------------- /Flink Sql Cookbook/Joins/06 Lateral Table Join_2G1VYGDFE.zpln: -------------------------------------------------------------------------------- 1 | { 2 | "paragraphs": [ 3 | { 4 | "text": "%md\n\n\u003e :bulb: This example will show how you can correlate events using a `LATERAL` join.\n\nA recent addition to the SQL standard is the `LATERAL` join, which allows you to combine the power of a correlated subquery with the expressiveness of a join. \n\nGiven a table with people\u0027s addresses, you need to find the two most populous cities for each state and continuously update those rankings as people move. The input table of `People` contains a uid for each person and their address and when they moved there.\n\nThe first step is to calculate each city\u0027s population using a continuous aggregation. While this is simple enough, the real power of Flink SQL comes when people move. By using deduplication. Flink will automatically issue a retraction for a persons old city when they move. So if John moves from New York to Los Angelos, the population for New York will automatically go down by 1. This gives us the power change-data-capture without having to invest in the actual infrastructure of setting it up!\n\nWith this dynamic population table at hand, you are ready to solve the original problem using a `LATERAL` table join. Unlike a normal join, lateral joins allow the subquery to correlate with columns from other arguments in the `FROM` clause. And unlike a regular subquery, as a join, the lateral can return multiple rows. You can now have a sub-query correlated with every individual state, and for every state it ranks by population and returns the top 2 cities.\n", 5 | "user": "anonymous", 6 | "dateUpdated": "2021-10-08 22:59:50.002", 7 | "progress": 0, 8 | "config": { 9 | "tableHide": false, 10 | "editorSetting": { 11 | "language": "markdown", 12 | "editOnDblClick": true, 13 | "completionKey": "TAB", 14 | "completionSupport": false 15 | }, 16 | "colWidth": 12.0, 17 | "editorMode": "ace/mode/markdown", 18 | "fontSize": 9.0, 19 | "editorHide": true, 20 | "results": {}, 21 | "enabled": true 22 | }, 23 | "settings": { 24 | "params": {}, 25 | "forms": {} 26 | }, 27 | "results": { 28 | "code": "SUCCESS", 29 | "msg": [ 30 | { 31 | "type": "HTML", 32 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cblockquote\u003e\n\u003cp\u003e💡 This example will show how you can correlate events using a \u003ccode\u003eLATERAL\u003c/code\u003e join.\u003c/p\u003e\n\u003c/blockquote\u003e\n\u003cp\u003eA recent addition to the SQL standard is the \u003ccode\u003eLATERAL\u003c/code\u003e join, which allows you to combine the power of a correlated subquery with the expressiveness of a join.\u003c/p\u003e\n\u003cp\u003eGiven a table with people\u0026rsquo;s addresses, you need to find the two most populous cities for each state and continuously update those rankings as people move. The input table of \u003ccode\u003ePeople\u003c/code\u003e contains a uid for each person and their address and when they moved there.\u003c/p\u003e\n\u003cp\u003eThe first step is to calculate each city\u0026rsquo;s population using a continuous aggregation. While this is simple enough, the real power of Flink SQL comes when people move. By using deduplication. Flink will automatically issue a retraction for a persons old city when they move. So if John moves from New York to Los Angelos, the population for New York will automatically go down by 1. This gives us the power change-data-capture without having to invest in the actual infrastructure of setting it up!\u003c/p\u003e\n\u003cp\u003eWith this dynamic population table at hand, you are ready to solve the original problem using a \u003ccode\u003eLATERAL\u003c/code\u003e table join. Unlike a normal join, lateral joins allow the subquery to correlate with columns from other arguments in the \u003ccode\u003eFROM\u003c/code\u003e clause. And unlike a regular subquery, as a join, the lateral can return multiple rows. You can now have a sub-query correlated with every individual state, and for every state it ranks by population and returns the top 2 cities.\u003c/p\u003e\n\n\u003c/div\u003e" 33 | } 34 | ] 35 | }, 36 | "apps": [], 37 | "runtimeInfos": {}, 38 | "progressUpdateIntervalMs": 500, 39 | "jobName": "paragraph_1614316927145_278384972", 40 | "id": "paragraph_1614316927145_278384972", 41 | "dateCreated": "2021-02-26 13:22:07.145", 42 | "dateStarted": "2021-10-08 22:59:50.003", 43 | "dateFinished": "2021-10-08 22:59:50.007", 44 | "status": "FINISHED" 45 | }, 46 | { 47 | "text": "%md\n\n本例将展示如何使用 `LATERAL` join 来关联事件。\n\n `LATERAL` join 是最近添加到标准 SQL 的,它允许我们在 jion 的时候结合关联子查询的能力。\n \n 给定一个居民地址的表,我们需要找出对于每个州人口最多的城市。尽管这个例子很简单,当居民移动的时候就能体现 Flink SQL 的价值了。 通过使用去重复。当居民移动时 Flink 会自动发送一个 retraction给 居民以前的城市。所以如果 John 从纽约迁移到了洛杉矶,纽约的人口自动减去1。 这个能力能给我们提供捕获数据改变的能力而不需要花费额外的精力来搭建实际的基础设施!\n \n 借助动态人口表,我们可以使用 `LATERAL` table join 来解决之前的问题。与 普通的 join 不能,lateral 连接允许子查询关联`FROM` 语句中的其他参数的字段。并且与传统的子查询不同,作为一个连接,lateral 可以返回多行。现在我们可以使用一个与每个州关联的子查询并对对每个州的城市的人口排序返回最多的2个城市。\n\n", 48 | "user": "anonymous", 49 | "dateUpdated": "2021-03-18 15:57:24.097", 50 | "progress": 0, 51 | "config": { 52 | "editorSetting": { 53 | "language": "markdown", 54 | "editOnDblClick": true, 55 | "completionKey": "TAB", 56 | "completionSupport": false 57 | }, 58 | "colWidth": 12.0, 59 | "editorMode": "ace/mode/markdown", 60 | "fontSize": 9.0, 61 | "results": {}, 62 | "enabled": true, 63 | "editorHide": true, 64 | "tableHide": false 65 | }, 66 | "settings": { 67 | "params": {}, 68 | "forms": {} 69 | }, 70 | "results": { 71 | "code": "SUCCESS", 72 | "msg": [ 73 | { 74 | "type": "HTML", 75 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e本例将展示如何使用 \u003ccode\u003eLATERAL\u003c/code\u003e join 来关联事件。\u003c/p\u003e\n\u003cp\u003e\u003ccode\u003eLATERAL\u003c/code\u003e join 是最近添加到标准 SQL 的,它允许我们在 jion 的时候结合关联子查询的能力。\u003c/p\u003e\n\u003cp\u003e给定一个居民地址的表,我们需要找出对于每个州人口最多的城市。尽管这个例子很简单,当居民移动的时候就能体现 Flink SQL 的价值了。 通过使用去重复。当居民移动时 Flink 会自动发送一个 retraction给 居民以前的城市。所以如果 John 从纽约迁移到了洛杉矶,纽约的人口自动减去1。 这个能力能给我们提供捕获数据改变的能力而不需要花费额外的精力来搭建实际的基础设施!\u003c/p\u003e\n\u003cp\u003e借助动态人口表,我们可以使用 \u003ccode\u003eLATERAL\u003c/code\u003e table join 来解决之前的问题。与 普通的 join 不能,lateral 连接允许子查询关联\u003ccode\u003eFROM\u003c/code\u003e 语句中的其他参数的字段。并且与传统的子查询不同,作为一个连接,lateral 可以返回多行。现在我们可以使用一个与每个州关联的子查询并对对每个州的城市的人口排序返回最多的2个城市。\u003c/p\u003e\n\n\u003c/div\u003e" 76 | } 77 | ] 78 | }, 79 | "apps": [], 80 | "runtimeInfos": {}, 81 | "progressUpdateIntervalMs": 500, 82 | "jobName": "paragraph_1615535173871_1879361334", 83 | "id": "paragraph_1615535173871_1879361334", 84 | "dateCreated": "2021-03-12 07:46:13.871", 85 | "dateStarted": "2021-03-18 15:57:24.097", 86 | "dateFinished": "2021-03-18 15:57:24.107", 87 | "status": "FINISHED" 88 | }, 89 | { 90 | "text": "%flink.ssql\n\n\nDROP TABLE IF EXISTS People;\n\nCREATE TABLE People (\n id INT,\n city STRING,\n state STRING,\n arrival_time TIMESTAMP(3),\n WATERMARK FOR arrival_time AS arrival_time - INTERVAL \u00271\u0027 MINUTE \n) WITH (\n \u0027connector\u0027 \u003d \u0027faker\u0027,\n \u0027fields.id.expression\u0027 \u003d \u0027#{number.numberBetween \u0027\u00271\u0027\u0027,\u0027\u0027100\u0027\u0027}\u0027,\n \u0027fields.city.expression\u0027 \u003d \u0027#{regexify \u0027\u0027(Newmouth|Newburgh|Portport|Southfort|Springfield){1}\u0027\u0027}\u0027,\n \u0027fields.state.expression\u0027 \u003d \u0027#{regexify \u0027\u0027(New York|Illinois|California|Washington){1}\u0027\u0027}\u0027,\n \u0027fields.arrival_time.expression\u0027 \u003d \u0027#{date.past \u0027\u002715\u0027\u0027,\u0027\u0027SECONDS\u0027\u0027}\u0027,\n \u0027rows-per-second\u0027 \u003d \u002710\u0027\n); \n\n\nDROP VIEW IF EXISTS CurrentPopulation;\n\nCREATE VIEW CurrentPopulation AS\nSELECT \n city,\n state,\n COUNT(*) as population\nFROM (\n SELECT\n city,\n state,\n ROW_NUMBER() OVER (PARTITION BY id ORDER BY arrival_time DESC) AS rownum\n FROM People\n)\nWHERE rownum \u003d 1\nGROUP BY city, state;", 91 | "user": "anonymous", 92 | "dateUpdated": "2021-02-26 10:15:33.014", 93 | "progress": 0, 94 | "config": { 95 | "editorSetting": { 96 | "language": "sql", 97 | "editOnDblClick": false, 98 | "completionKey": "TAB", 99 | "completionSupport": true 100 | }, 101 | "colWidth": 12.0, 102 | "editorMode": "ace/mode/sql", 103 | "fontSize": 9.0, 104 | "results": {}, 105 | "enabled": true 106 | }, 107 | "settings": { 108 | "params": {}, 109 | "forms": {} 110 | }, 111 | "apps": [], 112 | "runtimeInfos": {}, 113 | "progressUpdateIntervalMs": 500, 114 | "jobName": "paragraph_1614305676693_319393607", 115 | "id": "paragraph_1614305676693_319393607", 116 | "dateCreated": "2021-02-26 10:14:36.693", 117 | "dateStarted": "2021-02-26 10:15:33.023", 118 | "dateFinished": "2021-02-26 10:15:53.304", 119 | "status": "FINISHED" 120 | }, 121 | { 122 | "text": "%flink.ssql(type\u003dupdate)\n\nSELECT\n state,\n city,\n population\nFROM \n (SELECT DISTINCT state FROM CurrentPopulation) States,\n LATERAL (\n SELECT city, population\n FROM CurrentPopulation\n WHERE state \u003d States.state\n ORDER BY population DESC\n LIMIT 2\n);\n", 123 | "user": "anonymous", 124 | "dateUpdated": "2021-02-26 13:22:04.824", 125 | "progress": 0, 126 | "config": { 127 | "editorSetting": { 128 | "language": "sql", 129 | "editOnDblClick": false, 130 | "completionKey": "TAB", 131 | "completionSupport": true 132 | }, 133 | "colWidth": 12.0, 134 | "editorMode": "ace/mode/sql", 135 | "fontSize": 9.0, 136 | "results": { 137 | "0": { 138 | "graph": { 139 | "mode": "table", 140 | "height": 300.0, 141 | "optionOpen": false, 142 | "setting": { 143 | "table": { 144 | "tableGridState": {}, 145 | "tableColumnTypeState": { 146 | "names": { 147 | "state": "string", 148 | "city": "string", 149 | "population": "string" 150 | }, 151 | "updated": false 152 | }, 153 | "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", 154 | "tableOptionValue": { 155 | "useFilter": false, 156 | "showPagination": false, 157 | "showAggregationFooter": false 158 | }, 159 | "updated": false, 160 | "initialized": false 161 | } 162 | }, 163 | "commonSetting": {} 164 | } 165 | } 166 | }, 167 | "enabled": true 168 | }, 169 | "settings": { 170 | "params": {}, 171 | "forms": {} 172 | }, 173 | "apps": [], 174 | "runtimeInfos": { 175 | "jobUrl": { 176 | "propertyName": "jobUrl", 177 | "label": "FLINK JOB", 178 | "tooltip": "View in Flink web UI", 179 | "group": "flink", 180 | "values": [ 181 | { 182 | "jobUrl": "http://localhost:8081#/job/202e01312f9dc27c71d8cb37529a8b5f" 183 | } 184 | ], 185 | "interpreterSettingId": "flink" 186 | } 187 | }, 188 | "progressUpdateIntervalMs": 500, 189 | "jobName": "paragraph_1614305708940_1594333975", 190 | "id": "paragraph_1614305708940_1594333975", 191 | "dateCreated": "2021-02-26 10:15:08.940", 192 | "dateStarted": "2021-02-26 10:16:08.740", 193 | "dateFinished": "2021-02-26 10:16:24.488", 194 | "status": "ABORT" 195 | }, 196 | { 197 | "text": "%flink.ssql\n", 198 | "user": "anonymous", 199 | "dateUpdated": "2021-02-26 10:16:08.725", 200 | "progress": 0, 201 | "config": {}, 202 | "settings": { 203 | "params": {}, 204 | "forms": {} 205 | }, 206 | "apps": [], 207 | "runtimeInfos": {}, 208 | "progressUpdateIntervalMs": 500, 209 | "jobName": "paragraph_1614305768725_733914734", 210 | "id": "paragraph_1614305768725_733914734", 211 | "dateCreated": "2021-02-26 10:16:08.725", 212 | "status": "READY" 213 | } 214 | ], 215 | "name": "06 Lateral Table Join", 216 | "id": "2G1VYGDFE", 217 | "defaultInterpreterGroup": "flink", 218 | "version": "0.10.0-SNAPSHOT", 219 | "noteParams": {}, 220 | "noteForms": {}, 221 | "angularObjects": {}, 222 | "config": { 223 | "isZeppelinNotebookCronEnable": false 224 | }, 225 | "info": {} 226 | } -------------------------------------------------------------------------------- /Flink Sql Cookbook/Joins/06_Lateral_Table_Join.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjffdu/flink-sql-cookbook-on-zeppelin/6cb9a0a0b64ef9eb87b4f8ced63e447b4aab72b9/Flink Sql Cookbook/Joins/06_Lateral_Table_Join.gif -------------------------------------------------------------------------------- /Flink Sql Cookbook/Other Builtin Functions/02 Union_Multiple_Stream.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjffdu/flink-sql-cookbook-on-zeppelin/6cb9a0a0b64ef9eb87b4f8ced63e447b4aab72b9/Flink Sql Cookbook/Other Builtin Functions/02 Union_Multiple_Stream.gif -------------------------------------------------------------------------------- /Flink Sql Cookbook/Other Builtin Functions/03 Filter_Late_Data.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjffdu/flink-sql-cookbook-on-zeppelin/6cb9a0a0b64ef9eb87b4f8ced63e447b4aab72b9/Flink Sql Cookbook/Other Builtin Functions/03 Filter_Late_Data.gif -------------------------------------------------------------------------------- /Flink Sql Cookbook/Other Builtin Functions/03 Filtering out Late Data_2GJ18VM3X.zpln: -------------------------------------------------------------------------------- 1 | { 2 | "paragraphs": [ 3 | { 4 | "text": "%md\n\n![Twitter Badge](https://img.shields.io/badge/Flink%20Version-1.14%2B-lightgrey)\n\n\u003e :bulb: This example will show how to filter out late data using the `CURRENT_WATERMARK` function.\nThe source table (`mobile_usage`) is backed by the [`faker` connector](https://flink-packages.org/packages/flink-faker), which continuously generates rows in memory based on Java Faker expressions.\n\nAs explained before in the [watermarks recipe](../../aggregations-and-analytics/02_watermarks/02_watermarks.md), Flink uses watermarks to measure progress in event time. By using a `WATERMARK` attribute in a table\u0027s DDL, we signify a column as the table\u0027s event time attribute and tell Flink how out of order we expect our data to arrive. \n\nThere are many cases when rows are arriving even more out of order than anticipated, i.e. after the watermark. This data is called *late*. An example could be when someone is using a mobile app while being offline because of lack of mobile coverage or flight mode being enabled. When Internet access is restored, previously tracked activities would then be sent.\n\nIn this recipe, we\u0027ll filter out this late data using the [`CURRENT_WATERMARK`](https://ci.apache.org/projects/flink/flink-docs-release-1.14/docs/dev/table/functions/systemfunctions/) function. In the first statement, we\u0027ll use the non-late data combined with the [`TUMBLE`](../../aggregations-and-analytics/01_group_by_window/01_group_by_window_tvf.md) function to send the unique IP addresses per minute to a downstream consumer (like a BI tool). Next to this use case, we\u0027re sending the late data to a different sink. For example, you might want to use these rows to change the results of your product recommender for offline mobile app users. \n\nThis table DDL contains both an event time and a processing time definition. `ingest_time` is defined as processing time, while `log_time` is defined as event time and will contain timestamps between 45 and 10 seconds ago. \n", 5 | "user": "anonymous", 6 | "dateUpdated": "2021-10-08 11:53:33.372", 7 | "progress": 0, 8 | "config": { 9 | "tableHide": false, 10 | "editorSetting": { 11 | "language": "markdown", 12 | "editOnDblClick": true, 13 | "completionKey": "TAB", 14 | "completionSupport": false 15 | }, 16 | "colWidth": 12.0, 17 | "editorMode": "ace/mode/markdown", 18 | "fontSize": 9.0, 19 | "editorHide": true, 20 | "results": {}, 21 | "enabled": true 22 | }, 23 | "settings": { 24 | "params": {}, 25 | "forms": {} 26 | }, 27 | "results": { 28 | "code": "SUCCESS", 29 | "msg": [ 30 | { 31 | "type": "HTML", 32 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e\u003cimg src\u003d\"https://img.shields.io/badge/Flink%20Version-1.14%2B-lightgrey\" alt\u003d\"Twitter Badge\" /\u003e\u003c/p\u003e\n\u003cblockquote\u003e\n\u003cp\u003e💡 This example will show how to filter out late data using the \u003ccode\u003eCURRENT_WATERMARK\u003c/code\u003e function.\u003cbr /\u003e\nThe source table (\u003ccode\u003emobile_usage\u003c/code\u003e) is backed by the \u003ca href\u003d\"https://flink-packages.org/packages/flink-faker\"\u003e\u003ccode\u003efaker\u003c/code\u003e connector\u003c/a\u003e, which continuously generates rows in memory based on Java Faker expressions.\u003c/p\u003e\n\u003c/blockquote\u003e\n\u003cp\u003eAs explained before in the \u003ca href\u003d\"../../aggregations-and-analytics/02_watermarks/02_watermarks.md\"\u003ewatermarks recipe\u003c/a\u003e, Flink uses watermarks to measure progress in event time. By using a \u003ccode\u003eWATERMARK\u003c/code\u003e attribute in a table\u0026rsquo;s DDL, we signify a column as the table\u0026rsquo;s event time attribute and tell Flink how out of order we expect our data to arrive.\u003c/p\u003e\n\u003cp\u003eThere are many cases when rows are arriving even more out of order than anticipated, i.e. after the watermark. This data is called \u003cem\u003elate\u003c/em\u003e. An example could be when someone is using a mobile app while being offline because of lack of mobile coverage or flight mode being enabled. When Internet access is restored, previously tracked activities would then be sent.\u003c/p\u003e\n\u003cp\u003eIn this recipe, we\u0026rsquo;ll filter out this late data using the \u003ca href\u003d\"https://ci.apache.org/projects/flink/flink-docs-release-1.14/docs/dev/table/functions/systemfunctions/\"\u003e\u003ccode\u003eCURRENT_WATERMARK\u003c/code\u003e\u003c/a\u003e function. In the first statement, we\u0026rsquo;ll use the non-late data combined with the \u003ca href\u003d\"../../aggregations-and-analytics/01_group_by_window/01_group_by_window_tvf.md\"\u003e\u003ccode\u003eTUMBLE\u003c/code\u003e\u003c/a\u003e function to send the unique IP addresses per minute to a downstream consumer (like a BI tool). Next to this use case, we\u0026rsquo;re sending the late data to a different sink. For example, you might want to use these rows to change the results of your product recommender for offline mobile app users.\u003c/p\u003e\n\u003cp\u003eThis table DDL contains both an event time and a processing time definition. \u003ccode\u003eingest_time\u003c/code\u003e is defined as processing time, while \u003ccode\u003elog_time\u003c/code\u003e is defined as event time and will contain timestamps between 45 and 10 seconds ago.\u003c/p\u003e\n\n\u003c/div\u003e" 33 | } 34 | ] 35 | }, 36 | "apps": [], 37 | "runtimeInfos": {}, 38 | "progressUpdateIntervalMs": 500, 39 | "jobName": "paragraph_1633665177413_1398758132", 40 | "id": "paragraph_1633665177413_1398758132", 41 | "dateCreated": "2021-10-08 11:52:57.413", 42 | "dateStarted": "2021-10-08 11:53:33.376", 43 | "dateFinished": "2021-10-08 11:53:35.864", 44 | "status": "FINISHED" 45 | }, 46 | { 47 | "text": "%flink.ssql\n\n-- Create source table\nCREATE TABLE IF NOT EXISTS `mobile_usage` ( \n `activity` STRING, \n `client_ip` STRING,\n `ingest_time` AS PROCTIME(),\n `log_time` TIMESTAMP_LTZ(3), \n WATERMARK FOR log_time AS log_time - INTERVAL \u002715\u0027 SECONDS\n) WITH (\n \u0027connector\u0027 \u003d \u0027faker\u0027, \n \u0027rows-per-second\u0027 \u003d \u002750\u0027,\n \u0027fields.activity.expression\u0027 \u003d \u0027#{regexify \u0027\u0027(open_push_message|discard_push_message|open_app|display_overview|change_settings)\u0027\u0027}\u0027,\n \u0027fields.client_ip.expression\u0027 \u003d \u0027#{Internet.publicIpV4Address}\u0027,\n \u0027fields.log_time.expression\u0027 \u003d \u0027#{date.past \u0027\u002745\u0027\u0027,\u0027\u002710\u0027\u0027,\u0027\u0027SECONDS\u0027\u0027}\u0027\n);\n\n-- Create sink table for rows that are non-late\nCREATE TABLE IF NOT EXISTS `unique_users_per_window` ( \n `window_start` TIMESTAMP(3), \n `window_end` TIMESTAMP(3),\n `ip_addresses` BIGINT\n) WITH (\n \u0027connector\u0027 \u003d \u0027blackhole\u0027\n);\n\n-- Create sink table for rows that are late\nCREATE TABLE IF NOT EXISTS `late_usage_events` ( \n `activity` STRING, \n `client_ip` STRING,\n `ingest_time` TIMESTAMP_LTZ(3),\n `log_time` TIMESTAMP_LTZ(3), \n `current_watermark` TIMESTAMP_LTZ(3) \n) WITH (\n \u0027connector\u0027 \u003d \u0027blackhole\u0027\n);\n\n-- Create a view with non-late data\nCREATE TEMPORARY VIEW IF NOT EXISTS `mobile_data` AS\n SELECT * FROM mobile_usage\n WHERE CURRENT_WATERMARK(log_time) IS NOT NULL\n OR log_time \u003c CURRENT_WATERMARK(log_time);\n\n-- Create a view with late data\nCREATE TEMPORARY VIEW IF NOT EXISTS `late_mobile_data` AS \n SELECT * FROM mobile_usage\n WHERE CURRENT_WATERMARK(log_time) IS NULL\n OR log_time \u003e CURRENT_WATERMARK(log_time);\n\n", 48 | "user": "anonymous", 49 | "dateUpdated": "2021-10-08 12:09:44.850", 50 | "progress": 0, 51 | "config": { 52 | "editorSetting": { 53 | "language": "sql", 54 | "editOnDblClick": false, 55 | "completionKey": "TAB", 56 | "completionSupport": true 57 | }, 58 | "colWidth": 12.0, 59 | "editorMode": "ace/mode/sql", 60 | "fontSize": 9.0, 61 | "results": {}, 62 | "enabled": true 63 | }, 64 | "settings": { 65 | "params": {}, 66 | "forms": {} 67 | }, 68 | "results": { 69 | "code": "SUCCESS", 70 | "msg": [ 71 | { 72 | "type": "TEXT", 73 | "data": "Table has been created.\nTable has been created.\nTable has been created.\nView has been created.\nView has been created.\n" 74 | } 75 | ] 76 | }, 77 | "apps": [], 78 | "runtimeInfos": {}, 79 | "progressUpdateIntervalMs": 500, 80 | "jobName": "paragraph_1633665213368_836109418", 81 | "id": "paragraph_1633665213368_836109418", 82 | "dateCreated": "2021-10-08 11:53:33.368", 83 | "dateStarted": "2021-10-08 12:09:44.851", 84 | "dateFinished": "2021-10-08 12:09:45.152", 85 | "status": "FINISHED" 86 | }, 87 | { 88 | "text": "%flink.ssql(runAsOne\u003dtrue)\n\nINSERT INTO `unique_users_per_window`\n SELECT `window_start`, `window_end`, COUNT(DISTINCT client_ip) AS `ip_addresses`\n FROM TABLE(\n TUMBLE(TABLE mobile_data, DESCRIPTOR(log_time), INTERVAL \u00271\u0027 MINUTE))\n GROUP BY window_start, window_end;\n\n-- Send all rows that are late to the sink for late data\nINSERT INTO `late_usage_events`\n SELECT *, CURRENT_WATERMARK(log_time) as `current_watermark` from `late_mobile_data`;\n", 89 | "user": "anonymous", 90 | "dateUpdated": "2021-10-08 12:09:54.241", 91 | "progress": 0, 92 | "config": { 93 | "editorSetting": { 94 | "language": "sql", 95 | "editOnDblClick": false, 96 | "completionKey": "TAB", 97 | "completionSupport": true 98 | }, 99 | "colWidth": 12.0, 100 | "editorMode": "ace/mode/sql", 101 | "fontSize": 9.0, 102 | "results": {}, 103 | "enabled": true 104 | }, 105 | "settings": { 106 | "params": {}, 107 | "forms": {} 108 | }, 109 | "results": { 110 | "code": "SUCCESS", 111 | "msg": [ 112 | { 113 | "type": "ANGULAR", 114 | "data": "\u003ch1\u003eDuration: {{duration}} \u003c/h1\u003e\n" 115 | }, 116 | { 117 | "type": "TEXT", 118 | "data": "Job is cancelled.\n" 119 | } 120 | ] 121 | }, 122 | "apps": [], 123 | "runtimeInfos": { 124 | "jobUrl": { 125 | "propertyName": "jobUrl", 126 | "label": "FLINK JOB", 127 | "tooltip": "View in Flink web UI", 128 | "group": "flink", 129 | "values": [ 130 | { 131 | "jobUrl": "https://knox.c-fa375384f1f481e0.cn-hongkong.emr.aliyuncs.com:8443/gateway/cluster-topo/yarn/proxy/application_1628498781174_4089/#/job/61aacdf53b079f872641211d0d6542ed" 132 | } 133 | ], 134 | "interpreterSettingId": "flink" 135 | } 136 | }, 137 | "progressUpdateIntervalMs": 500, 138 | "jobName": "paragraph_1633665300488_421025159", 139 | "id": "paragraph_1633665300488_421025159", 140 | "dateCreated": "2021-10-08 11:55:00.488", 141 | "dateStarted": "2021-10-08 12:09:54.242", 142 | "dateFinished": "2021-10-08 12:11:19.868", 143 | "status": "ABORT" 144 | }, 145 | { 146 | "text": "%flink.ssql\n", 147 | "user": "anonymous", 148 | "dateUpdated": "2021-10-08 11:58:02.012", 149 | "progress": 0, 150 | "config": {}, 151 | "settings": { 152 | "params": {}, 153 | "forms": {} 154 | }, 155 | "apps": [], 156 | "runtimeInfos": {}, 157 | "progressUpdateIntervalMs": 500, 158 | "jobName": "paragraph_1633665482012_1897319436", 159 | "id": "paragraph_1633665482012_1897319436", 160 | "dateCreated": "2021-10-08 11:58:02.012", 161 | "status": "READY" 162 | } 163 | ], 164 | "name": "03 Filtering out Late Data", 165 | "id": "2GJ18VM3X", 166 | "defaultInterpreterGroup": "flink", 167 | "version": "0.10.0-SNAPSHOT", 168 | "noteParams": {}, 169 | "noteForms": {}, 170 | "angularObjects": { 171 | "flink-2GJ18VM3X": [ 172 | { 173 | "name": "duration", 174 | "object": "1 minutes 24 seconds", 175 | "noteId": "2GJ18VM3X", 176 | "paragraphId": "paragraph_1633665300488_421025159" 177 | } 178 | ] 179 | }, 180 | "config": { 181 | "isZeppelinNotebookCronEnable": false 182 | }, 183 | "info": {} 184 | } -------------------------------------------------------------------------------- /Flink Sql Cookbook/UDF/01 Extending SQL with Python UDFs_2G19AQ57T.zpln: -------------------------------------------------------------------------------- 1 | { 2 | "paragraphs": [ 3 | { 4 | "text": "%md\n\nInstall apache-flink\n\n```\npip install apache-flink\u003d\u003d1.12.1\n```\n\nThis example will show how to extend Flink SQL with custom functions written in Python.\n\nFlink SQL provides a wide range of [built-in functions](https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/functions/systemFunctions.html) that cover most SQL day-to-day work. Sometimes, you need more flexibility to express custom business logic or transformations that aren\u0027t easily translatable to SQL: this can be achieved with [User-Defined Functions](https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/functions/udfs.html) (UDFs).\n\nIn this example, you\u0027ll focus on [Python UDFs](https://ci.apache.org/projects/flink/flink-docs-stable/dev/python/table-api-users-guide/udfs/python_udfs.html) and implement a custom function (`to_fahr`) to convert temperature readings that are continuously generated for different EU and US cities. The Celsius-\u003eFahrenheit conversion should only happen if the city associated with the reading is in the US.", 5 | "user": "anonymous", 6 | "dateUpdated": "2021-03-18 16:01:45.310", 7 | "progress": 0, 8 | "config": { 9 | "tableHide": false, 10 | "editorSetting": { 11 | "language": "markdown", 12 | "editOnDblClick": true, 13 | "completionKey": "TAB", 14 | "completionSupport": false 15 | }, 16 | "colWidth": 12.0, 17 | "editorMode": "ace/mode/markdown", 18 | "fontSize": 9.0, 19 | "editorHide": true, 20 | "results": {}, 21 | "enabled": true 22 | }, 23 | "settings": { 24 | "params": {}, 25 | "forms": {} 26 | }, 27 | "results": { 28 | "code": "SUCCESS", 29 | "msg": [ 30 | { 31 | "type": "HTML", 32 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003eInstall apache-flink\u003c/p\u003e\n\u003cpre\u003e\u003ccode\u003epip install apache-flink\u003d\u003d1.12.1\n\u003c/code\u003e\u003c/pre\u003e\n\u003cp\u003eThis example will show how to extend Flink SQL with custom functions written in Python.\u003c/p\u003e\n\u003cp\u003eFlink SQL provides a wide range of \u003ca href\u003d\"https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/functions/systemFunctions.html\"\u003ebuilt-in functions\u003c/a\u003e that cover most SQL day-to-day work. Sometimes, you need more flexibility to express custom business logic or transformations that aren\u0026rsquo;t easily translatable to SQL: this can be achieved with \u003ca href\u003d\"https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/functions/udfs.html\"\u003eUser-Defined Functions\u003c/a\u003e (UDFs).\u003c/p\u003e\n\u003cp\u003eIn this example, you\u0026rsquo;ll focus on \u003ca href\u003d\"https://ci.apache.org/projects/flink/flink-docs-stable/dev/python/table-api-users-guide/udfs/python_udfs.html\"\u003ePython UDFs\u003c/a\u003e and implement a custom function (\u003ccode\u003eto_fahr\u003c/code\u003e) to convert temperature readings that are continuously generated for different EU and US cities. The Celsius-\u0026gt;Fahrenheit conversion should only happen if the city associated with the reading is in the US.\u003c/p\u003e\n\n\u003c/div\u003e" 33 | } 34 | ] 35 | }, 36 | "apps": [], 37 | "runtimeInfos": {}, 38 | "progressUpdateIntervalMs": 500, 39 | "jobName": "paragraph_1614306308381_842100307", 40 | "id": "paragraph_1614306308381_842100307", 41 | "dateCreated": "2021-02-26 10:25:08.387", 42 | "dateStarted": "2021-03-18 16:01:45.311", 43 | "dateFinished": "2021-03-18 16:01:45.328", 44 | "status": "FINISHED" 45 | }, 46 | { 47 | "text": "%md\n\n本例将展示如何使用Python编写的自定义函数来扩展 Flink SQL\n\nFlink SQL 提供来很多的内置函数[built-in functions](https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/functions/systemFunctions.html) ,它们覆盖来几乎所有的日常工作所需的函数。有时,我们需要更多的灵活性来表达一些难以转换为 SQL 的自定义的业务逻辑或者转换:我们可以通过 [User-Defined Functions](https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/functions/udfs.html) (UDFs)来达到目的。\n\n在本例中我们聚焦于 [Python UDFs](https://ci.apache.org/projects/flink/flink-docs-stable/dev/python/table-api-users-guide/udfs/python_udfs.html) 并使用一个自定义的函数 (`to_fahr`) 来转换来自不同的 EU和 US 城市的温度的读数显示。只有这个读数关联的城市是 US 的时候才进行摄氏度到华氏度的转换。\n", 48 | "user": "anonymous", 49 | "dateUpdated": "2021-03-18 16:01:48.650", 50 | "progress": 0, 51 | "config": { 52 | "editorSetting": { 53 | "language": "markdown", 54 | "editOnDblClick": true, 55 | "completionKey": "TAB", 56 | "completionSupport": false 57 | }, 58 | "colWidth": 12.0, 59 | "editorMode": "ace/mode/markdown", 60 | "fontSize": 9.0, 61 | "results": {}, 62 | "enabled": true, 63 | "editorHide": true, 64 | "tableHide": false 65 | }, 66 | "settings": { 67 | "params": {}, 68 | "forms": {} 69 | }, 70 | "results": { 71 | "code": "SUCCESS", 72 | "msg": [ 73 | { 74 | "type": "HTML", 75 | "data": "\u003cdiv class\u003d\"markdown-body\"\u003e\n\u003cp\u003e本例将展示如何使用Python编写的自定义函数来扩展 Flink SQL\u003c/p\u003e\n\u003cp\u003eFlink SQL 提供来很多的内置函数\u003ca href\u003d\"https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/functions/systemFunctions.html\"\u003ebuilt-in functions\u003c/a\u003e ,它们覆盖来几乎所有的日常工作所需的函数。有时,我们需要更多的灵活性来表达一些难以转换为 SQL 的自定义的业务逻辑或者转换:我们可以通过 \u003ca href\u003d\"https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/functions/udfs.html\"\u003eUser-Defined Functions\u003c/a\u003e (UDFs)来达到目的。\u003c/p\u003e\n\u003cp\u003e在本例中我们聚焦于 \u003ca href\u003d\"https://ci.apache.org/projects/flink/flink-docs-stable/dev/python/table-api-users-guide/udfs/python_udfs.html\"\u003ePython UDFs\u003c/a\u003e 并使用一个自定义的函数 (\u003ccode\u003eto_fahr\u003c/code\u003e) 来转换来自不同的 EU和 US 城市的温度的读数显示。只有这个读数关联的城市是 US 的时候才进行摄氏度到华氏度的转换。\u003c/p\u003e\n\n\u003c/div\u003e" 76 | } 77 | ] 78 | }, 79 | "apps": [], 80 | "runtimeInfos": {}, 81 | "progressUpdateIntervalMs": 500, 82 | "jobName": "paragraph_1615539187631_995640762", 83 | "id": "paragraph_1615539187631_995640762", 84 | "dateCreated": "2021-03-12 08:53:07.631", 85 | "dateStarted": "2021-03-18 16:01:48.645", 86 | "dateFinished": "2021-03-18 16:01:48.659", 87 | "status": "FINISHED" 88 | }, 89 | { 90 | "text": "%flink.pyflink\n\nfrom pyflink.table import DataTypes\nfrom pyflink.table.udf import udf\n\nus_cities \u003d {\"Chicago\",\"Portland\",\"Seattle\",\"New York\"}\n\n@udf(input_types\u003d[DataTypes.STRING(), DataTypes.FLOAT()],\n result_type\u003dDataTypes.FLOAT())\ndef to_fahr(city, temperature):\n if city in us_cities:\n fahr \u003d ((temperature * 9.0 / 5.0) + 32.0)\n return fahr\n else:\n return temperature\n \nst_env.register_function(\"to_fahr\", to_fahr)\n", 91 | "user": "anonymous", 92 | "dateUpdated": "2021-02-26 10:33:10.938", 93 | "progress": 0, 94 | "config": { 95 | "editorSetting": { 96 | "language": "python", 97 | "editOnDblClick": false, 98 | "completionKey": "TAB", 99 | "completionSupport": true 100 | }, 101 | "colWidth": 12.0, 102 | "editorMode": "ace/mode/python", 103 | "fontSize": 9.0, 104 | "results": {}, 105 | "enabled": true 106 | }, 107 | "settings": { 108 | "params": {}, 109 | "forms": {} 110 | }, 111 | "apps": [], 112 | "runtimeInfos": {}, 113 | "progressUpdateIntervalMs": 500, 114 | "jobName": "paragraph_1614306085122_1142579790", 115 | "id": "paragraph_1614306085122_1142579790", 116 | "dateCreated": "2021-02-26 10:21:25.122", 117 | "dateStarted": "2021-02-26 10:23:50.582", 118 | "dateFinished": "2021-02-26 10:23:52.718", 119 | "status": "FINISHED" 120 | }, 121 | { 122 | "text": "%flink.ssql\n\n\nDROP TABLE IF EXISTS temperature_measurements;\n\nCREATE TABLE temperature_measurements (\n city STRING,\n temperature FLOAT,\n measurement_time TIMESTAMP(3),\n WATERMARK FOR measurement_time AS measurement_time - INTERVAL \u002715\u0027 SECONDS\n)\nWITH (\n \u0027connector\u0027 \u003d \u0027faker\u0027,\n \u0027fields.temperature.expression\u0027 \u003d \u0027#{number.numberBetween \u0027\u00270\u0027\u0027,\u0027\u002742\u0027\u0027}\u0027,\n \u0027fields.measurement_time.expression\u0027 \u003d \u0027#{date.past \u0027\u002715\u0027\u0027,\u0027\u0027SECONDS\u0027\u0027}\u0027,\n \u0027fields.city.expression\u0027 \u003d \u0027#{regexify \u0027\u0027(Copenhagen|Berlin|Chicago|Portland|Seattle|New York){1}\u0027\u0027}\u0027\n);\n", 123 | "user": "anonymous", 124 | "dateUpdated": "2021-02-26 10:23:17.099", 125 | "progress": 0, 126 | "config": { 127 | "editorSetting": { 128 | "language": "sql", 129 | "editOnDblClick": false, 130 | "completionKey": "TAB", 131 | "completionSupport": true 132 | }, 133 | "colWidth": 12.0, 134 | "editorMode": "ace/mode/sql", 135 | "fontSize": 9.0, 136 | "results": {}, 137 | "enabled": true 138 | }, 139 | "settings": { 140 | "params": {}, 141 | "forms": {} 142 | }, 143 | "apps": [], 144 | "runtimeInfos": {}, 145 | "progressUpdateIntervalMs": 500, 146 | "jobName": "paragraph_1614306097701_1279422606", 147 | "id": "paragraph_1614306097701_1279422606", 148 | "dateCreated": "2021-02-26 10:21:37.701", 149 | "dateStarted": "2021-02-26 10:23:17.107", 150 | "dateFinished": "2021-02-26 10:23:17.960", 151 | "status": "FINISHED" 152 | }, 153 | { 154 | "text": "%flink.ssql(type\u003dupdate)\n\n--Use to_fahr() to convert temperatures in US cities from C to F\nSELECT city,\n temperature AS tmp,\n to_fahr(city,temperature) AS tmp_conv,\n measurement_time\nFROM temperature_measurements\nORDER BY measurement_time DESC\nLIMIT 10;\n\n", 155 | "user": "anonymous", 156 | "dateUpdated": "2021-02-26 13:26:02.808", 157 | "progress": 0, 158 | "config": { 159 | "editorSetting": { 160 | "language": "sql", 161 | "editOnDblClick": false, 162 | "completionKey": "TAB", 163 | "completionSupport": true 164 | }, 165 | "colWidth": 12.0, 166 | "editorMode": "ace/mode/sql", 167 | "fontSize": 9.0, 168 | "results": { 169 | "0": { 170 | "graph": { 171 | "mode": "table", 172 | "height": 300.0, 173 | "optionOpen": false, 174 | "setting": { 175 | "table": { 176 | "tableGridState": {}, 177 | "tableColumnTypeState": { 178 | "names": { 179 | "city": "string", 180 | "tmp": "string", 181 | "tmp_conv": "string", 182 | "measurement_time": "string" 183 | }, 184 | "updated": true 185 | }, 186 | "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", 187 | "tableOptionValue": { 188 | "useFilter": false, 189 | "showPagination": false, 190 | "showAggregationFooter": false 191 | }, 192 | "updated": false, 193 | "initialized": false 194 | } 195 | }, 196 | "commonSetting": {} 197 | } 198 | } 199 | }, 200 | "enabled": true 201 | }, 202 | "settings": { 203 | "params": {}, 204 | "forms": {} 205 | }, 206 | "apps": [], 207 | "runtimeInfos": { 208 | "jobUrl": { 209 | "propertyName": "jobUrl", 210 | "label": "FLINK JOB", 211 | "tooltip": "View in Flink web UI", 212 | "group": "flink", 213 | "values": [ 214 | { 215 | "jobUrl": "http://localhost:8081#/job/8b3669937956e8b8950352dd940ff3f9" 216 | } 217 | ], 218 | "interpreterSettingId": "flink" 219 | } 220 | }, 221 | "progressUpdateIntervalMs": 500, 222 | "jobName": "paragraph_1614306197104_1373037918", 223 | "id": "paragraph_1614306197104_1373037918", 224 | "dateCreated": "2021-02-26 10:23:17.104", 225 | "dateStarted": "2021-02-26 10:27:06.707", 226 | "dateFinished": "2021-02-26 10:28:36.691", 227 | "status": "ABORT" 228 | }, 229 | { 230 | "text": "%flink.ssql\n", 231 | "user": "anonymous", 232 | "dateUpdated": "2021-02-26 10:23:39.052", 233 | "progress": 0, 234 | "config": {}, 235 | "settings": { 236 | "params": {}, 237 | "forms": {} 238 | }, 239 | "apps": [], 240 | "runtimeInfos": {}, 241 | "progressUpdateIntervalMs": 500, 242 | "jobName": "paragraph_1614306219052_1345346806", 243 | "id": "paragraph_1614306219052_1345346806", 244 | "dateCreated": "2021-02-26 10:23:39.052", 245 | "status": "READY" 246 | } 247 | ], 248 | "name": "01 Extending SQL with Python UDFs", 249 | "id": "2G19AQ57T", 250 | "defaultInterpreterGroup": "flink", 251 | "version": "0.10.0-SNAPSHOT", 252 | "noteParams": {}, 253 | "noteForms": {}, 254 | "angularObjects": {}, 255 | "config": { 256 | "isZeppelinNotebookCronEnable": false 257 | }, 258 | "info": {} 259 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | This is for run flink-sql-cook on zeppelin. Read this article for how to use it 4 | 5 | https://medium.com/analytics-vidhya/learn-flink-sql-the-easy-way-d9d48a95ae57 6 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.4" 2 | 3 | services: 4 | zeppelin: 5 | image: apache/zeppelin:0.10.1 6 | volumes: 7 | - ./logs:/logs 8 | - .:/notebook 9 | - $FLINK_HOME:/flink 10 | environment: 11 | EPPELIN_LOG_DIR: /logs 12 | ZEPPELIN_NOTEBOOK_DIR: /notebook 13 | ports: 14 | - 8080:8080 15 | - 8081:8081 16 | --------------------------------------------------------------------------------