├── .gitignore
├── README.md
├── requirements.txt
├── table
    ├── __init__.py
    ├── batch
    │   ├── add_columns.py
    │   ├── add_or_replace_columns.py
    │   ├── alias.py
    │   ├── distinct_agg.py
    │   ├── drop_columns.py
    │   ├── filter.py
    │   ├── full_outer_join.py
    │   ├── group_by_agg.py
    │   ├── group_by_window_agg.py
    │   ├── in.py
    │   ├── inner_join.py
    │   ├── intersect.py
    │   ├── intersect_all.py
    │   ├── left_outer_join.py
    │   ├── minus.py
    │   ├── minus_all.py
    │   ├── offset_and_fetch.py
    │   ├── order_by.py
    │   ├── rename_columns.py
    │   ├── right_outer_join.py
    │   ├── scan.py
    │   ├── session_window.py
    │   ├── slide_window.py
    │   ├── table_select.py
    │   ├── tumble_window.py
    │   ├── union.py
    │   ├── union_all.py
    │   └── where.py
    ├── javaudf
    │   ├── README.md
    │   ├── aggregate-function
    │   │   ├── pom.xml
    │   │   └── src
    │   │   │   └── main
    │   │   │       └── java
    │   │   │           └── com
    │   │   │               └── pyflink
    │   │   │                   └── table
    │   │   │                       └── WeightedAvg.java
    │   ├── aggregate_func_demo.py
    │   ├── scalar-function
    │   │   ├── pom.xml
    │   │   └── src
    │   │   │   └── main
    │   │   │       └── java
    │   │   │           └── com
    │   │   │               └── pyflink
    │   │   │                   └── table
    │   │   │                       └── HashCode.java
    │   ├── scalar_func_demo.py
    │   ├── table-function
    │   │   ├── pom.xml
    │   │   └── src
    │   │   │   └── main
    │   │   │       └── java
    │   │   │           └── com
    │   │   │               └── pyflink
    │   │   │                   └── table
    │   │   │                       └── Split.java
    │   └── table_func_demo.py
    ├── prepare_environment.py
    ├── resources
    │   └── table_orders.csv
    ├── streaming
    │   ├── add_columns.py
    │   ├── add_or_replace_columns.py
    │   ├── alias.py
    │   ├── distinct.py
    │   ├── distinct_agg.py
    │   ├── drop_columns.py
    │   ├── filter.py
    │   ├── full_outer_join.py
    │   ├── group_by_agg.py
    │   ├── group_by_window_agg.py
    │   ├── in.py
    │   ├── inner_join.py
    │   ├── left_outer_join.py
    │   ├── over_window_agg.py
    │   ├── rename_columns.py
    │   ├── right_outer_join.py
    │   ├── scan.py
    │   ├── session_window.py
    │   ├── slide_window.py
    │   ├── table_select.py
    │   ├── tumble_window.py
    │   ├── union_all.py
    │   └── where.py
    ├── user_case
    │   ├── __init__.py
    │   └── pv_uv
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── create_data.sh
    │   │   ├── env.sh
    │   │   ├── pv_uv_example.py
    │   │   └── user_behavior.log
    └── user_defined_sources_and_sinks
    │   ├── CustomTableSourceDemo.py
    │   ├── README.md
    │   ├── __init__.py
    │   ├── pom.xml
    │   └── src
    │       └── main
    │           ├── java
    │               └── com
    │               │   └── pyflink
    │               │       └── table
    │               │           ├── factory
    │               │               └── TestTableFactory.java
    │               │           ├── sinks
    │               │               └── TestRetractSink.java
    │               │           └── sources
    │               │               └── TestSource.java
    │           └── resources
    │               └── META-INF
    │                   └── services
    │                       └── org.apache.flink.table.factories.TableFactory
└── utils
    ├── __init__.py
    ├── elastic_search_utils.py
    └── kafka_utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | table/result
3 | target
4 | .DS_Store
5 | *.iml
6 | *dependency-reduced-pom.xml
7 | *__pycache__*
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pyflink-demo
  2 | This project is to help users easier to write their pyflink job.
  3 | 
  4 | **contents**
  5 | 
  6 | - [Quick Start](#quick-start)
  7 |   + [Setup](#setup)
  8 |     + [Requirements](#requirements)
  9 |     + [Install python2](#install-python2)
 10 |     + [Install pip](#install-pip)
 11 |     + [Install java 8](#install-java-8)
 12 |     + [Install maven](#install-maven)
 13 |   + [Build PyFlink](#build-pyflink)
 14 |   + [Prepare Kafka](#prepare-kafka)
 15 |   + [Prepare ElasticSearch](#prepare-elasticsearch)
 16 |   + [Install Dependency](#install-dependency)
 17 |   + [Run Demo](#run-demo)
 18 |     + [[optional] Importing the project on PyCharm](#optionalimporting-the-project-on-pycharm)
 19 |     + [Run pyflink table api example](#run-pyflink-table-api-example)
 20 | 
 21 | ## Quick Start
 22 | 
 23 | ### Setup
 24 | 
 25 | #### Requirements
 26 | 1. python2.7 or python3
 27 | 2. pip
 28 | 3. java 1.8
 29 | 4. maven version >=3.3.0
 30 | 
 31 | #### Install python2
 32 | 
 33 | macOS
 34 | ```shell
 35 | /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
 36 | export PATH="/usr/local/bin:/usr/local/sbin:$PATH"
 37 | brew install python@2 
 38 | ```
 39 | Ubuntu
 40 | ```shell
 41 | sudo apt install python-dev
 42 | ```
 43 | 
 44 | #### Install pip
 45 | 
 46 | macOS
 47 | 
 48 | ```shell 
 49 | curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
 50 | python get-pip.py
 51 | ```
 52 | 
 53 | Ubuntu
 54 | ```shell
 55 | sudo apt install python-pip
 56 | ```
 57 | 
 58 | #### Install java 8
 59 | 
 60 | [java download page](http://www.oracle.com/technetwork/java/javase/downloads/index.html)
 61 | 
 62 | #### Install maven
 63 | 
 64 | maven version >=3.3.0
 65 | 
 66 | [download maven page](http://maven.apache.org/download.cgi)
 67 | 
 68 | ```shell
 69 | tar -xvf apache-maven-3.6.1-bin.tar.gz
 70 | mv -rf apache-maven-3.6.1 /usr/local/
 71 | ```
 72 | configuration environment variables
 73 | ```shell
 74 | MAVEN_HOME=/usr/local/apache-maven-3.6.1
 75 | export MAVEN_HOME
 76 | export PATH=${PATH}:${MAVEN_HOME}/bin
 77 | ```
 78 | 
 79 | 
 80 | ### Build PyFlink
 81 | 
 82 | If you want to build a PyFlink package that can be used for pip installation, you need to build Flink jars first, as described in https://ci.apache.org/projects/flink/flink-docs-master/flinkDev/building.html
 83 | 
 84 | ```shell
 85 | mvn clean install -DskipTests -Dfast
 86 | ```
 87 | 
 88 | Then you need to copy the jar package flink-sql-connector-kafka-0.11_*-SNAPSHOT.jar in the directory of flink-connectors/flink-sql-connector-kafka-0.11
 89 | 
 90 | ```shell
 91 | cp flink-connectors/flink-sql-connector-kafka-0.11/target/flink-sql-connector-kafka-0.11_*-SNAPSHOT.jar build-target/lib
 92 | ```
 93 | 
 94 | Then you need to copy the jar package flink-connector-elasticsearch6_*-SNAPSHOT.jar in the directory of flink-connectors/flink-connector-elasticsearch6
 95 | 
 96 | ```shell
 97 | cp flink-connectors/flink-connector-elasticsearch6/target/flink-connector-elasticsearch6_*-SNAPSHOT.jar build-target/lib
 98 | ```
 99 | 
100 | Next you need to copy the jar package flink-json-*-SNAPSHOT-sql-jar.jar in the directory of flink-formats/flink-json
101 | 
102 | ```shell
103 | cp flink-formats/flink-json/target/flink-json-*-SNAPSHOT-sql-jar.jar build-target/lib
104 | ```
105 | 
106 | Next go to the root directory of flink source code and run this command to build the sdist package and wheel package:
107 | 
108 | ```shell
109 | cd flink-python; python3 setup.py sdist bdist_wheel
110 | ```
111 | 
112 | The sdist and wheel package will be found under `./flink-python/dist/`. Either of them could be used for pip installation, such as:
113 | 
114 | ```shell
115 | pip install dist/*.tar.gz
116 | ```
117 | 
118 | ### Prepare Kafka
119 | Some demo choose kafka as source, so you need to install and run kafka in local host. the version we use kafka_2.11-0.11 (https://archive.apache.org/dist/kafka/0.11.0.3/kafka_2.11-0.11.0.3.tgz)
120 | you use the following command to download:
121 | 
122 | ```shell
123 | wget https://archive.apache.org/dist/kafka/0.11.0.3/kafka_2.11-0.11.0.3.tgz
124 | ```
125 | 
126 | Then you depress the tar package:
127 | 
128 | ```shell
129 | tar zxvf kafka_2.11-0.11.0.3.tgz
130 | ```
131 | Next you start the zookeeper:
132 | 
133 | ```shell
134 | cd kafka_2.11-0.11.0.3; bin/zookeeper-server-start.sh config/zookeeper.properties
135 | ```
136 | 
137 | Finally, you start kafka server:
138 | 
139 | ```shell
140 | bin/kafka-server-start.sh config/server.properties
141 | ```
142 | 
143 | ### Prepare ElasticSearch
144 | Some demo choose Elasticsearch as sink, so you need to install and run Elasticsearch in local host. the version we use elasticsearch-6.0.1 (https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.0.1.tar.gz)
145 | you use the following command to download:
146 | 
147 | ```shell
148 | wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.0.1.tar.gz
149 | ```
150 | 
151 | Then you depress the tar package:
152 | 
153 | ```shell
154 | tar zxvf https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.0.1.tar.gz
155 | ```
156 | 
157 | Finally, you start ElasticSearch:
158 | 
159 | ```shell
160 | ./bin/elasticsearch
161 | ```
162 | 
163 | ### Install Dependency
164 | Install environment dependency
165 | 
166 | ```shell
167 | pip install -r requirements.txt
168 | ```
169 | 
170 | ### Run demo
171 | #### [optional]Importing the project on PyCharm
172 | You can use PyCharm to open the project and choose the python interpreter as the python which match the pip tool which install the pyflink and dependency in requirements.txt.
173 | The following documentation describes the steps to setup PyCharm 2019.1.3 ([https://www.jetbrains.com/pycharm/download/](https://www.jetbrains.com/pycharm/download/))
174 | 
175 | If you are in the PyCharm startup interface:
176 | 1. Start PyCharm and choose "Open"
177 | 2. Select the pyflink-demo cloned repository.
178 | 3. Click on System interpreter in python interpreter option(Pycharm->Preference->python interpreter).
179 | 4. Choose the python which have installed the packages of pyflink and dependencies in the requirements.txt
180 | 
181 | If you have used PyCharm to open a project:
182 | 1. Select "File -> Open"
183 | 2. Select the pyflink-demo cloned repository.
184 | 3. Click on System interpreter in python interpreter option(Pycharm->Preference->python interpreter).
185 | 4. Choose the python which have installed the packages of pyflink and dependencies in the requirements.txt
186 | #### Run pyflink table api example 
187 | Demos about table api is in the pyflink-demo/table/batch directory and pyflink-demo/table/streaming directory.
188 | Demos about udf is in the pyflink-demo/table/javaudf
189 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | elasticsearch>=6.0.0,<7.0.0
2 | kafka-python
3 | py4j==0.10.8.1


--------------------------------------------------------------------------------
/table/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukess/pyflink-demo/7e8ae37b3cfb4a6a5560cae2a0d44cddee25e036/table/__init__.py


--------------------------------------------------------------------------------
/table/batch/add_columns.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def add_columns_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
12 |     result_file = "/tmp/table_add_columns_batch.csv"
13 |     if os.path.exists(result_file):
14 |         os.remove(result_file)
15 |     bt_env.register_table_source("Orders",
16 |                                  CsvTableSource(source_file,
17 |                                                 ["a", "b", "c", "rowtime"],
18 |                                                 [DataTypes.STRING(),
19 |                                                  DataTypes.INT(),
20 |                                                  DataTypes.INT(),
21 |                                                  DataTypes.TIMESTAMP()]))
22 |     bt_env.register_table_sink("result",
23 |                                CsvTableSink(["a", "b", "c", "rowtime", "d"],
24 |                                             [DataTypes.STRING(),
25 |                                              DataTypes.INT(),
26 |                                              DataTypes.INT(),
27 |                                              DataTypes.TIMESTAMP(),
28 |                                              DataTypes.STRING()],
29 |                                             result_file))
30 |     orders = bt_env.scan("Orders")
31 |     result = orders.add_columns("concat(a, '_sunny') as d")
32 |     result.insert_into("result")
33 |     bt_env.execute("add columns batch")
34 |     # cat /tmp/table_add_columns_batch.csv
35 |     # a,1,1,2013-01-01 00:14:13.0,a_sunny
36 |     # b,2,2,2013-01-01 00:24:13.0,b_sunny
37 |     # a,3,3,2013-01-01 00:34:13.0,a_sunny
38 |     # a,4,4,2013-01-01 01:14:13.0,a_sunny
39 |     # b,4,5,2013-01-01 01:24:13.0,b_sunny
40 |     # a,5,2,2013-01-01 01:34:13.0,a_sunny
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     add_columns_batch()
45 | 


--------------------------------------------------------------------------------
/table/batch/add_or_replace_columns.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def add_or_replace_columns_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
12 |     result_file = "/tmp/table_add_or_replace_columns_batch.csv"
13 |     if os.path.exists(result_file):
14 |         os.remove(result_file)
15 |     bt_env.register_table_source("Orders",
16 |                                  CsvTableSource(source_file,
17 |                                                 ["a", "b", "c", "rowtime"],
18 |                                                 [DataTypes.STRING(),
19 |                                                  DataTypes.INT(),
20 |                                                  DataTypes.INT(),
21 |                                                  DataTypes.TIMESTAMP()]))
22 |     bt_env.register_table_sink("result",
23 |                                CsvTableSink(["a", "b", "c", "rowtime"],
24 |                                             [DataTypes.STRING(),
25 |                                              DataTypes.INT(),
26 |                                              DataTypes.INT(),
27 |                                              DataTypes.TIMESTAMP()],
28 |                                             result_file))
29 |     orders = bt_env.scan("Orders")
30 |     result = orders.add_or_replace_columns("concat(a, '_sunny') as a")
31 |     result.insert_into("result")
32 |     bt_env.execute("add or replace columns batch")
33 |     # cat /tmp/table_add_or_replace_columns_batch.csv
34 |     # a_sunny,1,1,2013-01-01 00:14:13.0
35 |     # b_sunny,2,2,2013-01-01 00:24:13.0
36 |     # a_sunny,3,3,2013-01-01 00:34:13.0
37 |     # a_sunny,4,4,2013-01-01 01:14:13.0
38 |     # b_sunny,4,5,2013-01-01 01:24:13.0
39 |     # a_sunny,5,2,2013-01-01 01:34:13.0
40 | 
41 | 
42 | if __name__ == '__main__':
43 |    add_or_replace_columns_batch()


--------------------------------------------------------------------------------
/table/batch/alias.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def alias_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
12 |     result_file = "/tmp/table_alias_batch.csv"
13 |     if os.path.exists(result_file):
14 |         os.remove(result_file)
15 |     bt_env.register_table_source("Orders",
16 |                                  CsvTableSource(source_file,
17 |                                                 ["a", "b", "c", "rowtime"],
18 |                                                 [DataTypes.STRING(),
19 |                                                  DataTypes.INT(),
20 |                                                  DataTypes.INT(),
21 |                                                  DataTypes.TIMESTAMP()]))
22 |     bt_env.register_table_sink("result",
23 |                                CsvTableSink(["a", "b", "c", "rowtime"],
24 |                                             [DataTypes.STRING(),
25 |                                              DataTypes.INT(),
26 |                                              DataTypes.INT(),
27 |                                              DataTypes.TIMESTAMP()],
28 |                                             result_file))
29 |     orders = bt_env.scan("Orders")
30 |     result = orders.alias("x, y, z, t").select("x, y, z, t")
31 |     result.insert_into("result")
32 |     bt_env.execute("alias batch")
33 |     # cat table/result/table_alias_batch.csv
34 |     # a,1,1,2013-01-01 00:14:13.0
35 |     # b,2,2,2013-01-01 00:24:13.0
36 |     # a,3,3,2013-01-01 00:34:13.0
37 |     # a,4,4,2013-01-01 01:14:13.0
38 |     # b,4,5,2013-01-01 01:24:13.0
39 |     # a,5,2,2013-01-01 01:34:13.0
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     alias_batch()
44 | 


--------------------------------------------------------------------------------
/table/batch/distinct_agg.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | # DISTINCT window aggregates are currently not supported in Batch mode.
 8 | def distinct_agg_batch():
 9 |     b_env = ExecutionEnvironment.get_execution_environment()
10 |     b_env.set_parallelism(1)
11 |     bt_env = BatchTableEnvironment.create(b_env)
12 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
13 |     result_file = "/tmp/table_distinct_agg_batch.csv"
14 |     if os.path.exists(result_file):
15 |         os.remove(result_file)
16 |     bt_env.register_table_source("Orders",
17 |                                  CsvTableSource(source_file,
18 |                                                 ["a", "b", "c", "rowtime"],
19 |                                                 [DataTypes.STRING(),
20 |                                                  DataTypes.INT(),
21 |                                                  DataTypes.INT(),
22 |                                                  DataTypes.TIMESTAMP()]))
23 |     bt_env.register_table_sink("result",
24 |                                CsvTableSink(["b"],
25 |                                             [DataTypes.INT()],
26 |                                             result_file))
27 |     orders = bt_env.scan("Orders")
28 |     result = orders.group_by("a") \
29 |         .select("b.sum.distinct as d")
30 |     result.insert_into("result")
31 |     bt_env.execute("distinct agg batch")
32 |     # cat table/result/table_distinct_batch.csv
33 |     # 13
34 |     # 6
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     distinct_agg_batch()
39 | 


--------------------------------------------------------------------------------
/table/batch/drop_columns.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def drop_columns_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
12 |     result_file = "/tmp/table_drop_columns_batch.csv"
13 |     if os.path.exists(result_file):
14 |         os.remove(result_file)
15 |     bt_env.register_table_source("Orders",
16 |                                  CsvTableSource(source_file,
17 |                                                 ["a", "b", "c", "rowtime"],
18 |                                                 [DataTypes.STRING(),
19 |                                                  DataTypes.INT(),
20 |                                                  DataTypes.INT(),
21 |                                                  DataTypes.TIMESTAMP()]))
22 |     bt_env.register_table_sink("result",
23 |                                CsvTableSink(["a", "b", "rowtime"],
24 |                                             [DataTypes.STRING(),
25 |                                              DataTypes.INT(),
26 |                                              DataTypes.TIMESTAMP()],
27 |                                             result_file))
28 |     orders = bt_env.scan("Orders")
29 |     result = orders.drop_columns("c")
30 |     result.insert_into("result")
31 |     bt_env.execute("drop columns batch")
32 |     # cat table/result/table_drop_columns_batch.csv
33 |     # a,1,2013-01-01 00:14:13.0
34 |     # b,2,2013-01-01 00:24:13.0
35 |     # a,3,2013-01-01 00:34:13.0
36 |     # a,4,2013-01-01 01:14:13.0
37 |     # b,4,2013-01-01 01:24:13.0
38 |     # a,5,2013-01-01 01:34:13.0
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     drop_columns_batch()
43 | 


--------------------------------------------------------------------------------
/table/batch/filter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def filter_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
12 |     result_file = "/tmp/table_filter_batch.csv"
13 |     if os.path.exists(result_file):
14 |         os.remove(result_file)
15 |     bt_env.register_table_source("Orders",
16 |                                  CsvTableSource(source_file,
17 |                                                 ["a", "b", "c", "rowtime"],
18 |                                                 [DataTypes.STRING(),
19 |                                                  DataTypes.INT(),
20 |                                                  DataTypes.INT(),
21 |                                                  DataTypes.TIMESTAMP()]))
22 |     bt_env.register_table_sink("result",
23 |                                CsvTableSink(["a", "b", "c", "rowtime"],
24 |                                             [DataTypes.STRING(),
25 |                                              DataTypes.INT(),
26 |                                              DataTypes.INT(),
27 |                                              DataTypes.TIMESTAMP()],
28 |                                             result_file))
29 |     orders = bt_env.scan("Orders")
30 |     result = orders.filter("b % 2 === 0")
31 |     result.insert_into("result")
32 |     bt_env.execute("filter batch")
33 |     # cat /tmp/table_filter_batch.csv
34 |     # b,2,2,2013-01-01 00:24:13.0
35 |     # a,4,4,2013-01-01 01:14:13.0
36 |     # b,4,5,2013-01-01 01:24:13.0
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     filter_batch()
41 | 


--------------------------------------------------------------------------------
/table/batch/full_outer_join.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def full_outer_join_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     result_file = "/tmp/table_full_outer_join_batch.csv"
12 |     if os.path.exists(result_file):
13 |         os.remove(result_file)
14 |     left = bt_env.from_elements(
15 |         [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")],
16 |         ["a", "b", "c"]).select("a, b, c")
17 |     right = bt_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")],
18 |                                  ["d", "e", "f"]).select("d, e, f")
19 |     bt_env.register_table_sink("result",
20 |                                CsvTableSink(["a", "b", "c"],
21 |                                             [DataTypes.BIGINT(),
22 |                                              DataTypes.STRING(),
23 |                                              DataTypes.STRING()],
24 |                                             result_file))
25 | 
26 |     result = left.full_outer_join(right, "a = d").select("a, b, e")
27 |     result.insert_into("result")
28 |     bt_env.execute("full outer join batch")
29 |     # cat /tmp/table_full_outer_join_batch.csv
30 |     # 1,1a,1b
31 |     # 1,1a,3b
32 |     # 2,2a,
33 |     # 2,4b,
34 |     # 3,,
35 |     # 5,5a,
36 |     # 4b
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     full_outer_join_batch()
41 | 


--------------------------------------------------------------------------------
/table/batch/group_by_agg.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def group_by_agg_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
12 |     result_file = "/tmp/table_group_by_agg_batch.csv"
13 |     if os.path.exists(result_file):
14 |         os.remove(result_file)
15 |     bt_env.register_table_source("Orders",
16 |                                  CsvTableSource(source_file,
17 |                                                 ["a", "b", "c", "rowtime"],
18 |                                                 [DataTypes.STRING(),
19 |                                                  DataTypes.INT(),
20 |                                                  DataTypes.INT(),
21 |                                                  DataTypes.TIMESTAMP()]))
22 |     bt_env.register_table_sink("result",
23 |                                CsvTableSink(["a", "b"],
24 |                                             [DataTypes.STRING(),
25 |                                              DataTypes.INT()],
26 |                                             result_file))
27 |     orders = bt_env.scan("Orders")
28 |     result = orders.group_by("a").select("a, b.sum as d")
29 |     result.insert_into("result")
30 |     bt_env.execute("group by agg batch")
31 |     # cat /tmp/table_group_by_agg_batch.csv
32 |     # a,13
33 |     # b,6
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     group_by_agg_batch()
38 | 


--------------------------------------------------------------------------------
/table/batch/group_by_window_agg.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes
 5 | from pyflink.table.window import Tumble
 6 | 
 7 | 
 8 | def group_by_window_agg_batch():
 9 |     b_env = ExecutionEnvironment.get_execution_environment()
10 |     b_env.set_parallelism(1)
11 |     bt_env = BatchTableEnvironment.create(b_env)
12 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
13 |     result_file = "/tmp/table_group_by_window_agg_batch.csv"
14 |     if os.path.exists(result_file):
15 |         os.remove(result_file)
16 |     bt_env.register_table_source("Orders",
17 |                                  CsvTableSource(source_file,
18 |                                                 ["a", "b", "c", "rowtime"],
19 |                                                 [DataTypes.STRING(),
20 |                                                  DataTypes.INT(),
21 |                                                  DataTypes.INT(),
22 |                                                  DataTypes.TIMESTAMP()]))
23 |     bt_env.register_table_sink("result",
24 |                                CsvTableSink(["a", "start", "end", "rowtime", "d"],
25 |                                             [DataTypes.STRING(),
26 |                                              DataTypes.TIMESTAMP(),
27 |                                              DataTypes.TIMESTAMP(),
28 |                                              DataTypes.TIMESTAMP(),
29 |                                              DataTypes.INT()],
30 |                                             result_file))
31 |     orders = bt_env.scan("Orders")
32 |     result = orders.window(Tumble.over("1.hours").on("rowtime").alias("w")) \
33 |         .group_by("a, w") \
34 |         .select("a, w.start, w.end, w.rowtime, b.sum as d")
35 |     result.insert_into("result")
36 |     bt_env.execute("group by agg batch")
37 |     # cat /tmp/table_group_by_window_agg_batch.csv
38 |     # a,2013-01-01 00:00:00.0,2013-01-01 01:00:00.0,2013-01-01 00:59:59.999,4
39 |     # a,2013-01-01 01:00:00.0,2013-01-01 02:00:00.0,2013-01-01 01:59:59.999,9
40 |     # b,2013-01-01 00:00:00.0,2013-01-01 01:00:00.0,2013-01-01 00:59:59.999,2
41 |     # b,2013-01-01 01:00:00.0,2013-01-01 02:00:00.0,2013-01-01 01:59:59.999,4
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     group_by_window_agg_batch()
46 | 


--------------------------------------------------------------------------------
/table/batch/in.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def in_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     result_file = "/tmp/table_in_batch.csv"
12 |     if os.path.exists(result_file):
13 |         os.remove(result_file)
14 |     left = bt_env.from_elements(
15 |         [(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (4, "ra", "raa")],
16 |         ["a", "b", "c"]).select("a, b, c")
17 |     right = bt_env.from_elements([(1, "ra", "raa"), (2, "", "rbb"), (3, "rc", "rcc"), (1, "ra", "raa")],
18 |                                  ["a", "b", "c"]).select("a")
19 |     bt_env.register_table_sink("result",
20 |                                CsvTableSink(["a", "b", "c"],
21 |                                             [DataTypes.BIGINT(),
22 |                                              DataTypes.STRING(),
23 |                                              DataTypes.STRING()],
24 |                                             result_file))
25 | 
26 |     result = left.where("a.in(%s)" % right)
27 |     result.insert_into("result")
28 |     # another way
29 |     # bt_env.register_table("RightTable", right)
30 |     # result = left.where("a.in(RightTable)")
31 |     bt_env.execute("in batch")
32 | 
33 |     # cat /tmp/table_in_batch.csv
34 |     # 1,ra,raa
35 |     # 2,lb,lbb
36 |     # 2,lb,lbb
37 |     # 3,,lcc
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     in_batch()
42 | 


--------------------------------------------------------------------------------
/table/batch/inner_join.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def inner_join_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     result_file = "/tmp/table_inner_join_batch.csv"
12 |     if os.path.exists(result_file):
13 |         os.remove(result_file)
14 |     left = bt_env.from_elements(
15 |         [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")],
16 |         ["a", "b", "c"]).select("a, b, c")
17 |     right = bt_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")],
18 |                                  ["d", "e", "f"]).select("d, e, f")
19 |     bt_env.register_table_sink("result",
20 |                                CsvTableSink(["a", "b", "c"],
21 |                                             [DataTypes.BIGINT(),
22 |                                              DataTypes.STRING(),
23 |                                              DataTypes.STRING()],
24 |                                             result_file))
25 | 
26 |     result = left.join(right).where("a = d").select("a, b, e")
27 |     result.insert_into("result")
28 |     bt_env.execute("inner join batch")
29 |     # cat table/result/table_inner_join_batch.csv
30 |     # 1,1a,1b
31 |     # 2,2a,
32 |     # 2,4b,
33 |     # 1,1a,3b
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     inner_join_batch()
38 | 


--------------------------------------------------------------------------------
/table/batch/intersect.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def intersect_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     result_file = "/tmp/table_intersect_batch.csv"
12 |     if os.path.exists(result_file):
13 |         os.remove(result_file)
14 |     left = bt_env.from_elements([(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (1, "ra", "raa")],
15 |                                 ["a", "b", "c"]).select("a, b, c")
16 |     right = bt_env.from_elements([(1, "ra", "raa"), (2, "", "rbb"), (3, "rc", "rcc"), (1, "ra", "raa")],
17 |                                  ["a", "b", "c"]).select("a, b, c")
18 |     bt_env.register_table_sink("result",
19 |                                CsvTableSink(["a", "b", "c"],
20 |                                             [DataTypes.BIGINT(),
21 |                                              DataTypes.STRING(),
22 |                                              DataTypes.STRING()],
23 |                                             result_file))
24 | 
25 |     result = left.intersect(right)
26 |     result.insert_into("result")
27 |     bt_env.execute("intersect batch")
28 |     # cat /tmp/table_intersect_batch.csv
29 |     # 1,ra,raa
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     intersect_batch()
34 | 


--------------------------------------------------------------------------------
/table/batch/intersect_all.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def intersect_all_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     result_file = "/tmp/table_intersect_all_batch.csv"
12 |     if os.path.exists(result_file):
13 |         os.remove(result_file)
14 |     left = bt_env.from_elements([(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (1, "ra", "raa")],
15 |                                 ["a", "b", "c"]).select("a, b, c")
16 |     right = bt_env.from_elements([(1, "ra", "raa"), (2, "", "rbb"), (3, "rc", "rcc"), (1, "ra", "raa")],
17 |                                  ["a", "b", "c"]).select("a, b, c")
18 |     bt_env.register_table_sink("result",
19 |                                CsvTableSink(["a", "b", "c"],
20 |                                             [DataTypes.BIGINT(),
21 |                                              DataTypes.STRING(),
22 |                                              DataTypes.STRING()],
23 |                                             result_file))
24 | 
25 |     result = left.intersect_all(right)
26 |     result.insert_into("result")
27 |     bt_env.execute("intersect all batch")
28 |     # cat /tmp/table_intersect_all_batch.csv
29 |     # 1,ra,raa
30 |     # 1,ra,raa
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     intersect_all_batch()
35 | 


--------------------------------------------------------------------------------
/table/batch/left_outer_join.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def left_outer_join_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     result_file = "/tmp/table_left_outer_join_batch.csv"
12 |     if os.path.exists(result_file):
13 |         os.remove(result_file)
14 |     left = bt_env.from_elements(
15 |         [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")],
16 |         ["a", "b", "c"]).select("a, b, c")
17 |     right = bt_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")],
18 |                                  ["d", "e", "f"]).select("d, e, f")
19 |     bt_env.register_table_sink("result",
20 |                                CsvTableSink(["a", "b", "c"],
21 |                                             [DataTypes.BIGINT(),
22 |                                              DataTypes.STRING(),
23 |                                              DataTypes.STRING()],
24 |                                             result_file))
25 | 
26 |     result = left.left_outer_join(right, "a = d").select("a, b, e")
27 |     result.insert_into("result")
28 |     bt_env.execute("left outer join batch")
29 |     # cat /tmp/table_left_outer_join_batch.csv
30 |     # 1,1a,1b
31 |     # 1,1a,3b
32 |     # 2,2a,
33 |     # 2,4b,
34 |     # 3,,
35 |     # 5,5a,
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     left_outer_join_batch()
40 | 


--------------------------------------------------------------------------------
/table/batch/minus.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def minus_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     result_file = "/tmp/table_minus_batch.csv"
12 |     if os.path.exists(result_file):
13 |         os.remove(result_file)
14 |     left = bt_env.from_elements(
15 |         [(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (1, "ra", "raa")],
16 |         ["a", "b", "c"]).select("a, b, c")
17 |     right = bt_env.from_elements([(1, "ra", "raa"), (2, "", "rbb"), (3, "rc", "rcc"), (1, "ra", "raa")],
18 |                                  ["a", "b", "c"]).select("a, b, c")
19 |     bt_env.register_table_sink("result",
20 |                                CsvTableSink(["a", "b", "c"],
21 |                                             [DataTypes.BIGINT(),
22 |                                              DataTypes.STRING(),
23 |                                              DataTypes.STRING()],
24 |                                             result_file))
25 | 
26 |     result = left.minus(right)
27 |     result.insert_into("result")
28 |     bt_env.execute("minus batch")
29 |     # cat /tmp/table_minus_batch.csv
30 |     # 2,lb,lbb
31 |     # 3,,lcc
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     minus_batch()
36 | 


--------------------------------------------------------------------------------
/table/batch/minus_all.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def minus_all_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     result_file = "/tmp/table_minus_all_batch.csv"
12 |     if os.path.exists(result_file):
13 |         os.remove(result_file)
14 |     left = bt_env.from_elements(
15 |         [(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (1, "ra", "raa")],
16 |         ["a", "b", "c"]).select("a, b, c")
17 |     right = bt_env.from_elements([(1, "ra", "raa"), (2, "", "rbb"), (3, "rc", "rcc"), (1, "ra", "raa")],
18 |                                  ["a", "b", "c"]).select("a, b, c")
19 |     bt_env.register_table_sink("result",
20 |                                CsvTableSink(["a", "b", "c"],
21 |                                             [DataTypes.BIGINT(),
22 |                                              DataTypes.STRING(),
23 |                                              DataTypes.STRING()],
24 |                                             result_file))
25 | 
26 |     result = left.minus_all(right)
27 |     result.insert_into("result")
28 |     bt_env.execute("minus all batch")
29 |     # cat /tmp/table_minus_all_batch.csv
30 |     # 2,lb,lbb
31 |     # 2,lb,lbb
32 |     # 3,,lcc
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     minus_all_batch()
37 | 


--------------------------------------------------------------------------------
/table/batch/offset_and_fetch.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def offset_and_fetch_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     result_file_1 = "/tmp/table_offset_and_fetch_batch_1.csv"
12 |     result_file_2 = "/tmp/table_offset_and_fetch_batch_2.csv"
13 |     result_file_3 = "/tmp/table_offset_and_fetch_batch_3.csv"
14 |     if os.path.exists(result_file_1):
15 |         os.remove(result_file_1)
16 |     if os.path.exists(result_file_2):
17 |         os.remove(result_file_2)
18 |     if os.path.exists(result_file_3):
19 |         os.remove(result_file_3)
20 | 
21 |     bt_env.register_table_sink("result1",
22 |                                CsvTableSink(["a", "b", "c"],
23 |                                             [DataTypes.BIGINT(),
24 |                                              DataTypes.STRING(),
25 |                                              DataTypes.STRING()],
26 |                                             result_file_1))
27 | 
28 |     bt_env.register_table_sink("result2",
29 |                                CsvTableSink(["a", "b", "c"],
30 |                                             [DataTypes.BIGINT(),
31 |                                              DataTypes.STRING(),
32 |                                              DataTypes.STRING()],
33 |                                             result_file_2))
34 | 
35 |     bt_env.register_table_sink("result3",
36 |                                CsvTableSink(["a", "b", "c"],
37 |                                             [DataTypes.BIGINT(),
38 |                                              DataTypes.STRING(),
39 |                                              DataTypes.STRING()],
40 |                                             result_file_3))
41 | 
42 |     left = bt_env.from_elements(
43 |         [(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (4, "ra", "raa")],
44 |         ["a", "b", "c"]).select("a, b, c")
45 | 
46 |     ordered_table = left.order_by("a.asc")
47 | 
48 |     ordered_table.fetch(5).insert_into("result1")
49 |     ordered_table.offset(1).insert_into("result2")
50 |     ordered_table.offset(1).fetch(2).insert_into("result3")
51 | 
52 |     bt_env.execute("offset and fetch batch")
53 |     # cat /tmp/able_offset_and_fetch_batch_1.csv
54 |     # 1,ra,raa
55 |     # 2,lb,lbb
56 |     # 2,lb,lbb
57 |     # 3,,lcc
58 |     # 4,ra,raa
59 | 
60 |     # cat /tmp/table_offset_and_fetch_batch_2.csv
61 |     # 2,lb,lbb
62 |     # 2,lb,lbb
63 |     # 3,,lcc
64 |     # 4,ra,raa
65 | 
66 |     # cat /tmp/table_offset_and_fetch_batch_3.csv
67 |     # 2,lb,lbb
68 |     # 2,lb,lbb
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     offset_and_fetch_batch()
73 | 


--------------------------------------------------------------------------------
/table/batch/order_by.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def order_by_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     result_file = "/tmp/table_order_by_batch.csv"
12 |     if os.path.exists(result_file):
13 |         os.remove(result_file)
14 | 
15 |     left = bt_env.from_elements(
16 |         [(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (4, "ra", "raa")],
17 |         ["a", "b", "c"]).select("a, b, c")
18 |     bt_env.register_table_sink("result",
19 |                                CsvTableSink(["a", "b", "c"],
20 |                                             [DataTypes.BIGINT(),
21 |                                              DataTypes.STRING(),
22 |                                              DataTypes.STRING()],
23 |                                             result_file))
24 | 
25 |     result = left.order_by("a.asc")
26 |     result.insert_into("result")
27 |     bt_env.execute("order by batch")
28 | 
29 |     # cat /tmp/table_order_by_batch.csv
30 |     # 1,ra,raa
31 |     # 2,lb,lbb
32 |     # 2,lb,lbb
33 |     # 3,,lcc
34 |     # 4,ra,raa
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     order_by_batch()
39 | 


--------------------------------------------------------------------------------
/table/batch/rename_columns.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def rename_columns_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
12 |     result_file = "/tmp/table_rename_columns_batch.csv"
13 |     if os.path.exists(result_file):
14 |         os.remove(result_file)
15 |     bt_env.register_table_source("Orders",
16 |                                  CsvTableSource(source_file,
17 |                                                 ["a", "b", "c", "rowtime"],
18 |                                                 [DataTypes.STRING(),
19 |                                                  DataTypes.INT(),
20 |                                                  DataTypes.INT(),
21 |                                                  DataTypes.TIMESTAMP()]))
22 |     bt_env.register_table_sink("result",
23 |                                CsvTableSink(["a", "b"],
24 |                                             [DataTypes.STRING(),
25 |                                              DataTypes.INT()],
26 |                                             result_file))
27 |     orders = bt_env.scan("Orders")
28 |     result = orders.rename_columns("a as a2, b as b2").select("a2, b2")
29 |     result.insert_into("result")
30 |     bt_env.execute("rename columns batch")
31 |     # cat /tmp/table_rename_columns_batch.csv
32 |     # a,1
33 |     # b,2
34 |     # a,3
35 |     # a,4
36 |     # b,4
37 |     # a,5
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     rename_columns_batch()
42 | 


--------------------------------------------------------------------------------
/table/batch/right_outer_join.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def right_outer_join_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     result_file = "/tmp/table_right_outer_join_batch.csv"
12 |     if os.path.exists(result_file):
13 |         os.remove(result_file)
14 |     left = bt_env.from_elements(
15 |         [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")],
16 |         ["a", "b", "c"]).select("a, b, c")
17 |     right = bt_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")],
18 |                                  ["d", "e", "f"]).select("d, e, f")
19 |     bt_env.register_table_sink("result",
20 |                                CsvTableSink(["a", "b", "c"],
21 |                                             [DataTypes.BIGINT(),
22 |                                              DataTypes.STRING(),
23 |                                              DataTypes.STRING()],
24 |                                             result_file))
25 | 
26 |     result = left.right_outer_join(right, "a = d").select("a, b, e")
27 |     result.insert_into("result")
28 |     bt_env.execute("right outer join batch")
29 |     # cat /tmp/table_right_outer_join_batch.csv
30 |     # 1,1a,1b
31 |     # 1,1a,3b
32 |     # 2,2a,
33 |     # 2,4b,
34 |     # 4b
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     right_outer_join_batch()
39 | 


--------------------------------------------------------------------------------
/table/batch/scan.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def scan_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
12 |     result_file = "/tmp/table_scan_batch.csv"
13 | 
14 |     if os.path.exists(result_file):
15 |         os.remove(result_file)
16 |     bt_env.register_table_source("Orders",
17 |                                  CsvTableSource(source_file,
18 |                                                 ["a", "b", "c", "rowtime"],
19 |                                                 [DataTypes.STRING(),
20 |                                                  DataTypes.INT(),
21 |                                                  DataTypes.INT(),
22 |                                                  DataTypes.TIMESTAMP()]))
23 |     bt_env.register_table_sink("result",
24 |                                CsvTableSink(["a", "b", "c", "rowtime"],
25 |                                             [DataTypes.STRING(),
26 |                                              DataTypes.INT(),
27 |                                              DataTypes.INT(),
28 |                                              DataTypes.TIMESTAMP()],
29 |                                             result_file))
30 |     orders = bt_env.scan("Orders")
31 |     orders.insert_into("result")
32 |     bt_env.execute("scan batch")
33 |     # cat /tmp/table_scan_batch.csv
34 |     # a,1,1,2013-01-01 00:14:13.0
35 |     # b,2,2,2013-01-01 00:24:13.0
36 |     # a,3,3,2013-01-01 00:34:13.0
37 |     # a,4,4,2013-01-01 01:14:13.0
38 |     # b,4,5,2013-01-01 01:24:13.0
39 |     # a,5,2,2013-01-01 01:34:13.0
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     scan_batch()
44 | 


--------------------------------------------------------------------------------
/table/batch/session_window.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes
 5 | from pyflink.table.window import Session
 6 | 
 7 | 
 8 | def session_time_window_batch():
 9 |     b_env = ExecutionEnvironment.get_execution_environment()
10 |     b_env.set_parallelism(1)
11 |     bt_env = BatchTableEnvironment.create(b_env)
12 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
13 |     result_file = "/tmp/table_session_time_window_batch.csv"
14 |     if os.path.exists(result_file):
15 |         os.remove(result_file)
16 |     bt_env.register_table_source("Orders",
17 |                                  CsvTableSource(source_file,
18 |                                                 ["a", "b", "c", "rowtime"],
19 |                                                 [DataTypes.STRING(),
20 |                                                  DataTypes.INT(),
21 |                                                  DataTypes.INT(),
22 |                                                  DataTypes.TIMESTAMP()]))
23 |     bt_env.register_table_sink("result",
24 |                                CsvTableSink(["a"],
25 |                                             [DataTypes.INT()],
26 |                                             result_file))
27 |     orders = bt_env.scan("Orders")
28 |     result = orders.window(Session.with_gap("10.minutes").on("rowtime").alias("w")) \
29 |         .group_by("w").select("b.sum")
30 |     result.insert_into("result")
31 |     bt_env.execute("session time window batch")
32 |     # cat /tmp/table_session_time_window_batch.csv
33 |     # 6
34 |     # 13
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     session_time_window_batch()
39 | 


--------------------------------------------------------------------------------
/table/batch/slide_window.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes
 5 | from pyflink.table.window import Slide
 6 | 
 7 | 
 8 | def slide_time_window_batch():
 9 |     b_env = ExecutionEnvironment.get_execution_environment()
10 |     b_env.set_parallelism(1)
11 |     bt_env = BatchTableEnvironment.create(b_env)
12 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
13 |     result_file = "/tmp/table_slide_time_window_batch.csv"
14 |     if os.path.exists(result_file):
15 |         os.remove(result_file)
16 |     bt_env.register_table_source("Orders",
17 |                                  CsvTableSource(source_file,
18 |                                                 ["a", "b", "c", "rowtime"],
19 |                                                 [DataTypes.STRING(),
20 |                                                  DataTypes.INT(),
21 |                                                  DataTypes.INT(),
22 |                                                  DataTypes.TIMESTAMP()]))
23 |     bt_env.register_table_sink("result",
24 |                                CsvTableSink(["a"],
25 |                                             [DataTypes.INT()],
26 |                                             result_file))
27 |     orders = bt_env.scan("Orders")
28 |     result = orders.window(Slide.over("60.minutes").every("10.minutes").on("rowtime").alias("w")) \
29 |         .group_by("w").select("b.sum")
30 |     result.insert_into("result")
31 |     bt_env.execute("slide time window batch")
32 |     # cat /tmp/table_slide_time_window_batch.csv
33 |     # 1
34 |     # 3
35 |     # 6
36 |     # 6
37 |     # 6
38 |     # 6
39 |     # 9
40 |     # 11
41 |     # 13
42 |     # 13
43 |     # 13
44 |     # 13
45 |     # 9
46 |     # 5
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     slide_time_window_batch()
51 | 


--------------------------------------------------------------------------------
/table/batch/table_select.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def select_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
12 |     result_file = "/tmp/table_select_batch.csv"
13 |     if os.path.exists(result_file):
14 |         os.remove(result_file)
15 | 
16 |     bt_env.register_table_source("Orders",
17 |                                  CsvTableSource(source_file,
18 |                                                 ["a", "b", "c", "rowtime"],
19 |                                                 [DataTypes.STRING(),
20 |                                                  DataTypes.INT(),
21 |                                                  DataTypes.INT(),
22 |                                                  DataTypes.TIMESTAMP()]))
23 |     bt_env.register_table_sink("result",
24 |                                CsvTableSink(["a", "c"],
25 |                                             [DataTypes.STRING(),
26 |                                              DataTypes.INT()],
27 |                                             result_file))
28 |     orders = bt_env.scan("Orders")
29 |     result = orders.select("a, b")
30 |     result.insert_into("result")
31 |     bt_env.execute("select batch")
32 |     # cat /tmp/table_select_batch.csv
33 |     # a,1
34 |     # b,2
35 |     # a,3
36 |     # a,4
37 |     # b,4
38 |     # a,5
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     select_batch()
43 | 


--------------------------------------------------------------------------------
/table/batch/tumble_window.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes
 5 | from pyflink.table.window import Tumble
 6 | 
 7 | 
 8 | def tumble_row_window_batch():
 9 |     b_env = ExecutionEnvironment.get_execution_environment()
10 |     b_env.set_parallelism(1)
11 |     bt_env = BatchTableEnvironment.create(b_env)
12 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
13 |     result_file = "/tmp/table_tumble_row_window_batch.csv"
14 |     if os.path.exists(result_file):
15 |         os.remove(result_file)
16 |     bt_env.register_table_source("Orders",
17 |                                  CsvTableSource(source_file,
18 |                                                 ["a", "b", "c", "rowtime"],
19 |                                                 [DataTypes.STRING(),
20 |                                                  DataTypes.INT(),
21 |                                                  DataTypes.INT(),
22 |                                                  DataTypes.TIMESTAMP()]))
23 |     bt_env.register_table_sink("result",
24 |                                CsvTableSink(["a"],
25 |                                             [DataTypes.INT()],
26 |                                             result_file))
27 |     orders = bt_env.scan("Orders")
28 |     result = orders.window(Tumble.over("2.rows").on("rowtime").alias("w")) \
29 |         .group_by("w, a").select("b.sum")
30 |     result.insert_into("result")
31 |     bt_env.execute("tumble row window batch")
32 |     # cat /tmp/table_tumble_row_window_batch.csv
33 |     # 4
34 |     # 9
35 |     # 6
36 | 
37 | 
38 | def tumble_time_window_batch():
39 |     b_env = ExecutionEnvironment.get_execution_environment()
40 |     b_env.set_parallelism(1)
41 |     bt_env = BatchTableEnvironment.create(b_env)
42 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
43 |     result_file = "/tmp/table_tumble_time_window_batch.csv"
44 |     if os.path.exists(result_file):
45 |         os.remove(result_file)
46 |     bt_env.register_table_source("Orders",
47 |                                  CsvTableSource(source_file,
48 |                                                 ["a", "b", "c", "rowtime"],
49 |                                                 [DataTypes.STRING(),
50 |                                                  DataTypes.INT(),
51 |                                                  DataTypes.INT(),
52 |                                                  DataTypes.TIMESTAMP()]))
53 |     bt_env.register_table_sink("result",
54 |                                CsvTableSink(["a"],
55 |                                             [DataTypes.INT()],
56 |                                             result_file))
57 |     orders = bt_env.scan("Orders")
58 |     result = orders.window(Tumble.over("30.minutes").on("rowtime").alias("w")) \
59 |         .group_by("w, a").select("b.sum")
60 |     result.insert_into("result")
61 |     bt_env.execute("tumble time window batch")
62 |     # cat /tmp/table_tumble_time_window_batch.csv
63 |     # 1
64 |     # 3
65 |     # 4
66 |     # 5
67 |     # 2
68 |     # 4
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     tumble_row_window_batch()
73 |     # tumble_time_window_batch()
74 | 


--------------------------------------------------------------------------------
/table/batch/union.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def union_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     result_file = os.getcwd() + "/tmp/table_union_batch.csv"
12 |     if os.path.exists(result_file):
13 |         os.remove(result_file)
14 |     left = bt_env.from_elements(
15 |         [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (1, "1a", "1laa"), (1, "1b", "1bb")],
16 |         ["a", "b", "c"]).select("a, b, c")
17 |     right = bt_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")],
18 |                                  ["a", "b", "c"]).select("a, b, c")
19 |     bt_env.register_table_sink("result",
20 |                                CsvTableSink(["a", "b", "c"],
21 |                                             [DataTypes.BIGINT(),
22 |                                              DataTypes.STRING(),
23 |                                              DataTypes.STRING()],
24 |                                             result_file))
25 | 
26 |     result = left.union(right)
27 |     result.insert_into("result")
28 |     bt_env.execute("union batch")
29 |     # cat /tmp/table_union_batch.csv
30 |     # 1,1a,1laa
31 |     # 1,1b,1bb
32 |     # 1,3b,3bb
33 |     # 2,,2bb
34 |     # 2,2a,2aa
35 |     # 3,,3aa
36 |     # 4,4b,4bb
37 |     # note : Unions two tables with duplicate records removed whatever the duplicate record from
38 |     # the same table or the other.
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     union_batch()
43 | 


--------------------------------------------------------------------------------
/table/batch/union_all.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def union_all_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     result_file = os.getcwd() + "/tmp/table_union_all_batch.csv"
12 |     if os.path.exists(result_file):
13 |         os.remove(result_file)
14 |     left = bt_env.from_elements(
15 |         [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (1, "1a", "1laa"), (1, "1b", "1bb")],
16 |         ["a", "b", "c"]).select("a, b, c")
17 |     right = bt_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")],
18 |                                  ["a", "b", "c"]).select("a, b, c")
19 |     bt_env.register_table_sink("result",
20 |                                CsvTableSink(["a", "b", "c"],
21 |                                             [DataTypes.BIGINT(),
22 |                                              DataTypes.STRING(),
23 |                                              DataTypes.STRING()],
24 |                                             result_file))
25 | 
26 |     result = left.union_all(right)
27 |     result.insert_into("result")
28 |     bt_env.execute("union all batch")
29 |     # cat /tmp/table_union_all_batch.csv
30 |     # 1,1a,1laa
31 |     # 2,2a,2aa
32 |     # 3,,3aa
33 |     # 1,1a,1laa
34 |     # 1,1b,1bb
35 |     # 1,1b,1bb
36 |     # 2,,2bb
37 |     # 1,3b,3bb
38 |     # 4,4b,4bb
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     union_all_batch()
43 | 


--------------------------------------------------------------------------------
/table/batch/where.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def where_batch():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
12 |     result_file = os.getcwd() + "/../result/table_where_batch.csv"
13 |     if os.path.exists(result_file):
14 |         os.remove(result_file)
15 |     bt_env.register_table_source("Orders",
16 |                                  CsvTableSource(source_file,
17 |                                                 ["a", "b", "c", "rowtime"],
18 |                                                 [DataTypes.STRING(),
19 |                                                  DataTypes.INT(),
20 |                                                  DataTypes.INT(),
21 |                                                  DataTypes.TIMESTAMP()]))
22 |     bt_env.register_table_sink("result",
23 |                                CsvTableSink(["a", "b", "c", "rowtime"],
24 |                                             [DataTypes.STRING(),
25 |                                              DataTypes.INT(),
26 |                                              DataTypes.INT(),
27 |                                              DataTypes.TIMESTAMP()],
28 |                                             result_file))
29 |     orders = bt_env.scan("Orders")
30 |     result = orders.where("a === 'b'")
31 |     result.insert_into("result")
32 |     bt_env.execute("where batch")
33 |     # cat table/result/table_where_batch.csv
34 |     # b,2,2,2013-01-01 00:24:13.0
35 |     # b,4,5,2013-01-01 01:24:13.0
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     where_batch()
40 | 


--------------------------------------------------------------------------------
/table/javaudf/README.md:
--------------------------------------------------------------------------------
 1 | # UDF
 2 | This page helps users to use udf in pyflink
 3 | 
 4 | ## Build UDF
 5 | 
 6 | ### Scalar Function
 7 | The example of Scalar Function lives in scalar-function. You need to build this code:
 8 | 
 9 | ```shell
10 | cd scalar-function; mvn clean package
11 | ```
12 | 
13 | ### Table Function
14 | The example of Scalar Function lives in scalar-function. You need to build this code:
15 | 
16 | ```shell
17 | cd table-function; mvn clean package
18 | ```
19 | 
20 | ### Aggregate Function
21 | The example of Scalar Function lives in scalar-function. You need to build this code:
22 | 
23 | ```shell
24 | cd aggregate-function; mvn clean package
25 | ```
26 | 
27 | ## Run Java UDF In PyFlink
28 | 
29 | ### [optional] Run In Local PVM(Python Virtual Machine)
30 | 1. put udf jar(scalar-function-1.0.jar, table-function-1.0.jar, aggregate-function-1.0.jar) in Python site-packages/pyflink/lib directory
31 | 2. use python interpreter to run the code in scalar_func_demo.py or table_func_demo.py or aggregate_func_demo.py
32 | 
33 | ### [optional] Run Job In Flink Cluster
34 |  
35 | 1. start flink cluster. You can start the standard alone flink cluster:
36 | 
37 | ```shell
38 | bin/start-cluster.sh
39 | ```
40 | 
41 | you need to cd to directory of build-target in flink source code.
42 | 
43 | 2. submit the python job:
44 | 
45 | ```shell
46 | bin/flink run -py <pyflink-demo path>/table/javaudf/scalar_func_demo.py -jar <path/to/scalar-function-1.0.jar>
47 | ```
48 | 
49 | ```shell
50 | bin/flink run -py <pyflink-demo path>/table/javaudf/table_func_demo.py -jar <path/to/table-function-1.0.jar>
51 | ```
52 | 
53 | ```shell
54 | bin/flink run -py <pyflink-demo path>/table/javaudf/aggregate_func_demo.py.py -jar <path/to/aggregate-function-1.0.jar>
55 | ```
56 | 


--------------------------------------------------------------------------------
/table/javaudf/aggregate-function/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 | Licensed to the Apache Software Foundation (ASF) under one
 4 | or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.  The ASF licenses this file
 7 | to you under the Apache License, Version 2.0 (the
 8 | "License"); you may not use this file except in compliance
 9 | with the License.  You may obtain a copy of the License at
10 | 
11 |   http://www.apache.org/licenses/LICENSE-2.0
12 | 
13 | Unless required by applicable law or agreed to in writing,
14 | software distributed under the License is distributed on an
15 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | KIND, either express or implied.  See the License for the
17 | specific language governing permissions and limitations
18 | under the License.
19 | -->
20 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
21 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
22 | 
23 |     <modelVersion>4.0.0</modelVersion>
24 | 
25 |     <groupId>org.apache.flink.table</groupId>
26 |     <artifactId>aggregate-function</artifactId>
27 |     <version>1.0</version>
28 | 
29 |     <packaging>jar</packaging>
30 | 
31 |     <properties>
32 |         <table.version>1.9.0</table.version>
33 |     </properties>
34 | 
35 |     <dependencies>
36 | 
37 |         <!-- core dependencies -->
38 | 
39 |         <dependency>
40 |             <groupId>org.apache.flink</groupId>
41 |             <artifactId>flink-core</artifactId>
42 |             <version>${table.version}</version>
43 |             <scope>provided</scope>
44 |         </dependency>
45 |         <dependency>
46 |             <groupId>org.apache.flink</groupId>
47 |             <artifactId>flink-java</artifactId>
48 |             <version>${table.version}</version>
49 |             <scope>provided</scope>
50 |         </dependency>
51 |         <dependency>
52 |             <groupId>org.apache.flink</groupId>
53 |             <artifactId>flink-streaming-java_2.11</artifactId>
54 |             <version>${table.version}</version>
55 |             <scope>provided</scope>
56 |         </dependency>
57 |         <dependency>
58 |             <groupId>org.apache.flink</groupId>
59 |             <artifactId>flink-table-common</artifactId>
60 |             <version>${table.version}</version>
61 |             <scope>provided</scope>
62 |         </dependency>
63 |         <dependency>
64 |             <groupId>org.apache.flink</groupId>
65 |             <artifactId>flink-table-planner_2.11</artifactId>
66 |             <version>${table.version}</version>
67 |             <scope>provided</scope>
68 |         </dependency>
69 |     </dependencies>
70 |     <build>
71 |         <plugins>
72 |             <plugin>
73 |                 <groupId>org.apache.maven.plugins</groupId>
74 |                 <artifactId>maven-compiler-plugin</artifactId>
75 |                 <version>3.1</version>
76 |                 <configuration>
77 |                     <source>1.8</source>
78 |                     <target>1.8</target>
79 |                 </configuration>
80 |             </plugin>
81 |         </plugins>
82 |     </build>
83 | </project>
84 | 


--------------------------------------------------------------------------------
/table/javaudf/aggregate-function/src/main/java/com/pyflink/table/WeightedAvg.java:
--------------------------------------------------------------------------------
 1 | package com.pyflink.table;
 2 | 
 3 | import org.apache.flink.table.functions.AggregateFunction;
 4 | 
 5 | import java.util.Iterator;
 6 | 
 7 | /**
 8 |  * Weighted Average user-defined aggregate function.
 9 |  */
10 | public class WeightedAvg extends AggregateFunction<Long, WeightedAvg.WeightedAvgAccum> {
11 |     @Override
12 |     public Long getValue(WeightedAvgAccum weightedAvgAccum) {
13 |         if (weightedAvgAccum.count == 0) {
14 |             return null;
15 |         } else {
16 |             return weightedAvgAccum.sum / weightedAvgAccum.count;
17 |         }
18 |     }
19 | 
20 |     @Override
21 |     public WeightedAvgAccum createAccumulator() {
22 |         return new WeightedAvgAccum();
23 |     }
24 | 
25 |     public void accumulate(WeightedAvgAccum acc, long iValue, int iWeight) {
26 |         acc.sum += iValue * iWeight;
27 |         acc.count += iWeight;
28 |     }
29 | 
30 |     public void accumulate(WeightedAvgAccum acc, long iValue, long iWeight) {
31 |         acc.sum += iValue * iWeight;
32 |         acc.count += iWeight;
33 |     }
34 | 
35 |     public void retract(WeightedAvgAccum acc, long iValue, int iWeight) {
36 |         acc.sum -= iValue * iWeight;
37 |         acc.count -= iWeight;
38 |     }
39 | 
40 |     public void merge(WeightedAvgAccum acc, Iterable<WeightedAvgAccum> it) {
41 |         Iterator<WeightedAvgAccum> iter = it.iterator();
42 |         while (iter.hasNext()) {
43 |             WeightedAvgAccum other = iter.next();
44 |             acc.count += other.count;
45 |             acc.sum += other.sum;
46 |         }
47 |     }
48 | 
49 |     public void resetAccumulator(WeightedAvgAccum acc) {
50 |         acc.count = 0;
51 |         acc.sum = 0L;
52 |     }
53 | 
54 |     /**
55 |      * Accumulator for WeightedAvg.
56 |      */
57 |     public static class WeightedAvgAccum {
58 |         public long sum = 0;
59 |         public int count = 0;
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/table/javaudf/aggregate_func_demo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def aggregate_func_python_table_api():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 |     source_table = bt_env.from_elements([("a", 1, 1), ("a", 2, 2), ("b", 3, 2), ("a", 5, 2)],
12 |                                         ["user", "points", "level"])
13 | 
14 |     result_file = "/tmp/aggregate_func_python_table_api.csv"
15 |     if os.path.exists(result_file):
16 |         os.remove(result_file)
17 |     bt_env.register_table_sink("result",
18 |                                CsvTableSink(["a", "b"],
19 |                                             [DataTypes.STRING(),
20 |                                              DataTypes.BIGINT()],
21 |                                             result_file))
22 |     bt_env.register_java_function("wAvg", "com.pyflink.table.WeightedAvg")
23 |     result = source_table.group_by("user").select("user, wAvg(points, level) as avgPoints")
24 |     result.insert_into("result")
25 |     bt_env.execute("aggregate func python table api")
26 |     # cat  /tmp/aggregate_func_python_table_api.csv
27 |     # a,3
28 |     # b,3
29 | 
30 | 
31 | def aggregate_func_python_sql_api():
32 |     b_env = ExecutionEnvironment.get_execution_environment()
33 |     b_env.set_parallelism(1)
34 |     bt_env = BatchTableEnvironment.create(b_env)
35 |     source_table = bt_env.from_elements([("a", 1, 1), ("a", 2, 2), ("b", 3, 2), ("a", 5, 2)],
36 |                                         ["user", "points", "level"])
37 | 
38 |     result_file = "/tmp/aggregate_func_python_sql_api.csv"
39 |     if os.path.exists(result_file):
40 |         os.remove(result_file)
41 |     bt_env.register_table_sink("result",
42 |                                CsvTableSink(["a", "b"],
43 |                                             [DataTypes.STRING(),
44 |                                              DataTypes.BIGINT()],
45 |                                             result_file))
46 | 
47 |     bt_env.register_java_function("wAvg", "com.pyflink.table.WeightedAvg")
48 |     bt_env.register_table("userScores", source_table)
49 |     result = bt_env.sql_query("SELECT user, wAvg(points, level) AS avgPoints FROM userScores GROUP BY user")
50 |     result.insert_into("result")
51 |     bt_env.execute("aggregate func python sql api")
52 |     # cat /tmp/aggregate_func_python_sql_api.csv
53 |     # a,3
54 |     # b,3
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     aggregate_func_python_table_api()
59 |     # aggregate_func_python_sql_api()
60 | 


--------------------------------------------------------------------------------
/table/javaudf/scalar-function/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 | Licensed to the Apache Software Foundation (ASF) under one
 4 | or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.  The ASF licenses this file
 7 | to you under the Apache License, Version 2.0 (the
 8 | "License"); you may not use this file except in compliance
 9 | with the License.  You may obtain a copy of the License at
10 | 
11 |   http://www.apache.org/licenses/LICENSE-2.0
12 | 
13 | Unless required by applicable law or agreed to in writing,
14 | software distributed under the License is distributed on an
15 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | KIND, either express or implied.  See the License for the
17 | specific language governing permissions and limitations
18 | under the License.
19 | -->
20 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
21 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
22 | 
23 |     <modelVersion>4.0.0</modelVersion>
24 | 
25 |     <groupId>org.apache.flink.table</groupId>
26 |     <artifactId>scalar-function</artifactId>
27 |     <version>1.0</version>
28 | 
29 |     <packaging>jar</packaging>
30 | 
31 |     <properties>
32 |         <table.version>1.9.0</table.version>
33 |     </properties>
34 | 
35 |     <dependencies>
36 | 
37 |         <!-- core dependencies -->
38 | 
39 |         <dependency>
40 |             <groupId>org.apache.flink</groupId>
41 |             <artifactId>flink-core</artifactId>
42 |             <version>${table.version}</version>
43 |             <scope>provided</scope>
44 |         </dependency>
45 |         <dependency>
46 |             <groupId>org.apache.flink</groupId>
47 |             <artifactId>flink-java</artifactId>
48 |             <version>${table.version}</version>
49 |             <scope>provided</scope>
50 |         </dependency>
51 |         <dependency>
52 |             <groupId>org.apache.flink</groupId>
53 |             <artifactId>flink-streaming-java_2.11</artifactId>
54 |             <version>${table.version}</version>
55 |             <scope>provided</scope>
56 |         </dependency>
57 |         <dependency>
58 |             <groupId>org.apache.flink</groupId>
59 |             <artifactId>flink-table-common</artifactId>
60 |             <version>${table.version}</version>
61 |             <scope>provided</scope>
62 |         </dependency>
63 |         <dependency>
64 |             <groupId>org.apache.flink</groupId>
65 |             <artifactId>flink-table-planner_2.11</artifactId>
66 |             <version>${table.version}</version>
67 |             <scope>provided</scope>
68 |         </dependency>
69 |     </dependencies>
70 |     <build>
71 |         <plugins>
72 |             <plugin>
73 |                 <groupId>org.apache.maven.plugins</groupId>
74 |                 <artifactId>maven-compiler-plugin</artifactId>
75 |                 <version>3.1</version>
76 |                 <configuration>
77 |                     <source>1.8</source>
78 |                     <target>1.8</target>
79 |                 </configuration>
80 |             </plugin>
81 |         </plugins>
82 |     </build>
83 | </project>
84 | 


--------------------------------------------------------------------------------
/table/javaudf/scalar-function/src/main/java/com/pyflink/table/HashCode.java:
--------------------------------------------------------------------------------
 1 | package com.pyflink.table;
 2 | 
 3 | import org.apache.flink.table.functions.ScalarFunction;
 4 | 
 5 | public class HashCode extends ScalarFunction {
 6 |     private int factor = 12;
 7 | 
 8 |     public int eval(String s) {
 9 |         return s.hashCode() * factor;
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/table/javaudf/scalar_func_demo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.dataset import ExecutionEnvironment
 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def scalar_func_python_table_api():
 8 |     b_env = ExecutionEnvironment.get_execution_environment()
 9 |     b_env.set_parallelism(1)
10 |     bt_env = BatchTableEnvironment.create(b_env)
11 | 
12 |     source_table = bt_env.from_elements([("a", "aa"), ("b", "bb"), ("c", "cc")], ["a", "b"]).select("a, b")
13 | 
14 |     result_file = "/tmp/scalar_func_python_table_api.csv"
15 |     if os.path.exists(result_file):
16 |         os.remove(result_file)
17 |     bt_env.register_table_sink("result",
18 |                                CsvTableSink(["a", "b", "c"],
19 |                                             [DataTypes.STRING(),
20 |                                              DataTypes.INT(),
21 |                                              DataTypes.INT()],
22 |                                             result_file))
23 | 
24 |     # register the java scalar function
25 |     bt_env.register_java_function("hashCode", "com.pyflink.table.HashCode")
26 | 
27 |     # use the java scalar function in Python Table API
28 |     result = source_table.select("a, a.hashCode(), hashCode(a)")
29 |     result.insert_into("result")
30 |     bt_env.execute("scalar func python table api")
31 |     # cat /tmp/scalar_func_python_table_api.csv
32 |     # a,1164,1164
33 |     # b,1176,1176
34 |     # c,1188,1188
35 | 
36 | 
37 | def scalar_func_python_sql():
38 |     b_env = ExecutionEnvironment.get_execution_environment()
39 |     b_env.set_parallelism(1)
40 |     bt_env = BatchTableEnvironment.create(b_env)
41 | 
42 |     source_table = bt_env.from_elements([("a", 1), ("b", 2), ("c", 3)], ["a", "b"]).select("a, b")
43 | 
44 |     result_file = "/tmp/scalar_func_python_sql.csv"
45 |     if os.path.exists(result_file):
46 |         os.remove(result_file)
47 |     bt_env.register_table_sink("result",
48 |                                CsvTableSink(["a", "b"],
49 |                                             [DataTypes.STRING(),
50 |                                              DataTypes.INT()],
51 |                                             result_file))
52 | 
53 |     # register the java scalar function
54 |     bt_env.register_java_function("hashCode", "com.pyflink.table.HashCode")
55 | 
56 |     # register the table for using in the sql query
57 |     bt_env.register_table("MyTable", source_table)
58 | 
59 |     result = bt_env.sql_query("SELECT a, hashCode(a) FROM MyTable")
60 |     result.insert_into("result")
61 |     bt_env.execute("scalar func python sql")
62 |     # cat /tmp/scalar_func_python_sql.csv
63 |     # a,1164
64 |     # b,1176
65 |     # c,1188
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     scalar_func_python_table_api()
70 |     # scalar_func_python_sql()
71 | 


--------------------------------------------------------------------------------
/table/javaudf/table-function/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 | Licensed to the Apache Software Foundation (ASF) under one
 4 | or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.  The ASF licenses this file
 7 | to you under the Apache License, Version 2.0 (the
 8 | "License"); you may not use this file except in compliance
 9 | with the License.  You may obtain a copy of the License at
10 | 
11 |   http://www.apache.org/licenses/LICENSE-2.0
12 | 
13 | Unless required by applicable law or agreed to in writing,
14 | software distributed under the License is distributed on an
15 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | KIND, either express or implied.  See the License for the
17 | specific language governing permissions and limitations
18 | under the License.
19 | -->
20 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
21 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
22 | 
23 |     <modelVersion>4.0.0</modelVersion>
24 | 
25 |     <groupId>org.apache.flink.table</groupId>
26 |     <artifactId>table-function</artifactId>
27 |     <version>1.0</version>
28 | 
29 |     <packaging>jar</packaging>
30 | 
31 |     <properties>
32 |         <table.version>1.9.0</table.version>
33 |     </properties>
34 | 
35 |     <dependencies>
36 | 
37 |         <!-- core dependencies -->
38 | 
39 |         <dependency>
40 |             <groupId>org.apache.flink</groupId>
41 |             <artifactId>flink-core</artifactId>
42 |             <version>${table.version}</version>
43 |             <scope>provided</scope>
44 |         </dependency>
45 |         <dependency>
46 |             <groupId>org.apache.flink</groupId>
47 |             <artifactId>flink-java</artifactId>
48 |             <version>${table.version}</version>
49 |             <scope>provided</scope>
50 |         </dependency>
51 |         <dependency>
52 |             <groupId>org.apache.flink</groupId>
53 |             <artifactId>flink-streaming-java_2.11</artifactId>
54 |             <version>${table.version}</version>
55 |             <scope>provided</scope>
56 |         </dependency>
57 |         <dependency>
58 |             <groupId>org.apache.flink</groupId>
59 |             <artifactId>flink-table-common</artifactId>
60 |             <version>${table.version}</version>
61 |             <scope>provided</scope>
62 |         </dependency>
63 |         <dependency>
64 |             <groupId>org.apache.flink</groupId>
65 |             <artifactId>flink-table-planner_2.11</artifactId>
66 |             <version>${table.version}</version>
67 |             <scope>provided</scope>
68 |         </dependency>
69 |     </dependencies>
70 | 
71 |     <build>
72 |         <plugins>
73 |             <plugin>
74 |                 <groupId>org.apache.maven.plugins</groupId>
75 |                 <artifactId>maven-compiler-plugin</artifactId>
76 |                 <version>3.1</version>
77 |                 <configuration>
78 |                     <source>1.8</source>
79 |                     <target>1.8</target>
80 |                 </configuration>
81 |             </plugin>
82 |         </plugins>
83 |     </build>
84 | </project>
85 | 


--------------------------------------------------------------------------------
/table/javaudf/table-function/src/main/java/com/pyflink/table/Split.java:
--------------------------------------------------------------------------------
 1 | package com.pyflink.table;
 2 | 
 3 | import org.apache.flink.api.java.tuple.Tuple2;
 4 | import org.apache.flink.table.functions.TableFunction;
 5 | 
 6 | public class Split extends TableFunction<Tuple2<String, Integer>> {
 7 |     private String separator = " ";
 8 | 
 9 |     public void eval(String str) {
10 |         for (String s : str.split(separator)) {
11 |             collect(new Tuple2<String, Integer>(s, s.length()));
12 |         }
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/table/javaudf/table_func_demo.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from pyflink.dataset import ExecutionEnvironment
  4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes
  5 | 
  6 | 
  7 | def table_func_python_table_join_lateral_api():
  8 |     b_env = ExecutionEnvironment.get_execution_environment()
  9 |     b_env.set_parallelism(1)
 10 |     bt_env = BatchTableEnvironment.create(b_env)
 11 | 
 12 |     source_table = bt_env.from_elements([("a aa aaa", "aa"), ("b bb bbb", "bb"), ("c cc ccc", "cc")],
 13 |                                         ["a", "b"]).select("a, b")
 14 | 
 15 |     result_file = "/tmp/table_func_python_table_join_lateral_api.csv"
 16 |     if os.path.exists(result_file):
 17 |         os.remove(result_file)
 18 |     bt_env.register_table_sink("result",
 19 |                                CsvTableSink(["a", "b", "c"],
 20 |                                             [DataTypes.STRING(),
 21 |                                              DataTypes.STRING(),
 22 |                                              DataTypes.INT()],
 23 |                                             result_file))
 24 | 
 25 |     bt_env.register_java_function("split", "com.pyflink.table.Split")
 26 | 
 27 |     result = source_table.join_lateral("Split(a) as (word, length)").select("a, word, length")
 28 | 
 29 |     result.insert_into("result")
 30 | 
 31 |     bt_env.execute("table func python table join lateral api")
 32 |     # cat /tmp/table_func_python_table_join_lateral_api.csv
 33 |     # a aa aaa,a,1
 34 |     # a aa aaa,aa,2
 35 |     # a aa aaa,aaa,3
 36 |     # b bb bbb,b,1
 37 |     # b bb bbb,bb,2
 38 |     # b bb bbb,bbb,3
 39 |     # c cc ccc,c,1
 40 |     # c cc ccc,cc,2
 41 |     # c cc ccc,ccc,3
 42 | 
 43 | 
 44 | def table_func_python_table_left_outer_join_lateral_api():
 45 |     b_env = ExecutionEnvironment.get_execution_environment()
 46 |     b_env.set_parallelism(1)
 47 |     bt_env = BatchTableEnvironment.create(b_env)
 48 | 
 49 |     source_table = bt_env.from_elements([("a aa aaa", "aa"), ("b bb bbb", "bb"), ("c cc ccc", "cc")],
 50 |                                         ["a", "b"]).select("a, b")
 51 | 
 52 |     result_file = "/tmp/table_func_python_table_left_outer_join_lateral_api.csv"
 53 |     if os.path.exists(result_file):
 54 |         os.remove(result_file)
 55 |     bt_env.register_table_sink("result",
 56 |                                CsvTableSink(["a", "b", "c"],
 57 |                                             [DataTypes.STRING(),
 58 |                                              DataTypes.STRING(),
 59 |                                              DataTypes.INT()],
 60 |                                             result_file))
 61 | 
 62 |     bt_env.register_java_function("split", "com.pyflink.table.Split")
 63 | 
 64 |     result = source_table.left_outer_join_lateral("Split(a) as (word, length)").select("a, word, length")
 65 | 
 66 |     result.insert_into("result")
 67 | 
 68 |     bt_env.execute("table func python table left outer join lateral api")
 69 |     # cat /tmp/table_func_python_table_left_outer_join_lateral_api.csv
 70 |     # a aa aaa,a,1
 71 |     # a aa aaa,aa,2
 72 |     # a aa aaa,aaa,3
 73 |     # b bb bbb,b,1
 74 |     # b bb bbb,bb,2
 75 |     # b bb bbb,bbb,3
 76 |     # c cc ccc,c,1
 77 |     # c cc ccc,cc,2
 78 |     # c cc ccc,ccc,3
 79 | 
 80 | 
 81 | def table_func_python_sql_join_lateral_api():
 82 |     b_env = ExecutionEnvironment.get_execution_environment()
 83 |     b_env.set_parallelism(1)
 84 |     bt_env = BatchTableEnvironment.create(b_env)
 85 | 
 86 |     source_table = bt_env.from_elements([("a aa aaa", "aa"), ("b bb bbb", "bb"), ("c cc ccc", "cc")],
 87 |                                         ["a", "b"]).select("a, b")
 88 | 
 89 |     result_file = "/tmp/table_func_python_sql_join_lateral_api.csv"
 90 |     if os.path.exists(result_file):
 91 |         os.remove(result_file)
 92 |     bt_env.register_table_sink("result",
 93 |                                CsvTableSink(["a", "b", "c"],
 94 |                                             [DataTypes.STRING(),
 95 |                                              DataTypes.STRING(),
 96 |                                              DataTypes.INT()],
 97 |                                             result_file))
 98 | 
 99 |     bt_env.register_java_function("split", "com.pyflink.table.Split")
100 |     bt_env.register_table("MyTable", source_table)
101 | 
102 |     result = bt_env.sql_query("SELECT a, word, length FROM MyTable, LATERAL TABLE(split(a)) as T(word, length)")
103 | 
104 |     result.insert_into("result")
105 | 
106 |     bt_env.execute("table func python sql join lateral api")
107 |     # cat /tmp/table_func_python_sql_join_lateral_api.csv
108 |     # a aa aaa,a,1
109 |     # a aa aaa,aa,2
110 |     # a aa aaa,aaa,3
111 |     # b bb bbb,b,1
112 |     # b bb bbb,bb,2
113 |     # b bb bbb,bbb,3
114 |     # c cc ccc,c,1
115 |     # c cc ccc,cc,2
116 |     # c cc ccc,ccc,3
117 | 
118 | 
119 | def table_func_python_sql_left_outer_join_lateral_api():
120 |     b_env = ExecutionEnvironment.get_execution_environment()
121 |     b_env.set_parallelism(1)
122 |     bt_env = BatchTableEnvironment.create(b_env)
123 | 
124 |     source_table = bt_env.from_elements([("a aa aaa", "aa"), ("b bb bbb", "bb"), ("c cc ccc", "cc")],
125 |                                         ["a", "b"]).select("a, b")
126 | 
127 |     result_file = "/tmp/table_func_python_sql_left_outer_join_lateral_api.csv"
128 |     if os.path.exists(result_file):
129 |         os.remove(result_file)
130 |     bt_env.register_table_sink("result",
131 |                                CsvTableSink(["a", "b", "c"],
132 |                                             [DataTypes.STRING(),
133 |                                              DataTypes.STRING(),
134 |                                              DataTypes.INT()],
135 |                                             result_file))
136 | 
137 |     bt_env.register_java_function("split", "com.pyflink.table.Split")
138 |     bt_env.register_table("MyTable", source_table)
139 | 
140 |     result = bt_env.sql_query(
141 |         "SELECT a, word, length FROM MyTable LEFT JOIN LATERAL TABLE(split(a)) as T(word, length) ON TRUE")
142 | 
143 |     result.insert_into("result")
144 | 
145 |     bt_env.execute("table func python sql left outer join lateral api")
146 |     # cat /tmp/table_func_python_sql_left_outer_join_lateral_api.csv
147 |     # a aa aaa,a,1
148 |     # a aa aaa,aa,2
149 |     # a aa aaa,aaa,3
150 |     # b bb bbb,b,1
151 |     # b bb bbb,bb,2
152 |     # b bb bbb,bbb,3
153 |     # c cc ccc,c,1
154 |     # c cc ccc,cc,2
155 |     # c cc ccc,ccc,3
156 | 
157 | 
158 | if __name__ == '__main__':
159 |     table_func_python_table_join_lateral_api()
160 |     # table_func_python_table_left_outer_join_lateral_api()
161 |     # table_func_python_sql_join_lateral_api()
162 |     # table_func_python_sql_left_outer_join_lateral_api()
163 | 


--------------------------------------------------------------------------------
/table/prepare_environment.py:
--------------------------------------------------------------------------------
 1 | from utils import kafka_utils, elastic_search_utils
 2 | 
 3 | 
 4 | def prepare_env(need_stream_source=False, need_upsert_sink=False):
 5 |     elastic_search_used_method = ['group_by_agg_streaming', 'distinct_agg_streaming']
 6 | 
 7 |     if need_stream_source:
 8 |         topics = kafka_utils.list_topics()
 9 |         if 'user' not in topics:
10 |             kafka_utils.create_topic('user')
11 |             msgs = [{'a': 'a', 'b': 1, 'c': 1, 'time': '2013-01-01T00:14:13Z'},
12 |                     {'a': 'b', 'b': 2, 'c': 2, 'time': '2013-01-01T00:24:13Z'},
13 |                     {'a': 'a', 'b': 3, 'c': 3, 'time': '2013-01-01T00:34:13Z'},
14 |                     {'a': 'a', 'b': 4, 'c': 4, 'time': '2013-01-01T01:14:13Z'},
15 |                     {'a': 'b', 'b': 4, 'c': 5, 'time': '2013-01-01T01:24:13Z'},
16 |                     {'a': 'a', 'b': 5, 'c': 2, 'time': '2013-01-01T01:34:13Z'}]
17 |             for msg in msgs:
18 |                 kafka_utils.send_msg('user', msg)
19 | 
20 |     if need_upsert_sink:
21 |         mapping = '''
22 |         {
23 |             "mappings" : {
24 |                   "pyflink" : {
25 |                     "properties" : {
26 |                       "a" : {
27 |                         "type" : "text",
28 |                         "fields" : {
29 |                           "keyword" : {
30 |                             "type" : "keyword",
31 |                             "ignore_above" : 256
32 |                           }
33 |                         }
34 |                       },
35 |                       "b" : {
36 |                         "type" : "text",
37 |                         "fields" : {
38 |                           "keyword" : {
39 |                             "type" : "keyword",
40 |                             "ignore_above" : 256
41 |                           }
42 |                         }
43 |                       }
44 |                     }
45 |                   }
46 |             }
47 |         }
48 |         '''
49 |         for method in elastic_search_used_method:
50 |             elastic_search_utils.delete_index(method)
51 |             elastic_search_utils.create_index(method, mapping)
52 | 


--------------------------------------------------------------------------------
/table/resources/table_orders.csv:
--------------------------------------------------------------------------------
1 | a,1,1,2013-01-01 00:14:13
2 | b,2,2,2013-01-01 00:24:13
3 | a,3,3,2013-01-01 00:34:13
4 | a,4,4,2013-01-01 01:14:13
5 | b,4,5,2013-01-01 01:24:13
6 | a,5,2,2013-01-01 01:34:13


--------------------------------------------------------------------------------
/table/streaming/add_columns.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.datastream import StreamExecutionEnvironment
 4 | from pyflink.table import StreamTableEnvironment, CsvTableSource, CsvTableSink, DataTypes, EnvironmentSettings
 5 | 
 6 | 
 7 | def add_columns_streaming():
 8 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 9 |     s_env.set_parallelism(1)
10 |     # use blink table planner
11 |     st_env = StreamTableEnvironment.create(s_env, environment_settings=EnvironmentSettings.new_instance()
12 |                                            .in_streaming_mode().use_blink_planner().build())
13 |     # use flink table planner
14 |     # st_env = StreamTableEnvironment.create(s_env)
15 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
16 |     result_file = "/tmp/table_add_columns_streaming.csv"
17 |     if os.path.exists(result_file):
18 |         os.remove(result_file)
19 |     st_env.register_table_source("Orders",
20 |                                  CsvTableSource(source_file,
21 |                                                 ["a", "b", "c", "rowtime"],
22 |                                                 [DataTypes.STRING(),
23 |                                                  DataTypes.INT(),
24 |                                                  DataTypes.INT(),
25 |                                                  DataTypes.TIMESTAMP()]))
26 |     st_env.register_table_sink("result",
27 |                                CsvTableSink(["a", "b", "c", "rowtime", "d"],
28 |                                             [DataTypes.STRING(),
29 |                                              DataTypes.INT(),
30 |                                              DataTypes.INT(),
31 |                                              DataTypes.TIMESTAMP(),
32 |                                              DataTypes.STRING()],
33 |                                             result_file))
34 |     orders = st_env.scan("Orders")
35 |     result = orders.add_columns("concat(a, '_sunny') as d")
36 |     result.insert_into("result")
37 |     st_env.execute("add columns streaming")
38 |     # cat /tmp/table_add_columns_streaming.csv
39 |     # a,1,1,2013-01-01 00:14:13.0,a_sunny
40 |     # b,2,2,2013-01-01 00:24:13.0,b_sunny
41 |     # a,3,3,2013-01-01 00:34:13.0,a_sunny
42 |     # a,4,4,2013-01-01 01:14:13.0,a_sunny
43 |     # b,4,5,2013-01-01 01:24:13.0,b_sunny
44 |     # a,5,2,2013-01-01 01:34:13.0,a_sunny
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     add_columns_streaming()
49 | 


--------------------------------------------------------------------------------
/table/streaming/add_or_replace_columns.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.datastream import StreamExecutionEnvironment
 4 | from pyflink.table import StreamTableEnvironment, CsvTableSource, CsvTableSink, DataTypes, EnvironmentSettings
 5 | 
 6 | 
 7 | def add_or_replace_columns_streaming():
 8 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 9 |     s_env.set_parallelism(1)
10 |     # use blink table planner
11 |     st_env = StreamTableEnvironment.create(s_env, environment_settings=EnvironmentSettings.new_instance()
12 |                                            .in_streaming_mode().use_blink_planner().build())
13 |     # use flink table planner
14 |     # st_env = StreamTableEnvironment.create(s_env)    source_file = os.getcwd() + "/../resources/table_orders.csv"
15 |     result_file = "/tmp/table_add_or_replace_columns_streaming.csv"
16 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
17 |     if os.path.exists(result_file):
18 |         os.remove(result_file)
19 |     st_env.register_table_source("Orders",
20 |                                  CsvTableSource(source_file,
21 |                                                 ["a", "b", "c", "rowtime"],
22 |                                                 [DataTypes.STRING(),
23 |                                                  DataTypes.INT(),
24 |                                                  DataTypes.INT(),
25 |                                                  DataTypes.TIMESTAMP()]))
26 |     st_env.register_table_sink("result",
27 |                                CsvTableSink(["a", "b", "c", "rowtime"],
28 |                                             [DataTypes.STRING(),
29 |                                              DataTypes.INT(),
30 |                                              DataTypes.INT(),
31 |                                              DataTypes.TIMESTAMP()],
32 |                                             result_file))
33 |     orders = st_env.scan("Orders")
34 |     result = orders.add_or_replace_columns("concat(a, '_sunny') as a")
35 |     result.insert_into("result")
36 |     st_env.execute("add or replace columns streaming")
37 |     # cat /tmp/table_add_or_replace_columns_streaming.csv
38 |     # a_sunny,1,1,2013-01-01 00:14:13.0
39 |     # b_sunny,2,2,2013-01-01 00:24:13.0
40 |     # a_sunny,3,3,2013-01-01 00:34:13.0
41 |     # a_sunny,4,4,2013-01-01 01:14:13.0
42 |     # b_sunny,4,5,2013-01-01 01:24:13.0
43 |     # a_sunny,5,2,2013-01-01 01:34:13.0
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     add_or_replace_columns_streaming()
48 | 


--------------------------------------------------------------------------------
/table/streaming/alias.py:
--------------------------------------------------------------------------------
 1 | from pyflink.datastream import StreamExecutionEnvironment
 2 | from pyflink.table import StreamTableEnvironment, CsvTableSource, CsvTableSink, DataTypes, EnvironmentSettings
 3 | import os
 4 | 
 5 | 
 6 | def alias_streaming():
 7 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 8 |     s_env.set_parallelism(1)
 9 |     # use blink table planner
10 |     st_env = StreamTableEnvironment.create(s_env, environment_settings=EnvironmentSettings.new_instance()
11 |                                            .in_streaming_mode().use_blink_planner().build())
12 |     # use flink table planner
13 |     # st_env = StreamTableEnvironment.create(s_env)
14 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
15 |     result_file = "/tmp/table_alias_streaming.csv"
16 |     if os.path.exists(result_file):
17 |         os.remove(result_file)
18 |     st_env.register_table_source("Orders",
19 |                                  CsvTableSource(source_file,
20 |                                                 ["a", "b", "c", "rowtime"],
21 |                                                 [DataTypes.STRING(),
22 |                                                  DataTypes.INT(),
23 |                                                  DataTypes.INT(),
24 |                                                  DataTypes.TIMESTAMP()]))
25 |     st_env.register_table_sink("result",
26 |                                CsvTableSink(["a", "b", "c", "rowtime"],
27 |                                             [DataTypes.STRING(),
28 |                                              DataTypes.INT(),
29 |                                              DataTypes.INT(),
30 |                                              DataTypes.TIMESTAMP()],
31 |                                             result_file))
32 |     orders = st_env.scan("Orders")
33 |     result = orders.alias("x, y, z, t").select("x, y, z, t")
34 |     result.insert_into("result")
35 |     st_env.execute("alias streaming")
36 |     # cat /tmp/table_alias_streaming.csv
37 |     # a,1,1,2013-01-01 00:14:13.0
38 |     # b,2,2,2013-01-01 00:24:13.0
39 |     # a,3,3,2013-01-01 00:34:13.0
40 |     # a,4,4,2013-01-01 01:14:13.0
41 |     # b,4,5,2013-01-01 01:24:13.0
42 |     # a,5,2,2013-01-01 01:34:13.0
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     alias_streaming()
47 | 


--------------------------------------------------------------------------------
/table/streaming/distinct.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.datastream import StreamExecutionEnvironment
 4 | from pyflink.table import StreamTableEnvironment, CsvTableSource, DataTypes
 5 | 
 6 | from table.user_defined_sources_and_sinks.TestRetractSink import TestRetractSink
 7 | 
 8 | 
 9 | def distinct_streaming():
10 |     s_env = StreamExecutionEnvironment.get_execution_environment()
11 |     s_env.set_parallelism(1)
12 |     st_env = StreamTableEnvironment.create(s_env)
13 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
14 |     st_env.register_table_source("Orders",
15 |                                  CsvTableSource(source_file,
16 |                                                 ["a", "b", "c", "rowtime"],
17 |                                                 [DataTypes.STRING(),
18 |                                                  DataTypes.INT(),
19 |                                                  DataTypes.INT(),
20 |                                                  DataTypes.TIMESTAMP()]))
21 | 
22 |     orders = st_env.scan("Orders")
23 |     result = orders.select("a, b").distinct()
24 |     # use custom retract sink connector
25 |     sink = TestRetractSink(["a", "b"],
26 |                            [DataTypes.STRING(),
27 |                             DataTypes.INT()])
28 |     st_env.register_table_sink("sink", sink)
29 |     result.insert_into("sink")
30 |     st_env.execute("distinct streaming")
31 |     # (true, a, 1)
32 |     # (true, b, 2)
33 |     # (true, a, 3)
34 |     # (true, a, 4)
35 |     # (true, b, 4)
36 |     # (true, a, 5)
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     distinct_streaming()
41 | 


--------------------------------------------------------------------------------
/table/streaming/distinct_agg.py:
--------------------------------------------------------------------------------
  1 | from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic
  2 | from pyflink.table import StreamTableEnvironment, DataTypes, EnvironmentSettings
  3 | from pyflink.table.descriptors import Schema, Rowtime, Elasticsearch, Json, Kafka
  4 | from pyflink.table.window import Tumble
  5 | 
  6 | 
  7 | def distinct_agg_streaming():
  8 |     s_env = StreamExecutionEnvironment.get_execution_environment()
  9 |     s_env.set_parallelism(1)
 10 |     s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
 11 |     # use blink table planner
 12 |     st_env = StreamTableEnvironment.create(s_env, environment_settings=EnvironmentSettings.new_instance()
 13 |                                            .in_streaming_mode().use_blink_planner().build())
 14 |     # use flink table planner
 15 |     # st_env = StreamTableEnvironment.create(s_env)
 16 |     st_env \
 17 |         .connect(  # declare the external system to connect to
 18 |             Kafka()
 19 |             .version("0.11")
 20 |             .topic("user")
 21 |             .start_from_earliest()
 22 |             .property("zookeeper.connect", "localhost:2181")
 23 |             .property("bootstrap.servers", "localhost:9092")
 24 |         ) \
 25 |         .with_format(  # declare a format for this system
 26 |             Json()
 27 |             .fail_on_missing_field(True)
 28 |             .json_schema(
 29 |                 "{"
 30 |                 "  type: 'object',"
 31 |                 "  properties: {"
 32 |                 "    a: {"
 33 |                 "      type: 'string'"
 34 |                 "    },"
 35 |                 "    b: {"
 36 |                 "      type: 'string'"
 37 |                 "    },"
 38 |                 "    c: {"
 39 |                 "      type: 'string'"
 40 |                 "    },"
 41 |                 "    time: {"
 42 |                 "      type: 'string',"
 43 |                 "      format: 'date-time'"
 44 |                 "    }"
 45 |                 "  }"
 46 |                 "}"
 47 |              )
 48 |          ) \
 49 |         .with_schema(  # declare the schema of the table
 50 |              Schema()
 51 |              .field("rowtime", DataTypes.TIMESTAMP())
 52 |              .rowtime(
 53 |                 Rowtime()
 54 |                 .timestamps_from_field("time")
 55 |                 .watermarks_periodic_bounded(60000))
 56 |              .field("a", DataTypes.STRING())
 57 |              .field("b", DataTypes.STRING())
 58 |              .field("c", DataTypes.STRING())
 59 |          ) \
 60 |         .in_append_mode() \
 61 |         .register_table_source("Orders")
 62 |     st_env.connect(
 63 |         Elasticsearch()
 64 |         .version("6")
 65 |         .host("localhost", 9200, "http")
 66 |         .index("distinct_agg_streaming")
 67 |         .document_type('pyflink')
 68 |         .key_delimiter("_")
 69 |         .key_null_literal("null")
 70 |         .failure_handler_ignore()
 71 |         .disable_flush_on_checkpoint()
 72 |         .bulk_flush_max_actions(2)
 73 |         .bulk_flush_max_size("1 mb")
 74 |         .bulk_flush_interval(5000)
 75 |         ) \
 76 |         .with_schema(
 77 |             Schema()
 78 |             .field("a", DataTypes.STRING())
 79 |             .field("b", DataTypes.STRING())
 80 |         ) \
 81 |         .with_format(
 82 |            Json()
 83 |            .derive_schema()
 84 |         ) \
 85 |         .in_upsert_mode() \
 86 |         .register_table_sink("result")
 87 |     orders = st_env.scan("Orders")
 88 |     result = orders.window(Tumble.over("30.minutes").on("rowtime").alias("w")) \
 89 |         .group_by("a, w").select("a, b.max.distinct as d")
 90 |     result.insert_into("result")
 91 |     st_env.execute("distinct agg streaming")
 92 |     # curl -X GET 'http://localhost:9200/distinct_agg_streaming/_search'
 93 |     # {
 94 |     #     "took": 3,
 95 |     #     "timed_out": false,
 96 |     #     "_shards": {
 97 |     #         "total": 5,
 98 |     #         "successful": 5,
 99 |     #         "skipped": 0,
100 |     #         "failed": 0
101 |     #     },
102 |     #     "hits": {
103 |     #         "total": 5,
104 |     #         "max_score": 1,
105 |     #         "hits": [
106 |     #             {
107 |     #                 "_index": "distinct_agg_streaming",
108 |     #                 "_type": "pyflink",
109 |     #                 "_id": "3zfsHWwBHRafi3KHm2Ve",
110 |     #                 "_score": 1,
111 |     #                 "_source": {
112 |     #                     "a": "a",
113 |     #                     "b": "3"
114 |     #                 }
115 |     #             },
116 |     #             {
117 |     #                 "_index": "distinct_agg_streaming",
118 |     #                 "_type": "pyflink",
119 |     #                 "_id": "4TfsHWwBHRafi3KHrmU-",
120 |     #                 "_score": 1,
121 |     #                 "_source": {
122 |     #                     "a": "b",
123 |     #                     "b": "4"
124 |     #                 }
125 |     #             },
126 |     #             {
127 |     #                 "_index": "distinct_agg_streaming",
128 |     #                 "_type": "pyflink",
129 |     #                 "_id": "4DfsHWwBHRafi3KHm2Ve",
130 |     #                 "_score": 1,
131 |     #                 "_source": {
132 |     #                     "a": "a",
133 |     #                     "b": "4"
134 |     #                 }
135 |     #             },
136 |     #             {
137 |     #                 "_index": "distinct_agg_streaming",
138 |     #                 "_type": "pyflink",
139 |     #                 "_id": "3TfsHWwBHRafi3KHm2Uf",
140 |     #                 "_score": 1,
141 |     #                 "_source": {
142 |     #                     "a": "a",
143 |     #                     "b": "1"
144 |     #                 }
145 |     #             },
146 |     #             {
147 |     #                 "_index": "distinct_agg_streaming",
148 |     #                 "_type": "pyflink",
149 |     #                 "_id": "3jfsHWwBHRafi3KHm2Uf",
150 |     #                 "_score": 1,
151 |     #                 "_source": {
152 |     #                     "a": "b",
153 |     #                     "b": "2"
154 |     #                 }
155 |     #             }
156 |     #         ]
157 |     #     }
158 |     # }
159 | 
160 | 
161 | if __name__ == '__main__':
162 |     from table.prepare_environment import prepare_env
163 |     prepare_env(need_stream_source=True, need_upsert_sink=True)
164 |     distinct_agg_streaming()
165 | 


--------------------------------------------------------------------------------
/table/streaming/drop_columns.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.datastream import StreamExecutionEnvironment
 4 | from pyflink.table import StreamTableEnvironment, CsvTableSource, CsvTableSink, DataTypes, EnvironmentSettings
 5 | 
 6 | 
 7 | def drop_columns_streaming():
 8 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 9 |     s_env.set_parallelism(1)
10 |     # use blink table planner
11 |     st_env = StreamTableEnvironment.create(s_env, environment_settings=EnvironmentSettings.new_instance()
12 |                                            .in_streaming_mode().use_blink_planner().build())
13 |     # use flink table planner
14 |     # st_env = StreamTableEnvironment.create(s_env)
15 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
16 |     result_file = "/tmp/table_drop_columns_streaming.csv"
17 |     if os.path.exists(result_file):
18 |         os.remove(result_file)
19 |     st_env.register_table_source("Orders",
20 |                                  CsvTableSource(source_file,
21 |                                                 ["a", "b", "c", "rowtime"],
22 |                                                 [DataTypes.STRING(),
23 |                                                  DataTypes.INT(),
24 |                                                  DataTypes.INT(),
25 |                                                  DataTypes.TIMESTAMP()]))
26 |     st_env.register_table_sink("result",
27 |                                CsvTableSink(["a", "b", "rowtime"],
28 |                                             [DataTypes.STRING(),
29 |                                              DataTypes.INT(),
30 |                                              DataTypes.TIMESTAMP()],
31 |                                             result_file))
32 |     orders = st_env.scan("Orders")
33 |     result = orders.drop_columns("c")
34 |     result.insert_into("result")
35 |     st_env.execute("drop columns streaming")
36 |     # cat /tmp/table_drop_columns_streaming.csv
37 |     # a,1,2013-01-01 00:14:13.0
38 |     # b,2,2013-01-01 00:24:13.0
39 |     # a,3,2013-01-01 00:34:13.0
40 |     # a,4,2013-01-01 01:14:13.0
41 |     # b,4,2013-01-01 01:24:13.0
42 |     # a,5,2013-01-01 01:34:13.0
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     drop_columns_streaming()
47 | 


--------------------------------------------------------------------------------
/table/streaming/filter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.datastream import StreamExecutionEnvironment
 4 | from pyflink.table import StreamTableEnvironment, CsvTableSource, CsvTableSink, DataTypes, EnvironmentSettings
 5 | 
 6 | 
 7 | def filter_streaming():
 8 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 9 |     s_env.set_parallelism(1)
10 |     # use blink table planner
11 |     st_env = StreamTableEnvironment.create(s_env, environment_settings=EnvironmentSettings.new_instance()
12 |                                            .in_streaming_mode().use_blink_planner().build())
13 |     # use flink table planner
14 |     # st_env = StreamTableEnvironment.create(s_env)
15 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
16 |     result_file = "/tmp/table_filter_streaming.csv"
17 |     if os.path.exists(result_file):
18 |         os.remove(result_file)
19 |     st_env.register_table_source("Orders",
20 |                                  CsvTableSource(source_file,
21 |                                                 ["a", "b", "c", "rowtime"],
22 |                                                 [DataTypes.STRING(),
23 |                                                  DataTypes.INT(),
24 |                                                  DataTypes.INT(),
25 |                                                  DataTypes.TIMESTAMP()]))
26 |     st_env.register_table_sink("result",
27 |                                CsvTableSink(["a", "b", "c", "rowtime"],
28 |                                             [DataTypes.STRING(),
29 |                                              DataTypes.INT(),
30 |                                              DataTypes.INT(),
31 |                                              DataTypes.TIMESTAMP()],
32 |                                             result_file))
33 |     orders = st_env.scan("Orders")
34 |     result = orders.filter("b % 2 === 0")
35 |     result.insert_into("result")
36 |     st_env.execute("filter streaming")
37 |     # cat /tmp/table_filter_streaming.csv
38 |     # b,2,2,2013-01-01 00:24:13.0
39 |     # a,4,4,2013-01-01 01:14:13.0
40 |     # b,4,5,2013-01-01 01:24:13.0
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     filter_streaming()
45 | 


--------------------------------------------------------------------------------
/table/streaming/full_outer_join.py:
--------------------------------------------------------------------------------
 1 | from pyflink.datastream import StreamExecutionEnvironment
 2 | from pyflink.table import StreamTableEnvironment, EnvironmentSettings, DataTypes
 3 | 
 4 | from table.user_defined_sources_and_sinks.TestRetractSink import TestRetractSink
 5 | 
 6 | 
 7 | def full_outer_join_streaming():
 8 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 9 |     s_env.set_parallelism(1)
10 |     # use blink table planner
11 |     st_env = StreamTableEnvironment.create(s_env, environment_settings=EnvironmentSettings.new_instance()
12 |                                            .in_streaming_mode().use_blink_planner().build())
13 |     # use flink table planner
14 |     # st_env = StreamTableEnvironment.create(s_env)
15 |     left = st_env.from_elements(
16 |         [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")],
17 |         ["a", "b", "c"]).select("a, b, c")
18 |     right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")],
19 |                                  ["d", "e", "f"]).select("d, e, f")
20 | 
21 |     result = left.full_outer_join(right, "a = d").select("a, b, e")
22 |     # use custom retract sink connector
23 |     sink = TestRetractSink(["a", "b", "c"],
24 |                            [DataTypes.BIGINT(),
25 |                             DataTypes.STRING(),
26 |                             DataTypes.STRING()])
27 |     st_env.register_table_sink("sink", sink)
28 |     result.insert_into("sink")
29 |     st_env.execute("full outer join streaming")
30 |     # (true, 1, 1a, null)
31 |     # (true, 2, 2a, null)
32 |     # (true, 3, null, null)
33 |     # (true, 2, 4b, null)
34 |     # (true, 5, 5a, null)
35 |     # (false, 1, 1a, null)
36 |     # (true, 1, 1a, 1b)
37 |     # (false, 2, 2a, null)
38 |     # (false, 2, 4b, null)
39 |     # (true, 2, 2a, null)
40 |     # (true, 2, 4b, null)
41 |     # (true, 1, 1a, 3b)
42 |     # (true, null, null, 4b)
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     full_outer_join_streaming()
47 | 


--------------------------------------------------------------------------------
/table/streaming/group_by_agg.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from pyflink.datastream import StreamExecutionEnvironment
  4 | from pyflink.table import StreamTableEnvironment, CsvTableSource, DataTypes, EnvironmentSettings
  5 | from pyflink.table.descriptors import Elasticsearch, Schema, Json
  6 | 
  7 | 
  8 | def group_by_agg_streaming():
  9 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 10 |     s_env.set_parallelism(1)
 11 |     # use blink table planner
 12 |     st_env = StreamTableEnvironment.create(s_env, environment_settings=EnvironmentSettings.new_instance()
 13 |                                            .in_streaming_mode().use_blink_planner().build())
 14 |     # use flink table planner
 15 |     # st_env = StreamTableEnvironment.create(s_env)
 16 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
 17 |     st_env.register_table_source("Orders",
 18 |                                  CsvTableSource(source_file,
 19 |                                                 ["a", "b", "c", "rowtime"],
 20 |                                                 [DataTypes.STRING(),
 21 |                                                  DataTypes.INT(),
 22 |                                                  DataTypes.INT(),
 23 |                                                  DataTypes.TIMESTAMP()]))
 24 |     st_env.connect(
 25 |         Elasticsearch()
 26 |         .version("6")
 27 |         .host("localhost", 9200, "http")
 28 |         .index("group_by_agg_streaming")
 29 |         .document_type('pyflink')
 30 |         .key_delimiter("_")
 31 |         .key_null_literal("null")
 32 |         .failure_handler_ignore()
 33 |         .disable_flush_on_checkpoint()
 34 |         .bulk_flush_max_actions(2)
 35 |         .bulk_flush_max_size("1 mb")
 36 |         .bulk_flush_interval(5000)
 37 |         ) \
 38 |         .with_schema(
 39 |             Schema()
 40 |             .field("a", DataTypes.STRING())
 41 |             .field("b", DataTypes.STRING())
 42 |         ) \
 43 |         .with_format(
 44 |            Json()
 45 |            .derive_schema()
 46 |         ) \
 47 |         .in_upsert_mode() \
 48 |         .register_table_sink("result")
 49 | 
 50 |     orders = st_env.scan("Orders")
 51 |     groub_by_table = orders.group_by("a").select("a, b.sum as d")
 52 |     # Because the schema of index user in elasticsearch is
 53 |     # {"a":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}},
 54 |     # "b":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}
 55 |     # so we need to cast the type in our demo.
 56 |     st_env.register_table("group_table", groub_by_table)
 57 |     result = st_env.sql_query("SELECT a, CAST(d AS VARCHAR) from group_table")
 58 |     result.insert_into("result")
 59 |     st_env.execute("group by agg streaming")
 60 |     # curl -X GET 'http://localhost:9200/group_by_agg_streaming/_search'
 61 |     # {
 62 |     #     "took": 2,
 63 |     #     "timed_out": false,
 64 |     #     "_shards": {
 65 |     #         "total": 5,
 66 |     #         "successful": 5,
 67 |     #         "skipped": 0,
 68 |     #         "failed": 0
 69 |     #     },
 70 |     #     "hits": {
 71 |     #         "total": 2,
 72 |     #         "max_score": 1,
 73 |     #         "hits": [
 74 |     #             {
 75 |     #                 "_index": "group_by_agg_streaming",
 76 |     #                 "_type": "group_by_agg_streaming",
 77 |     #                 "_id": "b",
 78 |     #                 "_score": 1,
 79 |     #                 "_source": {
 80 |     #                     "a": "b",
 81 |     #                     "b": "6"
 82 |     #                 }
 83 |     #             },
 84 |     #             {
 85 |     #                 "_index": "group_by_agg_streaming",
 86 |     #                 "_type": "group_by_agg_streaming",
 87 |     #                 "_id": "a",
 88 |     #                 "_score": 1,
 89 |     #                 "_source": {
 90 |     #                     "a": "a",
 91 |     #                     "b": "13"
 92 |     #                 }
 93 |     #             }
 94 |     #         ]
 95 |     #     }
 96 |     # }
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |     from table.prepare_environment import prepare_env
101 |     prepare_env(need_upsert_sink=True)
102 |     group_by_agg_streaming()
103 | 


--------------------------------------------------------------------------------
/table/streaming/group_by_window_agg.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic
 4 | from pyflink.table import StreamTableEnvironment, CsvTableSink, DataTypes, EnvironmentSettings
 5 | from pyflink.table.descriptors import Schema, Rowtime, Json, Kafka
 6 | from pyflink.table.window import Tumble
 7 | 
 8 | 
 9 | def group_by_window_agg_streaming():
10 |     s_env = StreamExecutionEnvironment.get_execution_environment()
11 |     s_env.set_parallelism(1)
12 |     s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
13 |     # use blink table planner
14 |     st_env = StreamTableEnvironment.create(s_env, environment_settings=EnvironmentSettings.new_instance()
15 |                                            .in_streaming_mode().use_blink_planner().build())
16 |     # use flink table planner
17 |     # st_env = StreamTableEnvironment.create(s_env)
18 |     result_file = "/tmp/table_group_by_window_agg_streaming.csv"
19 |     if os.path.exists(result_file):
20 |         os.remove(result_file)
21 |     st_env \
22 |         .connect(  # declare the external system to connect to
23 |             Kafka()
24 |             .version("0.11")
25 |             .topic("user")
26 |             .start_from_earliest()
27 |             .property("zookeeper.connect", "localhost:2181")
28 |             .property("bootstrap.servers", "localhost:9092")
29 |         ) \
30 |         .with_format(  # declare a format for this system
31 |             Json()
32 |             .fail_on_missing_field(True)
33 |             .json_schema(
34 |                 "{"
35 |                 "  type: 'object',"
36 |                 "  properties: {"
37 |                 "    a: {"
38 |                 "      type: 'string'"
39 |                 "    },"
40 |                 "    b: {"
41 |                 "      type: 'string'"
42 |                 "    },"
43 |                 "    c: {"
44 |                 "      type: 'string'"
45 |                 "    },"
46 |                 "    time: {"
47 |                 "      type: 'string',"
48 |                 "      format: 'date-time'"
49 |                 "    }"
50 |                 "  }"
51 |                 "}"
52 |              )
53 |          ) \
54 |         .with_schema(  # declare the schema of the table
55 |              Schema()
56 |              .field("rowtime", DataTypes.TIMESTAMP())
57 |              .rowtime(
58 |                 Rowtime()
59 |                 .timestamps_from_field("time")
60 |                 .watermarks_periodic_bounded(60000))
61 |              .field("a", DataTypes.STRING())
62 |              .field("b", DataTypes.STRING())
63 |              .field("c", DataTypes.STRING())
64 |          ) \
65 |         .in_append_mode() \
66 |         .register_table_source("source")
67 | 
68 |     st_env.register_table_sink("result",
69 |                                CsvTableSink(["a", "b"],
70 |                                             [DataTypes.STRING(),
71 |                                              DataTypes.STRING()],
72 |                                             result_file))
73 | 
74 |     st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \
75 |         .group_by("w, a") \
76 |         .select("a, max(b)").insert_into("result")
77 | 
78 |     st_env.execute("group by window agg streaming")
79 |     # cat /tmp/table_group_by_window_agg_streaming.csv
80 |     # a,3
81 |     # b,2
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     from table.prepare_environment import prepare_env
86 |     prepare_env(need_stream_source=True)
87 |     group_by_window_agg_streaming()
88 | 


--------------------------------------------------------------------------------
/table/streaming/in.py:
--------------------------------------------------------------------------------
 1 | from pyflink.datastream import StreamExecutionEnvironment
 2 | from pyflink.table import StreamTableEnvironment, DataTypes
 3 | 
 4 | from table.user_defined_sources_and_sinks.TestRetractSink import TestRetractSink
 5 | 
 6 | 
 7 | def in_streaming():
 8 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 9 |     s_env.set_parallelism(1)
10 |     st_env = StreamTableEnvironment.create(s_env)
11 |     left = st_env.from_elements(
12 |         [(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (4, "ra", "raa")],
13 |         ["a", "b", "c"]).select("a, b, c")
14 |     right = st_env.from_elements([(1, "ra", "raa"), (2, "", "rbb"), (3, "rc", "rcc"), (1, "ra", "raa")],
15 |                                  ["a", "b", "c"]).select("a")
16 | 
17 |     result = left.where("a.in(%s)" % right).select("b, c")
18 |     # another way
19 |     # st_env.register_table("RightTable", right)
20 |     # result = left.where("a.in(RightTable)")
21 | 
22 |     # use custom retract sink connector
23 |     sink = TestRetractSink(["a", "b"],
24 |                            [DataTypes.STRING(),
25 |                             DataTypes.STRING()])
26 |     st_env.register_table_sink("sink", sink)
27 |     result.insert_into("sink")
28 |     st_env.execute("in streaming")
29 |     # (true, ra, raa)
30 |     # (true, lb, lbb)
31 |     # (true, lb, lbb)
32 |     # (true,, lcc)
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     in_streaming()
37 | 


--------------------------------------------------------------------------------
/table/streaming/inner_join.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.datastream import StreamExecutionEnvironment
 4 | from pyflink.table import StreamTableEnvironment, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def inner_join_streaming():
 8 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 9 |     s_env.set_parallelism(1)
10 |     st_env = StreamTableEnvironment.create(s_env)
11 |     result_file = "/tmp/table_inner_join_streaming.csv"
12 |     if os.path.exists(result_file):
13 |         os.remove(result_file)
14 |     left = st_env.from_elements(
15 |         [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")],
16 |         ["a", "b", "c"]).select("a, b, c")
17 |     right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")],
18 |                                  ["d", "e", "f"]).select("d, e, f")
19 |     st_env.register_table_sink("result",
20 |                                CsvTableSink(["a", "b", "c"],
21 |                                             [DataTypes.BIGINT(),
22 |                                              DataTypes.STRING(),
23 |                                              DataTypes.STRING()],
24 |                                             result_file))
25 | 
26 |     result = left.join(right).where("a = d").select("a, b, e")
27 |     result.insert_into("result")
28 |     st_env.execute("inner join streaming")
29 |     # cat /tmp/table_inner_join_streaming.csv
30 |     # 1,1a,1b
31 |     # 2,4b,
32 |     # 2,2a,
33 |     # 1,1a,3b
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     inner_join_streaming()
38 | 


--------------------------------------------------------------------------------
/table/streaming/left_outer_join.py:
--------------------------------------------------------------------------------
 1 | from pyflink.datastream import StreamExecutionEnvironment
 2 | from pyflink.table import StreamTableEnvironment, DataTypes
 3 | 
 4 | from table.user_defined_sources_and_sinks.TestRetractSink import TestRetractSink
 5 | 
 6 | 
 7 | def left_outer_join_streaming():
 8 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 9 |     s_env.set_parallelism(1)
10 |     st_env = StreamTableEnvironment.create(s_env)
11 |     left = st_env.from_elements(
12 |         [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")],
13 |         ["a", "b", "c"]).select("a, b, c")
14 |     right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")],
15 |                                  ["d", "e", "f"]).select("d, e, f")
16 | 
17 |     result = left.left_outer_join(right, "a = d").select("a, b, e")
18 |     # use custom retract sink connector
19 |     sink = TestRetractSink(["a", "b", "c"],
20 |                            [DataTypes.BIGINT(),
21 |                             DataTypes.STRING(),
22 |                             DataTypes.STRING()])
23 |     st_env.register_table_sink("sink", sink)
24 |     result.insert_into("sink")
25 |     st_env.execute("left outer join streaming")
26 |     # (true, 1, 1a, null)
27 |     # (true, 2, 2a, null)
28 |     # (true, 3, null, null)
29 |     # (true, 2, 4b, null)
30 |     # (true, 5, 5a, null)
31 |     # (false, 1, 1a, null)
32 |     # (true, 1, 1a, 1b)
33 |     # (false, 2, 4b, null)
34 |     # (true, 2, 4b, null)
35 |     # (false, 2, 2a, null)
36 |     # (true, 2, 2a, null)
37 |     # (true, 1, 1a, 3b)
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     left_outer_join_streaming()
42 | 


--------------------------------------------------------------------------------
/table/streaming/over_window_agg.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic
  4 | from pyflink.table import StreamTableEnvironment, CsvTableSink, DataTypes
  5 | from pyflink.table.descriptors import Kafka, Json, Schema, Rowtime
  6 | from pyflink.table.window import Over
  7 | 
  8 | 
  9 | def over_window_agg_streaming():
 10 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 11 |     s_env.set_parallelism(1)
 12 |     s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
 13 |     st_env = StreamTableEnvironment.create(s_env)
 14 |     result_file = "/tmp/table_over_window_agg_streaming.csv"
 15 |     if os.path.exists(result_file):
 16 |         os.remove(result_file)
 17 |     st_env \
 18 |         .connect(  # declare the external system to connect to
 19 |             Kafka()
 20 |             .version("0.11")
 21 |             .topic("user")
 22 |             .start_from_earliest()
 23 |             .property("zookeeper.connect", "localhost:2181")
 24 |             .property("bootstrap.servers", "localhost:9092")
 25 |         ) \
 26 |         .with_format(  # declare a format for this system
 27 |             Json()
 28 |             .fail_on_missing_field(True)
 29 |             .json_schema(
 30 |                 "{"
 31 |                 "  type: 'object',"
 32 |                 "  properties: {"
 33 |                 "    a: {"
 34 |                 "      type: 'string'"
 35 |                 "    },"
 36 |                 "    b: {"
 37 |                 "      type: 'string'"
 38 |                 "    },"
 39 |                 "    c: {"
 40 |                 "      type: 'string'"
 41 |                 "    },"
 42 |                 "    time: {"
 43 |                 "      type: 'string',"
 44 |                 "      format: 'date-time'"
 45 |                 "    }"
 46 |                 "  }"
 47 |                 "}"
 48 |              )
 49 |          ) \
 50 |         .with_schema(  # declare the schema of the table
 51 |              Schema()
 52 |              .field("rowtime", DataTypes.TIMESTAMP())
 53 |              .rowtime(
 54 |                 Rowtime()
 55 |                 .timestamps_from_field("time")
 56 |                 .watermarks_periodic_bounded(60000))
 57 |              .field("a", DataTypes.STRING())
 58 |              .field("b", DataTypes.STRING())
 59 |              .field("c", DataTypes.STRING())
 60 |          ) \
 61 |         .in_append_mode() \
 62 |         .register_table_source("source")
 63 | 
 64 |     st_env.register_table_sink("result",
 65 |                                CsvTableSink(["a", "b", "c"],
 66 |                                             [DataTypes.STRING(),
 67 |                                              DataTypes.STRING(),
 68 |                                              DataTypes.STRING()],
 69 |                                             result_file))
 70 | 
 71 |     st_env.scan("source").over_window(Over.partition_by("a")
 72 |                                       .order_by("rowtime").preceding("30.minutes").alias("w")) \
 73 |         .select("a, max(b) over w, min(c) over w").insert_into("result")
 74 | 
 75 |     st_env.execute("over window agg streaming")
 76 |     # cat /tmp/table_over_window_agg_streaming.csv
 77 |     # a,1,1
 78 |     # b,2,2
 79 |     # a,3,1
 80 |     # a,4,4
 81 |     # b,4,5
 82 | 
 83 |     # if preceding("unbounded_ranges") e.g:
 84 |     # st_env.scan("source").over_window(Over.partition_by("a")
 85 |     #                                       .order_by("rowtime").preceding("unbounded_range").alias("w")) \
 86 |     #         .select("a, max(b) over w, min(c) over w").insert_into("result")
 87 |     # the result is
 88 |     # a,1,1
 89 |     # a,3,1
 90 |     # a,4,1
 91 |     # b,2,2
 92 |     # b,4,2
 93 |     # rows is similar to time, you can refer to the doc.
 94 | 
 95 | 
 96 | if __name__ == '__main__':
 97 |     from table.prepare_environment import prepare_env
 98 |     prepare_env(need_stream_source=True)
 99 |     over_window_agg_streaming()
100 | 


--------------------------------------------------------------------------------
/table/streaming/rename_columns.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.datastream import StreamExecutionEnvironment
 4 | from pyflink.table import StreamTableEnvironment, CsvTableSource, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def rename_columns_streaming():
 8 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 9 |     s_env.set_parallelism(1)
10 |     st_env = StreamTableEnvironment.create(s_env)
11 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
12 |     result_file = "/tmp/table_rename_columns_streaming.csv"
13 |     if os.path.exists(result_file):
14 |         os.remove(result_file)
15 |     st_env.register_table_source("Orders",
16 |                                  CsvTableSource(source_file,
17 |                                                 ["a", "b", "c", "rowtime"],
18 |                                                 [DataTypes.STRING(),
19 |                                                  DataTypes.INT(),
20 |                                                  DataTypes.INT(),
21 |                                                  DataTypes.TIMESTAMP()]))
22 |     st_env.register_table_sink("result",
23 |                                CsvTableSink(["a", "b"],
24 |                                             [DataTypes.STRING(),
25 |                                              DataTypes.INT()],
26 |                                             result_file))
27 |     orders = st_env.scan("Orders")
28 |     result = orders.rename_columns("a as a2, b as b2").select("a2, b2")
29 |     result.insert_into("result")
30 |     st_env.execute("rename columns streaming")
31 |     # cat /tmp/table_rename_columns_streaming.csv
32 |     # a,1
33 |     # b,2
34 |     # a,3
35 |     # a,4
36 |     # b,4
37 |     # a,5
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     rename_columns_streaming()
42 | 


--------------------------------------------------------------------------------
/table/streaming/right_outer_join.py:
--------------------------------------------------------------------------------
 1 | from pyflink.datastream import StreamExecutionEnvironment
 2 | from pyflink.table import StreamTableEnvironment, DataTypes
 3 | 
 4 | from table.user_defined_sources_and_sinks.TestRetractSink import TestRetractSink
 5 | 
 6 | 
 7 | def right_outer_join_streaming():
 8 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 9 |     s_env.set_parallelism(1)
10 |     st_env = StreamTableEnvironment.create(s_env)
11 |     left = st_env.from_elements(
12 |         [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")],
13 |         ["a", "b", "c"]).select("a, b, c")
14 |     right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")],
15 |                                  ["d", "e", "f"]).select("d, e, f")
16 | 
17 |     result = left.right_outer_join(right, "a = d").select("b, e")
18 |     # use custom retract sink connector
19 |     sink = TestRetractSink(["a", "b"],
20 |                            [DataTypes.STRING(),
21 |                             DataTypes.STRING()])
22 |     st_env.register_table_sink("sink", sink)
23 |     result.insert_into("sink")
24 |     st_env.execute("right outer join streaming")
25 |     # (true, null, null)
26 |     # (true, null, 3b)
27 |     # (true, null, 4b)
28 |     # (false, null, 1b)
29 |     # (true, 1a, 1b)
30 |     # (false, null, 3b)
31 |     # (true, 1a, 3b)
32 |     # (false, null, null)
33 |     # (true, 2a, null)
34 |     # (true, 4b, null)
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     right_outer_join_streaming()
39 | 


--------------------------------------------------------------------------------
/table/streaming/scan.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.datastream import StreamExecutionEnvironment
 4 | from pyflink.table import StreamTableEnvironment, CsvTableSource, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def scan_streaming():
 8 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 9 |     s_env.set_parallelism(1)
10 |     st_env = StreamTableEnvironment.create(s_env)
11 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
12 |     result_file = "/tmp/table_scan_streaming.csv"
13 |     if os.path.exists(result_file):
14 |         os.remove(result_file)
15 |     st_env.register_table_source("Orders",
16 |                                  CsvTableSource(source_file,
17 |                                                 ["a", "b", "c", "rowtime"],
18 |                                                 [DataTypes.STRING(),
19 |                                                  DataTypes.INT(),
20 |                                                  DataTypes.INT(),
21 |                                                  DataTypes.TIMESTAMP()]))
22 |     st_env.register_table_sink("result",
23 |                                CsvTableSink(["a", "b", "c", "rowtime"],
24 |                                             [DataTypes.STRING(),
25 |                                              DataTypes.INT(),
26 |                                              DataTypes.INT(),
27 |                                              DataTypes.TIMESTAMP()],
28 |                                             result_file))
29 |     orders = st_env.scan("Orders")
30 |     orders.insert_into("result")
31 |     st_env.execute("scan streaming")
32 |     # cat /tmp/table_scan_streaming.csv
33 |     # a,1,1,2013-01-01 00:14:13.0
34 |     # b,2,2,2013-01-01 00:24:13.0
35 |     # a,3,3,2013-01-01 00:34:13.0
36 |     # a,4,4,2013-01-01 01:14:13.0
37 |     # b,4,5,2013-01-01 01:24:13.0
38 |     # a,5,2,2013-01-01 01:34:13.0
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     scan_streaming()
43 | 


--------------------------------------------------------------------------------
/table/streaming/session_window.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic
 4 | from pyflink.table import StreamTableEnvironment, CsvTableSink, DataTypes
 5 | from pyflink.table.descriptors import Schema, Rowtime, Json, Kafka
 6 | from pyflink.table.window import Session
 7 | 
 8 | 
 9 | def session_time_window_streaming():
10 |     s_env = StreamExecutionEnvironment.get_execution_environment()
11 |     s_env.set_parallelism(1)
12 |     s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
13 |     st_env = StreamTableEnvironment.create(s_env)
14 |     result_file = "/tmp/session_time_window_streaming.csv"
15 |     if os.path.exists(result_file):
16 |         os.remove(result_file)
17 |     st_env \
18 |         .connect(  # declare the external system to connect to
19 |             Kafka()
20 |             .version("0.11")
21 |             .topic("user")
22 |             .start_from_earliest()
23 |             .property("zookeeper.connect", "localhost:2181")
24 |             .property("bootstrap.servers", "localhost:9092")
25 |         ) \
26 |         .with_format(  # declare a format for this system
27 |             Json()
28 |             .fail_on_missing_field(True)
29 |             .json_schema(
30 |                 "{"
31 |                 "  type: 'object',"
32 |                 "  properties: {"
33 |                 "    a: {"
34 |                 "      type: 'string'"
35 |                 "    },"
36 |                 "    b: {"
37 |                 "      type: 'string'"
38 |                 "    },"
39 |                 "    c: {"
40 |                 "      type: 'string'"
41 |                 "    },"
42 |                 "    time: {"
43 |                 "      type: 'string',"
44 |                 "      format: 'date-time'"
45 |                 "    }"
46 |                 "  }"
47 |                 "}"
48 |              )
49 |          ) \
50 |         .with_schema(  # declare the schema of the table
51 |              Schema()
52 |              .field("rowtime", DataTypes.TIMESTAMP())
53 |              .rowtime(
54 |                 Rowtime()
55 |                 .timestamps_from_field("time")
56 |                 .watermarks_periodic_bounded(60000))
57 |              .field("a", DataTypes.STRING())
58 |              .field("b", DataTypes.STRING())
59 |              .field("c", DataTypes.STRING())
60 |          ) \
61 |         .in_append_mode() \
62 |         .register_table_source("source")
63 | 
64 |     st_env.register_table_sink("result",
65 |                                CsvTableSink(["a", "b"],
66 |                                             [DataTypes.STRING(),
67 |                                              DataTypes.STRING()],
68 |                                             result_file))
69 | 
70 |     st_env.scan("source").window(Session.with_gap("10.minutes").on("rowtime").alias("w")) \
71 |         .group_by("w, a") \
72 |         .select("a, max(b)").insert_into("result")
73 | 
74 |     st_env.execute("session time window streaming")
75 |     # cat /tmp/session_time_window_streaming.csv
76 |     # a,1
77 |     # b,2
78 |     # a,3
79 |     # a,4
80 | 
81 | 
82 | if __name__ == '__main__':
83 |     from table.prepare_environment import prepare_env
84 |     prepare_env(need_stream_source=True)
85 |     session_time_window_streaming()
86 | 


--------------------------------------------------------------------------------
/table/streaming/slide_window.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic
  4 | from pyflink.table import StreamTableEnvironment, CsvTableSink, DataTypes
  5 | from pyflink.table.descriptors import Schema, Rowtime, Json, Kafka
  6 | from pyflink.table.window import Slide
  7 | 
  8 | 
  9 | def slide_time_window_streaming():
 10 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 11 |     s_env.set_parallelism(1)
 12 |     s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
 13 |     st_env = StreamTableEnvironment.create(s_env)
 14 |     result_file = "/tmp/slide_time_window_streaming.csv"
 15 |     if os.path.exists(result_file):
 16 |         os.remove(result_file)
 17 |     st_env \
 18 |         .connect(  # declare the external system to connect to
 19 |             Kafka()
 20 |             .version("0.11")
 21 |             .topic("user")
 22 |             .start_from_earliest()
 23 |             .property("zookeeper.connect", "localhost:2181")
 24 |             .property("bootstrap.servers", "localhost:9092")
 25 |         ) \
 26 |         .with_format(  # declare a format for this system
 27 |             Json()
 28 |             .fail_on_missing_field(True)
 29 |             .json_schema(
 30 |                 "{"
 31 |                 "  type: 'object',"
 32 |                 "  properties: {"
 33 |                 "    a: {"
 34 |                 "      type: 'string'"
 35 |                 "    },"
 36 |                 "    b: {"
 37 |                 "      type: 'string'"
 38 |                 "    },"
 39 |                 "    c: {"
 40 |                 "      type: 'string'"
 41 |                 "    },"
 42 |                 "    time: {"
 43 |                 "      type: 'string',"
 44 |                 "      format: 'date-time'"
 45 |                 "    }"
 46 |                 "  }"
 47 |                 "}"
 48 |              )
 49 |          ) \
 50 |         .with_schema(  # declare the schema of the table
 51 |              Schema()
 52 |              .field("rowtime", DataTypes.TIMESTAMP())
 53 |              .rowtime(
 54 |                 Rowtime()
 55 |                 .timestamps_from_field("time")
 56 |                 .watermarks_periodic_bounded(60000))
 57 |              .field("a", DataTypes.STRING())
 58 |              .field("b", DataTypes.STRING())
 59 |              .field("c", DataTypes.STRING())
 60 |          ) \
 61 |         .in_append_mode() \
 62 |         .register_table_source("source")
 63 | 
 64 |     st_env.register_table_sink("result",
 65 |                                CsvTableSink(["a", "b"],
 66 |                                             [DataTypes.STRING(),
 67 |                                              DataTypes.STRING()],
 68 |                                             result_file))
 69 | 
 70 |     st_env.scan("source").window(Slide.over("1.hours").every("10.minutes").on("rowtime").alias("w")) \
 71 |         .group_by("w, a") \
 72 |         .select("a, max(b)").insert_into("result")
 73 | 
 74 |     st_env.execute("slide time window streaming")
 75 |     # cat /tmp/slide_time_window_streaming.csv
 76 |     # a,1
 77 |     # a,1
 78 |     # b,2
 79 |     # a,3
 80 |     # b,2
 81 |     # b,2
 82 |     # a,3
 83 |     # b,2
 84 |     # a,3
 85 |     # b,2
 86 |     # a,3
 87 |     # b,2
 88 |     # a,4
 89 |     # b,4
 90 |     # a,4
 91 | 
 92 | 
 93 | def slide_row_window_streaming():
 94 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 95 |     s_env.set_parallelism(1)
 96 |     s_env.set_stream_time_characteristic(TimeCharacteristic.ProcessingTime)
 97 |     st_env = StreamTableEnvironment.create(s_env)
 98 |     result_file = "/tmp/slide_row_window_streaming.csv"
 99 |     if os.path.exists(result_file):
100 |         os.remove(result_file)
101 |     st_env \
102 |         .connect(  # declare the external system to connect to
103 |             Kafka()
104 |             .version("0.11")
105 |             .topic("user")
106 |             .start_from_earliest()
107 |             .property("zookeeper.connect", "localhost:2181")
108 |             .property("bootstrap.servers", "localhost:9092")
109 |         ) \
110 |         .with_format(  # declare a format for this system
111 |             Json()
112 |             .fail_on_missing_field(True)
113 |             .json_schema(
114 |                 "{"
115 |                 "  type: 'object',"
116 |                 "  properties: {"
117 |                 "    a: {"
118 |                 "      type: 'string'"
119 |                 "    },"
120 |                 "    b: {"
121 |                 "      type: 'string'"
122 |                 "    },"
123 |                 "    c: {"
124 |                 "      type: 'string'"
125 |                 "    },"
126 |                 "    time: {"
127 |                 "      type: 'string',"
128 |                 "      format: 'date-time'"
129 |                 "    }"
130 |                 "  }"
131 |                 "}"
132 |              )
133 |          ) \
134 |         .with_schema(  # declare the schema of the table
135 |              Schema()
136 |              .field("proctime", DataTypes.TIMESTAMP())
137 |              .proctime()
138 |              .field("a", DataTypes.STRING())
139 |              .field("b", DataTypes.STRING())
140 |              .field("c", DataTypes.STRING())
141 |          ) \
142 |         .in_append_mode() \
143 |         .register_table_source("source")
144 | 
145 |     st_env.register_table_sink("result",
146 |                                CsvTableSink(["a", "b"],
147 |                                             [DataTypes.STRING(),
148 |                                              DataTypes.STRING()],
149 |                                             result_file))
150 | 
151 |     st_env.scan("source").window(Slide.over("2.rows").every("1.rows").on("proctime").alias("w")) \
152 |         .group_by("w, a") \
153 |         .select("a, max(b)").insert_into("result")
154 | 
155 |     st_env.execute("slide row window streaming")
156 |     # cat /tmp/slide_row_window_streaming.csv
157 |     # a,1
158 |     # b,2
159 |     # a,3
160 |     # a,4
161 |     # b,4
162 |     # a,5
163 | 
164 | 
165 | if __name__ == '__main__':
166 |     from table.prepare_environment import prepare_env
167 |     prepare_env(need_stream_source=True)
168 |     # slide_time_window_streaming()
169 |     slide_row_window_streaming()
170 | 


--------------------------------------------------------------------------------
/table/streaming/table_select.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.datastream import StreamExecutionEnvironment
 4 | from pyflink.table import StreamTableEnvironment, CsvTableSource, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def select_streaming():
 8 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 9 |     s_env.set_parallelism(1)
10 |     st_env = StreamTableEnvironment.create(s_env)
11 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
12 |     result_file = "/tmp/table_select_streaming.csv"
13 |     if os.path.exists(result_file):
14 |         os.remove(result_file)
15 | 
16 |     st_env.register_table_source("Orders",
17 |                                  CsvTableSource(source_file,
18 |                                                 ["a", "b", "c", "rowtime"],
19 |                                                 [DataTypes.STRING(),
20 |                                                  DataTypes.INT(),
21 |                                                  DataTypes.INT(),
22 |                                                  DataTypes.TIMESTAMP()]))
23 |     st_env.register_table_sink("result",
24 |                                CsvTableSink(["a", "c"],
25 |                                             [DataTypes.STRING(),
26 |                                              DataTypes.INT()],
27 |                                             result_file))
28 |     orders = st_env.scan("Orders")
29 |     result = orders.select("a, b")
30 |     result.insert_into("result")
31 |     st_env.execute("select streaming")
32 | 
33 |     # cat /tmp/table_select_streaming.csv
34 |     # a,1
35 |     # b,2
36 |     # a,3
37 |     # a,4
38 |     # b,4
39 |     # a,5
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     select_streaming()
44 | 


--------------------------------------------------------------------------------
/table/streaming/tumble_window.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic
  4 | from pyflink.table import StreamTableEnvironment, CsvTableSink, DataTypes
  5 | from pyflink.table.descriptors import Schema, Rowtime, Json, Kafka
  6 | from pyflink.table.window import Tumble
  7 | 
  8 | 
  9 | def tumble_time_window_streaming():
 10 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 11 |     s_env.set_parallelism(1)
 12 |     s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
 13 |     st_env = StreamTableEnvironment.create(s_env)
 14 |     result_file = "/tmp/tumble_time_window_streaming.csv"
 15 |     if os.path.exists(result_file):
 16 |         os.remove(result_file)
 17 |     st_env \
 18 |         .connect(  # declare the external system to connect to
 19 |             Kafka()
 20 |             .version("0.11")
 21 |             .topic("user")
 22 |             .start_from_earliest()
 23 |             .property("zookeeper.connect", "localhost:2181")
 24 |             .property("bootstrap.servers", "localhost:9092")
 25 |         ) \
 26 |         .with_format(  # declare a format for this system
 27 |             Json()
 28 |             .fail_on_missing_field(True)
 29 |             .json_schema(
 30 |                 "{"
 31 |                 "  type: 'object',"
 32 |                 "  properties: {"
 33 |                 "    a: {"
 34 |                 "      type: 'string'"
 35 |                 "    },"
 36 |                 "    b: {"
 37 |                 "      type: 'string'"
 38 |                 "    },"
 39 |                 "    c: {"
 40 |                 "      type: 'string'"
 41 |                 "    },"
 42 |                 "    time: {"
 43 |                 "      type: 'string',"
 44 |                 "      format: 'date-time'"
 45 |                 "    }"
 46 |                 "  }"
 47 |                 "}"
 48 |              )
 49 |          ) \
 50 |         .with_schema(  # declare the schema of the table
 51 |              Schema()
 52 |              .field("rowtime", DataTypes.TIMESTAMP())
 53 |              .rowtime(
 54 |                 Rowtime()
 55 |                 .timestamps_from_field("time")
 56 |                 .watermarks_periodic_bounded(60000))
 57 |              .field("a", DataTypes.STRING())
 58 |              .field("b", DataTypes.STRING())
 59 |              .field("c", DataTypes.STRING())
 60 |          ) \
 61 |         .in_append_mode() \
 62 |         .register_table_source("source")
 63 | 
 64 |     st_env.register_table_sink("result",
 65 |                                CsvTableSink(["a", "b"],
 66 |                                             [DataTypes.STRING(),
 67 |                                              DataTypes.STRING()],
 68 |                                             result_file))
 69 | 
 70 |     st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \
 71 |         .group_by("w, a") \
 72 |         .select("a, max(b)").insert_into("result")
 73 | 
 74 |     st_env.execute("tumble time window streaming")
 75 |     # cat /tmp/tumble_time_window_streaming.csv
 76 |     # a,3
 77 |     # b,2
 78 | 
 79 | 
 80 | def tumble_row_window_streaming():
 81 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 82 |     s_env.set_parallelism(1)
 83 |     s_env.set_stream_time_characteristic(TimeCharacteristic.ProcessingTime)
 84 |     st_env = StreamTableEnvironment.create(s_env)
 85 |     result_file = "/tmp/tumble_row_window_streaming.csv"
 86 |     if os.path.exists(result_file):
 87 |         os.remove(result_file)
 88 |     st_env \
 89 |         .connect(  # declare the external system to connect to
 90 |             Kafka()
 91 |             .version("0.11")
 92 |             .topic("user")
 93 |             .start_from_earliest()
 94 |             .property("zookeeper.connect", "localhost:2181")
 95 |             .property("bootstrap.servers", "localhost:9092")
 96 |         ) \
 97 |         .with_format(  # declare a format for this system
 98 |             Json()
 99 |             .fail_on_missing_field(True)
100 |             .json_schema(
101 |                 "{"
102 |                 "  type: 'object',"
103 |                 "  properties: {"
104 |                 "    a: {"
105 |                 "      type: 'string'"
106 |                 "    },"
107 |                 "    b: {"
108 |                 "      type: 'string'"
109 |                 "    },"
110 |                 "    c: {"
111 |                 "      type: 'string'"
112 |                 "    },"
113 |                 "    time: {"
114 |                 "      type: 'string',"
115 |                 "      format: 'date-time'"
116 |                 "    }"
117 |                 "  }"
118 |                 "}"
119 |              )
120 |          ) \
121 |         .with_schema(  # declare the schema of the table
122 |              Schema()
123 |              .field("proctime", DataTypes.TIMESTAMP())
124 |              .proctime()
125 |              .field("a", DataTypes.STRING())
126 |              .field("b", DataTypes.STRING())
127 |              .field("c", DataTypes.STRING())
128 |          ) \
129 |         .in_append_mode() \
130 |         .register_table_source("source")
131 | 
132 |     st_env.register_table_sink("result",
133 |                                CsvTableSink(["a", "b"],
134 |                                             [DataTypes.STRING(),
135 |                                              DataTypes.STRING()],
136 |                                             result_file))
137 | 
138 |     st_env.scan("source").window(Tumble.over("2.rows").on("proctime").alias("w")) \
139 |         .group_by("w, a") \
140 |         .select("a, max(b)").insert_into("result")
141 | 
142 |     st_env.execute("tumble row window streaming")
143 |     # cat /tmp/tumble_row_window_streaming.csv
144 |     # a,3
145 |     # b,4
146 |     # a 5
147 | 
148 | 
149 | if __name__ == '__main__':
150 |     from table.prepare_environment import prepare_env
151 |     prepare_env(need_stream_source=True)
152 |     # tumble_time_window_streaming()
153 |     tumble_row_window_streaming()
154 | 


--------------------------------------------------------------------------------
/table/streaming/union_all.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.datastream import StreamExecutionEnvironment
 4 | from pyflink.table import StreamTableEnvironment, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def union_all_streaming():
 8 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 9 |     s_env.set_parallelism(1)
10 |     st_env = StreamTableEnvironment.create(s_env)
11 |     result_file = "/tmp/table_union_all_streaming.csv"
12 |     if os.path.exists(result_file):
13 |         os.remove(result_file)
14 |     left = st_env.from_elements(
15 |         [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (1, "1a", "1laa"), (1, "1b", "1bb")],
16 |         ["a", "b", "c"]).select("a, b, c")
17 |     right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")],
18 |                                  ["a", "b", "c"]).select("a, b, c")
19 |     st_env.register_table_sink("result",
20 |                                CsvTableSink(["a", "b", "c"],
21 |                                             [DataTypes.BIGINT(),
22 |                                              DataTypes.STRING(),
23 |                                              DataTypes.STRING()],
24 |                                             result_file))
25 | 
26 |     result = left.union_all(right)
27 |     result.insert_into("result")
28 |     st_env.execute("union all streaming")
29 |     # cat /tmp/table_union_all_streaming.csv
30 |     # 1,1b,1bb
31 |     # 2,,2bb
32 |     # 1,3b,3bb
33 |     # 4,4b,4bb
34 |     # 1,1a,1laa
35 |     # 2,2a,2aa
36 |     # 3,,3aa
37 |     # 1,1a,1laa
38 |     # 1,1b,1bb
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     union_all_streaming()
43 | 


--------------------------------------------------------------------------------
/table/streaming/where.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pyflink.datastream import StreamExecutionEnvironment
 4 | from pyflink.table import StreamTableEnvironment, CsvTableSource, CsvTableSink, DataTypes
 5 | 
 6 | 
 7 | def where_streaming():
 8 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 9 |     s_env.set_parallelism(1)
10 |     st_env = StreamTableEnvironment.create(s_env)
11 |     source_file = os.getcwd() + "/../resources/table_orders.csv"
12 |     result_file = "/tmp/table_where_streaming.csv"
13 |     if os.path.exists(result_file):
14 |         os.remove(result_file)
15 |     st_env.register_table_source("Orders",
16 |                                  CsvTableSource(source_file,
17 |                                                 ["a", "b", "c", "rowtime"],
18 |                                                 [DataTypes.STRING(),
19 |                                                  DataTypes.INT(),
20 |                                                  DataTypes.INT(),
21 |                                                  DataTypes.TIMESTAMP()]))
22 |     st_env.register_table_sink("result",
23 |                                CsvTableSink(["a", "b", "c", "rowtime"],
24 |                                             [DataTypes.STRING(),
25 |                                              DataTypes.INT(),
26 |                                              DataTypes.INT(),
27 |                                              DataTypes.TIMESTAMP()],
28 |                                             result_file))
29 |     orders = st_env.scan("Orders")
30 |     result = orders.where("a === 'b'")
31 |     result.insert_into("result")
32 |     st_env.execute("where streaming")
33 |     # cat /tmp/table_where_streaming.csv
34 |     # b,2,2,2013-01-01 00:24:13.0
35 |     # b,4,5,2013-01-01 01:24:13.0
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     where_streaming()
40 | 


--------------------------------------------------------------------------------
/table/user_case/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukess/pyflink-demo/7e8ae37b3cfb4a6a5560cae2a0d44cddee25e036/table/user_case/__init__.py


--------------------------------------------------------------------------------
/table/user_case/pv_uv/README.md:
--------------------------------------------------------------------------------
  1 | # pv_uv_demo
  2 | This demo is to help users to use pyflink api to develop a pv/uv demo
  3 | 
  4 | **contents**
  5 | 
  6 | - [Quick Start](#quick-start)
  7 |   + [Setup](#setup)
  8 |     + [Requirements](#requirements)
  9 |     + [Install python2](#install-python2)
 10 |     + [Install pip](#install-pip)
 11 |     + [Install java 8](#install-java-8)
 12 |     + [Install maven](#install-maven)
 13 |   + [Build PyFlink](#build-pyflink)
 14 |   + [Prepare Kafka](#prepare-kafka)
 15 |   + [Prepare Derby](#prepare-derby)
 16 |   + [Install Dependency](#install-dependency)
 17 |   + [Prepare Data](#prepare-data)
 18 |   + [Run Demo](#run-the-demo)
 19 |     + [See the result](#see-the-result)
 20 | 
 21 | ## Quick Start
 22 | 
 23 | ### Setup
 24 | 
 25 | #### Requirements
 26 | 1. python2.7 or python3
 27 | 2. pip
 28 | 3. java 1.8
 29 | 4. maven version >=3.3.0
 30 | 
 31 | #### Install python2
 32 | 
 33 | macOS
 34 | ```shell
 35 | /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
 36 | export PATH="/usr/local/bin:/usr/local/sbin:$PATH"
 37 | brew install python@2 
 38 | ```
 39 | Ubuntu
 40 | ```shell
 41 | sudo apt install python-dev
 42 | ```
 43 | 
 44 | #### Install pip
 45 | 
 46 | macOS
 47 | 
 48 | ```shell 
 49 | curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
 50 | python get-pip.py
 51 | ```
 52 | 
 53 | Ubuntu
 54 | ```shell
 55 | sudo apt install python-pip
 56 | ```
 57 | 
 58 | #### Install java 8
 59 | 
 60 | [java download page](http://www.oracle.com/technetwork/java/javase/downloads/index.html)
 61 | 
 62 | #### Install maven
 63 | 
 64 | maven version >=3.3.0
 65 | 
 66 | [download maven page](http://maven.apache.org/download.cgi)
 67 | 
 68 | ```shell
 69 | tar -xvf apache-maven-3.6.1-bin.tar.gz
 70 | mv -rf apache-maven-3.6.1 /usr/local/
 71 | ```
 72 | configuration environment variables
 73 | ```shell
 74 | MAVEN_HOME=/usr/local/apache-maven-3.6.1
 75 | export MAVEN_HOME
 76 | export PATH=${PATH}:${MAVEN_HOME}/bin
 77 | ```
 78 | 
 79 | 
 80 | ### Build PyFlink
 81 | 
 82 | If you want to build a PyFlink package that can be used for pip installation, you need to build Flink jars first, as described in https://ci.apache.org/projects/flink/flink-docs-master/flinkDev/building.html
 83 | 
 84 | ```shell
 85 | mvn clean install -DskipTests -Dfast
 86 | ```
 87 | 
 88 | Then you need to copy the jar package flink-sql-connector-kafka-0.11_*-SNAPSHOT.jar in the directory of flink-connectors/flink-sql-connector-kafka-0.11
 89 | 
 90 | ```shell
 91 | cp flink-connectors/flink-sql-connector-kafka-0.11/target/flink-sql-connector-kafka-0.11_*-SNAPSHOT.jar build-target/lib
 92 | ```
 93 | 
 94 | Then you need to copy the jar package flink-jdbc_*-SNAPSHOT.jar in the directory of flink-connectors/flink-jdbc
 95 | 
 96 | ```shell
 97 | cp flink-connectors/flink-jdbc/target/flink-jdbc_*-SNAPSHOT.jar build-target/lib
 98 | ```
 99 | 
100 | Next you need to copy the jar package flink-json-*-SNAPSHOT-sql-jar.jar in the directory of flink-formats/flink-json
101 | 
102 | ```shell
103 | cp flink-formats/flink-json/target/flink-json-*-SNAPSHOT-sql-jar.jar build-target/lib
104 | ```
105 | 
106 | Next go to the root directory of flink source code and run this command to build the sdist package and wheel package:
107 | 
108 | ```shell
109 | cd flink-python; python3 setup.py sdist bdist_wheel
110 | ```
111 | 
112 | The sdist and wheel package will be found under `./flink-python/dist/`. Either of them could be used for pip installation, such as:
113 | 
114 | ```shell
115 | pip install dist/*.tar.gz
116 | ```
117 | 
118 | ### Prepare Kafka
119 | Some demo choose kafka as source, so you need to install and run kafka in local host. the version we use kafka_2.11-0.11 (https://archive.apache.org/dist/kafka/0.11.0.3/kafka_2.11-0.11.0.3.tgz)
120 | you use the following command to download:
121 | 
122 | ```shell
123 | wget https://archive.apache.org/dist/kafka/0.11.0.3/kafka_2.11-0.11.0.3.tgz
124 | ```
125 | 
126 | Then you depress the tar package:
127 | 
128 | ```shell
129 | tar zxvf kafka_2.11-0.11.0.3.tgz
130 | ```
131 | 
132 | ### Prepare Derby
133 | the pv_uv_demo need a upsert sink connector and I choose the derby, so you need to install and run Derby in local host. the version we use db-derby-10.14.2.0-lib (http://apache.mirrors.pair.com//db/derby/db-derby-10.14.2.0/db-derby-10.14.2.0-lib.tar.gz)
134 | you use the following command to download:
135 | 
136 | ```shell
137 | wget http://apache.mirrors.pair.com//db/derby/db-derby-10.14.2.0/db-derby-10.14.2.0-lib.tar.gz
138 | ```
139 | 
140 | Then you depress the tar package:
141 | 
142 | ```shell
143 | tar zxvf http://apache.mirrors.pair.com//db/derby/db-derby-10.14.2.0/db-derby-10.14.2.0-lib.tar.gz
144 | ```
145 | 
146 | Next, you start Derby Server:
147 | 
148 | ```shell
149 | ./bin/startNetworkServer -h 0.0.0.0
150 | ```
151 | 
152 | Next, you can run the ij in another terminal:
153 | 
154 | ```shell
155 | ./bin/ij
156 | ```
157 | 
158 | Next, you can connect to the server in the ij interactive command:
159 | 
160 | ```shell
161 | ij> connect 'jdbc:derby://localhost:1527/firstdb;create=true';
162 | ```
163 | 
164 | Next, you need to create the result table pv_uv_table in the ij terminal:
165 | 
166 | ```shell
167 | ij> create table pv_uv_table(startTime TIMESTAMP,endTime TIMESTAMP,pv bigint,uv bigint);
168 | ```
169 | 
170 | Finally, you need to put the derby.jar, derbyclient.jar and derbytools.jar in db-derby-10.14.2.0-bin/lib into the Python directory of site-package/pyflink/lib
171 | 
172 | ### Install Dependency
173 | Install environment dependency
174 | 
175 | ```shell
176 | pip install -r requirements.txt
177 | ```
178 | 
179 | ### Prepare Data
180 | First you need to replace the variable KAFKA_DIR in file env.sh with your installed KAFKA binary directory, for example in my env.sh:
181 | 
182 | ```shell
183 | KAFKA_DIR=/Users/duanchen/Applications/kafka_2.11-0.11.0.3
184 | ```
185 | 
186 | Next, you need to source the create_data.sh
187 | 
188 | ```shell
189 | source create_data.sh
190 | ```
191 | 
192 | Next, you can start kafka
193 | 
194 | ```shell
195 | start_kafka
196 | ```
197 | 
198 | Next, you can create the topic which will be used in our demo
199 | 
200 | ```shell
201 | create_kafka_topic 1 1 user_behavior
202 | ```
203 | 
204 | Finally, you can send message to to the topic user_behavior
205 | 
206 | ```shell
207 | send_message user_behavior user_behavior.log
208 | ```
209 | 
210 | ## Run The Demo
211 | The demo code in pv-uv_example.py, you can directly run the code
212 | 
213 | ### See the result
214 | you can see the result in the ij terminal:
215 | 
216 | ```shell
217 | ij> select * from pv_uv_table;
218 | STARTTIME                    |ENDTIME                      |PV                  |UV
219 | -----------------------------------------------------------------------------------------------------
220 | 2017-11-26 01:00:00.0        |2017-11-26 02:00:00.0        |47244               |30837
221 | 2017-11-26 02:00:00.0        |2017-11-26 03:00:00.0        |53902               |35261
222 | 2017-11-26 03:00:00.0        |2017-11-26 04:00:00.0        |53135               |35302
223 | 2017-11-26 04:00:00.0        |2017-11-26 05:00:00.0        |49863               |33537
224 | 2017-11-26 05:00:00.0        |2017-11-26 06:00:00.0        |54305               |35748
225 | 2017-11-26 06:00:00.0        |2017-11-26 07:00:00.0        |56718               |36934
226 | 2017-11-26 07:00:00.0        |2017-11-26 08:00:00.0        |58324               |37763
227 | 2017-11-26 08:00:00.0        |2017-11-26 09:00:00.0        |58672               |37961
228 | 
229 | 已选择 8 行
230 | ```
231 | 


--------------------------------------------------------------------------------
/table/user_case/pv_uv/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukess/pyflink-demo/7e8ae37b3cfb4a6a5560cae2a0d44cddee25e036/table/user_case/pv_uv/__init__.py


--------------------------------------------------------------------------------
/table/user_case/pv_uv/create_data.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | ################################################################################
  3 | # Licensed to the Apache Software Foundation (ASF) under one
  4 | # or more contributor license agreements.  See the NOTICE file
  5 | # distributed with this work for additional information
  6 | # regarding copyright ownership.  The ASF licenses this file
  7 | # to you under the Apache License, Version 2.0 (the
  8 | # "License"); you may not use this file except in compliance
  9 | # with the License.  You may obtain a copy of the License at
 10 | #
 11 | #     http://www.apache.org/licenses/LICENSE-2.0
 12 | #
 13 | # Unless required by applicable law or agreed to in writing, software
 14 | # distributed under the License is distributed on an "AS IS" BASIS,
 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 | # See the License for the specific language governing permissions and
 17 | # limitations under the License.
 18 | ################################################################################
 19 | source "$(dirname "$0")"/env.sh
 20 | 
 21 | function check_kafka_dir_set {
 22 | 	if [[ -z $KAFKA_DIR ]]; then
 23 | 		echo "Faild to set KAFKA_DIR , you can check the code in env.sh"
 24 |     	exit 1
 25 |     fi
 26 | }
 27 | 
 28 | function start_zookeeper {
 29 | 	check_kafka_dir_set
 30 |     $KAFKA_DIR/bin/zookeeper-server-start.sh $KAFKA_DIR/config/zookeeper.properties &
 31 | }
 32 | 
 33 | function stop_zookeeper {
 34 | 	check_kafka_dir_set
 35 | 	$KAFKA_DIR/bin/zookeeper-server-stop.sh
 36 | }
 37 | 
 38 | function start_kafka_server {
 39 | 	check_kafka_dir_set
 40 |     $KAFKA_DIR/bin/kafka-server-start.sh $KAFKA_DIR/config/server.properties &
 41 | }
 42 | 
 43 | function stop_kafka_server {
 44 | 	check_kafka_dir_set
 45 | 	$KAFKA_DIR/bin/kafka-server-stop.sh
 46 | }
 47 | 
 48 | function check_start {
 49 | 	  # zookeeper outputs the "Node does not exist" bit to stderr
 50 |   while [[ $($KAFKA_DIR/bin/zookeeper-shell.sh localhost:2181 get /brokers/ids/0 2>&1) =~ .*Node\ does\ not\ exist.* ]]; do
 51 |     echo "Waiting for broker..."
 52 |     sleep 1
 53 |   done
 54 | }
 55 | 
 56 | function start_kafka {
 57 | 	start_zookeeper
 58 | 	start_kafka_server
 59 | 	check_start
 60 | }
 61 | 
 62 | function stop_kafka {
 63 | 	check_kafka_dir_set
 64 | 	stop_kafka_server
 65 | 	stop_zookeeper
 66 | 
 67 | 	# Terminate Kafka process if it still exists
 68 | 	PIDS=$(jps -vl | grep -i 'kafka\.Kafka' | grep java | grep -v grep | awk '{print $1}'|| echo "")
 69 | 
 70 | 	if [ ! -z "$PIDS" ]; then
 71 | 	    kill -s TERM $PIDS || true
 72 | 	fi
 73 | 
 74 | 	# Terminate QuorumPeerMain process if it still exists
 75 | 	PIDS=$(jps -vl | grep java | grep -i QuorumPeerMain | grep -v grep | awk '{print $1}'|| echo "")
 76 | 
 77 | 	if [ ! -z "$PIDS" ]; then
 78 | 		kill -s TERM $PIDS || true
 79 |   	fi
 80 | }
 81 | 
 82 | function create_kafka_topic {
 83 | 	check_kafka_dir_set
 84 | 	$KAFKA_DIR/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor $1 --partitions $2 --topic $3 2>&1 >/dev/null
 85 | }
 86 | 
 87 | function drop_kafka_topic {
 88 | 	check_kafka_dir_set
 89 |     $KAFKA_DIR/bin/kafka-topics.sh --delete --zookeeper localhost:2181 --topic $1 2>&1 >/dev/null
 90 |     sleep 1
 91 | }
 92 | 
 93 | function send_message {
 94 | 	check_kafka_dir_set
 95 | 	# batch produce to kafka
 96 | 	$KAFKA_DIR/bin/kafka-console-producer.sh --broker-list localhost:9092 --topic $1 < $2
 97 | }
 98 | 
 99 | function send_demo_message {
100 | 	check_kafka_dir_set
101 | 	send_messages_to_kafka '{"user_id": "543462", "item_id":"1715", "category_id": "1464116", "behavior": "pv", "ts": "2017-11-26T01:00:00Z"}' $1
102 | 	send_messages_to_kafka '{"user_id": "662867", "item_id":"2244074", "category_id": "1575622", "behavior": "pv", "ts": "2017-11-26T01:00:00Z"}' $1
103 | 	send_messages_to_kafka '{"user_id": "561558", "item_id":"3611281", "category_id": "965809", "behavior": "pv", "ts": "2017-11-26T01:00:00Z"}' $1
104 | }
105 | 
106 | function send_messages_to_kafka {
107 | 	echo -e $1 | $KAFKA_DIR/bin/kafka-console-producer.sh --broker-list localhost:9092 --topic $2
108 | }
109 | 
110 | # stop_kafka
111 | # start_kafka
112 | # drop_kafka_topic user_behavior
113 | # create_kafka_topic 1 1 user_behavior
114 | # send_message user_behavior user_behavior.log
115 | # send_demo_message user_behavior
116 | # stop_kafka
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------
/table/user_case/pv_uv/env.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | ################################################################################
 3 | # Licensed to the Apache Software Foundation (ASF) under one
 4 | # or more contributor license agreements.  See the NOTICE file
 5 | # distributed with this work for additional information
 6 | # regarding copyright ownership.  The ASF licenses this file
 7 | # to you under the Apache License, Version 2.0 (the
 8 | # "License"); you may not use this file except in compliance
 9 | # with the License.  You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | ################################################################################
19 | 
20 | KAFKA_DIR=/Users/duanchen/Applications/kafka_2.11-0.11.0.3
21 | 


--------------------------------------------------------------------------------
/table/user_case/pv_uv/pv_uv_example.py:
--------------------------------------------------------------------------------
 1 | from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic
 2 | from pyflink.table import StreamTableEnvironment, DataTypes, EnvironmentSettings
 3 | from pyflink.table.descriptors import CustomConnectorDescriptor, Schema, Kafka, Json, Rowtime
 4 | from pyflink.table.window import Tumble
 5 | 
 6 | 
 7 | def pv_uv_demo():
 8 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 9 |     s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime)
10 |     s_env.set_parallelism(1)
11 |     # use blink table planner
12 |     st_env = StreamTableEnvironment.create(s_env, environment_settings=EnvironmentSettings.new_instance()
13 |                                            .in_streaming_mode().use_blink_planner().build())
14 |     # use flink table planner
15 |     # st_env = StreamTableEnvironment.create(s_env)
16 |     st_env \
17 |         .connect(  # declare the external system to connect to
18 |             Kafka()
19 |             .version("0.11")
20 |             .topic("user_behavior")
21 |             .start_from_earliest()
22 |             .property("zookeeper.connect", "localhost:2181")
23 |             .property("bootstrap.servers", "localhost:9092")
24 |         ) \
25 |         .with_format(  # declare a format for this system
26 |             Json()
27 |             .fail_on_missing_field(True)
28 |             .json_schema(
29 |                 "{"
30 |                 "  type: 'object',"
31 |                 "  properties: {"
32 |                 "    user_id: {"
33 |                 "      type: 'string'"
34 |                 "    },"
35 |                 "    item_id: {"
36 |                 "      type: 'string'"
37 |                 "    },"
38 |                 "    category_id: {"
39 |                 "      type: 'string'"
40 |                 "    },"
41 |                 "    behavior: {"
42 |                 "      type: 'string'"
43 |                 "    },"
44 |                 "    ts: {"
45 |                 "      type: 'string',"
46 |                 "      format: 'date-time'"
47 |                 "    }"
48 |                 "  }"
49 |                 "}"
50 |             )
51 |         ) \
52 |         .with_schema(  # declare the schema of the table
53 |             Schema()
54 |             .field("user_id", DataTypes.STRING())
55 |             .field("item_id", DataTypes.STRING())
56 |             .field("category_id", DataTypes.STRING())
57 |             .field("behavior", DataTypes.STRING())
58 |             .field("rowtime", DataTypes.TIMESTAMP())
59 |             .rowtime(
60 |                 Rowtime()
61 |                 .timestamps_from_field("ts")
62 |                 .watermarks_periodic_bounded(60000))
63 |          ) \
64 |         .in_append_mode() \
65 |         .register_table_source("source")
66 | 
67 |     # use custom retract sink connector
68 |     custom_connector = CustomConnectorDescriptor('jdbc', 1, False) \
69 |         .property("connector.driver", "org.apache.derby.jdbc.ClientDriver") \
70 |         .property("connector.url", "jdbc:derby://localhost:1527/firstdb") \
71 |         .property("connector.table", "pv_uv_table") \
72 |         .property("connector.write.flush.max-rows", "1")
73 |     st_env.connect(custom_connector) \
74 |         .with_schema(
75 |         Schema()
76 |             .field("startTime", DataTypes.TIMESTAMP())
77 |             .field("endTime", DataTypes.TIMESTAMP())
78 |             .field("pv", DataTypes.BIGINT())
79 |             .field("uv", DataTypes.BIGINT())
80 |     ).register_table_sink("sink")
81 | 
82 |     st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \
83 |         .group_by("w") \
84 |         .select("w.start as startTime, w.end as endTime, COUNT(1) as pv, user_id.count.distinct as uv").insert_into("sink")
85 | 
86 |     st_env.execute("table pv uv")
87 | 
88 | 
89 | if __name__ == '__main__':
90 |     pv_uv_demo()
91 | 


--------------------------------------------------------------------------------
/table/user_defined_sources_and_sinks/CustomTableSourceDemo.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic
  4 | from pyflink.table import StreamTableEnvironment, DataTypes, CsvTableSink
  5 | from pyflink.table.descriptors import Schema, CustomFormatDescriptor, CustomConnectorDescriptor, Json
  6 | from pyflink.table.window import Tumble
  7 | 
  8 | 
  9 | def custom_kafka_source_demo():
 10 |     custom_connector = CustomConnectorDescriptor('kafka', 1, True) \
 11 |         .property('connector.topic', 'user') \
 12 |         .property('connector.properties.0.key', 'zookeeper.connect') \
 13 |         .property('connector.properties.0.value', 'localhost:2181') \
 14 |         .property('connector.properties.1.key', 'bootstrap.servers') \
 15 |         .property('connector.properties.1.value', 'localhost:9092') \
 16 |         .properties({'connector.version': '0.11', 'connector.startup-mode': 'earliest-offset'})
 17 | 
 18 |     # the key is 'format.json-schema'
 19 |     custom_format = CustomFormatDescriptor('json', 1) \
 20 |         .property('format.json-schema',
 21 |                   "{"
 22 |                   "  type: 'object',"
 23 |                   "  properties: {"
 24 |                   "    a: {"
 25 |                   "      type: 'string'"
 26 |                   "    },"
 27 |                   "    b: {"
 28 |                   "      type: 'string'"
 29 |                   "    },"
 30 |                   "    c: {"
 31 |                   "      type: 'string'"
 32 |                   "    },"
 33 |                   "    time: {"
 34 |                   "      type: 'string',"
 35 |                   "      format: 'date-time'"
 36 |                   "    }"
 37 |                   "  }"
 38 |                   "}") \
 39 |         .properties({'format.fail-on-missing-field': 'true'})
 40 | 
 41 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 42 |     s_env.set_parallelism(1)
 43 |     s_env.set_stream_time_characteristic(TimeCharacteristic.ProcessingTime)
 44 |     st_env = StreamTableEnvironment.create(s_env)
 45 |     result_file = "/tmp/custom_kafka_source_demo.csv"
 46 |     if os.path.exists(result_file):
 47 |         os.remove(result_file)
 48 |     st_env \
 49 |         .connect(custom_connector) \
 50 |         .with_format(
 51 |             custom_format
 52 |         ) \
 53 |         .with_schema(  # declare the schema of the table
 54 |             Schema()
 55 |             .field("proctime", DataTypes.TIMESTAMP())
 56 |             .proctime()
 57 |             .field("a", DataTypes.STRING())
 58 |             .field("b", DataTypes.STRING())
 59 |             .field("c", DataTypes.STRING())
 60 |          ) \
 61 |         .in_append_mode() \
 62 |         .register_table_source("source")
 63 | 
 64 |     st_env.register_table_sink("result",
 65 |                                CsvTableSink(["a", "b"],
 66 |                                             [DataTypes.STRING(),
 67 |                                              DataTypes.STRING()],
 68 |                                             result_file))
 69 | 
 70 |     st_env.scan("source").window(Tumble.over("2.rows").on("proctime").alias("w")) \
 71 |         .group_by("w, a") \
 72 |         .select("a, max(b)").insert_into("result")
 73 | 
 74 |     st_env.execute("custom kafka source demo")
 75 |     # cat /tmp/custom_kafka_source_demo.csv
 76 |     # a,3
 77 |     # b,4
 78 |     # a 5
 79 | 
 80 | 
 81 | def custom_test_source_demo():
 82 |     s_env = StreamExecutionEnvironment.get_execution_environment()
 83 |     s_env.set_parallelism(1)
 84 |     st_env = StreamTableEnvironment.create(s_env)
 85 |     result_file = "/tmp/custom_test_source_demo.csv"
 86 |     if os.path.exists(result_file):
 87 |         os.remove(result_file)
 88 |     custom_connector = CustomConnectorDescriptor('pyflink-test', 1, False)
 89 |     st_env.connect(custom_connector) \
 90 |         .with_schema(
 91 |         Schema()
 92 |             .field("a", DataTypes.STRING())
 93 |     ).register_table_source("source")
 94 | 
 95 |     st_env.register_table_sink("result",
 96 |                                CsvTableSink(["a"],
 97 |                                             [DataTypes.STRING()],
 98 |                                             result_file))
 99 |     orders = st_env.scan("source")
100 |     orders.insert_into("result")
101 |     st_env.execute("custom test source demo")
102 |     # cat /tmp/custom_test_source_demo.csv
103 |     # haha
104 |     # haha
105 |     # haha
106 | 
107 | 
108 | def custom_test_sink_demo():
109 |     s_env = StreamExecutionEnvironment.get_execution_environment()
110 |     s_env.set_parallelism(1)
111 |     st_env = StreamTableEnvironment.create(s_env)
112 |     left = st_env.from_elements(
113 |         [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")],
114 |         ["a", "b", "c"]).select("a, b, c")
115 |     right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")],
116 |                                  ["d", "e", "f"]).select("d, e, f")
117 | 
118 |     result = left.left_outer_join(right, "a = d").select("a, b, e")
119 |     # use custom retract sink connector
120 |     custom_connector = CustomConnectorDescriptor('pyflink-test', 1, False)
121 |     st_env.connect(custom_connector) \
122 |         .with_schema(
123 |         Schema()
124 |             .field("a", DataTypes.BIGINT())
125 |             .field("b", DataTypes.STRING())
126 |             .field("c", DataTypes.STRING())
127 |     ).register_table_sink("sink")
128 |     result.insert_into("sink")
129 |     st_env.execute("custom test sink demo")
130 |     # (true, 1, 1a, null)
131 |     # (true, 2, 2a, null)
132 |     # (true, 3, null, null)
133 |     # (true, 2, 4b, null)
134 |     # (true, 5, 5a, null)
135 |     # (false, 1, 1a, null)
136 |     # (true, 1, 1a, 1b)
137 |     # (false, 2, 4b, null)
138 |     # (true, 2, 4b, null)
139 |     # (false, 2, 2a, null)
140 |     # (true, 2, 2a, null)
141 |     # (true, 1, 1a, 3b)
142 | 
143 | 
144 | if __name__ == '__main__':
145 |     # from table.prepare_environment import prepare_env
146 |     # prepare_env(need_stream_source=True)
147 |     # custom_kafka_source_demo()
148 |     # custom_test_sink_demo()
149 |     custom_test_source_demo()
150 | 


--------------------------------------------------------------------------------
/table/user_defined_sources_and_sinks/README.md:
--------------------------------------------------------------------------------
 1 | # User-defined Sources & Sinks
 2 | This page helps users to custom create sources & sinks
 3 | 
 4 | ## Build Sources & Sinks
 5 | 
 6 | ### Custom Sink
 7 | The example of custom restract table sink lives in sinks module. You need to build this code:
 8 | 
 9 | ```shell
10 | cd sinks; mvn clean package
11 | ```
12 | 
13 | 1. put jar(source or sink jar) into Python site-packages/pyflink/lib directory
14 | 
15 | 2. create your python code wrapped the java class(you can refer to TestRetractSink.py)
16 | 
17 | 


--------------------------------------------------------------------------------
/table/user_defined_sources_and_sinks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukess/pyflink-demo/7e8ae37b3cfb4a6a5560cae2a0d44cddee25e036/table/user_defined_sources_and_sinks/__init__.py


--------------------------------------------------------------------------------
/table/user_defined_sources_and_sinks/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 | Licensed to the Apache Software Foundation (ASF) under one
 4 | or more contributor license agreements.  See the NOTICE file
 5 | distributed with this work for additional information
 6 | regarding copyright ownership.  The ASF licenses this file
 7 | to you under the Apache License, Version 2.0 (the
 8 | "License"); you may not use this file except in compliance
 9 | with the License.  You may obtain a copy of the License at
10 | 
11 |   http://www.apache.org/licenses/LICENSE-2.0
12 | 
13 | Unless required by applicable law or agreed to in writing,
14 | software distributed under the License is distributed on an
15 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 | KIND, either express or implied.  See the License for the
17 | specific language governing permissions and limitations
18 | under the License.
19 | -->
20 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
21 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
22 | 
23 |     <modelVersion>4.0.0</modelVersion>
24 | 
25 |     <groupId>org.apache.flink.table</groupId>
26 |     <artifactId>user-defined-connectors</artifactId>
27 |     <version>1.0</version>
28 | 
29 |     <packaging>jar</packaging>
30 | 
31 |     <properties>
32 |         <table.version>1.9.0</table.version>
33 |     </properties>
34 | 
35 |     <dependencies>
36 | 
37 |         <!-- core dependencies -->
38 | 
39 |         <dependency>
40 |             <groupId>org.apache.flink</groupId>
41 |             <artifactId>flink-core</artifactId>
42 |             <version>${table.version}</version>
43 |             <scope>provided</scope>
44 |         </dependency>
45 |         <dependency>
46 |             <groupId>org.apache.flink</groupId>
47 |             <artifactId>flink-java</artifactId>
48 |             <version>${table.version}</version>
49 |             <scope>provided</scope>
50 |         </dependency>
51 |         <dependency>
52 |             <groupId>org.apache.flink</groupId>
53 |             <artifactId>flink-streaming-java_2.11</artifactId>
54 |             <version>${table.version}</version>
55 |             <scope>provided</scope>
56 |         </dependency>
57 |         <dependency>
58 |             <groupId>org.apache.flink</groupId>
59 |             <artifactId>flink-table-common</artifactId>
60 |             <version>${table.version}</version>
61 |             <scope>provided</scope>
62 |         </dependency>
63 |         <dependency>
64 |             <groupId>org.apache.flink</groupId>
65 |             <artifactId>flink-table-planner_2.11</artifactId>
66 |             <version>${table.version}</version>
67 |             <scope>provided</scope>
68 |         </dependency>
69 |     </dependencies>
70 |     <build>
71 |         <plugins>
72 |             <plugin>
73 |                 <groupId>org.apache.maven.plugins</groupId>
74 |                 <artifactId>maven-compiler-plugin</artifactId>
75 |                 <version>3.1</version>
76 |                 <configuration>
77 |                     <source>1.8</source>
78 |                     <target>1.8</target>
79 |                 </configuration>
80 |             </plugin>
81 |         </plugins>
82 |     </build>
83 | </project>
84 | 


--------------------------------------------------------------------------------
/table/user_defined_sources_and_sinks/src/main/java/com/pyflink/table/factory/TestTableFactory.java:
--------------------------------------------------------------------------------
 1 | package com.pyflink.table.factory;
 2 | 
 3 | import com.pyflink.table.sinks.TestRetractSink;
 4 | import com.pyflink.table.sources.TestSource;
 5 | import org.apache.flink.api.java.tuple.Tuple2;
 6 | import org.apache.flink.api.java.typeutils.RowTypeInfo;
 7 | import org.apache.flink.table.api.TableSchema;
 8 | import org.apache.flink.table.descriptors.DescriptorProperties;
 9 | import org.apache.flink.table.descriptors.SchemaValidator;
10 | import org.apache.flink.table.factories.StreamTableSinkFactory;
11 | import org.apache.flink.table.factories.StreamTableSourceFactory;
12 | import org.apache.flink.table.sinks.StreamTableSink;
13 | import org.apache.flink.table.sources.StreamTableSource;
14 | import org.apache.flink.types.Row;
15 | 
16 | import java.util.ArrayList;
17 | import java.util.HashMap;
18 | import java.util.List;
19 | import java.util.Map;
20 | 
21 | import static org.apache.flink.table.descriptors.ConnectorDescriptorValidator.CONNECTOR_PROPERTY_VERSION;
22 | import static org.apache.flink.table.descriptors.ConnectorDescriptorValidator.CONNECTOR_TYPE;
23 | import static org.apache.flink.table.descriptors.Rowtime.ROWTIME_TIMESTAMPS_CLASS;
24 | import static org.apache.flink.table.descriptors.Rowtime.ROWTIME_TIMESTAMPS_FROM;
25 | import static org.apache.flink.table.descriptors.Rowtime.ROWTIME_TIMESTAMPS_SERIALIZED;
26 | import static org.apache.flink.table.descriptors.Rowtime.ROWTIME_TIMESTAMPS_TYPE;
27 | import static org.apache.flink.table.descriptors.Rowtime.ROWTIME_WATERMARKS_CLASS;
28 | import static org.apache.flink.table.descriptors.Rowtime.ROWTIME_WATERMARKS_DELAY;
29 | import static org.apache.flink.table.descriptors.Rowtime.ROWTIME_WATERMARKS_SERIALIZED;
30 | import static org.apache.flink.table.descriptors.Rowtime.ROWTIME_WATERMARKS_TYPE;
31 | import static org.apache.flink.table.descriptors.Schema.SCHEMA;
32 | import static org.apache.flink.table.descriptors.Schema.SCHEMA_FROM;
33 | import static org.apache.flink.table.descriptors.Schema.SCHEMA_NAME;
34 | import static org.apache.flink.table.descriptors.Schema.SCHEMA_PROCTIME;
35 | import static org.apache.flink.table.descriptors.Schema.SCHEMA_TYPE;
36 | 
37 | public class TestTableFactory implements StreamTableSourceFactory<Row>, StreamTableSinkFactory<Tuple2<Boolean, Row>> {
38 |     @Override
39 |     public StreamTableSink<Tuple2<Boolean, Row>> createStreamTableSink(Map<String, String> map) {
40 |         DescriptorProperties params = new DescriptorProperties(true);
41 |         params.putProperties(map);
42 |         new SchemaValidator(true, true, true).validate(params);
43 |         TableSchema tableSchema = params.getTableSchema(SCHEMA);
44 |         TestRetractSink sink = new TestRetractSink();
45 |         return (StreamTableSink<Tuple2<Boolean, Row>>) sink.configure(tableSchema.getFieldNames(), tableSchema.getFieldTypes());
46 |     }
47 | 
48 |     @Override
49 |     public StreamTableSource<Row> createStreamTableSource(Map<String, String> map) {
50 |         DescriptorProperties params = new DescriptorProperties(true);
51 |         params.putProperties(map);
52 |         new SchemaValidator(true, true, true).validate(params);
53 |         TableSchema tableSchema = params.getTableSchema(SCHEMA);
54 |         return new TestSource(tableSchema, new RowTypeInfo(tableSchema.getFieldTypes(), tableSchema.getFieldNames()));
55 |     }
56 | 
57 |     @Override
58 |     public Map<String, String> requiredContext() {
59 |         Map<String, String> context = new HashMap<>();
60 |         context.put(CONNECTOR_TYPE, "pyflink-test");
61 |         context.put(CONNECTOR_PROPERTY_VERSION, "1");
62 |         return context;
63 |     }
64 | 
65 |     @Override
66 |     public List<String> supportedProperties() {
67 |         List<String> properties = new ArrayList<>();
68 | 
69 |         // schema
70 |         properties.add(SCHEMA + ".#." + SCHEMA_TYPE);
71 |         properties.add(SCHEMA + ".#." + SCHEMA_NAME);
72 |         properties.add(SCHEMA + ".#." + SCHEMA_FROM);
73 | 
74 |         // time attributes
75 |         properties.add(SCHEMA + ".#." + SCHEMA_PROCTIME);
76 |         properties.add(SCHEMA + ".#." + ROWTIME_TIMESTAMPS_TYPE);
77 |         properties.add(SCHEMA + ".#." + ROWTIME_TIMESTAMPS_FROM);
78 |         properties.add(SCHEMA + ".#." + ROWTIME_TIMESTAMPS_CLASS);
79 |         properties.add(SCHEMA + ".#." + ROWTIME_TIMESTAMPS_SERIALIZED);
80 |         properties.add(SCHEMA + ".#." + ROWTIME_WATERMARKS_TYPE);
81 |         properties.add(SCHEMA + ".#." + ROWTIME_WATERMARKS_CLASS);
82 |         properties.add(SCHEMA + ".#." + ROWTIME_WATERMARKS_SERIALIZED);
83 |         properties.add(SCHEMA + ".#." + ROWTIME_WATERMARKS_DELAY);
84 |         return properties;
85 |     }
86 | }
87 | 


--------------------------------------------------------------------------------
/table/user_defined_sources_and_sinks/src/main/java/com/pyflink/table/sinks/TestRetractSink.java:
--------------------------------------------------------------------------------
 1 | package com.pyflink.table.sinks;
 2 | 
 3 | import org.apache.flink.api.common.typeinfo.TypeInformation;
 4 | import org.apache.flink.api.java.tuple.Tuple2;
 5 | import org.apache.flink.api.java.typeutils.RowTypeInfo;
 6 | import org.apache.flink.streaming.api.datastream.DataStream;
 7 | import org.apache.flink.streaming.api.datastream.DataStreamSink;
 8 | import org.apache.flink.streaming.api.functions.sink.SinkFunction;
 9 | import org.apache.flink.table.sinks.RetractStreamTableSink;
10 | import org.apache.flink.table.sinks.TableSink;
11 | import org.apache.flink.types.Row;
12 | 
13 | public class TestRetractSink implements RetractStreamTableSink<Row> {
14 | 
15 |     String[] fNames;
16 |     TypeInformation<?>[] fTypes;
17 | 
18 |     @Override
19 |     public TypeInformation<Row> getRecordType() {
20 |         return new RowTypeInfo(fTypes, fNames);
21 |     }
22 | 
23 |     @Override
24 |     public String[] getFieldNames() {
25 |         return fNames;
26 |     }
27 | 
28 |     @Override
29 |     public TypeInformation<?>[] getFieldTypes() {
30 |         return fTypes;
31 |     }
32 | 
33 |     @Override
34 |     public void emitDataStream(DataStream<Tuple2<Boolean, Row>> dataStream) {
35 |         consumeDataStream(dataStream);
36 |     }
37 | 
38 |     @Override
39 |     public DataStreamSink<?> consumeDataStream(DataStream<Tuple2<Boolean, Row>> dataStream) {
40 |         return dataStream.addSink(new RowSink());
41 |     }
42 | 
43 |     @Override
44 |     public TableSink<Tuple2<Boolean, Row>> configure(String[] fNames, TypeInformation<?>[] fTypes) {
45 |         TestRetractSink copy = new TestRetractSink();
46 |         copy.fNames = fNames;
47 |         copy.fTypes = fTypes;
48 |         return copy;
49 |     }
50 | 
51 |     private static class RowSink implements SinkFunction<Tuple2<Boolean, Row>> {
52 |         @Override
53 |         public void invoke(Tuple2<Boolean, Row> value) throws Exception {
54 |             System.out.println(value);
55 |         }
56 |     }
57 | }
58 | 
59 | 


--------------------------------------------------------------------------------
/table/user_defined_sources_and_sinks/src/main/java/com/pyflink/table/sources/TestSource.java:
--------------------------------------------------------------------------------
 1 | package com.pyflink.table.sources;
 2 | 
 3 | import org.apache.flink.api.common.typeinfo.TypeInformation;
 4 | import org.apache.flink.streaming.api.datastream.DataStream;
 5 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
 6 | import org.apache.flink.table.api.TableSchema;
 7 | import org.apache.flink.table.sources.StreamTableSource;
 8 | import org.apache.flink.types.Row;
 9 | 
10 | import java.util.ArrayList;
11 | import java.util.List;
12 | 
13 | public class TestSource implements StreamTableSource<Row> {
14 | 
15 |     TableSchema schema;
16 |     TypeInformation<Row> returnType;
17 | 
18 |     public TestSource(TableSchema tableSchema, TypeInformation<Row> returnType) {
19 |         this.schema = tableSchema;
20 |         this.returnType = returnType;
21 |     }
22 | 
23 |     @Override
24 |     public DataStream<Row> getDataStream(StreamExecutionEnvironment env) {
25 |         Row r1 = new Row(1);
26 |         r1.setField(0, "haha");
27 |         Row r2 = new Row(1);
28 |         r2.setField(0, "haha");
29 |         Row r3 = new Row(1);
30 |         r3.setField(0, "haha");
31 |         List<Row> data = new ArrayList<>();
32 |         data.add(r1);
33 |         data.add(r2);
34 |         data.add(r3);
35 |         return env.fromCollection(data, returnType);
36 |     }
37 | 
38 |     @Override
39 |     public TableSchema getTableSchema() {
40 |         return schema;
41 |     }
42 | 
43 |     @Override
44 |     public TypeInformation<Row> getReturnType() {
45 |         return returnType;
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/table/user_defined_sources_and_sinks/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | com.pyflink.table.factory.TestTableFactory


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mukess/pyflink-demo/7e8ae37b3cfb4a6a5560cae2a0d44cddee25e036/utils/__init__.py


--------------------------------------------------------------------------------
/utils/elastic_search_utils.py:
--------------------------------------------------------------------------------
 1 | from elasticsearch import Elasticsearch
 2 | 
 3 | 
 4 | def create_index(index='test', body=''):
 5 |     es = Elasticsearch()
 6 |     es.indices.create(index=index, ignore=400, body=body)
 7 | 
 8 | 
 9 | def add_update_data(index, doc_type, id, body):
10 |     es = Elasticsearch()
11 |     es.index(index=index, doc_type=doc_type, id=id, body=body)
12 | 
13 | 
14 | def get_data(index, doc_type, id):
15 |     es = Elasticsearch()
16 |     return es.get(index=index, doc_type=doc_type, id=id)['_source']
17 | 
18 | 
19 | def get_all_data(index, doc_type='_all'):
20 |     es = Elasticsearch()
21 |     if index is not None:
22 |         data = es.search(index=index, doc_type=doc_type)
23 |     return data
24 | 
25 | 
26 | def delete_index(index):
27 |     es = Elasticsearch()
28 |     if index is not None:
29 |         es.indices.delete(index=index, ignore=[400, 404])
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     create_index('user')
34 |     # from datetime import datetime
35 |     #
36 |     # body = {"any": "data", "timestamp": datetime.now()}
37 |     # add_update_data(index='test', doc_type='person', id=1, body=body)
38 |     # import time
39 |     # time.sleep(1)
40 |     # print(get_all_data('test', 'person'))
41 |     # delete_index('user')
42 | 


--------------------------------------------------------------------------------
/utils/kafka_utils.py:
--------------------------------------------------------------------------------
 1 | from kafka import KafkaProducer
 2 | from kafka import KafkaAdminClient
 3 | from kafka import KafkaConsumer
 4 | from kafka.admin import NewTopic
 5 | import json
 6 | 
 7 | 
 8 | def send_msg(topic='test', msg=None):
 9 |     producer = KafkaProducer(bootstrap_servers='localhost:9092',
10 |                              value_serializer=lambda v: json.dumps(v).encode('utf-8'))
11 |     if msg is not None:
12 |         future = producer.send(topic, msg)
13 |         future.get()
14 | 
15 | 
16 | def get_msg(topic='test'):
17 |     consumer = KafkaConsumer(topic, auto_offset_reset='earliest')
18 |     for message in consumer:
19 |         print(message)
20 | 
21 | 
22 | def list_topics():
23 |     global_consumer = KafkaConsumer(bootstrap_servers='localhost:9092')
24 |     topics = global_consumer.topics()
25 |     return topics
26 | 
27 | 
28 | def create_topic(topic='test'):
29 |     admin = KafkaAdminClient(bootstrap_servers='localhost:9092')
30 |     topics = list_topics()
31 |     if topic not in topics:
32 |         topic_obj = NewTopic(topic, 1, 1)
33 |         admin.create_topics(new_topics=[topic_obj])
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     print(list_topics())
38 |     # msg = {'user': 'flink', 'message': 'Hello Message', 'time': '2013-01-01T00:14:13Z'}
39 |     # msg = {'user': 'flink', 'message': 'Hello Message', 'time': '1990-10-14T12:12:43Z'}
40 |     # send_msg('test', msg)
41 |     # get_msg('user')
42 | 


--------------------------------------------------------------------------------