├── .gitignore ├── README.md ├── requirements.txt ├── table ├── __init__.py ├── batch │ ├── add_columns.py │ ├── add_or_replace_columns.py │ ├── alias.py │ ├── distinct_agg.py │ ├── drop_columns.py │ ├── filter.py │ ├── full_outer_join.py │ ├── group_by_agg.py │ ├── group_by_window_agg.py │ ├── in.py │ ├── inner_join.py │ ├── intersect.py │ ├── intersect_all.py │ ├── left_outer_join.py │ ├── minus.py │ ├── minus_all.py │ ├── offset_and_fetch.py │ ├── order_by.py │ ├── rename_columns.py │ ├── right_outer_join.py │ ├── scan.py │ ├── session_window.py │ ├── slide_window.py │ ├── table_select.py │ ├── tumble_window.py │ ├── union.py │ ├── union_all.py │ └── where.py ├── javaudf │ ├── README.md │ ├── aggregate-function │ │ ├── pom.xml │ │ └── src │ │ │ └── main │ │ │ └── java │ │ │ └── com │ │ │ └── pyflink │ │ │ └── table │ │ │ └── WeightedAvg.java │ ├── aggregate_func_demo.py │ ├── scalar-function │ │ ├── pom.xml │ │ └── src │ │ │ └── main │ │ │ └── java │ │ │ └── com │ │ │ └── pyflink │ │ │ └── table │ │ │ └── HashCode.java │ ├── scalar_func_demo.py │ ├── table-function │ │ ├── pom.xml │ │ └── src │ │ │ └── main │ │ │ └── java │ │ │ └── com │ │ │ └── pyflink │ │ │ └── table │ │ │ └── Split.java │ └── table_func_demo.py ├── prepare_environment.py ├── resources │ └── table_orders.csv ├── streaming │ ├── add_columns.py │ ├── add_or_replace_columns.py │ ├── alias.py │ ├── distinct.py │ ├── distinct_agg.py │ ├── drop_columns.py │ ├── filter.py │ ├── full_outer_join.py │ ├── group_by_agg.py │ ├── group_by_window_agg.py │ ├── in.py │ ├── inner_join.py │ ├── left_outer_join.py │ ├── over_window_agg.py │ ├── rename_columns.py │ ├── right_outer_join.py │ ├── scan.py │ ├── session_window.py │ ├── slide_window.py │ ├── table_select.py │ ├── tumble_window.py │ ├── union_all.py │ └── where.py ├── user_case │ ├── __init__.py │ └── pv_uv │ │ ├── README.md │ │ ├── __init__.py │ │ ├── create_data.sh │ │ ├── env.sh │ │ ├── pv_uv_example.py │ │ └── user_behavior.log └── user_defined_sources_and_sinks │ ├── CustomTableSourceDemo.py │ ├── README.md │ ├── __init__.py │ ├── pom.xml │ └── src │ └── main │ ├── java │ └── com │ │ └── pyflink │ │ └── table │ │ ├── factory │ │ └── TestTableFactory.java │ │ ├── sinks │ │ └── TestRetractSink.java │ │ └── sources │ │ └── TestSource.java │ └── resources │ └── META-INF │ └── services │ └── org.apache.flink.table.factories.TableFactory └── utils ├── __init__.py ├── elastic_search_utils.py └── kafka_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | table/result 3 | target 4 | .DS_Store 5 | *.iml 6 | *dependency-reduced-pom.xml 7 | *__pycache__* 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pyflink-demo 2 | This project is to help users easier to write their pyflink job. 3 | 4 | **contents** 5 | 6 | - [Quick Start](#quick-start) 7 | + [Setup](#setup) 8 | + [Requirements](#requirements) 9 | + [Install python2](#install-python2) 10 | + [Install pip](#install-pip) 11 | + [Install java 8](#install-java-8) 12 | + [Install maven](#install-maven) 13 | + [Build PyFlink](#build-pyflink) 14 | + [Prepare Kafka](#prepare-kafka) 15 | + [Prepare ElasticSearch](#prepare-elasticsearch) 16 | + [Install Dependency](#install-dependency) 17 | + [Run Demo](#run-demo) 18 | + [[optional] Importing the project on PyCharm](#optionalimporting-the-project-on-pycharm) 19 | + [Run pyflink table api example](#run-pyflink-table-api-example) 20 | 21 | ## Quick Start 22 | 23 | ### Setup 24 | 25 | #### Requirements 26 | 1. python2.7 or python3 27 | 2. pip 28 | 3. java 1.8 29 | 4. maven version >=3.3.0 30 | 31 | #### Install python2 32 | 33 | macOS 34 | ```shell 35 | /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" 36 | export PATH="/usr/local/bin:/usr/local/sbin:$PATH" 37 | brew install python@2 38 | ``` 39 | Ubuntu 40 | ```shell 41 | sudo apt install python-dev 42 | ``` 43 | 44 | #### Install pip 45 | 46 | macOS 47 | 48 | ```shell 49 | curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py 50 | python get-pip.py 51 | ``` 52 | 53 | Ubuntu 54 | ```shell 55 | sudo apt install python-pip 56 | ``` 57 | 58 | #### Install java 8 59 | 60 | [java download page](http://www.oracle.com/technetwork/java/javase/downloads/index.html) 61 | 62 | #### Install maven 63 | 64 | maven version >=3.3.0 65 | 66 | [download maven page](http://maven.apache.org/download.cgi) 67 | 68 | ```shell 69 | tar -xvf apache-maven-3.6.1-bin.tar.gz 70 | mv -rf apache-maven-3.6.1 /usr/local/ 71 | ``` 72 | configuration environment variables 73 | ```shell 74 | MAVEN_HOME=/usr/local/apache-maven-3.6.1 75 | export MAVEN_HOME 76 | export PATH=${PATH}:${MAVEN_HOME}/bin 77 | ``` 78 | 79 | 80 | ### Build PyFlink 81 | 82 | If you want to build a PyFlink package that can be used for pip installation, you need to build Flink jars first, as described in https://ci.apache.org/projects/flink/flink-docs-master/flinkDev/building.html 83 | 84 | ```shell 85 | mvn clean install -DskipTests -Dfast 86 | ``` 87 | 88 | Then you need to copy the jar package flink-sql-connector-kafka-0.11_*-SNAPSHOT.jar in the directory of flink-connectors/flink-sql-connector-kafka-0.11 89 | 90 | ```shell 91 | cp flink-connectors/flink-sql-connector-kafka-0.11/target/flink-sql-connector-kafka-0.11_*-SNAPSHOT.jar build-target/lib 92 | ``` 93 | 94 | Then you need to copy the jar package flink-connector-elasticsearch6_*-SNAPSHOT.jar in the directory of flink-connectors/flink-connector-elasticsearch6 95 | 96 | ```shell 97 | cp flink-connectors/flink-connector-elasticsearch6/target/flink-connector-elasticsearch6_*-SNAPSHOT.jar build-target/lib 98 | ``` 99 | 100 | Next you need to copy the jar package flink-json-*-SNAPSHOT-sql-jar.jar in the directory of flink-formats/flink-json 101 | 102 | ```shell 103 | cp flink-formats/flink-json/target/flink-json-*-SNAPSHOT-sql-jar.jar build-target/lib 104 | ``` 105 | 106 | Next go to the root directory of flink source code and run this command to build the sdist package and wheel package: 107 | 108 | ```shell 109 | cd flink-python; python3 setup.py sdist bdist_wheel 110 | ``` 111 | 112 | The sdist and wheel package will be found under `./flink-python/dist/`. Either of them could be used for pip installation, such as: 113 | 114 | ```shell 115 | pip install dist/*.tar.gz 116 | ``` 117 | 118 | ### Prepare Kafka 119 | Some demo choose kafka as source, so you need to install and run kafka in local host. the version we use kafka_2.11-0.11 (https://archive.apache.org/dist/kafka/0.11.0.3/kafka_2.11-0.11.0.3.tgz) 120 | you use the following command to download: 121 | 122 | ```shell 123 | wget https://archive.apache.org/dist/kafka/0.11.0.3/kafka_2.11-0.11.0.3.tgz 124 | ``` 125 | 126 | Then you depress the tar package: 127 | 128 | ```shell 129 | tar zxvf kafka_2.11-0.11.0.3.tgz 130 | ``` 131 | Next you start the zookeeper: 132 | 133 | ```shell 134 | cd kafka_2.11-0.11.0.3; bin/zookeeper-server-start.sh config/zookeeper.properties 135 | ``` 136 | 137 | Finally, you start kafka server: 138 | 139 | ```shell 140 | bin/kafka-server-start.sh config/server.properties 141 | ``` 142 | 143 | ### Prepare ElasticSearch 144 | Some demo choose Elasticsearch as sink, so you need to install and run Elasticsearch in local host. the version we use elasticsearch-6.0.1 (https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.0.1.tar.gz) 145 | you use the following command to download: 146 | 147 | ```shell 148 | wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.0.1.tar.gz 149 | ``` 150 | 151 | Then you depress the tar package: 152 | 153 | ```shell 154 | tar zxvf https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-6.0.1.tar.gz 155 | ``` 156 | 157 | Finally, you start ElasticSearch: 158 | 159 | ```shell 160 | ./bin/elasticsearch 161 | ``` 162 | 163 | ### Install Dependency 164 | Install environment dependency 165 | 166 | ```shell 167 | pip install -r requirements.txt 168 | ``` 169 | 170 | ### Run demo 171 | #### [optional]Importing the project on PyCharm 172 | You can use PyCharm to open the project and choose the python interpreter as the python which match the pip tool which install the pyflink and dependency in requirements.txt. 173 | The following documentation describes the steps to setup PyCharm 2019.1.3 ([https://www.jetbrains.com/pycharm/download/](https://www.jetbrains.com/pycharm/download/)) 174 | 175 | If you are in the PyCharm startup interface: 176 | 1. Start PyCharm and choose "Open" 177 | 2. Select the pyflink-demo cloned repository. 178 | 3. Click on System interpreter in python interpreter option(Pycharm->Preference->python interpreter). 179 | 4. Choose the python which have installed the packages of pyflink and dependencies in the requirements.txt 180 | 181 | If you have used PyCharm to open a project: 182 | 1. Select "File -> Open" 183 | 2. Select the pyflink-demo cloned repository. 184 | 3. Click on System interpreter in python interpreter option(Pycharm->Preference->python interpreter). 185 | 4. Choose the python which have installed the packages of pyflink and dependencies in the requirements.txt 186 | #### Run pyflink table api example 187 | Demos about table api is in the pyflink-demo/table/batch directory and pyflink-demo/table/streaming directory. 188 | Demos about udf is in the pyflink-demo/table/javaudf 189 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | elasticsearch>=6.0.0,<7.0.0 2 | kafka-python 3 | py4j==0.10.8.1 -------------------------------------------------------------------------------- /table/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukess/pyflink-demo/7e8ae37b3cfb4a6a5560cae2a0d44cddee25e036/table/__init__.py -------------------------------------------------------------------------------- /table/batch/add_columns.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes 5 | 6 | 7 | def add_columns_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | source_file = os.getcwd() + "/../resources/table_orders.csv" 12 | result_file = "/tmp/table_add_columns_batch.csv" 13 | if os.path.exists(result_file): 14 | os.remove(result_file) 15 | bt_env.register_table_source("Orders", 16 | CsvTableSource(source_file, 17 | ["a", "b", "c", "rowtime"], 18 | [DataTypes.STRING(), 19 | DataTypes.INT(), 20 | DataTypes.INT(), 21 | DataTypes.TIMESTAMP()])) 22 | bt_env.register_table_sink("result", 23 | CsvTableSink(["a", "b", "c", "rowtime", "d"], 24 | [DataTypes.STRING(), 25 | DataTypes.INT(), 26 | DataTypes.INT(), 27 | DataTypes.TIMESTAMP(), 28 | DataTypes.STRING()], 29 | result_file)) 30 | orders = bt_env.scan("Orders") 31 | result = orders.add_columns("concat(a, '_sunny') as d") 32 | result.insert_into("result") 33 | bt_env.execute("add columns batch") 34 | # cat /tmp/table_add_columns_batch.csv 35 | # a,1,1,2013-01-01 00:14:13.0,a_sunny 36 | # b,2,2,2013-01-01 00:24:13.0,b_sunny 37 | # a,3,3,2013-01-01 00:34:13.0,a_sunny 38 | # a,4,4,2013-01-01 01:14:13.0,a_sunny 39 | # b,4,5,2013-01-01 01:24:13.0,b_sunny 40 | # a,5,2,2013-01-01 01:34:13.0,a_sunny 41 | 42 | 43 | if __name__ == '__main__': 44 | add_columns_batch() 45 | -------------------------------------------------------------------------------- /table/batch/add_or_replace_columns.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes 5 | 6 | 7 | def add_or_replace_columns_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | source_file = os.getcwd() + "/../resources/table_orders.csv" 12 | result_file = "/tmp/table_add_or_replace_columns_batch.csv" 13 | if os.path.exists(result_file): 14 | os.remove(result_file) 15 | bt_env.register_table_source("Orders", 16 | CsvTableSource(source_file, 17 | ["a", "b", "c", "rowtime"], 18 | [DataTypes.STRING(), 19 | DataTypes.INT(), 20 | DataTypes.INT(), 21 | DataTypes.TIMESTAMP()])) 22 | bt_env.register_table_sink("result", 23 | CsvTableSink(["a", "b", "c", "rowtime"], 24 | [DataTypes.STRING(), 25 | DataTypes.INT(), 26 | DataTypes.INT(), 27 | DataTypes.TIMESTAMP()], 28 | result_file)) 29 | orders = bt_env.scan("Orders") 30 | result = orders.add_or_replace_columns("concat(a, '_sunny') as a") 31 | result.insert_into("result") 32 | bt_env.execute("add or replace columns batch") 33 | # cat /tmp/table_add_or_replace_columns_batch.csv 34 | # a_sunny,1,1,2013-01-01 00:14:13.0 35 | # b_sunny,2,2,2013-01-01 00:24:13.0 36 | # a_sunny,3,3,2013-01-01 00:34:13.0 37 | # a_sunny,4,4,2013-01-01 01:14:13.0 38 | # b_sunny,4,5,2013-01-01 01:24:13.0 39 | # a_sunny,5,2,2013-01-01 01:34:13.0 40 | 41 | 42 | if __name__ == '__main__': 43 | add_or_replace_columns_batch() -------------------------------------------------------------------------------- /table/batch/alias.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes 5 | 6 | 7 | def alias_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | source_file = os.getcwd() + "/../resources/table_orders.csv" 12 | result_file = "/tmp/table_alias_batch.csv" 13 | if os.path.exists(result_file): 14 | os.remove(result_file) 15 | bt_env.register_table_source("Orders", 16 | CsvTableSource(source_file, 17 | ["a", "b", "c", "rowtime"], 18 | [DataTypes.STRING(), 19 | DataTypes.INT(), 20 | DataTypes.INT(), 21 | DataTypes.TIMESTAMP()])) 22 | bt_env.register_table_sink("result", 23 | CsvTableSink(["a", "b", "c", "rowtime"], 24 | [DataTypes.STRING(), 25 | DataTypes.INT(), 26 | DataTypes.INT(), 27 | DataTypes.TIMESTAMP()], 28 | result_file)) 29 | orders = bt_env.scan("Orders") 30 | result = orders.alias("x, y, z, t").select("x, y, z, t") 31 | result.insert_into("result") 32 | bt_env.execute("alias batch") 33 | # cat table/result/table_alias_batch.csv 34 | # a,1,1,2013-01-01 00:14:13.0 35 | # b,2,2,2013-01-01 00:24:13.0 36 | # a,3,3,2013-01-01 00:34:13.0 37 | # a,4,4,2013-01-01 01:14:13.0 38 | # b,4,5,2013-01-01 01:24:13.0 39 | # a,5,2,2013-01-01 01:34:13.0 40 | 41 | 42 | if __name__ == '__main__': 43 | alias_batch() 44 | -------------------------------------------------------------------------------- /table/batch/distinct_agg.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes 5 | 6 | 7 | # DISTINCT window aggregates are currently not supported in Batch mode. 8 | def distinct_agg_batch(): 9 | b_env = ExecutionEnvironment.get_execution_environment() 10 | b_env.set_parallelism(1) 11 | bt_env = BatchTableEnvironment.create(b_env) 12 | source_file = os.getcwd() + "/../resources/table_orders.csv" 13 | result_file = "/tmp/table_distinct_agg_batch.csv" 14 | if os.path.exists(result_file): 15 | os.remove(result_file) 16 | bt_env.register_table_source("Orders", 17 | CsvTableSource(source_file, 18 | ["a", "b", "c", "rowtime"], 19 | [DataTypes.STRING(), 20 | DataTypes.INT(), 21 | DataTypes.INT(), 22 | DataTypes.TIMESTAMP()])) 23 | bt_env.register_table_sink("result", 24 | CsvTableSink(["b"], 25 | [DataTypes.INT()], 26 | result_file)) 27 | orders = bt_env.scan("Orders") 28 | result = orders.group_by("a") \ 29 | .select("b.sum.distinct as d") 30 | result.insert_into("result") 31 | bt_env.execute("distinct agg batch") 32 | # cat table/result/table_distinct_batch.csv 33 | # 13 34 | # 6 35 | 36 | 37 | if __name__ == '__main__': 38 | distinct_agg_batch() 39 | -------------------------------------------------------------------------------- /table/batch/drop_columns.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes 5 | 6 | 7 | def drop_columns_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | source_file = os.getcwd() + "/../resources/table_orders.csv" 12 | result_file = "/tmp/table_drop_columns_batch.csv" 13 | if os.path.exists(result_file): 14 | os.remove(result_file) 15 | bt_env.register_table_source("Orders", 16 | CsvTableSource(source_file, 17 | ["a", "b", "c", "rowtime"], 18 | [DataTypes.STRING(), 19 | DataTypes.INT(), 20 | DataTypes.INT(), 21 | DataTypes.TIMESTAMP()])) 22 | bt_env.register_table_sink("result", 23 | CsvTableSink(["a", "b", "rowtime"], 24 | [DataTypes.STRING(), 25 | DataTypes.INT(), 26 | DataTypes.TIMESTAMP()], 27 | result_file)) 28 | orders = bt_env.scan("Orders") 29 | result = orders.drop_columns("c") 30 | result.insert_into("result") 31 | bt_env.execute("drop columns batch") 32 | # cat table/result/table_drop_columns_batch.csv 33 | # a,1,2013-01-01 00:14:13.0 34 | # b,2,2013-01-01 00:24:13.0 35 | # a,3,2013-01-01 00:34:13.0 36 | # a,4,2013-01-01 01:14:13.0 37 | # b,4,2013-01-01 01:24:13.0 38 | # a,5,2013-01-01 01:34:13.0 39 | 40 | 41 | if __name__ == '__main__': 42 | drop_columns_batch() 43 | -------------------------------------------------------------------------------- /table/batch/filter.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes 5 | 6 | 7 | def filter_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | source_file = os.getcwd() + "/../resources/table_orders.csv" 12 | result_file = "/tmp/table_filter_batch.csv" 13 | if os.path.exists(result_file): 14 | os.remove(result_file) 15 | bt_env.register_table_source("Orders", 16 | CsvTableSource(source_file, 17 | ["a", "b", "c", "rowtime"], 18 | [DataTypes.STRING(), 19 | DataTypes.INT(), 20 | DataTypes.INT(), 21 | DataTypes.TIMESTAMP()])) 22 | bt_env.register_table_sink("result", 23 | CsvTableSink(["a", "b", "c", "rowtime"], 24 | [DataTypes.STRING(), 25 | DataTypes.INT(), 26 | DataTypes.INT(), 27 | DataTypes.TIMESTAMP()], 28 | result_file)) 29 | orders = bt_env.scan("Orders") 30 | result = orders.filter("b % 2 === 0") 31 | result.insert_into("result") 32 | bt_env.execute("filter batch") 33 | # cat /tmp/table_filter_batch.csv 34 | # b,2,2,2013-01-01 00:24:13.0 35 | # a,4,4,2013-01-01 01:14:13.0 36 | # b,4,5,2013-01-01 01:24:13.0 37 | 38 | 39 | if __name__ == '__main__': 40 | filter_batch() 41 | -------------------------------------------------------------------------------- /table/batch/full_outer_join.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes 5 | 6 | 7 | def full_outer_join_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | result_file = "/tmp/table_full_outer_join_batch.csv" 12 | if os.path.exists(result_file): 13 | os.remove(result_file) 14 | left = bt_env.from_elements( 15 | [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")], 16 | ["a", "b", "c"]).select("a, b, c") 17 | right = bt_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], 18 | ["d", "e", "f"]).select("d, e, f") 19 | bt_env.register_table_sink("result", 20 | CsvTableSink(["a", "b", "c"], 21 | [DataTypes.BIGINT(), 22 | DataTypes.STRING(), 23 | DataTypes.STRING()], 24 | result_file)) 25 | 26 | result = left.full_outer_join(right, "a = d").select("a, b, e") 27 | result.insert_into("result") 28 | bt_env.execute("full outer join batch") 29 | # cat /tmp/table_full_outer_join_batch.csv 30 | # 1,1a,1b 31 | # 1,1a,3b 32 | # 2,2a, 33 | # 2,4b, 34 | # 3,, 35 | # 5,5a, 36 | # 4b 37 | 38 | 39 | if __name__ == '__main__': 40 | full_outer_join_batch() 41 | -------------------------------------------------------------------------------- /table/batch/group_by_agg.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes 5 | 6 | 7 | def group_by_agg_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | source_file = os.getcwd() + "/../resources/table_orders.csv" 12 | result_file = "/tmp/table_group_by_agg_batch.csv" 13 | if os.path.exists(result_file): 14 | os.remove(result_file) 15 | bt_env.register_table_source("Orders", 16 | CsvTableSource(source_file, 17 | ["a", "b", "c", "rowtime"], 18 | [DataTypes.STRING(), 19 | DataTypes.INT(), 20 | DataTypes.INT(), 21 | DataTypes.TIMESTAMP()])) 22 | bt_env.register_table_sink("result", 23 | CsvTableSink(["a", "b"], 24 | [DataTypes.STRING(), 25 | DataTypes.INT()], 26 | result_file)) 27 | orders = bt_env.scan("Orders") 28 | result = orders.group_by("a").select("a, b.sum as d") 29 | result.insert_into("result") 30 | bt_env.execute("group by agg batch") 31 | # cat /tmp/table_group_by_agg_batch.csv 32 | # a,13 33 | # b,6 34 | 35 | 36 | if __name__ == '__main__': 37 | group_by_agg_batch() 38 | -------------------------------------------------------------------------------- /table/batch/group_by_window_agg.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes 5 | from pyflink.table.window import Tumble 6 | 7 | 8 | def group_by_window_agg_batch(): 9 | b_env = ExecutionEnvironment.get_execution_environment() 10 | b_env.set_parallelism(1) 11 | bt_env = BatchTableEnvironment.create(b_env) 12 | source_file = os.getcwd() + "/../resources/table_orders.csv" 13 | result_file = "/tmp/table_group_by_window_agg_batch.csv" 14 | if os.path.exists(result_file): 15 | os.remove(result_file) 16 | bt_env.register_table_source("Orders", 17 | CsvTableSource(source_file, 18 | ["a", "b", "c", "rowtime"], 19 | [DataTypes.STRING(), 20 | DataTypes.INT(), 21 | DataTypes.INT(), 22 | DataTypes.TIMESTAMP()])) 23 | bt_env.register_table_sink("result", 24 | CsvTableSink(["a", "start", "end", "rowtime", "d"], 25 | [DataTypes.STRING(), 26 | DataTypes.TIMESTAMP(), 27 | DataTypes.TIMESTAMP(), 28 | DataTypes.TIMESTAMP(), 29 | DataTypes.INT()], 30 | result_file)) 31 | orders = bt_env.scan("Orders") 32 | result = orders.window(Tumble.over("1.hours").on("rowtime").alias("w")) \ 33 | .group_by("a, w") \ 34 | .select("a, w.start, w.end, w.rowtime, b.sum as d") 35 | result.insert_into("result") 36 | bt_env.execute("group by agg batch") 37 | # cat /tmp/table_group_by_window_agg_batch.csv 38 | # a,2013-01-01 00:00:00.0,2013-01-01 01:00:00.0,2013-01-01 00:59:59.999,4 39 | # a,2013-01-01 01:00:00.0,2013-01-01 02:00:00.0,2013-01-01 01:59:59.999,9 40 | # b,2013-01-01 00:00:00.0,2013-01-01 01:00:00.0,2013-01-01 00:59:59.999,2 41 | # b,2013-01-01 01:00:00.0,2013-01-01 02:00:00.0,2013-01-01 01:59:59.999,4 42 | 43 | 44 | if __name__ == '__main__': 45 | group_by_window_agg_batch() 46 | -------------------------------------------------------------------------------- /table/batch/in.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes 5 | 6 | 7 | def in_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | result_file = "/tmp/table_in_batch.csv" 12 | if os.path.exists(result_file): 13 | os.remove(result_file) 14 | left = bt_env.from_elements( 15 | [(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (4, "ra", "raa")], 16 | ["a", "b", "c"]).select("a, b, c") 17 | right = bt_env.from_elements([(1, "ra", "raa"), (2, "", "rbb"), (3, "rc", "rcc"), (1, "ra", "raa")], 18 | ["a", "b", "c"]).select("a") 19 | bt_env.register_table_sink("result", 20 | CsvTableSink(["a", "b", "c"], 21 | [DataTypes.BIGINT(), 22 | DataTypes.STRING(), 23 | DataTypes.STRING()], 24 | result_file)) 25 | 26 | result = left.where("a.in(%s)" % right) 27 | result.insert_into("result") 28 | # another way 29 | # bt_env.register_table("RightTable", right) 30 | # result = left.where("a.in(RightTable)") 31 | bt_env.execute("in batch") 32 | 33 | # cat /tmp/table_in_batch.csv 34 | # 1,ra,raa 35 | # 2,lb,lbb 36 | # 2,lb,lbb 37 | # 3,,lcc 38 | 39 | 40 | if __name__ == '__main__': 41 | in_batch() 42 | -------------------------------------------------------------------------------- /table/batch/inner_join.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes 5 | 6 | 7 | def inner_join_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | result_file = "/tmp/table_inner_join_batch.csv" 12 | if os.path.exists(result_file): 13 | os.remove(result_file) 14 | left = bt_env.from_elements( 15 | [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")], 16 | ["a", "b", "c"]).select("a, b, c") 17 | right = bt_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], 18 | ["d", "e", "f"]).select("d, e, f") 19 | bt_env.register_table_sink("result", 20 | CsvTableSink(["a", "b", "c"], 21 | [DataTypes.BIGINT(), 22 | DataTypes.STRING(), 23 | DataTypes.STRING()], 24 | result_file)) 25 | 26 | result = left.join(right).where("a = d").select("a, b, e") 27 | result.insert_into("result") 28 | bt_env.execute("inner join batch") 29 | # cat table/result/table_inner_join_batch.csv 30 | # 1,1a,1b 31 | # 2,2a, 32 | # 2,4b, 33 | # 1,1a,3b 34 | 35 | 36 | if __name__ == '__main__': 37 | inner_join_batch() 38 | -------------------------------------------------------------------------------- /table/batch/intersect.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes 5 | 6 | 7 | def intersect_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | result_file = "/tmp/table_intersect_batch.csv" 12 | if os.path.exists(result_file): 13 | os.remove(result_file) 14 | left = bt_env.from_elements([(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (1, "ra", "raa")], 15 | ["a", "b", "c"]).select("a, b, c") 16 | right = bt_env.from_elements([(1, "ra", "raa"), (2, "", "rbb"), (3, "rc", "rcc"), (1, "ra", "raa")], 17 | ["a", "b", "c"]).select("a, b, c") 18 | bt_env.register_table_sink("result", 19 | CsvTableSink(["a", "b", "c"], 20 | [DataTypes.BIGINT(), 21 | DataTypes.STRING(), 22 | DataTypes.STRING()], 23 | result_file)) 24 | 25 | result = left.intersect(right) 26 | result.insert_into("result") 27 | bt_env.execute("intersect batch") 28 | # cat /tmp/table_intersect_batch.csv 29 | # 1,ra,raa 30 | 31 | 32 | if __name__ == '__main__': 33 | intersect_batch() 34 | -------------------------------------------------------------------------------- /table/batch/intersect_all.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes 5 | 6 | 7 | def intersect_all_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | result_file = "/tmp/table_intersect_all_batch.csv" 12 | if os.path.exists(result_file): 13 | os.remove(result_file) 14 | left = bt_env.from_elements([(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (1, "ra", "raa")], 15 | ["a", "b", "c"]).select("a, b, c") 16 | right = bt_env.from_elements([(1, "ra", "raa"), (2, "", "rbb"), (3, "rc", "rcc"), (1, "ra", "raa")], 17 | ["a", "b", "c"]).select("a, b, c") 18 | bt_env.register_table_sink("result", 19 | CsvTableSink(["a", "b", "c"], 20 | [DataTypes.BIGINT(), 21 | DataTypes.STRING(), 22 | DataTypes.STRING()], 23 | result_file)) 24 | 25 | result = left.intersect_all(right) 26 | result.insert_into("result") 27 | bt_env.execute("intersect all batch") 28 | # cat /tmp/table_intersect_all_batch.csv 29 | # 1,ra,raa 30 | # 1,ra,raa 31 | 32 | 33 | if __name__ == '__main__': 34 | intersect_all_batch() 35 | -------------------------------------------------------------------------------- /table/batch/left_outer_join.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes 5 | 6 | 7 | def left_outer_join_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | result_file = "/tmp/table_left_outer_join_batch.csv" 12 | if os.path.exists(result_file): 13 | os.remove(result_file) 14 | left = bt_env.from_elements( 15 | [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")], 16 | ["a", "b", "c"]).select("a, b, c") 17 | right = bt_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], 18 | ["d", "e", "f"]).select("d, e, f") 19 | bt_env.register_table_sink("result", 20 | CsvTableSink(["a", "b", "c"], 21 | [DataTypes.BIGINT(), 22 | DataTypes.STRING(), 23 | DataTypes.STRING()], 24 | result_file)) 25 | 26 | result = left.left_outer_join(right, "a = d").select("a, b, e") 27 | result.insert_into("result") 28 | bt_env.execute("left outer join batch") 29 | # cat /tmp/table_left_outer_join_batch.csv 30 | # 1,1a,1b 31 | # 1,1a,3b 32 | # 2,2a, 33 | # 2,4b, 34 | # 3,, 35 | # 5,5a, 36 | 37 | 38 | if __name__ == '__main__': 39 | left_outer_join_batch() 40 | -------------------------------------------------------------------------------- /table/batch/minus.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes 5 | 6 | 7 | def minus_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | result_file = "/tmp/table_minus_batch.csv" 12 | if os.path.exists(result_file): 13 | os.remove(result_file) 14 | left = bt_env.from_elements( 15 | [(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (1, "ra", "raa")], 16 | ["a", "b", "c"]).select("a, b, c") 17 | right = bt_env.from_elements([(1, "ra", "raa"), (2, "", "rbb"), (3, "rc", "rcc"), (1, "ra", "raa")], 18 | ["a", "b", "c"]).select("a, b, c") 19 | bt_env.register_table_sink("result", 20 | CsvTableSink(["a", "b", "c"], 21 | [DataTypes.BIGINT(), 22 | DataTypes.STRING(), 23 | DataTypes.STRING()], 24 | result_file)) 25 | 26 | result = left.minus(right) 27 | result.insert_into("result") 28 | bt_env.execute("minus batch") 29 | # cat /tmp/table_minus_batch.csv 30 | # 2,lb,lbb 31 | # 3,,lcc 32 | 33 | 34 | if __name__ == '__main__': 35 | minus_batch() 36 | -------------------------------------------------------------------------------- /table/batch/minus_all.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes 5 | 6 | 7 | def minus_all_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | result_file = "/tmp/table_minus_all_batch.csv" 12 | if os.path.exists(result_file): 13 | os.remove(result_file) 14 | left = bt_env.from_elements( 15 | [(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (1, "ra", "raa")], 16 | ["a", "b", "c"]).select("a, b, c") 17 | right = bt_env.from_elements([(1, "ra", "raa"), (2, "", "rbb"), (3, "rc", "rcc"), (1, "ra", "raa")], 18 | ["a", "b", "c"]).select("a, b, c") 19 | bt_env.register_table_sink("result", 20 | CsvTableSink(["a", "b", "c"], 21 | [DataTypes.BIGINT(), 22 | DataTypes.STRING(), 23 | DataTypes.STRING()], 24 | result_file)) 25 | 26 | result = left.minus_all(right) 27 | result.insert_into("result") 28 | bt_env.execute("minus all batch") 29 | # cat /tmp/table_minus_all_batch.csv 30 | # 2,lb,lbb 31 | # 2,lb,lbb 32 | # 3,,lcc 33 | 34 | 35 | if __name__ == '__main__': 36 | minus_all_batch() 37 | -------------------------------------------------------------------------------- /table/batch/offset_and_fetch.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes 5 | 6 | 7 | def offset_and_fetch_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | result_file_1 = "/tmp/table_offset_and_fetch_batch_1.csv" 12 | result_file_2 = "/tmp/table_offset_and_fetch_batch_2.csv" 13 | result_file_3 = "/tmp/table_offset_and_fetch_batch_3.csv" 14 | if os.path.exists(result_file_1): 15 | os.remove(result_file_1) 16 | if os.path.exists(result_file_2): 17 | os.remove(result_file_2) 18 | if os.path.exists(result_file_3): 19 | os.remove(result_file_3) 20 | 21 | bt_env.register_table_sink("result1", 22 | CsvTableSink(["a", "b", "c"], 23 | [DataTypes.BIGINT(), 24 | DataTypes.STRING(), 25 | DataTypes.STRING()], 26 | result_file_1)) 27 | 28 | bt_env.register_table_sink("result2", 29 | CsvTableSink(["a", "b", "c"], 30 | [DataTypes.BIGINT(), 31 | DataTypes.STRING(), 32 | DataTypes.STRING()], 33 | result_file_2)) 34 | 35 | bt_env.register_table_sink("result3", 36 | CsvTableSink(["a", "b", "c"], 37 | [DataTypes.BIGINT(), 38 | DataTypes.STRING(), 39 | DataTypes.STRING()], 40 | result_file_3)) 41 | 42 | left = bt_env.from_elements( 43 | [(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (4, "ra", "raa")], 44 | ["a", "b", "c"]).select("a, b, c") 45 | 46 | ordered_table = left.order_by("a.asc") 47 | 48 | ordered_table.fetch(5).insert_into("result1") 49 | ordered_table.offset(1).insert_into("result2") 50 | ordered_table.offset(1).fetch(2).insert_into("result3") 51 | 52 | bt_env.execute("offset and fetch batch") 53 | # cat /tmp/able_offset_and_fetch_batch_1.csv 54 | # 1,ra,raa 55 | # 2,lb,lbb 56 | # 2,lb,lbb 57 | # 3,,lcc 58 | # 4,ra,raa 59 | 60 | # cat /tmp/table_offset_and_fetch_batch_2.csv 61 | # 2,lb,lbb 62 | # 2,lb,lbb 63 | # 3,,lcc 64 | # 4,ra,raa 65 | 66 | # cat /tmp/table_offset_and_fetch_batch_3.csv 67 | # 2,lb,lbb 68 | # 2,lb,lbb 69 | 70 | 71 | if __name__ == '__main__': 72 | offset_and_fetch_batch() 73 | -------------------------------------------------------------------------------- /table/batch/order_by.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes 5 | 6 | 7 | def order_by_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | result_file = "/tmp/table_order_by_batch.csv" 12 | if os.path.exists(result_file): 13 | os.remove(result_file) 14 | 15 | left = bt_env.from_elements( 16 | [(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (4, "ra", "raa")], 17 | ["a", "b", "c"]).select("a, b, c") 18 | bt_env.register_table_sink("result", 19 | CsvTableSink(["a", "b", "c"], 20 | [DataTypes.BIGINT(), 21 | DataTypes.STRING(), 22 | DataTypes.STRING()], 23 | result_file)) 24 | 25 | result = left.order_by("a.asc") 26 | result.insert_into("result") 27 | bt_env.execute("order by batch") 28 | 29 | # cat /tmp/table_order_by_batch.csv 30 | # 1,ra,raa 31 | # 2,lb,lbb 32 | # 2,lb,lbb 33 | # 3,,lcc 34 | # 4,ra,raa 35 | 36 | 37 | if __name__ == '__main__': 38 | order_by_batch() 39 | -------------------------------------------------------------------------------- /table/batch/rename_columns.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes 5 | 6 | 7 | def rename_columns_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | source_file = os.getcwd() + "/../resources/table_orders.csv" 12 | result_file = "/tmp/table_rename_columns_batch.csv" 13 | if os.path.exists(result_file): 14 | os.remove(result_file) 15 | bt_env.register_table_source("Orders", 16 | CsvTableSource(source_file, 17 | ["a", "b", "c", "rowtime"], 18 | [DataTypes.STRING(), 19 | DataTypes.INT(), 20 | DataTypes.INT(), 21 | DataTypes.TIMESTAMP()])) 22 | bt_env.register_table_sink("result", 23 | CsvTableSink(["a", "b"], 24 | [DataTypes.STRING(), 25 | DataTypes.INT()], 26 | result_file)) 27 | orders = bt_env.scan("Orders") 28 | result = orders.rename_columns("a as a2, b as b2").select("a2, b2") 29 | result.insert_into("result") 30 | bt_env.execute("rename columns batch") 31 | # cat /tmp/table_rename_columns_batch.csv 32 | # a,1 33 | # b,2 34 | # a,3 35 | # a,4 36 | # b,4 37 | # a,5 38 | 39 | 40 | if __name__ == '__main__': 41 | rename_columns_batch() 42 | -------------------------------------------------------------------------------- /table/batch/right_outer_join.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes 5 | 6 | 7 | def right_outer_join_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | result_file = "/tmp/table_right_outer_join_batch.csv" 12 | if os.path.exists(result_file): 13 | os.remove(result_file) 14 | left = bt_env.from_elements( 15 | [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")], 16 | ["a", "b", "c"]).select("a, b, c") 17 | right = bt_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], 18 | ["d", "e", "f"]).select("d, e, f") 19 | bt_env.register_table_sink("result", 20 | CsvTableSink(["a", "b", "c"], 21 | [DataTypes.BIGINT(), 22 | DataTypes.STRING(), 23 | DataTypes.STRING()], 24 | result_file)) 25 | 26 | result = left.right_outer_join(right, "a = d").select("a, b, e") 27 | result.insert_into("result") 28 | bt_env.execute("right outer join batch") 29 | # cat /tmp/table_right_outer_join_batch.csv 30 | # 1,1a,1b 31 | # 1,1a,3b 32 | # 2,2a, 33 | # 2,4b, 34 | # 4b 35 | 36 | 37 | if __name__ == '__main__': 38 | right_outer_join_batch() 39 | -------------------------------------------------------------------------------- /table/batch/scan.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes 5 | 6 | 7 | def scan_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | source_file = os.getcwd() + "/../resources/table_orders.csv" 12 | result_file = "/tmp/table_scan_batch.csv" 13 | 14 | if os.path.exists(result_file): 15 | os.remove(result_file) 16 | bt_env.register_table_source("Orders", 17 | CsvTableSource(source_file, 18 | ["a", "b", "c", "rowtime"], 19 | [DataTypes.STRING(), 20 | DataTypes.INT(), 21 | DataTypes.INT(), 22 | DataTypes.TIMESTAMP()])) 23 | bt_env.register_table_sink("result", 24 | CsvTableSink(["a", "b", "c", "rowtime"], 25 | [DataTypes.STRING(), 26 | DataTypes.INT(), 27 | DataTypes.INT(), 28 | DataTypes.TIMESTAMP()], 29 | result_file)) 30 | orders = bt_env.scan("Orders") 31 | orders.insert_into("result") 32 | bt_env.execute("scan batch") 33 | # cat /tmp/table_scan_batch.csv 34 | # a,1,1,2013-01-01 00:14:13.0 35 | # b,2,2,2013-01-01 00:24:13.0 36 | # a,3,3,2013-01-01 00:34:13.0 37 | # a,4,4,2013-01-01 01:14:13.0 38 | # b,4,5,2013-01-01 01:24:13.0 39 | # a,5,2,2013-01-01 01:34:13.0 40 | 41 | 42 | if __name__ == '__main__': 43 | scan_batch() 44 | -------------------------------------------------------------------------------- /table/batch/session_window.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes 5 | from pyflink.table.window import Session 6 | 7 | 8 | def session_time_window_batch(): 9 | b_env = ExecutionEnvironment.get_execution_environment() 10 | b_env.set_parallelism(1) 11 | bt_env = BatchTableEnvironment.create(b_env) 12 | source_file = os.getcwd() + "/../resources/table_orders.csv" 13 | result_file = "/tmp/table_session_time_window_batch.csv" 14 | if os.path.exists(result_file): 15 | os.remove(result_file) 16 | bt_env.register_table_source("Orders", 17 | CsvTableSource(source_file, 18 | ["a", "b", "c", "rowtime"], 19 | [DataTypes.STRING(), 20 | DataTypes.INT(), 21 | DataTypes.INT(), 22 | DataTypes.TIMESTAMP()])) 23 | bt_env.register_table_sink("result", 24 | CsvTableSink(["a"], 25 | [DataTypes.INT()], 26 | result_file)) 27 | orders = bt_env.scan("Orders") 28 | result = orders.window(Session.with_gap("10.minutes").on("rowtime").alias("w")) \ 29 | .group_by("w").select("b.sum") 30 | result.insert_into("result") 31 | bt_env.execute("session time window batch") 32 | # cat /tmp/table_session_time_window_batch.csv 33 | # 6 34 | # 13 35 | 36 | 37 | if __name__ == '__main__': 38 | session_time_window_batch() 39 | -------------------------------------------------------------------------------- /table/batch/slide_window.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes 5 | from pyflink.table.window import Slide 6 | 7 | 8 | def slide_time_window_batch(): 9 | b_env = ExecutionEnvironment.get_execution_environment() 10 | b_env.set_parallelism(1) 11 | bt_env = BatchTableEnvironment.create(b_env) 12 | source_file = os.getcwd() + "/../resources/table_orders.csv" 13 | result_file = "/tmp/table_slide_time_window_batch.csv" 14 | if os.path.exists(result_file): 15 | os.remove(result_file) 16 | bt_env.register_table_source("Orders", 17 | CsvTableSource(source_file, 18 | ["a", "b", "c", "rowtime"], 19 | [DataTypes.STRING(), 20 | DataTypes.INT(), 21 | DataTypes.INT(), 22 | DataTypes.TIMESTAMP()])) 23 | bt_env.register_table_sink("result", 24 | CsvTableSink(["a"], 25 | [DataTypes.INT()], 26 | result_file)) 27 | orders = bt_env.scan("Orders") 28 | result = orders.window(Slide.over("60.minutes").every("10.minutes").on("rowtime").alias("w")) \ 29 | .group_by("w").select("b.sum") 30 | result.insert_into("result") 31 | bt_env.execute("slide time window batch") 32 | # cat /tmp/table_slide_time_window_batch.csv 33 | # 1 34 | # 3 35 | # 6 36 | # 6 37 | # 6 38 | # 6 39 | # 9 40 | # 11 41 | # 13 42 | # 13 43 | # 13 44 | # 13 45 | # 9 46 | # 5 47 | 48 | 49 | if __name__ == '__main__': 50 | slide_time_window_batch() 51 | -------------------------------------------------------------------------------- /table/batch/table_select.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes 5 | 6 | 7 | def select_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | source_file = os.getcwd() + "/../resources/table_orders.csv" 12 | result_file = "/tmp/table_select_batch.csv" 13 | if os.path.exists(result_file): 14 | os.remove(result_file) 15 | 16 | bt_env.register_table_source("Orders", 17 | CsvTableSource(source_file, 18 | ["a", "b", "c", "rowtime"], 19 | [DataTypes.STRING(), 20 | DataTypes.INT(), 21 | DataTypes.INT(), 22 | DataTypes.TIMESTAMP()])) 23 | bt_env.register_table_sink("result", 24 | CsvTableSink(["a", "c"], 25 | [DataTypes.STRING(), 26 | DataTypes.INT()], 27 | result_file)) 28 | orders = bt_env.scan("Orders") 29 | result = orders.select("a, b") 30 | result.insert_into("result") 31 | bt_env.execute("select batch") 32 | # cat /tmp/table_select_batch.csv 33 | # a,1 34 | # b,2 35 | # a,3 36 | # a,4 37 | # b,4 38 | # a,5 39 | 40 | 41 | if __name__ == '__main__': 42 | select_batch() 43 | -------------------------------------------------------------------------------- /table/batch/tumble_window.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes 5 | from pyflink.table.window import Tumble 6 | 7 | 8 | def tumble_row_window_batch(): 9 | b_env = ExecutionEnvironment.get_execution_environment() 10 | b_env.set_parallelism(1) 11 | bt_env = BatchTableEnvironment.create(b_env) 12 | source_file = os.getcwd() + "/../resources/table_orders.csv" 13 | result_file = "/tmp/table_tumble_row_window_batch.csv" 14 | if os.path.exists(result_file): 15 | os.remove(result_file) 16 | bt_env.register_table_source("Orders", 17 | CsvTableSource(source_file, 18 | ["a", "b", "c", "rowtime"], 19 | [DataTypes.STRING(), 20 | DataTypes.INT(), 21 | DataTypes.INT(), 22 | DataTypes.TIMESTAMP()])) 23 | bt_env.register_table_sink("result", 24 | CsvTableSink(["a"], 25 | [DataTypes.INT()], 26 | result_file)) 27 | orders = bt_env.scan("Orders") 28 | result = orders.window(Tumble.over("2.rows").on("rowtime").alias("w")) \ 29 | .group_by("w, a").select("b.sum") 30 | result.insert_into("result") 31 | bt_env.execute("tumble row window batch") 32 | # cat /tmp/table_tumble_row_window_batch.csv 33 | # 4 34 | # 9 35 | # 6 36 | 37 | 38 | def tumble_time_window_batch(): 39 | b_env = ExecutionEnvironment.get_execution_environment() 40 | b_env.set_parallelism(1) 41 | bt_env = BatchTableEnvironment.create(b_env) 42 | source_file = os.getcwd() + "/../resources/table_orders.csv" 43 | result_file = "/tmp/table_tumble_time_window_batch.csv" 44 | if os.path.exists(result_file): 45 | os.remove(result_file) 46 | bt_env.register_table_source("Orders", 47 | CsvTableSource(source_file, 48 | ["a", "b", "c", "rowtime"], 49 | [DataTypes.STRING(), 50 | DataTypes.INT(), 51 | DataTypes.INT(), 52 | DataTypes.TIMESTAMP()])) 53 | bt_env.register_table_sink("result", 54 | CsvTableSink(["a"], 55 | [DataTypes.INT()], 56 | result_file)) 57 | orders = bt_env.scan("Orders") 58 | result = orders.window(Tumble.over("30.minutes").on("rowtime").alias("w")) \ 59 | .group_by("w, a").select("b.sum") 60 | result.insert_into("result") 61 | bt_env.execute("tumble time window batch") 62 | # cat /tmp/table_tumble_time_window_batch.csv 63 | # 1 64 | # 3 65 | # 4 66 | # 5 67 | # 2 68 | # 4 69 | 70 | 71 | if __name__ == '__main__': 72 | tumble_row_window_batch() 73 | # tumble_time_window_batch() 74 | -------------------------------------------------------------------------------- /table/batch/union.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes 5 | 6 | 7 | def union_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | result_file = os.getcwd() + "/tmp/table_union_batch.csv" 12 | if os.path.exists(result_file): 13 | os.remove(result_file) 14 | left = bt_env.from_elements( 15 | [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (1, "1a", "1laa"), (1, "1b", "1bb")], 16 | ["a", "b", "c"]).select("a, b, c") 17 | right = bt_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], 18 | ["a", "b", "c"]).select("a, b, c") 19 | bt_env.register_table_sink("result", 20 | CsvTableSink(["a", "b", "c"], 21 | [DataTypes.BIGINT(), 22 | DataTypes.STRING(), 23 | DataTypes.STRING()], 24 | result_file)) 25 | 26 | result = left.union(right) 27 | result.insert_into("result") 28 | bt_env.execute("union batch") 29 | # cat /tmp/table_union_batch.csv 30 | # 1,1a,1laa 31 | # 1,1b,1bb 32 | # 1,3b,3bb 33 | # 2,,2bb 34 | # 2,2a,2aa 35 | # 3,,3aa 36 | # 4,4b,4bb 37 | # note : Unions two tables with duplicate records removed whatever the duplicate record from 38 | # the same table or the other. 39 | 40 | 41 | if __name__ == '__main__': 42 | union_batch() 43 | -------------------------------------------------------------------------------- /table/batch/union_all.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes 5 | 6 | 7 | def union_all_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | result_file = os.getcwd() + "/tmp/table_union_all_batch.csv" 12 | if os.path.exists(result_file): 13 | os.remove(result_file) 14 | left = bt_env.from_elements( 15 | [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (1, "1a", "1laa"), (1, "1b", "1bb")], 16 | ["a", "b", "c"]).select("a, b, c") 17 | right = bt_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], 18 | ["a", "b", "c"]).select("a, b, c") 19 | bt_env.register_table_sink("result", 20 | CsvTableSink(["a", "b", "c"], 21 | [DataTypes.BIGINT(), 22 | DataTypes.STRING(), 23 | DataTypes.STRING()], 24 | result_file)) 25 | 26 | result = left.union_all(right) 27 | result.insert_into("result") 28 | bt_env.execute("union all batch") 29 | # cat /tmp/table_union_all_batch.csv 30 | # 1,1a,1laa 31 | # 2,2a,2aa 32 | # 3,,3aa 33 | # 1,1a,1laa 34 | # 1,1b,1bb 35 | # 1,1b,1bb 36 | # 2,,2bb 37 | # 1,3b,3bb 38 | # 4,4b,4bb 39 | 40 | 41 | if __name__ == '__main__': 42 | union_all_batch() 43 | -------------------------------------------------------------------------------- /table/batch/where.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSource, CsvTableSink, DataTypes 5 | 6 | 7 | def where_batch(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | source_file = os.getcwd() + "/../resources/table_orders.csv" 12 | result_file = os.getcwd() + "/../result/table_where_batch.csv" 13 | if os.path.exists(result_file): 14 | os.remove(result_file) 15 | bt_env.register_table_source("Orders", 16 | CsvTableSource(source_file, 17 | ["a", "b", "c", "rowtime"], 18 | [DataTypes.STRING(), 19 | DataTypes.INT(), 20 | DataTypes.INT(), 21 | DataTypes.TIMESTAMP()])) 22 | bt_env.register_table_sink("result", 23 | CsvTableSink(["a", "b", "c", "rowtime"], 24 | [DataTypes.STRING(), 25 | DataTypes.INT(), 26 | DataTypes.INT(), 27 | DataTypes.TIMESTAMP()], 28 | result_file)) 29 | orders = bt_env.scan("Orders") 30 | result = orders.where("a === 'b'") 31 | result.insert_into("result") 32 | bt_env.execute("where batch") 33 | # cat table/result/table_where_batch.csv 34 | # b,2,2,2013-01-01 00:24:13.0 35 | # b,4,5,2013-01-01 01:24:13.0 36 | 37 | 38 | if __name__ == '__main__': 39 | where_batch() 40 | -------------------------------------------------------------------------------- /table/javaudf/README.md: -------------------------------------------------------------------------------- 1 | # UDF 2 | This page helps users to use udf in pyflink 3 | 4 | ## Build UDF 5 | 6 | ### Scalar Function 7 | The example of Scalar Function lives in scalar-function. You need to build this code: 8 | 9 | ```shell 10 | cd scalar-function; mvn clean package 11 | ``` 12 | 13 | ### Table Function 14 | The example of Scalar Function lives in scalar-function. You need to build this code: 15 | 16 | ```shell 17 | cd table-function; mvn clean package 18 | ``` 19 | 20 | ### Aggregate Function 21 | The example of Scalar Function lives in scalar-function. You need to build this code: 22 | 23 | ```shell 24 | cd aggregate-function; mvn clean package 25 | ``` 26 | 27 | ## Run Java UDF In PyFlink 28 | 29 | ### [optional] Run In Local PVM(Python Virtual Machine) 30 | 1. put udf jar(scalar-function-1.0.jar, table-function-1.0.jar, aggregate-function-1.0.jar) in Python site-packages/pyflink/lib directory 31 | 2. use python interpreter to run the code in scalar_func_demo.py or table_func_demo.py or aggregate_func_demo.py 32 | 33 | ### [optional] Run Job In Flink Cluster 34 | 35 | 1. start flink cluster. You can start the standard alone flink cluster: 36 | 37 | ```shell 38 | bin/start-cluster.sh 39 | ``` 40 | 41 | you need to cd to directory of build-target in flink source code. 42 | 43 | 2. submit the python job: 44 | 45 | ```shell 46 | bin/flink run -py /table/javaudf/scalar_func_demo.py -jar 47 | ``` 48 | 49 | ```shell 50 | bin/flink run -py /table/javaudf/table_func_demo.py -jar 51 | ``` 52 | 53 | ```shell 54 | bin/flink run -py /table/javaudf/aggregate_func_demo.py.py -jar 55 | ``` 56 | -------------------------------------------------------------------------------- /table/javaudf/aggregate-function/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 20 | 22 | 23 | 4.0.0 24 | 25 | org.apache.flink.table 26 | aggregate-function 27 | 1.0 28 | 29 | jar 30 | 31 | 32 | 1.9.0 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | org.apache.flink 41 | flink-core 42 | ${table.version} 43 | provided 44 | 45 | 46 | org.apache.flink 47 | flink-java 48 | ${table.version} 49 | provided 50 | 51 | 52 | org.apache.flink 53 | flink-streaming-java_2.11 54 | ${table.version} 55 | provided 56 | 57 | 58 | org.apache.flink 59 | flink-table-common 60 | ${table.version} 61 | provided 62 | 63 | 64 | org.apache.flink 65 | flink-table-planner_2.11 66 | ${table.version} 67 | provided 68 | 69 | 70 | 71 | 72 | 73 | org.apache.maven.plugins 74 | maven-compiler-plugin 75 | 3.1 76 | 77 | 1.8 78 | 1.8 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /table/javaudf/aggregate-function/src/main/java/com/pyflink/table/WeightedAvg.java: -------------------------------------------------------------------------------- 1 | package com.pyflink.table; 2 | 3 | import org.apache.flink.table.functions.AggregateFunction; 4 | 5 | import java.util.Iterator; 6 | 7 | /** 8 | * Weighted Average user-defined aggregate function. 9 | */ 10 | public class WeightedAvg extends AggregateFunction { 11 | @Override 12 | public Long getValue(WeightedAvgAccum weightedAvgAccum) { 13 | if (weightedAvgAccum.count == 0) { 14 | return null; 15 | } else { 16 | return weightedAvgAccum.sum / weightedAvgAccum.count; 17 | } 18 | } 19 | 20 | @Override 21 | public WeightedAvgAccum createAccumulator() { 22 | return new WeightedAvgAccum(); 23 | } 24 | 25 | public void accumulate(WeightedAvgAccum acc, long iValue, int iWeight) { 26 | acc.sum += iValue * iWeight; 27 | acc.count += iWeight; 28 | } 29 | 30 | public void accumulate(WeightedAvgAccum acc, long iValue, long iWeight) { 31 | acc.sum += iValue * iWeight; 32 | acc.count += iWeight; 33 | } 34 | 35 | public void retract(WeightedAvgAccum acc, long iValue, int iWeight) { 36 | acc.sum -= iValue * iWeight; 37 | acc.count -= iWeight; 38 | } 39 | 40 | public void merge(WeightedAvgAccum acc, Iterable it) { 41 | Iterator iter = it.iterator(); 42 | while (iter.hasNext()) { 43 | WeightedAvgAccum other = iter.next(); 44 | acc.count += other.count; 45 | acc.sum += other.sum; 46 | } 47 | } 48 | 49 | public void resetAccumulator(WeightedAvgAccum acc) { 50 | acc.count = 0; 51 | acc.sum = 0L; 52 | } 53 | 54 | /** 55 | * Accumulator for WeightedAvg. 56 | */ 57 | public static class WeightedAvgAccum { 58 | public long sum = 0; 59 | public int count = 0; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /table/javaudf/aggregate_func_demo.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes 5 | 6 | 7 | def aggregate_func_python_table_api(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | source_table = bt_env.from_elements([("a", 1, 1), ("a", 2, 2), ("b", 3, 2), ("a", 5, 2)], 12 | ["user", "points", "level"]) 13 | 14 | result_file = "/tmp/aggregate_func_python_table_api.csv" 15 | if os.path.exists(result_file): 16 | os.remove(result_file) 17 | bt_env.register_table_sink("result", 18 | CsvTableSink(["a", "b"], 19 | [DataTypes.STRING(), 20 | DataTypes.BIGINT()], 21 | result_file)) 22 | bt_env.register_java_function("wAvg", "com.pyflink.table.WeightedAvg") 23 | result = source_table.group_by("user").select("user, wAvg(points, level) as avgPoints") 24 | result.insert_into("result") 25 | bt_env.execute("aggregate func python table api") 26 | # cat /tmp/aggregate_func_python_table_api.csv 27 | # a,3 28 | # b,3 29 | 30 | 31 | def aggregate_func_python_sql_api(): 32 | b_env = ExecutionEnvironment.get_execution_environment() 33 | b_env.set_parallelism(1) 34 | bt_env = BatchTableEnvironment.create(b_env) 35 | source_table = bt_env.from_elements([("a", 1, 1), ("a", 2, 2), ("b", 3, 2), ("a", 5, 2)], 36 | ["user", "points", "level"]) 37 | 38 | result_file = "/tmp/aggregate_func_python_sql_api.csv" 39 | if os.path.exists(result_file): 40 | os.remove(result_file) 41 | bt_env.register_table_sink("result", 42 | CsvTableSink(["a", "b"], 43 | [DataTypes.STRING(), 44 | DataTypes.BIGINT()], 45 | result_file)) 46 | 47 | bt_env.register_java_function("wAvg", "com.pyflink.table.WeightedAvg") 48 | bt_env.register_table("userScores", source_table) 49 | result = bt_env.sql_query("SELECT user, wAvg(points, level) AS avgPoints FROM userScores GROUP BY user") 50 | result.insert_into("result") 51 | bt_env.execute("aggregate func python sql api") 52 | # cat /tmp/aggregate_func_python_sql_api.csv 53 | # a,3 54 | # b,3 55 | 56 | 57 | if __name__ == '__main__': 58 | aggregate_func_python_table_api() 59 | # aggregate_func_python_sql_api() 60 | -------------------------------------------------------------------------------- /table/javaudf/scalar-function/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 20 | 22 | 23 | 4.0.0 24 | 25 | org.apache.flink.table 26 | scalar-function 27 | 1.0 28 | 29 | jar 30 | 31 | 32 | 1.9.0 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | org.apache.flink 41 | flink-core 42 | ${table.version} 43 | provided 44 | 45 | 46 | org.apache.flink 47 | flink-java 48 | ${table.version} 49 | provided 50 | 51 | 52 | org.apache.flink 53 | flink-streaming-java_2.11 54 | ${table.version} 55 | provided 56 | 57 | 58 | org.apache.flink 59 | flink-table-common 60 | ${table.version} 61 | provided 62 | 63 | 64 | org.apache.flink 65 | flink-table-planner_2.11 66 | ${table.version} 67 | provided 68 | 69 | 70 | 71 | 72 | 73 | org.apache.maven.plugins 74 | maven-compiler-plugin 75 | 3.1 76 | 77 | 1.8 78 | 1.8 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /table/javaudf/scalar-function/src/main/java/com/pyflink/table/HashCode.java: -------------------------------------------------------------------------------- 1 | package com.pyflink.table; 2 | 3 | import org.apache.flink.table.functions.ScalarFunction; 4 | 5 | public class HashCode extends ScalarFunction { 6 | private int factor = 12; 7 | 8 | public int eval(String s) { 9 | return s.hashCode() * factor; 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /table/javaudf/scalar_func_demo.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes 5 | 6 | 7 | def scalar_func_python_table_api(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | 12 | source_table = bt_env.from_elements([("a", "aa"), ("b", "bb"), ("c", "cc")], ["a", "b"]).select("a, b") 13 | 14 | result_file = "/tmp/scalar_func_python_table_api.csv" 15 | if os.path.exists(result_file): 16 | os.remove(result_file) 17 | bt_env.register_table_sink("result", 18 | CsvTableSink(["a", "b", "c"], 19 | [DataTypes.STRING(), 20 | DataTypes.INT(), 21 | DataTypes.INT()], 22 | result_file)) 23 | 24 | # register the java scalar function 25 | bt_env.register_java_function("hashCode", "com.pyflink.table.HashCode") 26 | 27 | # use the java scalar function in Python Table API 28 | result = source_table.select("a, a.hashCode(), hashCode(a)") 29 | result.insert_into("result") 30 | bt_env.execute("scalar func python table api") 31 | # cat /tmp/scalar_func_python_table_api.csv 32 | # a,1164,1164 33 | # b,1176,1176 34 | # c,1188,1188 35 | 36 | 37 | def scalar_func_python_sql(): 38 | b_env = ExecutionEnvironment.get_execution_environment() 39 | b_env.set_parallelism(1) 40 | bt_env = BatchTableEnvironment.create(b_env) 41 | 42 | source_table = bt_env.from_elements([("a", 1), ("b", 2), ("c", 3)], ["a", "b"]).select("a, b") 43 | 44 | result_file = "/tmp/scalar_func_python_sql.csv" 45 | if os.path.exists(result_file): 46 | os.remove(result_file) 47 | bt_env.register_table_sink("result", 48 | CsvTableSink(["a", "b"], 49 | [DataTypes.STRING(), 50 | DataTypes.INT()], 51 | result_file)) 52 | 53 | # register the java scalar function 54 | bt_env.register_java_function("hashCode", "com.pyflink.table.HashCode") 55 | 56 | # register the table for using in the sql query 57 | bt_env.register_table("MyTable", source_table) 58 | 59 | result = bt_env.sql_query("SELECT a, hashCode(a) FROM MyTable") 60 | result.insert_into("result") 61 | bt_env.execute("scalar func python sql") 62 | # cat /tmp/scalar_func_python_sql.csv 63 | # a,1164 64 | # b,1176 65 | # c,1188 66 | 67 | 68 | if __name__ == '__main__': 69 | scalar_func_python_table_api() 70 | # scalar_func_python_sql() 71 | -------------------------------------------------------------------------------- /table/javaudf/table-function/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 20 | 22 | 23 | 4.0.0 24 | 25 | org.apache.flink.table 26 | table-function 27 | 1.0 28 | 29 | jar 30 | 31 | 32 | 1.9.0 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | org.apache.flink 41 | flink-core 42 | ${table.version} 43 | provided 44 | 45 | 46 | org.apache.flink 47 | flink-java 48 | ${table.version} 49 | provided 50 | 51 | 52 | org.apache.flink 53 | flink-streaming-java_2.11 54 | ${table.version} 55 | provided 56 | 57 | 58 | org.apache.flink 59 | flink-table-common 60 | ${table.version} 61 | provided 62 | 63 | 64 | org.apache.flink 65 | flink-table-planner_2.11 66 | ${table.version} 67 | provided 68 | 69 | 70 | 71 | 72 | 73 | 74 | org.apache.maven.plugins 75 | maven-compiler-plugin 76 | 3.1 77 | 78 | 1.8 79 | 1.8 80 | 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /table/javaudf/table-function/src/main/java/com/pyflink/table/Split.java: -------------------------------------------------------------------------------- 1 | package com.pyflink.table; 2 | 3 | import org.apache.flink.api.java.tuple.Tuple2; 4 | import org.apache.flink.table.functions.TableFunction; 5 | 6 | public class Split extends TableFunction> { 7 | private String separator = " "; 8 | 9 | public void eval(String str) { 10 | for (String s : str.split(separator)) { 11 | collect(new Tuple2(s, s.length())); 12 | } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /table/javaudf/table_func_demo.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.dataset import ExecutionEnvironment 4 | from pyflink.table import BatchTableEnvironment, CsvTableSink, DataTypes 5 | 6 | 7 | def table_func_python_table_join_lateral_api(): 8 | b_env = ExecutionEnvironment.get_execution_environment() 9 | b_env.set_parallelism(1) 10 | bt_env = BatchTableEnvironment.create(b_env) 11 | 12 | source_table = bt_env.from_elements([("a aa aaa", "aa"), ("b bb bbb", "bb"), ("c cc ccc", "cc")], 13 | ["a", "b"]).select("a, b") 14 | 15 | result_file = "/tmp/table_func_python_table_join_lateral_api.csv" 16 | if os.path.exists(result_file): 17 | os.remove(result_file) 18 | bt_env.register_table_sink("result", 19 | CsvTableSink(["a", "b", "c"], 20 | [DataTypes.STRING(), 21 | DataTypes.STRING(), 22 | DataTypes.INT()], 23 | result_file)) 24 | 25 | bt_env.register_java_function("split", "com.pyflink.table.Split") 26 | 27 | result = source_table.join_lateral("Split(a) as (word, length)").select("a, word, length") 28 | 29 | result.insert_into("result") 30 | 31 | bt_env.execute("table func python table join lateral api") 32 | # cat /tmp/table_func_python_table_join_lateral_api.csv 33 | # a aa aaa,a,1 34 | # a aa aaa,aa,2 35 | # a aa aaa,aaa,3 36 | # b bb bbb,b,1 37 | # b bb bbb,bb,2 38 | # b bb bbb,bbb,3 39 | # c cc ccc,c,1 40 | # c cc ccc,cc,2 41 | # c cc ccc,ccc,3 42 | 43 | 44 | def table_func_python_table_left_outer_join_lateral_api(): 45 | b_env = ExecutionEnvironment.get_execution_environment() 46 | b_env.set_parallelism(1) 47 | bt_env = BatchTableEnvironment.create(b_env) 48 | 49 | source_table = bt_env.from_elements([("a aa aaa", "aa"), ("b bb bbb", "bb"), ("c cc ccc", "cc")], 50 | ["a", "b"]).select("a, b") 51 | 52 | result_file = "/tmp/table_func_python_table_left_outer_join_lateral_api.csv" 53 | if os.path.exists(result_file): 54 | os.remove(result_file) 55 | bt_env.register_table_sink("result", 56 | CsvTableSink(["a", "b", "c"], 57 | [DataTypes.STRING(), 58 | DataTypes.STRING(), 59 | DataTypes.INT()], 60 | result_file)) 61 | 62 | bt_env.register_java_function("split", "com.pyflink.table.Split") 63 | 64 | result = source_table.left_outer_join_lateral("Split(a) as (word, length)").select("a, word, length") 65 | 66 | result.insert_into("result") 67 | 68 | bt_env.execute("table func python table left outer join lateral api") 69 | # cat /tmp/table_func_python_table_left_outer_join_lateral_api.csv 70 | # a aa aaa,a,1 71 | # a aa aaa,aa,2 72 | # a aa aaa,aaa,3 73 | # b bb bbb,b,1 74 | # b bb bbb,bb,2 75 | # b bb bbb,bbb,3 76 | # c cc ccc,c,1 77 | # c cc ccc,cc,2 78 | # c cc ccc,ccc,3 79 | 80 | 81 | def table_func_python_sql_join_lateral_api(): 82 | b_env = ExecutionEnvironment.get_execution_environment() 83 | b_env.set_parallelism(1) 84 | bt_env = BatchTableEnvironment.create(b_env) 85 | 86 | source_table = bt_env.from_elements([("a aa aaa", "aa"), ("b bb bbb", "bb"), ("c cc ccc", "cc")], 87 | ["a", "b"]).select("a, b") 88 | 89 | result_file = "/tmp/table_func_python_sql_join_lateral_api.csv" 90 | if os.path.exists(result_file): 91 | os.remove(result_file) 92 | bt_env.register_table_sink("result", 93 | CsvTableSink(["a", "b", "c"], 94 | [DataTypes.STRING(), 95 | DataTypes.STRING(), 96 | DataTypes.INT()], 97 | result_file)) 98 | 99 | bt_env.register_java_function("split", "com.pyflink.table.Split") 100 | bt_env.register_table("MyTable", source_table) 101 | 102 | result = bt_env.sql_query("SELECT a, word, length FROM MyTable, LATERAL TABLE(split(a)) as T(word, length)") 103 | 104 | result.insert_into("result") 105 | 106 | bt_env.execute("table func python sql join lateral api") 107 | # cat /tmp/table_func_python_sql_join_lateral_api.csv 108 | # a aa aaa,a,1 109 | # a aa aaa,aa,2 110 | # a aa aaa,aaa,3 111 | # b bb bbb,b,1 112 | # b bb bbb,bb,2 113 | # b bb bbb,bbb,3 114 | # c cc ccc,c,1 115 | # c cc ccc,cc,2 116 | # c cc ccc,ccc,3 117 | 118 | 119 | def table_func_python_sql_left_outer_join_lateral_api(): 120 | b_env = ExecutionEnvironment.get_execution_environment() 121 | b_env.set_parallelism(1) 122 | bt_env = BatchTableEnvironment.create(b_env) 123 | 124 | source_table = bt_env.from_elements([("a aa aaa", "aa"), ("b bb bbb", "bb"), ("c cc ccc", "cc")], 125 | ["a", "b"]).select("a, b") 126 | 127 | result_file = "/tmp/table_func_python_sql_left_outer_join_lateral_api.csv" 128 | if os.path.exists(result_file): 129 | os.remove(result_file) 130 | bt_env.register_table_sink("result", 131 | CsvTableSink(["a", "b", "c"], 132 | [DataTypes.STRING(), 133 | DataTypes.STRING(), 134 | DataTypes.INT()], 135 | result_file)) 136 | 137 | bt_env.register_java_function("split", "com.pyflink.table.Split") 138 | bt_env.register_table("MyTable", source_table) 139 | 140 | result = bt_env.sql_query( 141 | "SELECT a, word, length FROM MyTable LEFT JOIN LATERAL TABLE(split(a)) as T(word, length) ON TRUE") 142 | 143 | result.insert_into("result") 144 | 145 | bt_env.execute("table func python sql left outer join lateral api") 146 | # cat /tmp/table_func_python_sql_left_outer_join_lateral_api.csv 147 | # a aa aaa,a,1 148 | # a aa aaa,aa,2 149 | # a aa aaa,aaa,3 150 | # b bb bbb,b,1 151 | # b bb bbb,bb,2 152 | # b bb bbb,bbb,3 153 | # c cc ccc,c,1 154 | # c cc ccc,cc,2 155 | # c cc ccc,ccc,3 156 | 157 | 158 | if __name__ == '__main__': 159 | table_func_python_table_join_lateral_api() 160 | # table_func_python_table_left_outer_join_lateral_api() 161 | # table_func_python_sql_join_lateral_api() 162 | # table_func_python_sql_left_outer_join_lateral_api() 163 | -------------------------------------------------------------------------------- /table/prepare_environment.py: -------------------------------------------------------------------------------- 1 | from utils import kafka_utils, elastic_search_utils 2 | 3 | 4 | def prepare_env(need_stream_source=False, need_upsert_sink=False): 5 | elastic_search_used_method = ['group_by_agg_streaming', 'distinct_agg_streaming'] 6 | 7 | if need_stream_source: 8 | topics = kafka_utils.list_topics() 9 | if 'user' not in topics: 10 | kafka_utils.create_topic('user') 11 | msgs = [{'a': 'a', 'b': 1, 'c': 1, 'time': '2013-01-01T00:14:13Z'}, 12 | {'a': 'b', 'b': 2, 'c': 2, 'time': '2013-01-01T00:24:13Z'}, 13 | {'a': 'a', 'b': 3, 'c': 3, 'time': '2013-01-01T00:34:13Z'}, 14 | {'a': 'a', 'b': 4, 'c': 4, 'time': '2013-01-01T01:14:13Z'}, 15 | {'a': 'b', 'b': 4, 'c': 5, 'time': '2013-01-01T01:24:13Z'}, 16 | {'a': 'a', 'b': 5, 'c': 2, 'time': '2013-01-01T01:34:13Z'}] 17 | for msg in msgs: 18 | kafka_utils.send_msg('user', msg) 19 | 20 | if need_upsert_sink: 21 | mapping = ''' 22 | { 23 | "mappings" : { 24 | "pyflink" : { 25 | "properties" : { 26 | "a" : { 27 | "type" : "text", 28 | "fields" : { 29 | "keyword" : { 30 | "type" : "keyword", 31 | "ignore_above" : 256 32 | } 33 | } 34 | }, 35 | "b" : { 36 | "type" : "text", 37 | "fields" : { 38 | "keyword" : { 39 | "type" : "keyword", 40 | "ignore_above" : 256 41 | } 42 | } 43 | } 44 | } 45 | } 46 | } 47 | } 48 | ''' 49 | for method in elastic_search_used_method: 50 | elastic_search_utils.delete_index(method) 51 | elastic_search_utils.create_index(method, mapping) 52 | -------------------------------------------------------------------------------- /table/resources/table_orders.csv: -------------------------------------------------------------------------------- 1 | a,1,1,2013-01-01 00:14:13 2 | b,2,2,2013-01-01 00:24:13 3 | a,3,3,2013-01-01 00:34:13 4 | a,4,4,2013-01-01 01:14:13 5 | b,4,5,2013-01-01 01:24:13 6 | a,5,2,2013-01-01 01:34:13 -------------------------------------------------------------------------------- /table/streaming/add_columns.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.datastream import StreamExecutionEnvironment 4 | from pyflink.table import StreamTableEnvironment, CsvTableSource, CsvTableSink, DataTypes, EnvironmentSettings 5 | 6 | 7 | def add_columns_streaming(): 8 | s_env = StreamExecutionEnvironment.get_execution_environment() 9 | s_env.set_parallelism(1) 10 | # use blink table planner 11 | st_env = StreamTableEnvironment.create(s_env, environment_settings=EnvironmentSettings.new_instance() 12 | .in_streaming_mode().use_blink_planner().build()) 13 | # use flink table planner 14 | # st_env = StreamTableEnvironment.create(s_env) 15 | source_file = os.getcwd() + "/../resources/table_orders.csv" 16 | result_file = "/tmp/table_add_columns_streaming.csv" 17 | if os.path.exists(result_file): 18 | os.remove(result_file) 19 | st_env.register_table_source("Orders", 20 | CsvTableSource(source_file, 21 | ["a", "b", "c", "rowtime"], 22 | [DataTypes.STRING(), 23 | DataTypes.INT(), 24 | DataTypes.INT(), 25 | DataTypes.TIMESTAMP()])) 26 | st_env.register_table_sink("result", 27 | CsvTableSink(["a", "b", "c", "rowtime", "d"], 28 | [DataTypes.STRING(), 29 | DataTypes.INT(), 30 | DataTypes.INT(), 31 | DataTypes.TIMESTAMP(), 32 | DataTypes.STRING()], 33 | result_file)) 34 | orders = st_env.scan("Orders") 35 | result = orders.add_columns("concat(a, '_sunny') as d") 36 | result.insert_into("result") 37 | st_env.execute("add columns streaming") 38 | # cat /tmp/table_add_columns_streaming.csv 39 | # a,1,1,2013-01-01 00:14:13.0,a_sunny 40 | # b,2,2,2013-01-01 00:24:13.0,b_sunny 41 | # a,3,3,2013-01-01 00:34:13.0,a_sunny 42 | # a,4,4,2013-01-01 01:14:13.0,a_sunny 43 | # b,4,5,2013-01-01 01:24:13.0,b_sunny 44 | # a,5,2,2013-01-01 01:34:13.0,a_sunny 45 | 46 | 47 | if __name__ == '__main__': 48 | add_columns_streaming() 49 | -------------------------------------------------------------------------------- /table/streaming/add_or_replace_columns.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.datastream import StreamExecutionEnvironment 4 | from pyflink.table import StreamTableEnvironment, CsvTableSource, CsvTableSink, DataTypes, EnvironmentSettings 5 | 6 | 7 | def add_or_replace_columns_streaming(): 8 | s_env = StreamExecutionEnvironment.get_execution_environment() 9 | s_env.set_parallelism(1) 10 | # use blink table planner 11 | st_env = StreamTableEnvironment.create(s_env, environment_settings=EnvironmentSettings.new_instance() 12 | .in_streaming_mode().use_blink_planner().build()) 13 | # use flink table planner 14 | # st_env = StreamTableEnvironment.create(s_env) source_file = os.getcwd() + "/../resources/table_orders.csv" 15 | result_file = "/tmp/table_add_or_replace_columns_streaming.csv" 16 | source_file = os.getcwd() + "/../resources/table_orders.csv" 17 | if os.path.exists(result_file): 18 | os.remove(result_file) 19 | st_env.register_table_source("Orders", 20 | CsvTableSource(source_file, 21 | ["a", "b", "c", "rowtime"], 22 | [DataTypes.STRING(), 23 | DataTypes.INT(), 24 | DataTypes.INT(), 25 | DataTypes.TIMESTAMP()])) 26 | st_env.register_table_sink("result", 27 | CsvTableSink(["a", "b", "c", "rowtime"], 28 | [DataTypes.STRING(), 29 | DataTypes.INT(), 30 | DataTypes.INT(), 31 | DataTypes.TIMESTAMP()], 32 | result_file)) 33 | orders = st_env.scan("Orders") 34 | result = orders.add_or_replace_columns("concat(a, '_sunny') as a") 35 | result.insert_into("result") 36 | st_env.execute("add or replace columns streaming") 37 | # cat /tmp/table_add_or_replace_columns_streaming.csv 38 | # a_sunny,1,1,2013-01-01 00:14:13.0 39 | # b_sunny,2,2,2013-01-01 00:24:13.0 40 | # a_sunny,3,3,2013-01-01 00:34:13.0 41 | # a_sunny,4,4,2013-01-01 01:14:13.0 42 | # b_sunny,4,5,2013-01-01 01:24:13.0 43 | # a_sunny,5,2,2013-01-01 01:34:13.0 44 | 45 | 46 | if __name__ == '__main__': 47 | add_or_replace_columns_streaming() 48 | -------------------------------------------------------------------------------- /table/streaming/alias.py: -------------------------------------------------------------------------------- 1 | from pyflink.datastream import StreamExecutionEnvironment 2 | from pyflink.table import StreamTableEnvironment, CsvTableSource, CsvTableSink, DataTypes, EnvironmentSettings 3 | import os 4 | 5 | 6 | def alias_streaming(): 7 | s_env = StreamExecutionEnvironment.get_execution_environment() 8 | s_env.set_parallelism(1) 9 | # use blink table planner 10 | st_env = StreamTableEnvironment.create(s_env, environment_settings=EnvironmentSettings.new_instance() 11 | .in_streaming_mode().use_blink_planner().build()) 12 | # use flink table planner 13 | # st_env = StreamTableEnvironment.create(s_env) 14 | source_file = os.getcwd() + "/../resources/table_orders.csv" 15 | result_file = "/tmp/table_alias_streaming.csv" 16 | if os.path.exists(result_file): 17 | os.remove(result_file) 18 | st_env.register_table_source("Orders", 19 | CsvTableSource(source_file, 20 | ["a", "b", "c", "rowtime"], 21 | [DataTypes.STRING(), 22 | DataTypes.INT(), 23 | DataTypes.INT(), 24 | DataTypes.TIMESTAMP()])) 25 | st_env.register_table_sink("result", 26 | CsvTableSink(["a", "b", "c", "rowtime"], 27 | [DataTypes.STRING(), 28 | DataTypes.INT(), 29 | DataTypes.INT(), 30 | DataTypes.TIMESTAMP()], 31 | result_file)) 32 | orders = st_env.scan("Orders") 33 | result = orders.alias("x, y, z, t").select("x, y, z, t") 34 | result.insert_into("result") 35 | st_env.execute("alias streaming") 36 | # cat /tmp/table_alias_streaming.csv 37 | # a,1,1,2013-01-01 00:14:13.0 38 | # b,2,2,2013-01-01 00:24:13.0 39 | # a,3,3,2013-01-01 00:34:13.0 40 | # a,4,4,2013-01-01 01:14:13.0 41 | # b,4,5,2013-01-01 01:24:13.0 42 | # a,5,2,2013-01-01 01:34:13.0 43 | 44 | 45 | if __name__ == '__main__': 46 | alias_streaming() 47 | -------------------------------------------------------------------------------- /table/streaming/distinct.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.datastream import StreamExecutionEnvironment 4 | from pyflink.table import StreamTableEnvironment, CsvTableSource, DataTypes 5 | 6 | from table.user_defined_sources_and_sinks.TestRetractSink import TestRetractSink 7 | 8 | 9 | def distinct_streaming(): 10 | s_env = StreamExecutionEnvironment.get_execution_environment() 11 | s_env.set_parallelism(1) 12 | st_env = StreamTableEnvironment.create(s_env) 13 | source_file = os.getcwd() + "/../resources/table_orders.csv" 14 | st_env.register_table_source("Orders", 15 | CsvTableSource(source_file, 16 | ["a", "b", "c", "rowtime"], 17 | [DataTypes.STRING(), 18 | DataTypes.INT(), 19 | DataTypes.INT(), 20 | DataTypes.TIMESTAMP()])) 21 | 22 | orders = st_env.scan("Orders") 23 | result = orders.select("a, b").distinct() 24 | # use custom retract sink connector 25 | sink = TestRetractSink(["a", "b"], 26 | [DataTypes.STRING(), 27 | DataTypes.INT()]) 28 | st_env.register_table_sink("sink", sink) 29 | result.insert_into("sink") 30 | st_env.execute("distinct streaming") 31 | # (true, a, 1) 32 | # (true, b, 2) 33 | # (true, a, 3) 34 | # (true, a, 4) 35 | # (true, b, 4) 36 | # (true, a, 5) 37 | 38 | 39 | if __name__ == '__main__': 40 | distinct_streaming() 41 | -------------------------------------------------------------------------------- /table/streaming/distinct_agg.py: -------------------------------------------------------------------------------- 1 | from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic 2 | from pyflink.table import StreamTableEnvironment, DataTypes, EnvironmentSettings 3 | from pyflink.table.descriptors import Schema, Rowtime, Elasticsearch, Json, Kafka 4 | from pyflink.table.window import Tumble 5 | 6 | 7 | def distinct_agg_streaming(): 8 | s_env = StreamExecutionEnvironment.get_execution_environment() 9 | s_env.set_parallelism(1) 10 | s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) 11 | # use blink table planner 12 | st_env = StreamTableEnvironment.create(s_env, environment_settings=EnvironmentSettings.new_instance() 13 | .in_streaming_mode().use_blink_planner().build()) 14 | # use flink table planner 15 | # st_env = StreamTableEnvironment.create(s_env) 16 | st_env \ 17 | .connect( # declare the external system to connect to 18 | Kafka() 19 | .version("0.11") 20 | .topic("user") 21 | .start_from_earliest() 22 | .property("zookeeper.connect", "localhost:2181") 23 | .property("bootstrap.servers", "localhost:9092") 24 | ) \ 25 | .with_format( # declare a format for this system 26 | Json() 27 | .fail_on_missing_field(True) 28 | .json_schema( 29 | "{" 30 | " type: 'object'," 31 | " properties: {" 32 | " a: {" 33 | " type: 'string'" 34 | " }," 35 | " b: {" 36 | " type: 'string'" 37 | " }," 38 | " c: {" 39 | " type: 'string'" 40 | " }," 41 | " time: {" 42 | " type: 'string'," 43 | " format: 'date-time'" 44 | " }" 45 | " }" 46 | "}" 47 | ) 48 | ) \ 49 | .with_schema( # declare the schema of the table 50 | Schema() 51 | .field("rowtime", DataTypes.TIMESTAMP()) 52 | .rowtime( 53 | Rowtime() 54 | .timestamps_from_field("time") 55 | .watermarks_periodic_bounded(60000)) 56 | .field("a", DataTypes.STRING()) 57 | .field("b", DataTypes.STRING()) 58 | .field("c", DataTypes.STRING()) 59 | ) \ 60 | .in_append_mode() \ 61 | .register_table_source("Orders") 62 | st_env.connect( 63 | Elasticsearch() 64 | .version("6") 65 | .host("localhost", 9200, "http") 66 | .index("distinct_agg_streaming") 67 | .document_type('pyflink') 68 | .key_delimiter("_") 69 | .key_null_literal("null") 70 | .failure_handler_ignore() 71 | .disable_flush_on_checkpoint() 72 | .bulk_flush_max_actions(2) 73 | .bulk_flush_max_size("1 mb") 74 | .bulk_flush_interval(5000) 75 | ) \ 76 | .with_schema( 77 | Schema() 78 | .field("a", DataTypes.STRING()) 79 | .field("b", DataTypes.STRING()) 80 | ) \ 81 | .with_format( 82 | Json() 83 | .derive_schema() 84 | ) \ 85 | .in_upsert_mode() \ 86 | .register_table_sink("result") 87 | orders = st_env.scan("Orders") 88 | result = orders.window(Tumble.over("30.minutes").on("rowtime").alias("w")) \ 89 | .group_by("a, w").select("a, b.max.distinct as d") 90 | result.insert_into("result") 91 | st_env.execute("distinct agg streaming") 92 | # curl -X GET 'http://localhost:9200/distinct_agg_streaming/_search' 93 | # { 94 | # "took": 3, 95 | # "timed_out": false, 96 | # "_shards": { 97 | # "total": 5, 98 | # "successful": 5, 99 | # "skipped": 0, 100 | # "failed": 0 101 | # }, 102 | # "hits": { 103 | # "total": 5, 104 | # "max_score": 1, 105 | # "hits": [ 106 | # { 107 | # "_index": "distinct_agg_streaming", 108 | # "_type": "pyflink", 109 | # "_id": "3zfsHWwBHRafi3KHm2Ve", 110 | # "_score": 1, 111 | # "_source": { 112 | # "a": "a", 113 | # "b": "3" 114 | # } 115 | # }, 116 | # { 117 | # "_index": "distinct_agg_streaming", 118 | # "_type": "pyflink", 119 | # "_id": "4TfsHWwBHRafi3KHrmU-", 120 | # "_score": 1, 121 | # "_source": { 122 | # "a": "b", 123 | # "b": "4" 124 | # } 125 | # }, 126 | # { 127 | # "_index": "distinct_agg_streaming", 128 | # "_type": "pyflink", 129 | # "_id": "4DfsHWwBHRafi3KHm2Ve", 130 | # "_score": 1, 131 | # "_source": { 132 | # "a": "a", 133 | # "b": "4" 134 | # } 135 | # }, 136 | # { 137 | # "_index": "distinct_agg_streaming", 138 | # "_type": "pyflink", 139 | # "_id": "3TfsHWwBHRafi3KHm2Uf", 140 | # "_score": 1, 141 | # "_source": { 142 | # "a": "a", 143 | # "b": "1" 144 | # } 145 | # }, 146 | # { 147 | # "_index": "distinct_agg_streaming", 148 | # "_type": "pyflink", 149 | # "_id": "3jfsHWwBHRafi3KHm2Uf", 150 | # "_score": 1, 151 | # "_source": { 152 | # "a": "b", 153 | # "b": "2" 154 | # } 155 | # } 156 | # ] 157 | # } 158 | # } 159 | 160 | 161 | if __name__ == '__main__': 162 | from table.prepare_environment import prepare_env 163 | prepare_env(need_stream_source=True, need_upsert_sink=True) 164 | distinct_agg_streaming() 165 | -------------------------------------------------------------------------------- /table/streaming/drop_columns.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.datastream import StreamExecutionEnvironment 4 | from pyflink.table import StreamTableEnvironment, CsvTableSource, CsvTableSink, DataTypes, EnvironmentSettings 5 | 6 | 7 | def drop_columns_streaming(): 8 | s_env = StreamExecutionEnvironment.get_execution_environment() 9 | s_env.set_parallelism(1) 10 | # use blink table planner 11 | st_env = StreamTableEnvironment.create(s_env, environment_settings=EnvironmentSettings.new_instance() 12 | .in_streaming_mode().use_blink_planner().build()) 13 | # use flink table planner 14 | # st_env = StreamTableEnvironment.create(s_env) 15 | source_file = os.getcwd() + "/../resources/table_orders.csv" 16 | result_file = "/tmp/table_drop_columns_streaming.csv" 17 | if os.path.exists(result_file): 18 | os.remove(result_file) 19 | st_env.register_table_source("Orders", 20 | CsvTableSource(source_file, 21 | ["a", "b", "c", "rowtime"], 22 | [DataTypes.STRING(), 23 | DataTypes.INT(), 24 | DataTypes.INT(), 25 | DataTypes.TIMESTAMP()])) 26 | st_env.register_table_sink("result", 27 | CsvTableSink(["a", "b", "rowtime"], 28 | [DataTypes.STRING(), 29 | DataTypes.INT(), 30 | DataTypes.TIMESTAMP()], 31 | result_file)) 32 | orders = st_env.scan("Orders") 33 | result = orders.drop_columns("c") 34 | result.insert_into("result") 35 | st_env.execute("drop columns streaming") 36 | # cat /tmp/table_drop_columns_streaming.csv 37 | # a,1,2013-01-01 00:14:13.0 38 | # b,2,2013-01-01 00:24:13.0 39 | # a,3,2013-01-01 00:34:13.0 40 | # a,4,2013-01-01 01:14:13.0 41 | # b,4,2013-01-01 01:24:13.0 42 | # a,5,2013-01-01 01:34:13.0 43 | 44 | 45 | if __name__ == '__main__': 46 | drop_columns_streaming() 47 | -------------------------------------------------------------------------------- /table/streaming/filter.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.datastream import StreamExecutionEnvironment 4 | from pyflink.table import StreamTableEnvironment, CsvTableSource, CsvTableSink, DataTypes, EnvironmentSettings 5 | 6 | 7 | def filter_streaming(): 8 | s_env = StreamExecutionEnvironment.get_execution_environment() 9 | s_env.set_parallelism(1) 10 | # use blink table planner 11 | st_env = StreamTableEnvironment.create(s_env, environment_settings=EnvironmentSettings.new_instance() 12 | .in_streaming_mode().use_blink_planner().build()) 13 | # use flink table planner 14 | # st_env = StreamTableEnvironment.create(s_env) 15 | source_file = os.getcwd() + "/../resources/table_orders.csv" 16 | result_file = "/tmp/table_filter_streaming.csv" 17 | if os.path.exists(result_file): 18 | os.remove(result_file) 19 | st_env.register_table_source("Orders", 20 | CsvTableSource(source_file, 21 | ["a", "b", "c", "rowtime"], 22 | [DataTypes.STRING(), 23 | DataTypes.INT(), 24 | DataTypes.INT(), 25 | DataTypes.TIMESTAMP()])) 26 | st_env.register_table_sink("result", 27 | CsvTableSink(["a", "b", "c", "rowtime"], 28 | [DataTypes.STRING(), 29 | DataTypes.INT(), 30 | DataTypes.INT(), 31 | DataTypes.TIMESTAMP()], 32 | result_file)) 33 | orders = st_env.scan("Orders") 34 | result = orders.filter("b % 2 === 0") 35 | result.insert_into("result") 36 | st_env.execute("filter streaming") 37 | # cat /tmp/table_filter_streaming.csv 38 | # b,2,2,2013-01-01 00:24:13.0 39 | # a,4,4,2013-01-01 01:14:13.0 40 | # b,4,5,2013-01-01 01:24:13.0 41 | 42 | 43 | if __name__ == '__main__': 44 | filter_streaming() 45 | -------------------------------------------------------------------------------- /table/streaming/full_outer_join.py: -------------------------------------------------------------------------------- 1 | from pyflink.datastream import StreamExecutionEnvironment 2 | from pyflink.table import StreamTableEnvironment, EnvironmentSettings, DataTypes 3 | 4 | from table.user_defined_sources_and_sinks.TestRetractSink import TestRetractSink 5 | 6 | 7 | def full_outer_join_streaming(): 8 | s_env = StreamExecutionEnvironment.get_execution_environment() 9 | s_env.set_parallelism(1) 10 | # use blink table planner 11 | st_env = StreamTableEnvironment.create(s_env, environment_settings=EnvironmentSettings.new_instance() 12 | .in_streaming_mode().use_blink_planner().build()) 13 | # use flink table planner 14 | # st_env = StreamTableEnvironment.create(s_env) 15 | left = st_env.from_elements( 16 | [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")], 17 | ["a", "b", "c"]).select("a, b, c") 18 | right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], 19 | ["d", "e", "f"]).select("d, e, f") 20 | 21 | result = left.full_outer_join(right, "a = d").select("a, b, e") 22 | # use custom retract sink connector 23 | sink = TestRetractSink(["a", "b", "c"], 24 | [DataTypes.BIGINT(), 25 | DataTypes.STRING(), 26 | DataTypes.STRING()]) 27 | st_env.register_table_sink("sink", sink) 28 | result.insert_into("sink") 29 | st_env.execute("full outer join streaming") 30 | # (true, 1, 1a, null) 31 | # (true, 2, 2a, null) 32 | # (true, 3, null, null) 33 | # (true, 2, 4b, null) 34 | # (true, 5, 5a, null) 35 | # (false, 1, 1a, null) 36 | # (true, 1, 1a, 1b) 37 | # (false, 2, 2a, null) 38 | # (false, 2, 4b, null) 39 | # (true, 2, 2a, null) 40 | # (true, 2, 4b, null) 41 | # (true, 1, 1a, 3b) 42 | # (true, null, null, 4b) 43 | 44 | 45 | if __name__ == '__main__': 46 | full_outer_join_streaming() 47 | -------------------------------------------------------------------------------- /table/streaming/group_by_agg.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.datastream import StreamExecutionEnvironment 4 | from pyflink.table import StreamTableEnvironment, CsvTableSource, DataTypes, EnvironmentSettings 5 | from pyflink.table.descriptors import Elasticsearch, Schema, Json 6 | 7 | 8 | def group_by_agg_streaming(): 9 | s_env = StreamExecutionEnvironment.get_execution_environment() 10 | s_env.set_parallelism(1) 11 | # use blink table planner 12 | st_env = StreamTableEnvironment.create(s_env, environment_settings=EnvironmentSettings.new_instance() 13 | .in_streaming_mode().use_blink_planner().build()) 14 | # use flink table planner 15 | # st_env = StreamTableEnvironment.create(s_env) 16 | source_file = os.getcwd() + "/../resources/table_orders.csv" 17 | st_env.register_table_source("Orders", 18 | CsvTableSource(source_file, 19 | ["a", "b", "c", "rowtime"], 20 | [DataTypes.STRING(), 21 | DataTypes.INT(), 22 | DataTypes.INT(), 23 | DataTypes.TIMESTAMP()])) 24 | st_env.connect( 25 | Elasticsearch() 26 | .version("6") 27 | .host("localhost", 9200, "http") 28 | .index("group_by_agg_streaming") 29 | .document_type('pyflink') 30 | .key_delimiter("_") 31 | .key_null_literal("null") 32 | .failure_handler_ignore() 33 | .disable_flush_on_checkpoint() 34 | .bulk_flush_max_actions(2) 35 | .bulk_flush_max_size("1 mb") 36 | .bulk_flush_interval(5000) 37 | ) \ 38 | .with_schema( 39 | Schema() 40 | .field("a", DataTypes.STRING()) 41 | .field("b", DataTypes.STRING()) 42 | ) \ 43 | .with_format( 44 | Json() 45 | .derive_schema() 46 | ) \ 47 | .in_upsert_mode() \ 48 | .register_table_sink("result") 49 | 50 | orders = st_env.scan("Orders") 51 | groub_by_table = orders.group_by("a").select("a, b.sum as d") 52 | # Because the schema of index user in elasticsearch is 53 | # {"a":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}}, 54 | # "b":{"type":"text","fields":{"keyword":{"type":"keyword","ignore_above":256}}} 55 | # so we need to cast the type in our demo. 56 | st_env.register_table("group_table", groub_by_table) 57 | result = st_env.sql_query("SELECT a, CAST(d AS VARCHAR) from group_table") 58 | result.insert_into("result") 59 | st_env.execute("group by agg streaming") 60 | # curl -X GET 'http://localhost:9200/group_by_agg_streaming/_search' 61 | # { 62 | # "took": 2, 63 | # "timed_out": false, 64 | # "_shards": { 65 | # "total": 5, 66 | # "successful": 5, 67 | # "skipped": 0, 68 | # "failed": 0 69 | # }, 70 | # "hits": { 71 | # "total": 2, 72 | # "max_score": 1, 73 | # "hits": [ 74 | # { 75 | # "_index": "group_by_agg_streaming", 76 | # "_type": "group_by_agg_streaming", 77 | # "_id": "b", 78 | # "_score": 1, 79 | # "_source": { 80 | # "a": "b", 81 | # "b": "6" 82 | # } 83 | # }, 84 | # { 85 | # "_index": "group_by_agg_streaming", 86 | # "_type": "group_by_agg_streaming", 87 | # "_id": "a", 88 | # "_score": 1, 89 | # "_source": { 90 | # "a": "a", 91 | # "b": "13" 92 | # } 93 | # } 94 | # ] 95 | # } 96 | # } 97 | 98 | 99 | if __name__ == '__main__': 100 | from table.prepare_environment import prepare_env 101 | prepare_env(need_upsert_sink=True) 102 | group_by_agg_streaming() 103 | -------------------------------------------------------------------------------- /table/streaming/group_by_window_agg.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic 4 | from pyflink.table import StreamTableEnvironment, CsvTableSink, DataTypes, EnvironmentSettings 5 | from pyflink.table.descriptors import Schema, Rowtime, Json, Kafka 6 | from pyflink.table.window import Tumble 7 | 8 | 9 | def group_by_window_agg_streaming(): 10 | s_env = StreamExecutionEnvironment.get_execution_environment() 11 | s_env.set_parallelism(1) 12 | s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) 13 | # use blink table planner 14 | st_env = StreamTableEnvironment.create(s_env, environment_settings=EnvironmentSettings.new_instance() 15 | .in_streaming_mode().use_blink_planner().build()) 16 | # use flink table planner 17 | # st_env = StreamTableEnvironment.create(s_env) 18 | result_file = "/tmp/table_group_by_window_agg_streaming.csv" 19 | if os.path.exists(result_file): 20 | os.remove(result_file) 21 | st_env \ 22 | .connect( # declare the external system to connect to 23 | Kafka() 24 | .version("0.11") 25 | .topic("user") 26 | .start_from_earliest() 27 | .property("zookeeper.connect", "localhost:2181") 28 | .property("bootstrap.servers", "localhost:9092") 29 | ) \ 30 | .with_format( # declare a format for this system 31 | Json() 32 | .fail_on_missing_field(True) 33 | .json_schema( 34 | "{" 35 | " type: 'object'," 36 | " properties: {" 37 | " a: {" 38 | " type: 'string'" 39 | " }," 40 | " b: {" 41 | " type: 'string'" 42 | " }," 43 | " c: {" 44 | " type: 'string'" 45 | " }," 46 | " time: {" 47 | " type: 'string'," 48 | " format: 'date-time'" 49 | " }" 50 | " }" 51 | "}" 52 | ) 53 | ) \ 54 | .with_schema( # declare the schema of the table 55 | Schema() 56 | .field("rowtime", DataTypes.TIMESTAMP()) 57 | .rowtime( 58 | Rowtime() 59 | .timestamps_from_field("time") 60 | .watermarks_periodic_bounded(60000)) 61 | .field("a", DataTypes.STRING()) 62 | .field("b", DataTypes.STRING()) 63 | .field("c", DataTypes.STRING()) 64 | ) \ 65 | .in_append_mode() \ 66 | .register_table_source("source") 67 | 68 | st_env.register_table_sink("result", 69 | CsvTableSink(["a", "b"], 70 | [DataTypes.STRING(), 71 | DataTypes.STRING()], 72 | result_file)) 73 | 74 | st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \ 75 | .group_by("w, a") \ 76 | .select("a, max(b)").insert_into("result") 77 | 78 | st_env.execute("group by window agg streaming") 79 | # cat /tmp/table_group_by_window_agg_streaming.csv 80 | # a,3 81 | # b,2 82 | 83 | 84 | if __name__ == '__main__': 85 | from table.prepare_environment import prepare_env 86 | prepare_env(need_stream_source=True) 87 | group_by_window_agg_streaming() 88 | -------------------------------------------------------------------------------- /table/streaming/in.py: -------------------------------------------------------------------------------- 1 | from pyflink.datastream import StreamExecutionEnvironment 2 | from pyflink.table import StreamTableEnvironment, DataTypes 3 | 4 | from table.user_defined_sources_and_sinks.TestRetractSink import TestRetractSink 5 | 6 | 7 | def in_streaming(): 8 | s_env = StreamExecutionEnvironment.get_execution_environment() 9 | s_env.set_parallelism(1) 10 | st_env = StreamTableEnvironment.create(s_env) 11 | left = st_env.from_elements( 12 | [(1, "ra", "raa"), (2, "lb", "lbb"), (3, "", "lcc"), (2, "lb", "lbb"), (4, "ra", "raa")], 13 | ["a", "b", "c"]).select("a, b, c") 14 | right = st_env.from_elements([(1, "ra", "raa"), (2, "", "rbb"), (3, "rc", "rcc"), (1, "ra", "raa")], 15 | ["a", "b", "c"]).select("a") 16 | 17 | result = left.where("a.in(%s)" % right).select("b, c") 18 | # another way 19 | # st_env.register_table("RightTable", right) 20 | # result = left.where("a.in(RightTable)") 21 | 22 | # use custom retract sink connector 23 | sink = TestRetractSink(["a", "b"], 24 | [DataTypes.STRING(), 25 | DataTypes.STRING()]) 26 | st_env.register_table_sink("sink", sink) 27 | result.insert_into("sink") 28 | st_env.execute("in streaming") 29 | # (true, ra, raa) 30 | # (true, lb, lbb) 31 | # (true, lb, lbb) 32 | # (true,, lcc) 33 | 34 | 35 | if __name__ == '__main__': 36 | in_streaming() 37 | -------------------------------------------------------------------------------- /table/streaming/inner_join.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.datastream import StreamExecutionEnvironment 4 | from pyflink.table import StreamTableEnvironment, CsvTableSink, DataTypes 5 | 6 | 7 | def inner_join_streaming(): 8 | s_env = StreamExecutionEnvironment.get_execution_environment() 9 | s_env.set_parallelism(1) 10 | st_env = StreamTableEnvironment.create(s_env) 11 | result_file = "/tmp/table_inner_join_streaming.csv" 12 | if os.path.exists(result_file): 13 | os.remove(result_file) 14 | left = st_env.from_elements( 15 | [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")], 16 | ["a", "b", "c"]).select("a, b, c") 17 | right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], 18 | ["d", "e", "f"]).select("d, e, f") 19 | st_env.register_table_sink("result", 20 | CsvTableSink(["a", "b", "c"], 21 | [DataTypes.BIGINT(), 22 | DataTypes.STRING(), 23 | DataTypes.STRING()], 24 | result_file)) 25 | 26 | result = left.join(right).where("a = d").select("a, b, e") 27 | result.insert_into("result") 28 | st_env.execute("inner join streaming") 29 | # cat /tmp/table_inner_join_streaming.csv 30 | # 1,1a,1b 31 | # 2,4b, 32 | # 2,2a, 33 | # 1,1a,3b 34 | 35 | 36 | if __name__ == '__main__': 37 | inner_join_streaming() 38 | -------------------------------------------------------------------------------- /table/streaming/left_outer_join.py: -------------------------------------------------------------------------------- 1 | from pyflink.datastream import StreamExecutionEnvironment 2 | from pyflink.table import StreamTableEnvironment, DataTypes 3 | 4 | from table.user_defined_sources_and_sinks.TestRetractSink import TestRetractSink 5 | 6 | 7 | def left_outer_join_streaming(): 8 | s_env = StreamExecutionEnvironment.get_execution_environment() 9 | s_env.set_parallelism(1) 10 | st_env = StreamTableEnvironment.create(s_env) 11 | left = st_env.from_elements( 12 | [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")], 13 | ["a", "b", "c"]).select("a, b, c") 14 | right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], 15 | ["d", "e", "f"]).select("d, e, f") 16 | 17 | result = left.left_outer_join(right, "a = d").select("a, b, e") 18 | # use custom retract sink connector 19 | sink = TestRetractSink(["a", "b", "c"], 20 | [DataTypes.BIGINT(), 21 | DataTypes.STRING(), 22 | DataTypes.STRING()]) 23 | st_env.register_table_sink("sink", sink) 24 | result.insert_into("sink") 25 | st_env.execute("left outer join streaming") 26 | # (true, 1, 1a, null) 27 | # (true, 2, 2a, null) 28 | # (true, 3, null, null) 29 | # (true, 2, 4b, null) 30 | # (true, 5, 5a, null) 31 | # (false, 1, 1a, null) 32 | # (true, 1, 1a, 1b) 33 | # (false, 2, 4b, null) 34 | # (true, 2, 4b, null) 35 | # (false, 2, 2a, null) 36 | # (true, 2, 2a, null) 37 | # (true, 1, 1a, 3b) 38 | 39 | 40 | if __name__ == '__main__': 41 | left_outer_join_streaming() 42 | -------------------------------------------------------------------------------- /table/streaming/over_window_agg.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic 4 | from pyflink.table import StreamTableEnvironment, CsvTableSink, DataTypes 5 | from pyflink.table.descriptors import Kafka, Json, Schema, Rowtime 6 | from pyflink.table.window import Over 7 | 8 | 9 | def over_window_agg_streaming(): 10 | s_env = StreamExecutionEnvironment.get_execution_environment() 11 | s_env.set_parallelism(1) 12 | s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) 13 | st_env = StreamTableEnvironment.create(s_env) 14 | result_file = "/tmp/table_over_window_agg_streaming.csv" 15 | if os.path.exists(result_file): 16 | os.remove(result_file) 17 | st_env \ 18 | .connect( # declare the external system to connect to 19 | Kafka() 20 | .version("0.11") 21 | .topic("user") 22 | .start_from_earliest() 23 | .property("zookeeper.connect", "localhost:2181") 24 | .property("bootstrap.servers", "localhost:9092") 25 | ) \ 26 | .with_format( # declare a format for this system 27 | Json() 28 | .fail_on_missing_field(True) 29 | .json_schema( 30 | "{" 31 | " type: 'object'," 32 | " properties: {" 33 | " a: {" 34 | " type: 'string'" 35 | " }," 36 | " b: {" 37 | " type: 'string'" 38 | " }," 39 | " c: {" 40 | " type: 'string'" 41 | " }," 42 | " time: {" 43 | " type: 'string'," 44 | " format: 'date-time'" 45 | " }" 46 | " }" 47 | "}" 48 | ) 49 | ) \ 50 | .with_schema( # declare the schema of the table 51 | Schema() 52 | .field("rowtime", DataTypes.TIMESTAMP()) 53 | .rowtime( 54 | Rowtime() 55 | .timestamps_from_field("time") 56 | .watermarks_periodic_bounded(60000)) 57 | .field("a", DataTypes.STRING()) 58 | .field("b", DataTypes.STRING()) 59 | .field("c", DataTypes.STRING()) 60 | ) \ 61 | .in_append_mode() \ 62 | .register_table_source("source") 63 | 64 | st_env.register_table_sink("result", 65 | CsvTableSink(["a", "b", "c"], 66 | [DataTypes.STRING(), 67 | DataTypes.STRING(), 68 | DataTypes.STRING()], 69 | result_file)) 70 | 71 | st_env.scan("source").over_window(Over.partition_by("a") 72 | .order_by("rowtime").preceding("30.minutes").alias("w")) \ 73 | .select("a, max(b) over w, min(c) over w").insert_into("result") 74 | 75 | st_env.execute("over window agg streaming") 76 | # cat /tmp/table_over_window_agg_streaming.csv 77 | # a,1,1 78 | # b,2,2 79 | # a,3,1 80 | # a,4,4 81 | # b,4,5 82 | 83 | # if preceding("unbounded_ranges") e.g: 84 | # st_env.scan("source").over_window(Over.partition_by("a") 85 | # .order_by("rowtime").preceding("unbounded_range").alias("w")) \ 86 | # .select("a, max(b) over w, min(c) over w").insert_into("result") 87 | # the result is 88 | # a,1,1 89 | # a,3,1 90 | # a,4,1 91 | # b,2,2 92 | # b,4,2 93 | # rows is similar to time, you can refer to the doc. 94 | 95 | 96 | if __name__ == '__main__': 97 | from table.prepare_environment import prepare_env 98 | prepare_env(need_stream_source=True) 99 | over_window_agg_streaming() 100 | -------------------------------------------------------------------------------- /table/streaming/rename_columns.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.datastream import StreamExecutionEnvironment 4 | from pyflink.table import StreamTableEnvironment, CsvTableSource, CsvTableSink, DataTypes 5 | 6 | 7 | def rename_columns_streaming(): 8 | s_env = StreamExecutionEnvironment.get_execution_environment() 9 | s_env.set_parallelism(1) 10 | st_env = StreamTableEnvironment.create(s_env) 11 | source_file = os.getcwd() + "/../resources/table_orders.csv" 12 | result_file = "/tmp/table_rename_columns_streaming.csv" 13 | if os.path.exists(result_file): 14 | os.remove(result_file) 15 | st_env.register_table_source("Orders", 16 | CsvTableSource(source_file, 17 | ["a", "b", "c", "rowtime"], 18 | [DataTypes.STRING(), 19 | DataTypes.INT(), 20 | DataTypes.INT(), 21 | DataTypes.TIMESTAMP()])) 22 | st_env.register_table_sink("result", 23 | CsvTableSink(["a", "b"], 24 | [DataTypes.STRING(), 25 | DataTypes.INT()], 26 | result_file)) 27 | orders = st_env.scan("Orders") 28 | result = orders.rename_columns("a as a2, b as b2").select("a2, b2") 29 | result.insert_into("result") 30 | st_env.execute("rename columns streaming") 31 | # cat /tmp/table_rename_columns_streaming.csv 32 | # a,1 33 | # b,2 34 | # a,3 35 | # a,4 36 | # b,4 37 | # a,5 38 | 39 | 40 | if __name__ == '__main__': 41 | rename_columns_streaming() 42 | -------------------------------------------------------------------------------- /table/streaming/right_outer_join.py: -------------------------------------------------------------------------------- 1 | from pyflink.datastream import StreamExecutionEnvironment 2 | from pyflink.table import StreamTableEnvironment, DataTypes 3 | 4 | from table.user_defined_sources_and_sinks.TestRetractSink import TestRetractSink 5 | 6 | 7 | def right_outer_join_streaming(): 8 | s_env = StreamExecutionEnvironment.get_execution_environment() 9 | s_env.set_parallelism(1) 10 | st_env = StreamTableEnvironment.create(s_env) 11 | left = st_env.from_elements( 12 | [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")], 13 | ["a", "b", "c"]).select("a, b, c") 14 | right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], 15 | ["d", "e", "f"]).select("d, e, f") 16 | 17 | result = left.right_outer_join(right, "a = d").select("b, e") 18 | # use custom retract sink connector 19 | sink = TestRetractSink(["a", "b"], 20 | [DataTypes.STRING(), 21 | DataTypes.STRING()]) 22 | st_env.register_table_sink("sink", sink) 23 | result.insert_into("sink") 24 | st_env.execute("right outer join streaming") 25 | # (true, null, null) 26 | # (true, null, 3b) 27 | # (true, null, 4b) 28 | # (false, null, 1b) 29 | # (true, 1a, 1b) 30 | # (false, null, 3b) 31 | # (true, 1a, 3b) 32 | # (false, null, null) 33 | # (true, 2a, null) 34 | # (true, 4b, null) 35 | 36 | 37 | if __name__ == '__main__': 38 | right_outer_join_streaming() 39 | -------------------------------------------------------------------------------- /table/streaming/scan.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.datastream import StreamExecutionEnvironment 4 | from pyflink.table import StreamTableEnvironment, CsvTableSource, CsvTableSink, DataTypes 5 | 6 | 7 | def scan_streaming(): 8 | s_env = StreamExecutionEnvironment.get_execution_environment() 9 | s_env.set_parallelism(1) 10 | st_env = StreamTableEnvironment.create(s_env) 11 | source_file = os.getcwd() + "/../resources/table_orders.csv" 12 | result_file = "/tmp/table_scan_streaming.csv" 13 | if os.path.exists(result_file): 14 | os.remove(result_file) 15 | st_env.register_table_source("Orders", 16 | CsvTableSource(source_file, 17 | ["a", "b", "c", "rowtime"], 18 | [DataTypes.STRING(), 19 | DataTypes.INT(), 20 | DataTypes.INT(), 21 | DataTypes.TIMESTAMP()])) 22 | st_env.register_table_sink("result", 23 | CsvTableSink(["a", "b", "c", "rowtime"], 24 | [DataTypes.STRING(), 25 | DataTypes.INT(), 26 | DataTypes.INT(), 27 | DataTypes.TIMESTAMP()], 28 | result_file)) 29 | orders = st_env.scan("Orders") 30 | orders.insert_into("result") 31 | st_env.execute("scan streaming") 32 | # cat /tmp/table_scan_streaming.csv 33 | # a,1,1,2013-01-01 00:14:13.0 34 | # b,2,2,2013-01-01 00:24:13.0 35 | # a,3,3,2013-01-01 00:34:13.0 36 | # a,4,4,2013-01-01 01:14:13.0 37 | # b,4,5,2013-01-01 01:24:13.0 38 | # a,5,2,2013-01-01 01:34:13.0 39 | 40 | 41 | if __name__ == '__main__': 42 | scan_streaming() 43 | -------------------------------------------------------------------------------- /table/streaming/session_window.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic 4 | from pyflink.table import StreamTableEnvironment, CsvTableSink, DataTypes 5 | from pyflink.table.descriptors import Schema, Rowtime, Json, Kafka 6 | from pyflink.table.window import Session 7 | 8 | 9 | def session_time_window_streaming(): 10 | s_env = StreamExecutionEnvironment.get_execution_environment() 11 | s_env.set_parallelism(1) 12 | s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) 13 | st_env = StreamTableEnvironment.create(s_env) 14 | result_file = "/tmp/session_time_window_streaming.csv" 15 | if os.path.exists(result_file): 16 | os.remove(result_file) 17 | st_env \ 18 | .connect( # declare the external system to connect to 19 | Kafka() 20 | .version("0.11") 21 | .topic("user") 22 | .start_from_earliest() 23 | .property("zookeeper.connect", "localhost:2181") 24 | .property("bootstrap.servers", "localhost:9092") 25 | ) \ 26 | .with_format( # declare a format for this system 27 | Json() 28 | .fail_on_missing_field(True) 29 | .json_schema( 30 | "{" 31 | " type: 'object'," 32 | " properties: {" 33 | " a: {" 34 | " type: 'string'" 35 | " }," 36 | " b: {" 37 | " type: 'string'" 38 | " }," 39 | " c: {" 40 | " type: 'string'" 41 | " }," 42 | " time: {" 43 | " type: 'string'," 44 | " format: 'date-time'" 45 | " }" 46 | " }" 47 | "}" 48 | ) 49 | ) \ 50 | .with_schema( # declare the schema of the table 51 | Schema() 52 | .field("rowtime", DataTypes.TIMESTAMP()) 53 | .rowtime( 54 | Rowtime() 55 | .timestamps_from_field("time") 56 | .watermarks_periodic_bounded(60000)) 57 | .field("a", DataTypes.STRING()) 58 | .field("b", DataTypes.STRING()) 59 | .field("c", DataTypes.STRING()) 60 | ) \ 61 | .in_append_mode() \ 62 | .register_table_source("source") 63 | 64 | st_env.register_table_sink("result", 65 | CsvTableSink(["a", "b"], 66 | [DataTypes.STRING(), 67 | DataTypes.STRING()], 68 | result_file)) 69 | 70 | st_env.scan("source").window(Session.with_gap("10.minutes").on("rowtime").alias("w")) \ 71 | .group_by("w, a") \ 72 | .select("a, max(b)").insert_into("result") 73 | 74 | st_env.execute("session time window streaming") 75 | # cat /tmp/session_time_window_streaming.csv 76 | # a,1 77 | # b,2 78 | # a,3 79 | # a,4 80 | 81 | 82 | if __name__ == '__main__': 83 | from table.prepare_environment import prepare_env 84 | prepare_env(need_stream_source=True) 85 | session_time_window_streaming() 86 | -------------------------------------------------------------------------------- /table/streaming/slide_window.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic 4 | from pyflink.table import StreamTableEnvironment, CsvTableSink, DataTypes 5 | from pyflink.table.descriptors import Schema, Rowtime, Json, Kafka 6 | from pyflink.table.window import Slide 7 | 8 | 9 | def slide_time_window_streaming(): 10 | s_env = StreamExecutionEnvironment.get_execution_environment() 11 | s_env.set_parallelism(1) 12 | s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) 13 | st_env = StreamTableEnvironment.create(s_env) 14 | result_file = "/tmp/slide_time_window_streaming.csv" 15 | if os.path.exists(result_file): 16 | os.remove(result_file) 17 | st_env \ 18 | .connect( # declare the external system to connect to 19 | Kafka() 20 | .version("0.11") 21 | .topic("user") 22 | .start_from_earliest() 23 | .property("zookeeper.connect", "localhost:2181") 24 | .property("bootstrap.servers", "localhost:9092") 25 | ) \ 26 | .with_format( # declare a format for this system 27 | Json() 28 | .fail_on_missing_field(True) 29 | .json_schema( 30 | "{" 31 | " type: 'object'," 32 | " properties: {" 33 | " a: {" 34 | " type: 'string'" 35 | " }," 36 | " b: {" 37 | " type: 'string'" 38 | " }," 39 | " c: {" 40 | " type: 'string'" 41 | " }," 42 | " time: {" 43 | " type: 'string'," 44 | " format: 'date-time'" 45 | " }" 46 | " }" 47 | "}" 48 | ) 49 | ) \ 50 | .with_schema( # declare the schema of the table 51 | Schema() 52 | .field("rowtime", DataTypes.TIMESTAMP()) 53 | .rowtime( 54 | Rowtime() 55 | .timestamps_from_field("time") 56 | .watermarks_periodic_bounded(60000)) 57 | .field("a", DataTypes.STRING()) 58 | .field("b", DataTypes.STRING()) 59 | .field("c", DataTypes.STRING()) 60 | ) \ 61 | .in_append_mode() \ 62 | .register_table_source("source") 63 | 64 | st_env.register_table_sink("result", 65 | CsvTableSink(["a", "b"], 66 | [DataTypes.STRING(), 67 | DataTypes.STRING()], 68 | result_file)) 69 | 70 | st_env.scan("source").window(Slide.over("1.hours").every("10.minutes").on("rowtime").alias("w")) \ 71 | .group_by("w, a") \ 72 | .select("a, max(b)").insert_into("result") 73 | 74 | st_env.execute("slide time window streaming") 75 | # cat /tmp/slide_time_window_streaming.csv 76 | # a,1 77 | # a,1 78 | # b,2 79 | # a,3 80 | # b,2 81 | # b,2 82 | # a,3 83 | # b,2 84 | # a,3 85 | # b,2 86 | # a,3 87 | # b,2 88 | # a,4 89 | # b,4 90 | # a,4 91 | 92 | 93 | def slide_row_window_streaming(): 94 | s_env = StreamExecutionEnvironment.get_execution_environment() 95 | s_env.set_parallelism(1) 96 | s_env.set_stream_time_characteristic(TimeCharacteristic.ProcessingTime) 97 | st_env = StreamTableEnvironment.create(s_env) 98 | result_file = "/tmp/slide_row_window_streaming.csv" 99 | if os.path.exists(result_file): 100 | os.remove(result_file) 101 | st_env \ 102 | .connect( # declare the external system to connect to 103 | Kafka() 104 | .version("0.11") 105 | .topic("user") 106 | .start_from_earliest() 107 | .property("zookeeper.connect", "localhost:2181") 108 | .property("bootstrap.servers", "localhost:9092") 109 | ) \ 110 | .with_format( # declare a format for this system 111 | Json() 112 | .fail_on_missing_field(True) 113 | .json_schema( 114 | "{" 115 | " type: 'object'," 116 | " properties: {" 117 | " a: {" 118 | " type: 'string'" 119 | " }," 120 | " b: {" 121 | " type: 'string'" 122 | " }," 123 | " c: {" 124 | " type: 'string'" 125 | " }," 126 | " time: {" 127 | " type: 'string'," 128 | " format: 'date-time'" 129 | " }" 130 | " }" 131 | "}" 132 | ) 133 | ) \ 134 | .with_schema( # declare the schema of the table 135 | Schema() 136 | .field("proctime", DataTypes.TIMESTAMP()) 137 | .proctime() 138 | .field("a", DataTypes.STRING()) 139 | .field("b", DataTypes.STRING()) 140 | .field("c", DataTypes.STRING()) 141 | ) \ 142 | .in_append_mode() \ 143 | .register_table_source("source") 144 | 145 | st_env.register_table_sink("result", 146 | CsvTableSink(["a", "b"], 147 | [DataTypes.STRING(), 148 | DataTypes.STRING()], 149 | result_file)) 150 | 151 | st_env.scan("source").window(Slide.over("2.rows").every("1.rows").on("proctime").alias("w")) \ 152 | .group_by("w, a") \ 153 | .select("a, max(b)").insert_into("result") 154 | 155 | st_env.execute("slide row window streaming") 156 | # cat /tmp/slide_row_window_streaming.csv 157 | # a,1 158 | # b,2 159 | # a,3 160 | # a,4 161 | # b,4 162 | # a,5 163 | 164 | 165 | if __name__ == '__main__': 166 | from table.prepare_environment import prepare_env 167 | prepare_env(need_stream_source=True) 168 | # slide_time_window_streaming() 169 | slide_row_window_streaming() 170 | -------------------------------------------------------------------------------- /table/streaming/table_select.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.datastream import StreamExecutionEnvironment 4 | from pyflink.table import StreamTableEnvironment, CsvTableSource, CsvTableSink, DataTypes 5 | 6 | 7 | def select_streaming(): 8 | s_env = StreamExecutionEnvironment.get_execution_environment() 9 | s_env.set_parallelism(1) 10 | st_env = StreamTableEnvironment.create(s_env) 11 | source_file = os.getcwd() + "/../resources/table_orders.csv" 12 | result_file = "/tmp/table_select_streaming.csv" 13 | if os.path.exists(result_file): 14 | os.remove(result_file) 15 | 16 | st_env.register_table_source("Orders", 17 | CsvTableSource(source_file, 18 | ["a", "b", "c", "rowtime"], 19 | [DataTypes.STRING(), 20 | DataTypes.INT(), 21 | DataTypes.INT(), 22 | DataTypes.TIMESTAMP()])) 23 | st_env.register_table_sink("result", 24 | CsvTableSink(["a", "c"], 25 | [DataTypes.STRING(), 26 | DataTypes.INT()], 27 | result_file)) 28 | orders = st_env.scan("Orders") 29 | result = orders.select("a, b") 30 | result.insert_into("result") 31 | st_env.execute("select streaming") 32 | 33 | # cat /tmp/table_select_streaming.csv 34 | # a,1 35 | # b,2 36 | # a,3 37 | # a,4 38 | # b,4 39 | # a,5 40 | 41 | 42 | if __name__ == '__main__': 43 | select_streaming() 44 | -------------------------------------------------------------------------------- /table/streaming/tumble_window.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic 4 | from pyflink.table import StreamTableEnvironment, CsvTableSink, DataTypes 5 | from pyflink.table.descriptors import Schema, Rowtime, Json, Kafka 6 | from pyflink.table.window import Tumble 7 | 8 | 9 | def tumble_time_window_streaming(): 10 | s_env = StreamExecutionEnvironment.get_execution_environment() 11 | s_env.set_parallelism(1) 12 | s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) 13 | st_env = StreamTableEnvironment.create(s_env) 14 | result_file = "/tmp/tumble_time_window_streaming.csv" 15 | if os.path.exists(result_file): 16 | os.remove(result_file) 17 | st_env \ 18 | .connect( # declare the external system to connect to 19 | Kafka() 20 | .version("0.11") 21 | .topic("user") 22 | .start_from_earliest() 23 | .property("zookeeper.connect", "localhost:2181") 24 | .property("bootstrap.servers", "localhost:9092") 25 | ) \ 26 | .with_format( # declare a format for this system 27 | Json() 28 | .fail_on_missing_field(True) 29 | .json_schema( 30 | "{" 31 | " type: 'object'," 32 | " properties: {" 33 | " a: {" 34 | " type: 'string'" 35 | " }," 36 | " b: {" 37 | " type: 'string'" 38 | " }," 39 | " c: {" 40 | " type: 'string'" 41 | " }," 42 | " time: {" 43 | " type: 'string'," 44 | " format: 'date-time'" 45 | " }" 46 | " }" 47 | "}" 48 | ) 49 | ) \ 50 | .with_schema( # declare the schema of the table 51 | Schema() 52 | .field("rowtime", DataTypes.TIMESTAMP()) 53 | .rowtime( 54 | Rowtime() 55 | .timestamps_from_field("time") 56 | .watermarks_periodic_bounded(60000)) 57 | .field("a", DataTypes.STRING()) 58 | .field("b", DataTypes.STRING()) 59 | .field("c", DataTypes.STRING()) 60 | ) \ 61 | .in_append_mode() \ 62 | .register_table_source("source") 63 | 64 | st_env.register_table_sink("result", 65 | CsvTableSink(["a", "b"], 66 | [DataTypes.STRING(), 67 | DataTypes.STRING()], 68 | result_file)) 69 | 70 | st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \ 71 | .group_by("w, a") \ 72 | .select("a, max(b)").insert_into("result") 73 | 74 | st_env.execute("tumble time window streaming") 75 | # cat /tmp/tumble_time_window_streaming.csv 76 | # a,3 77 | # b,2 78 | 79 | 80 | def tumble_row_window_streaming(): 81 | s_env = StreamExecutionEnvironment.get_execution_environment() 82 | s_env.set_parallelism(1) 83 | s_env.set_stream_time_characteristic(TimeCharacteristic.ProcessingTime) 84 | st_env = StreamTableEnvironment.create(s_env) 85 | result_file = "/tmp/tumble_row_window_streaming.csv" 86 | if os.path.exists(result_file): 87 | os.remove(result_file) 88 | st_env \ 89 | .connect( # declare the external system to connect to 90 | Kafka() 91 | .version("0.11") 92 | .topic("user") 93 | .start_from_earliest() 94 | .property("zookeeper.connect", "localhost:2181") 95 | .property("bootstrap.servers", "localhost:9092") 96 | ) \ 97 | .with_format( # declare a format for this system 98 | Json() 99 | .fail_on_missing_field(True) 100 | .json_schema( 101 | "{" 102 | " type: 'object'," 103 | " properties: {" 104 | " a: {" 105 | " type: 'string'" 106 | " }," 107 | " b: {" 108 | " type: 'string'" 109 | " }," 110 | " c: {" 111 | " type: 'string'" 112 | " }," 113 | " time: {" 114 | " type: 'string'," 115 | " format: 'date-time'" 116 | " }" 117 | " }" 118 | "}" 119 | ) 120 | ) \ 121 | .with_schema( # declare the schema of the table 122 | Schema() 123 | .field("proctime", DataTypes.TIMESTAMP()) 124 | .proctime() 125 | .field("a", DataTypes.STRING()) 126 | .field("b", DataTypes.STRING()) 127 | .field("c", DataTypes.STRING()) 128 | ) \ 129 | .in_append_mode() \ 130 | .register_table_source("source") 131 | 132 | st_env.register_table_sink("result", 133 | CsvTableSink(["a", "b"], 134 | [DataTypes.STRING(), 135 | DataTypes.STRING()], 136 | result_file)) 137 | 138 | st_env.scan("source").window(Tumble.over("2.rows").on("proctime").alias("w")) \ 139 | .group_by("w, a") \ 140 | .select("a, max(b)").insert_into("result") 141 | 142 | st_env.execute("tumble row window streaming") 143 | # cat /tmp/tumble_row_window_streaming.csv 144 | # a,3 145 | # b,4 146 | # a 5 147 | 148 | 149 | if __name__ == '__main__': 150 | from table.prepare_environment import prepare_env 151 | prepare_env(need_stream_source=True) 152 | # tumble_time_window_streaming() 153 | tumble_row_window_streaming() 154 | -------------------------------------------------------------------------------- /table/streaming/union_all.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.datastream import StreamExecutionEnvironment 4 | from pyflink.table import StreamTableEnvironment, CsvTableSink, DataTypes 5 | 6 | 7 | def union_all_streaming(): 8 | s_env = StreamExecutionEnvironment.get_execution_environment() 9 | s_env.set_parallelism(1) 10 | st_env = StreamTableEnvironment.create(s_env) 11 | result_file = "/tmp/table_union_all_streaming.csv" 12 | if os.path.exists(result_file): 13 | os.remove(result_file) 14 | left = st_env.from_elements( 15 | [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (1, "1a", "1laa"), (1, "1b", "1bb")], 16 | ["a", "b", "c"]).select("a, b, c") 17 | right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], 18 | ["a", "b", "c"]).select("a, b, c") 19 | st_env.register_table_sink("result", 20 | CsvTableSink(["a", "b", "c"], 21 | [DataTypes.BIGINT(), 22 | DataTypes.STRING(), 23 | DataTypes.STRING()], 24 | result_file)) 25 | 26 | result = left.union_all(right) 27 | result.insert_into("result") 28 | st_env.execute("union all streaming") 29 | # cat /tmp/table_union_all_streaming.csv 30 | # 1,1b,1bb 31 | # 2,,2bb 32 | # 1,3b,3bb 33 | # 4,4b,4bb 34 | # 1,1a,1laa 35 | # 2,2a,2aa 36 | # 3,,3aa 37 | # 1,1a,1laa 38 | # 1,1b,1bb 39 | 40 | 41 | if __name__ == '__main__': 42 | union_all_streaming() 43 | -------------------------------------------------------------------------------- /table/streaming/where.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.datastream import StreamExecutionEnvironment 4 | from pyflink.table import StreamTableEnvironment, CsvTableSource, CsvTableSink, DataTypes 5 | 6 | 7 | def where_streaming(): 8 | s_env = StreamExecutionEnvironment.get_execution_environment() 9 | s_env.set_parallelism(1) 10 | st_env = StreamTableEnvironment.create(s_env) 11 | source_file = os.getcwd() + "/../resources/table_orders.csv" 12 | result_file = "/tmp/table_where_streaming.csv" 13 | if os.path.exists(result_file): 14 | os.remove(result_file) 15 | st_env.register_table_source("Orders", 16 | CsvTableSource(source_file, 17 | ["a", "b", "c", "rowtime"], 18 | [DataTypes.STRING(), 19 | DataTypes.INT(), 20 | DataTypes.INT(), 21 | DataTypes.TIMESTAMP()])) 22 | st_env.register_table_sink("result", 23 | CsvTableSink(["a", "b", "c", "rowtime"], 24 | [DataTypes.STRING(), 25 | DataTypes.INT(), 26 | DataTypes.INT(), 27 | DataTypes.TIMESTAMP()], 28 | result_file)) 29 | orders = st_env.scan("Orders") 30 | result = orders.where("a === 'b'") 31 | result.insert_into("result") 32 | st_env.execute("where streaming") 33 | # cat /tmp/table_where_streaming.csv 34 | # b,2,2,2013-01-01 00:24:13.0 35 | # b,4,5,2013-01-01 01:24:13.0 36 | 37 | 38 | if __name__ == '__main__': 39 | where_streaming() 40 | -------------------------------------------------------------------------------- /table/user_case/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukess/pyflink-demo/7e8ae37b3cfb4a6a5560cae2a0d44cddee25e036/table/user_case/__init__.py -------------------------------------------------------------------------------- /table/user_case/pv_uv/README.md: -------------------------------------------------------------------------------- 1 | # pv_uv_demo 2 | This demo is to help users to use pyflink api to develop a pv/uv demo 3 | 4 | **contents** 5 | 6 | - [Quick Start](#quick-start) 7 | + [Setup](#setup) 8 | + [Requirements](#requirements) 9 | + [Install python2](#install-python2) 10 | + [Install pip](#install-pip) 11 | + [Install java 8](#install-java-8) 12 | + [Install maven](#install-maven) 13 | + [Build PyFlink](#build-pyflink) 14 | + [Prepare Kafka](#prepare-kafka) 15 | + [Prepare Derby](#prepare-derby) 16 | + [Install Dependency](#install-dependency) 17 | + [Prepare Data](#prepare-data) 18 | + [Run Demo](#run-the-demo) 19 | + [See the result](#see-the-result) 20 | 21 | ## Quick Start 22 | 23 | ### Setup 24 | 25 | #### Requirements 26 | 1. python2.7 or python3 27 | 2. pip 28 | 3. java 1.8 29 | 4. maven version >=3.3.0 30 | 31 | #### Install python2 32 | 33 | macOS 34 | ```shell 35 | /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" 36 | export PATH="/usr/local/bin:/usr/local/sbin:$PATH" 37 | brew install python@2 38 | ``` 39 | Ubuntu 40 | ```shell 41 | sudo apt install python-dev 42 | ``` 43 | 44 | #### Install pip 45 | 46 | macOS 47 | 48 | ```shell 49 | curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py 50 | python get-pip.py 51 | ``` 52 | 53 | Ubuntu 54 | ```shell 55 | sudo apt install python-pip 56 | ``` 57 | 58 | #### Install java 8 59 | 60 | [java download page](http://www.oracle.com/technetwork/java/javase/downloads/index.html) 61 | 62 | #### Install maven 63 | 64 | maven version >=3.3.0 65 | 66 | [download maven page](http://maven.apache.org/download.cgi) 67 | 68 | ```shell 69 | tar -xvf apache-maven-3.6.1-bin.tar.gz 70 | mv -rf apache-maven-3.6.1 /usr/local/ 71 | ``` 72 | configuration environment variables 73 | ```shell 74 | MAVEN_HOME=/usr/local/apache-maven-3.6.1 75 | export MAVEN_HOME 76 | export PATH=${PATH}:${MAVEN_HOME}/bin 77 | ``` 78 | 79 | 80 | ### Build PyFlink 81 | 82 | If you want to build a PyFlink package that can be used for pip installation, you need to build Flink jars first, as described in https://ci.apache.org/projects/flink/flink-docs-master/flinkDev/building.html 83 | 84 | ```shell 85 | mvn clean install -DskipTests -Dfast 86 | ``` 87 | 88 | Then you need to copy the jar package flink-sql-connector-kafka-0.11_*-SNAPSHOT.jar in the directory of flink-connectors/flink-sql-connector-kafka-0.11 89 | 90 | ```shell 91 | cp flink-connectors/flink-sql-connector-kafka-0.11/target/flink-sql-connector-kafka-0.11_*-SNAPSHOT.jar build-target/lib 92 | ``` 93 | 94 | Then you need to copy the jar package flink-jdbc_*-SNAPSHOT.jar in the directory of flink-connectors/flink-jdbc 95 | 96 | ```shell 97 | cp flink-connectors/flink-jdbc/target/flink-jdbc_*-SNAPSHOT.jar build-target/lib 98 | ``` 99 | 100 | Next you need to copy the jar package flink-json-*-SNAPSHOT-sql-jar.jar in the directory of flink-formats/flink-json 101 | 102 | ```shell 103 | cp flink-formats/flink-json/target/flink-json-*-SNAPSHOT-sql-jar.jar build-target/lib 104 | ``` 105 | 106 | Next go to the root directory of flink source code and run this command to build the sdist package and wheel package: 107 | 108 | ```shell 109 | cd flink-python; python3 setup.py sdist bdist_wheel 110 | ``` 111 | 112 | The sdist and wheel package will be found under `./flink-python/dist/`. Either of them could be used for pip installation, such as: 113 | 114 | ```shell 115 | pip install dist/*.tar.gz 116 | ``` 117 | 118 | ### Prepare Kafka 119 | Some demo choose kafka as source, so you need to install and run kafka in local host. the version we use kafka_2.11-0.11 (https://archive.apache.org/dist/kafka/0.11.0.3/kafka_2.11-0.11.0.3.tgz) 120 | you use the following command to download: 121 | 122 | ```shell 123 | wget https://archive.apache.org/dist/kafka/0.11.0.3/kafka_2.11-0.11.0.3.tgz 124 | ``` 125 | 126 | Then you depress the tar package: 127 | 128 | ```shell 129 | tar zxvf kafka_2.11-0.11.0.3.tgz 130 | ``` 131 | 132 | ### Prepare Derby 133 | the pv_uv_demo need a upsert sink connector and I choose the derby, so you need to install and run Derby in local host. the version we use db-derby-10.14.2.0-lib (http://apache.mirrors.pair.com//db/derby/db-derby-10.14.2.0/db-derby-10.14.2.0-lib.tar.gz) 134 | you use the following command to download: 135 | 136 | ```shell 137 | wget http://apache.mirrors.pair.com//db/derby/db-derby-10.14.2.0/db-derby-10.14.2.0-lib.tar.gz 138 | ``` 139 | 140 | Then you depress the tar package: 141 | 142 | ```shell 143 | tar zxvf http://apache.mirrors.pair.com//db/derby/db-derby-10.14.2.0/db-derby-10.14.2.0-lib.tar.gz 144 | ``` 145 | 146 | Next, you start Derby Server: 147 | 148 | ```shell 149 | ./bin/startNetworkServer -h 0.0.0.0 150 | ``` 151 | 152 | Next, you can run the ij in another terminal: 153 | 154 | ```shell 155 | ./bin/ij 156 | ``` 157 | 158 | Next, you can connect to the server in the ij interactive command: 159 | 160 | ```shell 161 | ij> connect 'jdbc:derby://localhost:1527/firstdb;create=true'; 162 | ``` 163 | 164 | Next, you need to create the result table pv_uv_table in the ij terminal: 165 | 166 | ```shell 167 | ij> create table pv_uv_table(startTime TIMESTAMP,endTime TIMESTAMP,pv bigint,uv bigint); 168 | ``` 169 | 170 | Finally, you need to put the derby.jar, derbyclient.jar and derbytools.jar in db-derby-10.14.2.0-bin/lib into the Python directory of site-package/pyflink/lib 171 | 172 | ### Install Dependency 173 | Install environment dependency 174 | 175 | ```shell 176 | pip install -r requirements.txt 177 | ``` 178 | 179 | ### Prepare Data 180 | First you need to replace the variable KAFKA_DIR in file env.sh with your installed KAFKA binary directory, for example in my env.sh: 181 | 182 | ```shell 183 | KAFKA_DIR=/Users/duanchen/Applications/kafka_2.11-0.11.0.3 184 | ``` 185 | 186 | Next, you need to source the create_data.sh 187 | 188 | ```shell 189 | source create_data.sh 190 | ``` 191 | 192 | Next, you can start kafka 193 | 194 | ```shell 195 | start_kafka 196 | ``` 197 | 198 | Next, you can create the topic which will be used in our demo 199 | 200 | ```shell 201 | create_kafka_topic 1 1 user_behavior 202 | ``` 203 | 204 | Finally, you can send message to to the topic user_behavior 205 | 206 | ```shell 207 | send_message user_behavior user_behavior.log 208 | ``` 209 | 210 | ## Run The Demo 211 | The demo code in pv-uv_example.py, you can directly run the code 212 | 213 | ### See the result 214 | you can see the result in the ij terminal: 215 | 216 | ```shell 217 | ij> select * from pv_uv_table; 218 | STARTTIME |ENDTIME |PV |UV 219 | ----------------------------------------------------------------------------------------------------- 220 | 2017-11-26 01:00:00.0 |2017-11-26 02:00:00.0 |47244 |30837 221 | 2017-11-26 02:00:00.0 |2017-11-26 03:00:00.0 |53902 |35261 222 | 2017-11-26 03:00:00.0 |2017-11-26 04:00:00.0 |53135 |35302 223 | 2017-11-26 04:00:00.0 |2017-11-26 05:00:00.0 |49863 |33537 224 | 2017-11-26 05:00:00.0 |2017-11-26 06:00:00.0 |54305 |35748 225 | 2017-11-26 06:00:00.0 |2017-11-26 07:00:00.0 |56718 |36934 226 | 2017-11-26 07:00:00.0 |2017-11-26 08:00:00.0 |58324 |37763 227 | 2017-11-26 08:00:00.0 |2017-11-26 09:00:00.0 |58672 |37961 228 | 229 | 已选择 8 行 230 | ``` 231 | -------------------------------------------------------------------------------- /table/user_case/pv_uv/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukess/pyflink-demo/7e8ae37b3cfb4a6a5560cae2a0d44cddee25e036/table/user_case/pv_uv/__init__.py -------------------------------------------------------------------------------- /table/user_case/pv_uv/create_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ################################################################################ 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | ################################################################################ 19 | source "$(dirname "$0")"/env.sh 20 | 21 | function check_kafka_dir_set { 22 | if [[ -z $KAFKA_DIR ]]; then 23 | echo "Faild to set KAFKA_DIR , you can check the code in env.sh" 24 | exit 1 25 | fi 26 | } 27 | 28 | function start_zookeeper { 29 | check_kafka_dir_set 30 | $KAFKA_DIR/bin/zookeeper-server-start.sh $KAFKA_DIR/config/zookeeper.properties & 31 | } 32 | 33 | function stop_zookeeper { 34 | check_kafka_dir_set 35 | $KAFKA_DIR/bin/zookeeper-server-stop.sh 36 | } 37 | 38 | function start_kafka_server { 39 | check_kafka_dir_set 40 | $KAFKA_DIR/bin/kafka-server-start.sh $KAFKA_DIR/config/server.properties & 41 | } 42 | 43 | function stop_kafka_server { 44 | check_kafka_dir_set 45 | $KAFKA_DIR/bin/kafka-server-stop.sh 46 | } 47 | 48 | function check_start { 49 | # zookeeper outputs the "Node does not exist" bit to stderr 50 | while [[ $($KAFKA_DIR/bin/zookeeper-shell.sh localhost:2181 get /brokers/ids/0 2>&1) =~ .*Node\ does\ not\ exist.* ]]; do 51 | echo "Waiting for broker..." 52 | sleep 1 53 | done 54 | } 55 | 56 | function start_kafka { 57 | start_zookeeper 58 | start_kafka_server 59 | check_start 60 | } 61 | 62 | function stop_kafka { 63 | check_kafka_dir_set 64 | stop_kafka_server 65 | stop_zookeeper 66 | 67 | # Terminate Kafka process if it still exists 68 | PIDS=$(jps -vl | grep -i 'kafka\.Kafka' | grep java | grep -v grep | awk '{print $1}'|| echo "") 69 | 70 | if [ ! -z "$PIDS" ]; then 71 | kill -s TERM $PIDS || true 72 | fi 73 | 74 | # Terminate QuorumPeerMain process if it still exists 75 | PIDS=$(jps -vl | grep java | grep -i QuorumPeerMain | grep -v grep | awk '{print $1}'|| echo "") 76 | 77 | if [ ! -z "$PIDS" ]; then 78 | kill -s TERM $PIDS || true 79 | fi 80 | } 81 | 82 | function create_kafka_topic { 83 | check_kafka_dir_set 84 | $KAFKA_DIR/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor $1 --partitions $2 --topic $3 2>&1 >/dev/null 85 | } 86 | 87 | function drop_kafka_topic { 88 | check_kafka_dir_set 89 | $KAFKA_DIR/bin/kafka-topics.sh --delete --zookeeper localhost:2181 --topic $1 2>&1 >/dev/null 90 | sleep 1 91 | } 92 | 93 | function send_message { 94 | check_kafka_dir_set 95 | # batch produce to kafka 96 | $KAFKA_DIR/bin/kafka-console-producer.sh --broker-list localhost:9092 --topic $1 < $2 97 | } 98 | 99 | function send_demo_message { 100 | check_kafka_dir_set 101 | send_messages_to_kafka '{"user_id": "543462", "item_id":"1715", "category_id": "1464116", "behavior": "pv", "ts": "2017-11-26T01:00:00Z"}' $1 102 | send_messages_to_kafka '{"user_id": "662867", "item_id":"2244074", "category_id": "1575622", "behavior": "pv", "ts": "2017-11-26T01:00:00Z"}' $1 103 | send_messages_to_kafka '{"user_id": "561558", "item_id":"3611281", "category_id": "965809", "behavior": "pv", "ts": "2017-11-26T01:00:00Z"}' $1 104 | } 105 | 106 | function send_messages_to_kafka { 107 | echo -e $1 | $KAFKA_DIR/bin/kafka-console-producer.sh --broker-list localhost:9092 --topic $2 108 | } 109 | 110 | # stop_kafka 111 | # start_kafka 112 | # drop_kafka_topic user_behavior 113 | # create_kafka_topic 1 1 user_behavior 114 | # send_message user_behavior user_behavior.log 115 | # send_demo_message user_behavior 116 | # stop_kafka 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /table/user_case/pv_uv/env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ################################################################################ 3 | # Licensed to the Apache Software Foundation (ASF) under one 4 | # or more contributor license agreements. See the NOTICE file 5 | # distributed with this work for additional information 6 | # regarding copyright ownership. The ASF licenses this file 7 | # to you under the Apache License, Version 2.0 (the 8 | # "License"); you may not use this file except in compliance 9 | # with the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | ################################################################################ 19 | 20 | KAFKA_DIR=/Users/duanchen/Applications/kafka_2.11-0.11.0.3 21 | -------------------------------------------------------------------------------- /table/user_case/pv_uv/pv_uv_example.py: -------------------------------------------------------------------------------- 1 | from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic 2 | from pyflink.table import StreamTableEnvironment, DataTypes, EnvironmentSettings 3 | from pyflink.table.descriptors import CustomConnectorDescriptor, Schema, Kafka, Json, Rowtime 4 | from pyflink.table.window import Tumble 5 | 6 | 7 | def pv_uv_demo(): 8 | s_env = StreamExecutionEnvironment.get_execution_environment() 9 | s_env.set_stream_time_characteristic(TimeCharacteristic.EventTime) 10 | s_env.set_parallelism(1) 11 | # use blink table planner 12 | st_env = StreamTableEnvironment.create(s_env, environment_settings=EnvironmentSettings.new_instance() 13 | .in_streaming_mode().use_blink_planner().build()) 14 | # use flink table planner 15 | # st_env = StreamTableEnvironment.create(s_env) 16 | st_env \ 17 | .connect( # declare the external system to connect to 18 | Kafka() 19 | .version("0.11") 20 | .topic("user_behavior") 21 | .start_from_earliest() 22 | .property("zookeeper.connect", "localhost:2181") 23 | .property("bootstrap.servers", "localhost:9092") 24 | ) \ 25 | .with_format( # declare a format for this system 26 | Json() 27 | .fail_on_missing_field(True) 28 | .json_schema( 29 | "{" 30 | " type: 'object'," 31 | " properties: {" 32 | " user_id: {" 33 | " type: 'string'" 34 | " }," 35 | " item_id: {" 36 | " type: 'string'" 37 | " }," 38 | " category_id: {" 39 | " type: 'string'" 40 | " }," 41 | " behavior: {" 42 | " type: 'string'" 43 | " }," 44 | " ts: {" 45 | " type: 'string'," 46 | " format: 'date-time'" 47 | " }" 48 | " }" 49 | "}" 50 | ) 51 | ) \ 52 | .with_schema( # declare the schema of the table 53 | Schema() 54 | .field("user_id", DataTypes.STRING()) 55 | .field("item_id", DataTypes.STRING()) 56 | .field("category_id", DataTypes.STRING()) 57 | .field("behavior", DataTypes.STRING()) 58 | .field("rowtime", DataTypes.TIMESTAMP()) 59 | .rowtime( 60 | Rowtime() 61 | .timestamps_from_field("ts") 62 | .watermarks_periodic_bounded(60000)) 63 | ) \ 64 | .in_append_mode() \ 65 | .register_table_source("source") 66 | 67 | # use custom retract sink connector 68 | custom_connector = CustomConnectorDescriptor('jdbc', 1, False) \ 69 | .property("connector.driver", "org.apache.derby.jdbc.ClientDriver") \ 70 | .property("connector.url", "jdbc:derby://localhost:1527/firstdb") \ 71 | .property("connector.table", "pv_uv_table") \ 72 | .property("connector.write.flush.max-rows", "1") 73 | st_env.connect(custom_connector) \ 74 | .with_schema( 75 | Schema() 76 | .field("startTime", DataTypes.TIMESTAMP()) 77 | .field("endTime", DataTypes.TIMESTAMP()) 78 | .field("pv", DataTypes.BIGINT()) 79 | .field("uv", DataTypes.BIGINT()) 80 | ).register_table_sink("sink") 81 | 82 | st_env.scan("source").window(Tumble.over("1.hours").on("rowtime").alias("w")) \ 83 | .group_by("w") \ 84 | .select("w.start as startTime, w.end as endTime, COUNT(1) as pv, user_id.count.distinct as uv").insert_into("sink") 85 | 86 | st_env.execute("table pv uv") 87 | 88 | 89 | if __name__ == '__main__': 90 | pv_uv_demo() 91 | -------------------------------------------------------------------------------- /table/user_defined_sources_and_sinks/CustomTableSourceDemo.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.datastream import StreamExecutionEnvironment, TimeCharacteristic 4 | from pyflink.table import StreamTableEnvironment, DataTypes, CsvTableSink 5 | from pyflink.table.descriptors import Schema, CustomFormatDescriptor, CustomConnectorDescriptor, Json 6 | from pyflink.table.window import Tumble 7 | 8 | 9 | def custom_kafka_source_demo(): 10 | custom_connector = CustomConnectorDescriptor('kafka', 1, True) \ 11 | .property('connector.topic', 'user') \ 12 | .property('connector.properties.0.key', 'zookeeper.connect') \ 13 | .property('connector.properties.0.value', 'localhost:2181') \ 14 | .property('connector.properties.1.key', 'bootstrap.servers') \ 15 | .property('connector.properties.1.value', 'localhost:9092') \ 16 | .properties({'connector.version': '0.11', 'connector.startup-mode': 'earliest-offset'}) 17 | 18 | # the key is 'format.json-schema' 19 | custom_format = CustomFormatDescriptor('json', 1) \ 20 | .property('format.json-schema', 21 | "{" 22 | " type: 'object'," 23 | " properties: {" 24 | " a: {" 25 | " type: 'string'" 26 | " }," 27 | " b: {" 28 | " type: 'string'" 29 | " }," 30 | " c: {" 31 | " type: 'string'" 32 | " }," 33 | " time: {" 34 | " type: 'string'," 35 | " format: 'date-time'" 36 | " }" 37 | " }" 38 | "}") \ 39 | .properties({'format.fail-on-missing-field': 'true'}) 40 | 41 | s_env = StreamExecutionEnvironment.get_execution_environment() 42 | s_env.set_parallelism(1) 43 | s_env.set_stream_time_characteristic(TimeCharacteristic.ProcessingTime) 44 | st_env = StreamTableEnvironment.create(s_env) 45 | result_file = "/tmp/custom_kafka_source_demo.csv" 46 | if os.path.exists(result_file): 47 | os.remove(result_file) 48 | st_env \ 49 | .connect(custom_connector) \ 50 | .with_format( 51 | custom_format 52 | ) \ 53 | .with_schema( # declare the schema of the table 54 | Schema() 55 | .field("proctime", DataTypes.TIMESTAMP()) 56 | .proctime() 57 | .field("a", DataTypes.STRING()) 58 | .field("b", DataTypes.STRING()) 59 | .field("c", DataTypes.STRING()) 60 | ) \ 61 | .in_append_mode() \ 62 | .register_table_source("source") 63 | 64 | st_env.register_table_sink("result", 65 | CsvTableSink(["a", "b"], 66 | [DataTypes.STRING(), 67 | DataTypes.STRING()], 68 | result_file)) 69 | 70 | st_env.scan("source").window(Tumble.over("2.rows").on("proctime").alias("w")) \ 71 | .group_by("w, a") \ 72 | .select("a, max(b)").insert_into("result") 73 | 74 | st_env.execute("custom kafka source demo") 75 | # cat /tmp/custom_kafka_source_demo.csv 76 | # a,3 77 | # b,4 78 | # a 5 79 | 80 | 81 | def custom_test_source_demo(): 82 | s_env = StreamExecutionEnvironment.get_execution_environment() 83 | s_env.set_parallelism(1) 84 | st_env = StreamTableEnvironment.create(s_env) 85 | result_file = "/tmp/custom_test_source_demo.csv" 86 | if os.path.exists(result_file): 87 | os.remove(result_file) 88 | custom_connector = CustomConnectorDescriptor('pyflink-test', 1, False) 89 | st_env.connect(custom_connector) \ 90 | .with_schema( 91 | Schema() 92 | .field("a", DataTypes.STRING()) 93 | ).register_table_source("source") 94 | 95 | st_env.register_table_sink("result", 96 | CsvTableSink(["a"], 97 | [DataTypes.STRING()], 98 | result_file)) 99 | orders = st_env.scan("source") 100 | orders.insert_into("result") 101 | st_env.execute("custom test source demo") 102 | # cat /tmp/custom_test_source_demo.csv 103 | # haha 104 | # haha 105 | # haha 106 | 107 | 108 | def custom_test_sink_demo(): 109 | s_env = StreamExecutionEnvironment.get_execution_environment() 110 | s_env.set_parallelism(1) 111 | st_env = StreamTableEnvironment.create(s_env) 112 | left = st_env.from_elements( 113 | [(1, "1a", "1laa"), (2, "2a", "2aa"), (3, None, "3aa"), (2, "4b", "4bb"), (5, "5a", "5aa")], 114 | ["a", "b", "c"]).select("a, b, c") 115 | right = st_env.from_elements([(1, "1b", "1bb"), (2, None, "2bb"), (1, "3b", "3bb"), (4, "4b", "4bb")], 116 | ["d", "e", "f"]).select("d, e, f") 117 | 118 | result = left.left_outer_join(right, "a = d").select("a, b, e") 119 | # use custom retract sink connector 120 | custom_connector = CustomConnectorDescriptor('pyflink-test', 1, False) 121 | st_env.connect(custom_connector) \ 122 | .with_schema( 123 | Schema() 124 | .field("a", DataTypes.BIGINT()) 125 | .field("b", DataTypes.STRING()) 126 | .field("c", DataTypes.STRING()) 127 | ).register_table_sink("sink") 128 | result.insert_into("sink") 129 | st_env.execute("custom test sink demo") 130 | # (true, 1, 1a, null) 131 | # (true, 2, 2a, null) 132 | # (true, 3, null, null) 133 | # (true, 2, 4b, null) 134 | # (true, 5, 5a, null) 135 | # (false, 1, 1a, null) 136 | # (true, 1, 1a, 1b) 137 | # (false, 2, 4b, null) 138 | # (true, 2, 4b, null) 139 | # (false, 2, 2a, null) 140 | # (true, 2, 2a, null) 141 | # (true, 1, 1a, 3b) 142 | 143 | 144 | if __name__ == '__main__': 145 | # from table.prepare_environment import prepare_env 146 | # prepare_env(need_stream_source=True) 147 | # custom_kafka_source_demo() 148 | # custom_test_sink_demo() 149 | custom_test_source_demo() 150 | -------------------------------------------------------------------------------- /table/user_defined_sources_and_sinks/README.md: -------------------------------------------------------------------------------- 1 | # User-defined Sources & Sinks 2 | This page helps users to custom create sources & sinks 3 | 4 | ## Build Sources & Sinks 5 | 6 | ### Custom Sink 7 | The example of custom restract table sink lives in sinks module. You need to build this code: 8 | 9 | ```shell 10 | cd sinks; mvn clean package 11 | ``` 12 | 13 | 1. put jar(source or sink jar) into Python site-packages/pyflink/lib directory 14 | 15 | 2. create your python code wrapped the java class(you can refer to TestRetractSink.py) 16 | 17 | -------------------------------------------------------------------------------- /table/user_defined_sources_and_sinks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukess/pyflink-demo/7e8ae37b3cfb4a6a5560cae2a0d44cddee25e036/table/user_defined_sources_and_sinks/__init__.py -------------------------------------------------------------------------------- /table/user_defined_sources_and_sinks/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 20 | 22 | 23 | 4.0.0 24 | 25 | org.apache.flink.table 26 | user-defined-connectors 27 | 1.0 28 | 29 | jar 30 | 31 | 32 | 1.9.0 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | org.apache.flink 41 | flink-core 42 | ${table.version} 43 | provided 44 | 45 | 46 | org.apache.flink 47 | flink-java 48 | ${table.version} 49 | provided 50 | 51 | 52 | org.apache.flink 53 | flink-streaming-java_2.11 54 | ${table.version} 55 | provided 56 | 57 | 58 | org.apache.flink 59 | flink-table-common 60 | ${table.version} 61 | provided 62 | 63 | 64 | org.apache.flink 65 | flink-table-planner_2.11 66 | ${table.version} 67 | provided 68 | 69 | 70 | 71 | 72 | 73 | org.apache.maven.plugins 74 | maven-compiler-plugin 75 | 3.1 76 | 77 | 1.8 78 | 1.8 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /table/user_defined_sources_and_sinks/src/main/java/com/pyflink/table/factory/TestTableFactory.java: -------------------------------------------------------------------------------- 1 | package com.pyflink.table.factory; 2 | 3 | import com.pyflink.table.sinks.TestRetractSink; 4 | import com.pyflink.table.sources.TestSource; 5 | import org.apache.flink.api.java.tuple.Tuple2; 6 | import org.apache.flink.api.java.typeutils.RowTypeInfo; 7 | import org.apache.flink.table.api.TableSchema; 8 | import org.apache.flink.table.descriptors.DescriptorProperties; 9 | import org.apache.flink.table.descriptors.SchemaValidator; 10 | import org.apache.flink.table.factories.StreamTableSinkFactory; 11 | import org.apache.flink.table.factories.StreamTableSourceFactory; 12 | import org.apache.flink.table.sinks.StreamTableSink; 13 | import org.apache.flink.table.sources.StreamTableSource; 14 | import org.apache.flink.types.Row; 15 | 16 | import java.util.ArrayList; 17 | import java.util.HashMap; 18 | import java.util.List; 19 | import java.util.Map; 20 | 21 | import static org.apache.flink.table.descriptors.ConnectorDescriptorValidator.CONNECTOR_PROPERTY_VERSION; 22 | import static org.apache.flink.table.descriptors.ConnectorDescriptorValidator.CONNECTOR_TYPE; 23 | import static org.apache.flink.table.descriptors.Rowtime.ROWTIME_TIMESTAMPS_CLASS; 24 | import static org.apache.flink.table.descriptors.Rowtime.ROWTIME_TIMESTAMPS_FROM; 25 | import static org.apache.flink.table.descriptors.Rowtime.ROWTIME_TIMESTAMPS_SERIALIZED; 26 | import static org.apache.flink.table.descriptors.Rowtime.ROWTIME_TIMESTAMPS_TYPE; 27 | import static org.apache.flink.table.descriptors.Rowtime.ROWTIME_WATERMARKS_CLASS; 28 | import static org.apache.flink.table.descriptors.Rowtime.ROWTIME_WATERMARKS_DELAY; 29 | import static org.apache.flink.table.descriptors.Rowtime.ROWTIME_WATERMARKS_SERIALIZED; 30 | import static org.apache.flink.table.descriptors.Rowtime.ROWTIME_WATERMARKS_TYPE; 31 | import static org.apache.flink.table.descriptors.Schema.SCHEMA; 32 | import static org.apache.flink.table.descriptors.Schema.SCHEMA_FROM; 33 | import static org.apache.flink.table.descriptors.Schema.SCHEMA_NAME; 34 | import static org.apache.flink.table.descriptors.Schema.SCHEMA_PROCTIME; 35 | import static org.apache.flink.table.descriptors.Schema.SCHEMA_TYPE; 36 | 37 | public class TestTableFactory implements StreamTableSourceFactory, StreamTableSinkFactory> { 38 | @Override 39 | public StreamTableSink> createStreamTableSink(Map map) { 40 | DescriptorProperties params = new DescriptorProperties(true); 41 | params.putProperties(map); 42 | new SchemaValidator(true, true, true).validate(params); 43 | TableSchema tableSchema = params.getTableSchema(SCHEMA); 44 | TestRetractSink sink = new TestRetractSink(); 45 | return (StreamTableSink>) sink.configure(tableSchema.getFieldNames(), tableSchema.getFieldTypes()); 46 | } 47 | 48 | @Override 49 | public StreamTableSource createStreamTableSource(Map map) { 50 | DescriptorProperties params = new DescriptorProperties(true); 51 | params.putProperties(map); 52 | new SchemaValidator(true, true, true).validate(params); 53 | TableSchema tableSchema = params.getTableSchema(SCHEMA); 54 | return new TestSource(tableSchema, new RowTypeInfo(tableSchema.getFieldTypes(), tableSchema.getFieldNames())); 55 | } 56 | 57 | @Override 58 | public Map requiredContext() { 59 | Map context = new HashMap<>(); 60 | context.put(CONNECTOR_TYPE, "pyflink-test"); 61 | context.put(CONNECTOR_PROPERTY_VERSION, "1"); 62 | return context; 63 | } 64 | 65 | @Override 66 | public List supportedProperties() { 67 | List properties = new ArrayList<>(); 68 | 69 | // schema 70 | properties.add(SCHEMA + ".#." + SCHEMA_TYPE); 71 | properties.add(SCHEMA + ".#." + SCHEMA_NAME); 72 | properties.add(SCHEMA + ".#." + SCHEMA_FROM); 73 | 74 | // time attributes 75 | properties.add(SCHEMA + ".#." + SCHEMA_PROCTIME); 76 | properties.add(SCHEMA + ".#." + ROWTIME_TIMESTAMPS_TYPE); 77 | properties.add(SCHEMA + ".#." + ROWTIME_TIMESTAMPS_FROM); 78 | properties.add(SCHEMA + ".#." + ROWTIME_TIMESTAMPS_CLASS); 79 | properties.add(SCHEMA + ".#." + ROWTIME_TIMESTAMPS_SERIALIZED); 80 | properties.add(SCHEMA + ".#." + ROWTIME_WATERMARKS_TYPE); 81 | properties.add(SCHEMA + ".#." + ROWTIME_WATERMARKS_CLASS); 82 | properties.add(SCHEMA + ".#." + ROWTIME_WATERMARKS_SERIALIZED); 83 | properties.add(SCHEMA + ".#." + ROWTIME_WATERMARKS_DELAY); 84 | return properties; 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /table/user_defined_sources_and_sinks/src/main/java/com/pyflink/table/sinks/TestRetractSink.java: -------------------------------------------------------------------------------- 1 | package com.pyflink.table.sinks; 2 | 3 | import org.apache.flink.api.common.typeinfo.TypeInformation; 4 | import org.apache.flink.api.java.tuple.Tuple2; 5 | import org.apache.flink.api.java.typeutils.RowTypeInfo; 6 | import org.apache.flink.streaming.api.datastream.DataStream; 7 | import org.apache.flink.streaming.api.datastream.DataStreamSink; 8 | import org.apache.flink.streaming.api.functions.sink.SinkFunction; 9 | import org.apache.flink.table.sinks.RetractStreamTableSink; 10 | import org.apache.flink.table.sinks.TableSink; 11 | import org.apache.flink.types.Row; 12 | 13 | public class TestRetractSink implements RetractStreamTableSink { 14 | 15 | String[] fNames; 16 | TypeInformation[] fTypes; 17 | 18 | @Override 19 | public TypeInformation getRecordType() { 20 | return new RowTypeInfo(fTypes, fNames); 21 | } 22 | 23 | @Override 24 | public String[] getFieldNames() { 25 | return fNames; 26 | } 27 | 28 | @Override 29 | public TypeInformation[] getFieldTypes() { 30 | return fTypes; 31 | } 32 | 33 | @Override 34 | public void emitDataStream(DataStream> dataStream) { 35 | consumeDataStream(dataStream); 36 | } 37 | 38 | @Override 39 | public DataStreamSink consumeDataStream(DataStream> dataStream) { 40 | return dataStream.addSink(new RowSink()); 41 | } 42 | 43 | @Override 44 | public TableSink> configure(String[] fNames, TypeInformation[] fTypes) { 45 | TestRetractSink copy = new TestRetractSink(); 46 | copy.fNames = fNames; 47 | copy.fTypes = fTypes; 48 | return copy; 49 | } 50 | 51 | private static class RowSink implements SinkFunction> { 52 | @Override 53 | public void invoke(Tuple2 value) throws Exception { 54 | System.out.println(value); 55 | } 56 | } 57 | } 58 | 59 | -------------------------------------------------------------------------------- /table/user_defined_sources_and_sinks/src/main/java/com/pyflink/table/sources/TestSource.java: -------------------------------------------------------------------------------- 1 | package com.pyflink.table.sources; 2 | 3 | import org.apache.flink.api.common.typeinfo.TypeInformation; 4 | import org.apache.flink.streaming.api.datastream.DataStream; 5 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 6 | import org.apache.flink.table.api.TableSchema; 7 | import org.apache.flink.table.sources.StreamTableSource; 8 | import org.apache.flink.types.Row; 9 | 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | 13 | public class TestSource implements StreamTableSource { 14 | 15 | TableSchema schema; 16 | TypeInformation returnType; 17 | 18 | public TestSource(TableSchema tableSchema, TypeInformation returnType) { 19 | this.schema = tableSchema; 20 | this.returnType = returnType; 21 | } 22 | 23 | @Override 24 | public DataStream getDataStream(StreamExecutionEnvironment env) { 25 | Row r1 = new Row(1); 26 | r1.setField(0, "haha"); 27 | Row r2 = new Row(1); 28 | r2.setField(0, "haha"); 29 | Row r3 = new Row(1); 30 | r3.setField(0, "haha"); 31 | List data = new ArrayList<>(); 32 | data.add(r1); 33 | data.add(r2); 34 | data.add(r3); 35 | return env.fromCollection(data, returnType); 36 | } 37 | 38 | @Override 39 | public TableSchema getTableSchema() { 40 | return schema; 41 | } 42 | 43 | @Override 44 | public TypeInformation getReturnType() { 45 | return returnType; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /table/user_defined_sources_and_sinks/src/main/resources/META-INF/services/org.apache.flink.table.factories.TableFactory: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | com.pyflink.table.factory.TestTableFactory -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mukess/pyflink-demo/7e8ae37b3cfb4a6a5560cae2a0d44cddee25e036/utils/__init__.py -------------------------------------------------------------------------------- /utils/elastic_search_utils.py: -------------------------------------------------------------------------------- 1 | from elasticsearch import Elasticsearch 2 | 3 | 4 | def create_index(index='test', body=''): 5 | es = Elasticsearch() 6 | es.indices.create(index=index, ignore=400, body=body) 7 | 8 | 9 | def add_update_data(index, doc_type, id, body): 10 | es = Elasticsearch() 11 | es.index(index=index, doc_type=doc_type, id=id, body=body) 12 | 13 | 14 | def get_data(index, doc_type, id): 15 | es = Elasticsearch() 16 | return es.get(index=index, doc_type=doc_type, id=id)['_source'] 17 | 18 | 19 | def get_all_data(index, doc_type='_all'): 20 | es = Elasticsearch() 21 | if index is not None: 22 | data = es.search(index=index, doc_type=doc_type) 23 | return data 24 | 25 | 26 | def delete_index(index): 27 | es = Elasticsearch() 28 | if index is not None: 29 | es.indices.delete(index=index, ignore=[400, 404]) 30 | 31 | 32 | if __name__ == '__main__': 33 | create_index('user') 34 | # from datetime import datetime 35 | # 36 | # body = {"any": "data", "timestamp": datetime.now()} 37 | # add_update_data(index='test', doc_type='person', id=1, body=body) 38 | # import time 39 | # time.sleep(1) 40 | # print(get_all_data('test', 'person')) 41 | # delete_index('user') 42 | -------------------------------------------------------------------------------- /utils/kafka_utils.py: -------------------------------------------------------------------------------- 1 | from kafka import KafkaProducer 2 | from kafka import KafkaAdminClient 3 | from kafka import KafkaConsumer 4 | from kafka.admin import NewTopic 5 | import json 6 | 7 | 8 | def send_msg(topic='test', msg=None): 9 | producer = KafkaProducer(bootstrap_servers='localhost:9092', 10 | value_serializer=lambda v: json.dumps(v).encode('utf-8')) 11 | if msg is not None: 12 | future = producer.send(topic, msg) 13 | future.get() 14 | 15 | 16 | def get_msg(topic='test'): 17 | consumer = KafkaConsumer(topic, auto_offset_reset='earliest') 18 | for message in consumer: 19 | print(message) 20 | 21 | 22 | def list_topics(): 23 | global_consumer = KafkaConsumer(bootstrap_servers='localhost:9092') 24 | topics = global_consumer.topics() 25 | return topics 26 | 27 | 28 | def create_topic(topic='test'): 29 | admin = KafkaAdminClient(bootstrap_servers='localhost:9092') 30 | topics = list_topics() 31 | if topic not in topics: 32 | topic_obj = NewTopic(topic, 1, 1) 33 | admin.create_topics(new_topics=[topic_obj]) 34 | 35 | 36 | if __name__ == '__main__': 37 | print(list_topics()) 38 | # msg = {'user': 'flink', 'message': 'Hello Message', 'time': '2013-01-01T00:14:13Z'} 39 | # msg = {'user': 'flink', 'message': 'Hello Message', 'time': '1990-10-14T12:12:43Z'} 40 | # send_msg('test', msg) 41 | # get_msg('user') 42 | --------------------------------------------------------------------------------