├── .gitignore ├── README.md ├── pom.xml └── src └── main ├── java └── myflink │ ├── BatchJob.java │ ├── HotItems.java │ ├── SocketWindowWordCount.java │ └── StreamingJob.java └── resources ├── UserBehavior.csv └── log4j.properties /.gitignore: -------------------------------------------------------------------------------- 1 | .cache 2 | scalastyle-output.xml 3 | .classpath 4 | .idea 5 | .metadata 6 | .settings 7 | .project 8 | .version.properties 9 | filter.properties 10 | logs.zip 11 | target 12 | tmp 13 | *.class 14 | *.iml 15 | *.swp 16 | *.jar 17 | *.log 18 | .DS_Store 19 | build-target 20 | flink-end-to-end-tests/flink-datastream-allround-test/src/main/java/org/apache/flink/streaming/tests/avro/ 21 | flink-formats/flink-avro/src/test/java/org/apache/flink/formats/avro/generated/ 22 | flink-runtime-web/web-dashboard/assets/fonts/ 23 | flink-runtime-web/web-dashboard/node_modules/ 24 | flink-runtime-web/web-dashboard/bower_components/ 25 | atlassian-ide-plugin.xml 26 | out/ 27 | /docs/api 28 | /docs/content 29 | /docs/.bundle 30 | /docs/.rubydeps 31 | /docs/ruby2/.bundle 32 | /docs/ruby2/.rubydeps 33 | /docs/.jekyll-metadata 34 | *.ipr 35 | *.iws 36 | tools/flink 37 | tools/flink-* 38 | tools/releasing/release 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 本项目主要用于我最近写的几篇 《Flink 入门》系列文章的示例工程代码。并不是直接生产可用的代码。 2 | 3 | 如果你对《Flink 入门》系列比较感兴趣,欢迎关注我的博客:http://wuchong.me/tags/Flink入门/ 4 | 5 | 也可以关注我的个人微信公众号: 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 19 | 21 | 4.0.0 22 | 23 | my-flink-project 24 | my-flink-project 25 | 0.1 26 | jar 27 | 28 | Flink Quickstart Job 29 | http://www.myorganization.org 30 | 31 | 32 | UTF-8 33 | 1.6.1 34 | 1.8 35 | 2.11 36 | ${java.version} 37 | ${java.version} 38 | 39 | 40 | 41 | 42 | apache.snapshots 43 | Apache Development Snapshot Repository 44 | https://repository.apache.org/content/repositories/snapshots/ 45 | 46 | false 47 | 48 | 49 | true 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | org.apache.flink 59 | flink-java 60 | ${flink.version} 61 | provided 62 | 63 | 64 | org.apache.flink 65 | flink-streaming-java_${scala.binary.version} 66 | ${flink.version} 67 | provided 68 | 69 | 70 | 71 | 72 | 80 | 81 | 82 | 83 | 84 | org.slf4j 85 | slf4j-log4j12 86 | 1.7.7 87 | runtime 88 | 89 | 90 | log4j 91 | log4j 92 | 1.2.17 93 | runtime 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | org.apache.maven.plugins 103 | maven-compiler-plugin 104 | 3.1 105 | 106 | ${java.version} 107 | ${java.version} 108 | 109 | 110 | 111 | 112 | 113 | 114 | org.apache.maven.plugins 115 | maven-shade-plugin 116 | 3.0.0 117 | 118 | 119 | 120 | package 121 | 122 | shade 123 | 124 | 125 | 126 | 127 | org.apache.flink:force-shading 128 | com.google.code.findbugs:jsr305 129 | org.slf4j:* 130 | log4j:* 131 | 132 | 133 | 134 | 135 | 137 | *:* 138 | 139 | META-INF/*.SF 140 | META-INF/*.DSA 141 | META-INF/*.RSA 142 | 143 | 144 | 145 | 146 | 147 | myflink.StreamingJob 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | org.eclipse.m2e 162 | lifecycle-mapping 163 | 1.0.0 164 | 165 | 166 | 167 | 168 | 169 | org.apache.maven.plugins 170 | maven-shade-plugin 171 | [3.0.0,) 172 | 173 | shade 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | org.apache.maven.plugins 183 | maven-compiler-plugin 184 | [3.1,) 185 | 186 | testCompile 187 | compile 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | add-dependencies-for-IDEA 208 | 209 | 210 | 211 | idea.version 212 | 213 | 214 | 215 | 216 | 217 | org.apache.flink 218 | flink-java 219 | ${flink.version} 220 | compile 221 | 222 | 223 | org.apache.flink 224 | flink-streaming-java_${scala.binary.version} 225 | ${flink.version} 226 | compile 227 | 228 | 229 | 230 | 231 | 232 | 233 | -------------------------------------------------------------------------------- /src/main/java/myflink/BatchJob.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package myflink; 20 | 21 | import org.apache.flink.api.java.ExecutionEnvironment; 22 | 23 | /** 24 | * Skeleton for a Flink Batch Job. 25 | * 26 | *

For a tutorial how to write a Flink batch application, check the 27 | * tutorials and examples on the Flink Website. 28 | * 29 | *

To package your application into a JAR file for execution, 30 | * change the main class in the POM.xml file to this class (simply search for 'mainClass') 31 | * and run 'mvn clean package' on the command line. 32 | */ 33 | public class BatchJob { 34 | 35 | public static void main(String[] args) throws Exception { 36 | // set up the batch execution environment 37 | final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); 38 | 39 | /* 40 | * Here, you can start creating your execution plan for Flink. 41 | * 42 | * Start with getting some data from the environment, like 43 | * env.readTextFile(textPath); 44 | * 45 | * then, transform the resulting DataSet using operations 46 | * like 47 | * .filter() 48 | * .flatMap() 49 | * .join() 50 | * .coGroup() 51 | * 52 | * and many more. 53 | * Have a look at the programming guide for the Java API: 54 | * 55 | * http://flink.apache.org/docs/latest/apis/batch/index.html 56 | * 57 | * and the examples 58 | * 59 | * http://flink.apache.org/docs/latest/apis/batch/examples.html 60 | * 61 | */ 62 | 63 | // execute program 64 | env.execute("Flink Batch Java API Skeleton"); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/myflink/HotItems.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package myflink; 19 | 20 | import org.apache.flink.api.common.functions.AggregateFunction; 21 | import org.apache.flink.api.common.functions.FilterFunction; 22 | import org.apache.flink.api.common.state.ListState; 23 | import org.apache.flink.api.common.state.ListStateDescriptor; 24 | import org.apache.flink.api.java.io.PojoCsvInputFormat; 25 | import org.apache.flink.api.java.tuple.Tuple; 26 | import org.apache.flink.api.java.tuple.Tuple1; 27 | import org.apache.flink.api.java.typeutils.PojoTypeInfo; 28 | import org.apache.flink.api.java.typeutils.TypeExtractor; 29 | import org.apache.flink.configuration.Configuration; 30 | import org.apache.flink.core.fs.Path; 31 | import org.apache.flink.streaming.api.TimeCharacteristic; 32 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 33 | import org.apache.flink.streaming.api.functions.KeyedProcessFunction; 34 | import org.apache.flink.streaming.api.functions.timestamps.AscendingTimestampExtractor; 35 | import org.apache.flink.streaming.api.functions.windowing.WindowFunction; 36 | import org.apache.flink.streaming.api.windowing.time.Time; 37 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow; 38 | import org.apache.flink.util.Collector; 39 | 40 | import java.io.File; 41 | import java.net.URL; 42 | import java.sql.Timestamp; 43 | import java.util.ArrayList; 44 | import java.util.Comparator; 45 | import java.util.List; 46 | 47 | public class HotItems { 48 | 49 | public static void main(String[] args) throws Exception { 50 | 51 | // 创建 execution environment 52 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 53 | // 告诉系统按照 EventTime 处理 54 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); 55 | // 为了打印到控制台的结果不乱序,我们配置全局的并发为1,改变并发对结果正确性没有影响 56 | env.setParallelism(1); 57 | 58 | // UserBehavior.csv 的本地文件路径, 在 resources 目录下 59 | URL fileUrl = HotItems.class.getClassLoader().getResource("UserBehavior.csv"); 60 | Path filePath = Path.fromLocalFile(new File(fileUrl.toURI())); 61 | // 抽取 UserBehavior 的 TypeInformation,是一个 PojoTypeInfo 62 | PojoTypeInfo pojoType = (PojoTypeInfo) TypeExtractor.createTypeInfo(UserBehavior.class); 63 | // 由于 Java 反射抽取出的字段顺序是不确定的,需要显式指定下文件中字段的顺序 64 | String[] fieldOrder = new String[]{"userId", "itemId", "categoryId", "behavior", "timestamp"}; 65 | // 创建 PojoCsvInputFormat 66 | PojoCsvInputFormat csvInput = new PojoCsvInputFormat<>(filePath, pojoType, fieldOrder); 67 | 68 | 69 | env 70 | // 创建数据源,得到 UserBehavior 类型的 DataStream 71 | .createInput(csvInput, pojoType) 72 | // 抽取出时间和生成 watermark 73 | .assignTimestampsAndWatermarks(new AscendingTimestampExtractor() { 74 | @Override 75 | public long extractAscendingTimestamp(UserBehavior userBehavior) { 76 | // 原始数据单位秒,将其转成毫秒 77 | return userBehavior.timestamp * 1000; 78 | } 79 | }) 80 | // 过滤出只有点击的数据 81 | .filter(new FilterFunction() { 82 | @Override 83 | public boolean filter(UserBehavior userBehavior) throws Exception { 84 | // 过滤出只有点击的数据 85 | return userBehavior.behavior.equals("pv"); 86 | } 87 | }) 88 | .keyBy("itemId") 89 | .timeWindow(Time.minutes(60), Time.minutes(5)) 90 | .aggregate(new CountAgg(), new WindowResultFunction()) 91 | .keyBy("windowEnd") 92 | .process(new TopNHotItems(3)) 93 | .print(); 94 | 95 | env.execute("Hot Items Job"); 96 | } 97 | 98 | /** 求某个窗口中前 N 名的热门点击商品,key 为窗口时间戳,输出为 TopN 的结果字符串 */ 99 | public static class TopNHotItems extends KeyedProcessFunction { 100 | 101 | private final int topSize; 102 | 103 | public TopNHotItems(int topSize) { 104 | this.topSize = topSize; 105 | } 106 | 107 | // 用于存储商品与点击数的状态,待收齐同一个窗口的数据后,再触发 TopN 计算 108 | private ListState itemState; 109 | 110 | @Override 111 | public void open(Configuration parameters) throws Exception { 112 | super.open(parameters); 113 | ListStateDescriptor itemsStateDesc = new ListStateDescriptor<>( 114 | "itemState-state", 115 | ItemViewCount.class); 116 | itemState = getRuntimeContext().getListState(itemsStateDesc); 117 | } 118 | 119 | @Override 120 | public void processElement( 121 | ItemViewCount input, 122 | Context context, 123 | Collector collector) throws Exception { 124 | 125 | // 每条数据都保存到状态中 126 | itemState.add(input); 127 | // 注册 windowEnd+1 的 EventTime Timer, 当触发时,说明收齐了属于windowEnd窗口的所有商品数据 128 | context.timerService().registerEventTimeTimer(input.windowEnd + 1); 129 | } 130 | 131 | @Override 132 | public void onTimer( 133 | long timestamp, OnTimerContext ctx, Collector out) throws Exception { 134 | // 获取收到的所有商品点击量 135 | List allItems = new ArrayList<>(); 136 | for (ItemViewCount item : itemState.get()) { 137 | allItems.add(item); 138 | } 139 | // 提前清除状态中的数据,释放空间 140 | itemState.clear(); 141 | // 按照点击量从大到小排序 142 | allItems.sort(new Comparator() { 143 | @Override 144 | public int compare(ItemViewCount o1, ItemViewCount o2) { 145 | return (int) (o2.viewCount - o1.viewCount); 146 | } 147 | }); 148 | // 将排名信息格式化成 String, 便于打印 149 | StringBuilder result = new StringBuilder(); 150 | result.append("====================================\n"); 151 | result.append("时间: ").append(new Timestamp(timestamp-1)).append("\n"); 152 | for (int i=0; i { 171 | 172 | @Override 173 | public void apply( 174 | Tuple key, // 窗口的主键,即 itemId 175 | TimeWindow window, // 窗口 176 | Iterable aggregateResult, // 聚合函数的结果,即 count 值 177 | Collector collector // 输出类型为 ItemViewCount 178 | ) throws Exception { 179 | Long itemId = ((Tuple1) key).f0; 180 | Long count = aggregateResult.iterator().next(); 181 | collector.collect(ItemViewCount.of(itemId, window.getEnd(), count)); 182 | } 183 | } 184 | 185 | /** COUNT 统计的聚合函数实现,每出现一条记录加一 */ 186 | public static class CountAgg implements AggregateFunction { 187 | 188 | @Override 189 | public Long createAccumulator() { 190 | return 0L; 191 | } 192 | 193 | @Override 194 | public Long add(UserBehavior userBehavior, Long acc) { 195 | return acc + 1; 196 | } 197 | 198 | @Override 199 | public Long getResult(Long acc) { 200 | return acc; 201 | } 202 | 203 | @Override 204 | public Long merge(Long acc1, Long acc2) { 205 | return acc1 + acc2; 206 | } 207 | } 208 | 209 | /** 商品点击量(窗口操作的输出类型) */ 210 | public static class ItemViewCount { 211 | public long itemId; // 商品ID 212 | public long windowEnd; // 窗口结束时间戳 213 | public long viewCount; // 商品的点击量 214 | 215 | public static ItemViewCount of(long itemId, long windowEnd, long viewCount) { 216 | ItemViewCount result = new ItemViewCount(); 217 | result.itemId = itemId; 218 | result.windowEnd = windowEnd; 219 | result.viewCount = viewCount; 220 | return result; 221 | } 222 | } 223 | 224 | /** 用户行为数据结构 **/ 225 | public static class UserBehavior { 226 | public long userId; // 用户ID 227 | public long itemId; // 商品ID 228 | public int categoryId; // 商品类目ID 229 | public String behavior; // 用户行为, 包括("pv", "buy", "cart", "fav") 230 | public long timestamp; // 行为发生的时间戳,单位秒 231 | } 232 | } 233 | -------------------------------------------------------------------------------- /src/main/java/myflink/SocketWindowWordCount.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package myflink; 19 | 20 | import org.apache.flink.api.common.functions.FlatMapFunction; 21 | import org.apache.flink.api.java.tuple.Tuple2; 22 | import org.apache.flink.streaming.api.datastream.DataStream; 23 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 24 | import org.apache.flink.streaming.api.windowing.time.Time; 25 | import org.apache.flink.util.Collector; 26 | 27 | public class SocketWindowWordCount { 28 | 29 | public static void main(String[] args) throws Exception { 30 | 31 | // 创建 execution environment 32 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 33 | 34 | // 通过连接 socket 获取输入数据,这里连接到本地9000端口,如果9000端口已被占用,请换一个端口 35 | DataStream text = env.socketTextStream("localhost", 9000, "\n"); 36 | 37 | // 解析数据,按 word 分组,开窗,聚合 38 | DataStream> windowCounts = text 39 | .flatMap(new FlatMapFunction>() { 40 | @Override 41 | public void flatMap(String value, Collector> out) { 42 | for (String word : value.split("\\s")) { 43 | out.collect(Tuple2.of(word, 1)); 44 | } 45 | } 46 | }) 47 | .keyBy(0) 48 | .timeWindow(Time.seconds(5)) 49 | .sum(1); 50 | 51 | // 将结果打印到控制台,注意这里使用的是单线程打印,而非多线程 52 | windowCounts.print().setParallelism(1); 53 | 54 | env.execute("Socket Window WordCount"); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/myflink/StreamingJob.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package myflink; 20 | 21 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 22 | 23 | /** 24 | * Skeleton for a Flink Streaming Job. 25 | * 26 | *

For a tutorial how to write a Flink streaming application, check the 27 | * tutorials and examples on the Flink Website. 28 | * 29 | *

To package your application into a JAR file for execution, run 30 | * 'mvn clean package' on the command line. 31 | * 32 | *

If you change the name of the main class (with the public static void main(String[] args)) 33 | * method, change the respective entry in the POM.xml file (simply search for 'mainClass'). 34 | */ 35 | public class StreamingJob { 36 | 37 | public static void main(String[] args) throws Exception { 38 | // set up the streaming execution environment 39 | final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 40 | 41 | /* 42 | * Here, you can start creating your execution plan for Flink. 43 | * 44 | * Start with getting some data from the environment, like 45 | * env.readTextFile(textPath); 46 | * 47 | * then, transform the resulting DataStream using operations 48 | * like 49 | * .filter() 50 | * .flatMap() 51 | * .join() 52 | * .coGroup() 53 | * 54 | * and many more. 55 | * Have a look at the programming guide for the Java API: 56 | * 57 | * http://flink.apache.org/docs/latest/apis/streaming/index.html 58 | * 59 | */ 60 | 61 | // execute program 62 | env.execute("Flink Streaming Java API Skeleton"); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | ################################################################################ 18 | 19 | log4j.rootLogger=INFO, console 20 | 21 | log4j.appender.console=org.apache.log4j.ConsoleAppender 22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 23 | log4j.appender.console.layout.ConversionPattern=%d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n 24 | --------------------------------------------------------------------------------