├── .gitignore
├── Flink
├── pom.xml
└── src
│ └── main
│ ├── java
│ └── cn
│ │ └── edu
│ │ └── ecnu
│ │ └── flink
│ │ └── examples
│ │ └── java
│ │ ├── fibonacciexample
│ │ └── FibonacciExample.java
│ │ ├── integersum
│ │ ├── DataflowModel_How.java
│ │ ├── DataflowModel_What.java
│ │ ├── DataflowModel_When.java
│ │ ├── DataflowModel_Where.java
│ │ ├── producer
│ │ │ └── Producer.java
│ │ └── trigger
│ │ │ ├── CustomerTrigger.java
│ │ │ └── CustomerTriggerWithAccumulation.java
│ │ ├── wordcount
│ │ └── WordCount.java
│ │ └── wordcountwithfaulttolerance
│ │ └── WordCountWithFaultTolerance.java
│ ├── resources
│ └── META-INF
│ │ └── MANIFEST.MF
│ └── scala
│ └── cn
│ └── edu
│ └── ecnu
│ └── flink
│ └── examples
│ └── scala
│ ├── fibonacciexample
│ └── FibonacciExample.scala
│ ├── integersum
│ ├── DataflowModel_How.scala
│ ├── DataflowModel_What.scala
│ ├── DataflowModel_When.scala
│ ├── DataflowModel_Where.scala
│ ├── producer
│ │ └── Producer.scala
│ └── trigger
│ │ ├── CustomerTrigger.scala
│ │ └── CustomerTriggerWithAccumulation.scala
│ ├── wordcount
│ └── WordCount.scala
│ └── wordcountwithfaulttolerance
│ └── WordCountWithFaultTolerance.scala
├── Giraph
├── pom.xml
└── src
│ └── main
│ ├── java
│ └── cn
│ │ └── edu
│ │ └── ecnu
│ │ └── giraph
│ │ └── examples
│ │ ├── cc
│ │ ├── ConnectedComponentsComputation.java
│ │ ├── ConnectedComponentsRunner.java
│ │ └── README.md
│ │ ├── kmeans
│ │ ├── KMeansComputation.java
│ │ ├── KMeansMasterCompute.java
│ │ ├── KMeansRunner.java
│ │ ├── README.md
│ │ └── utils
│ │ │ ├── FileOperation.java
│ │ │ └── PointsOperation.java
│ │ ├── pagerank
│ │ ├── PageRankComputation.java
│ │ ├── PageRankRunner.java
│ │ └── README.md
│ │ └── sssp
│ │ ├── README.md
│ │ ├── ShortestPathComputation.java
│ │ └── ShortestPathRunner.java
│ └── resources
│ ├── inputs
│ ├── cc
│ │ └── data.txt
│ ├── kmeans
│ │ ├── center
│ │ │ └── centers.txt
│ │ └── data.txt
│ ├── pagerank
│ │ └── data.txt
│ └── sssp
│ │ └── data.txt
│ └── log4j.properties
├── HDFS
├── HDFS.md
├── README.md
├── pom.xml
└── src
│ └── main
│ ├── java
│ └── cn
│ │ └── edu
│ │ └── ecnu
│ │ └── hdfs
│ │ └── examples
│ │ ├── read
│ │ └── Reader.java
│ │ └── write
│ │ └── Writer.java
│ └── resources
│ └── example
│ └── example.txt
├── LICENSE
├── MapReduce
├── MapReduce.md
├── pom.xml
└── src
│ └── main
│ ├── java
│ └── cn
│ │ └── edu
│ │ └── ecnu
│ │ └── mapreduce
│ │ └── examples
│ │ ├── Constants.java
│ │ ├── join
│ │ ├── README.md
│ │ ├── entity
│ │ │ └── ReduceJoinWritable.java
│ │ ├── mapjoin
│ │ │ ├── MapJoin.java
│ │ │ └── MapJoinMapper.java
│ │ └── reducejoin
│ │ │ ├── ReduceJoin.java
│ │ │ ├── ReduceJoinMapper.java
│ │ │ └── ReduceJoinReducer.java
│ │ ├── kmeans
│ │ ├── KMeans.java
│ │ ├── KMeansMapper.java
│ │ ├── KMeansReducer.java
│ │ ├── README.md
│ │ └── utils
│ │ │ ├── CentersOperation.java
│ │ │ └── FileOperation.java
│ │ ├── pagerank
│ │ ├── PageRank.java
│ │ ├── PageRankMapper.java
│ │ ├── PageRankReducer.java
│ │ ├── README.md
│ │ └── ReducePageRankWritable.java
│ │ └── wordcount
│ │ ├── README.md
│ │ ├── WordCount.java
│ │ ├── WordCountCombiner.java
│ │ ├── WordCountMapper.java
│ │ └── WordCountReducer.java
│ └── resources
│ ├── inputs
│ ├── join
│ │ ├── department.csv
│ │ ├── employee.csv
│ │ └── input_cluster
│ │ │ ├── department.csv
│ │ │ └── employee.csv
│ ├── kmeans
│ │ ├── center
│ │ │ └── centers.txt
│ │ └── data.txt
│ ├── pagerank
│ │ └── data.txt
│ └── wordcount
│ │ └── data.txt
│ └── log4j.properties
├── README.md
├── Spark
├── pom.xml
└── src
│ └── main
│ ├── java
│ └── cn
│ │ └── edu
│ │ └── ecnu
│ │ └── spark
│ │ └── examples
│ │ └── java
│ │ ├── checkpoint
│ │ └── Checkpoint.java
│ │ ├── join
│ │ ├── BroadcastJoin.java
│ │ └── ShuffleJoin.java
│ │ ├── kmeans
│ │ └── KMeans.java
│ │ ├── pagerank
│ │ └── PageRank.java
│ │ └── wordcount
│ │ └── WordCount.java
│ ├── resources
│ └── input
│ │ ├── join
│ │ ├── department.csv
│ │ └── employee.csv
│ │ ├── kmeans
│ │ ├── centers.txt
│ │ └── data.txt
│ │ ├── overleaf
│ │ ├── a.txt
│ │ ├── c.txt
│ │ └── e.txt
│ │ ├── pagerank
│ │ └── pagerank.txt
│ │ └── wordcount
│ │ └── words.txt
│ └── scala
│ └── cn
│ └── edu
│ └── ecnu
│ └── spark
│ └── examples
│ └── scala
│ ├── checkpoint
│ └── Checkpoint.scala
│ ├── join
│ ├── BroadcastJoin.scala
│ └── ShuffleJoin.scala
│ ├── kmeans
│ └── KMeans.scala
│ ├── pagerank
│ └── PageRank.scala
│ └── wordcount
│ └── WordCount.scala
├── SparkStreaming
├── pom.xml
└── src
│ └── main
│ ├── java
│ └── cn
│ │ └── edu
│ │ └── ecnu
│ │ └── sparkstreaming
│ │ └── examples
│ │ └── java
│ │ ├── anomaly
│ │ └── AnomalyDetection.java
│ │ ├── window
│ │ └── Window.java
│ │ └── wordcount
│ │ ├── BatchWordCount.java
│ │ └── GlobalWordCount.java
│ └── scala
│ └── cn
│ └── edu
│ └── ecnu
│ └── sparkstreaming
│ └── examples
│ └── scala
│ ├── anomaly
│ └── AnomalyDetection.scala
│ ├── window
│ └── Window.scala
│ └── wordcount
│ ├── BatchWordCount.scala
│ └── GlobalWordCount.scala
└── Storm
├── README.md
├── pom.xml
└── src
└── main
└── java
└── cn
└── edu
└── ecnu
└── example
└── storm
├── common
└── SocketSpout.java
├── detection
├── DetectionBolt.java
└── OutlierTopology.java
└── wordcount
├── CountBolt.java
├── SplitBolt.java
├── window
├── WindowBolt.java
└── WindowWordCountTopology.java
├── withAck
├── SocketSpoutWithAck.java
└── WordCountTopologyWithAck.java
└── withoutAck
└── WordCountTopology.java
/.gitignore:
--------------------------------------------------------------------------------
1 | */.*
2 | */logs/*
3 | */target/*
4 | *.iml
5 | */.idea/*
6 | */output/*
7 | */out/*
8 | */classes/*
9 | .idea/
10 |
--------------------------------------------------------------------------------
/Flink/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | Flink_Dataflow
8 | Flink_Dataflow
9 | 1.0-SNAPSHOT
10 |
11 |
12 |
13 | org.apache.maven.plugins
14 | maven-compiler-plugin
15 |
16 | 1.8
17 | 1.8
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 | org.apache.flink
26 | flink-streaming-java_2.11
27 | 1.7.2
28 |
29 |
30 | org.apache.flink
31 | flink-streaming-scala_2.11
32 | 1.7.2
33 |
34 |
35 | org.apache.flink
36 | flink-table_2.11
37 | 1.7.2
38 |
39 |
40 |
41 |
42 |
43 | org.apache.flink
44 | flink-scala_2.11
45 | 1.7.2
46 |
47 |
48 |
49 |
50 | org.apache.flink
51 | flink-core
52 | 1.7.2
53 |
54 |
55 |
56 |
--------------------------------------------------------------------------------
/Flink/src/main/java/cn/edu/ecnu/flink/examples/java/fibonacciexample/FibonacciExample.java:
--------------------------------------------------------------------------------
1 | package cn.edu.ecnu.flink.examples.java.fibonacciexample;
2 |
3 | import org.apache.flink.api.common.functions.FilterFunction;
4 | import org.apache.flink.api.common.functions.FlatMapFunction;
5 | import org.apache.flink.api.common.functions.MapFunction;
6 | import org.apache.flink.api.java.tuple.Tuple2;
7 | import org.apache.flink.api.java.tuple.Tuple3;
8 | import org.apache.flink.streaming.api.datastream.DataStream;
9 | import org.apache.flink.streaming.api.datastream.IterativeStream;
10 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
11 | import org.apache.flink.util.Collector;
12 |
13 | public class FibonacciExample {
14 | public static void main(String[] args) throws Exception {
15 | run(args);
16 | }
17 |
18 | public static void run(String[] args) throws Exception {
19 | /* |步骤1: 创建StreamExecutionEnvironment对象| */
20 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
21 |
22 | /* |步骤2:按应用逻辑使用操作算子编写DAG,操作算子包括数据源、转换和数据池等| */
23 | // |接收来自Socket数据,创建名为inputStream的DataStream|
24 | DataStream inputStream = env.socketTextStream("localhost", 9099, "\n");
25 | // |解析inputStream中的数据,创建名为first的DataStream|
26 | DataStream> first =
27 | inputStream.map(
28 | new MapFunction>() {
29 | @Override
30 | public Tuple3 map(String value) throws Exception {
31 | return new Tuple3<>(
32 | value.split(" ")[0].charAt(0),
33 | Long.valueOf(value.split(" ")[1]),
34 | Long.valueOf(value.split(" ")[2]));
35 | }
36 | });
37 | // |创建迭代算子|
38 | IterativeStream> iteration = first.iterate(5000L);
39 | // |实现迭代步逻辑,计算下一个斐波那契数|
40 | DataStream> iteratedStream =
41 | iteration.flatMap(
42 | new FlatMapFunction, Tuple3>() {
43 | @Override
44 | public void flatMap(
45 | Tuple3 value, Collector> out)
46 | throws Exception {
47 | // |例如迭代算子的输入输入为(A, 1, 2),此处转换将(A, 1, 2)转换为(A, 2, 3)|
48 | Tuple3 feedbackValue =
49 | new Tuple3(value.f0, value.f2, value.f1 + value.f2);
50 | // |例如迭代算子的输入输入为(A, 1, 2),此处转换将(A, 1, 2)转换为(A, 1, Min)|
51 | Tuple3 outputValue =
52 | new Tuple3(value.f0, value.f1, Long.MIN_VALUE);
53 | out.collect(feedbackValue);
54 | out.collect(outputValue);
55 | }
56 | }).setParallelism(2);
57 | // |创建反馈流|
58 | // |选择第三位置不为Min的元组,例如(A, 2, 3)|
59 | DataStream> feedback =
60 | iteratedStream.filter(
61 | new FilterFunction>() {
62 | @Override
63 | public boolean filter(Tuple3 value) throws Exception {
64 | return value.f2 != Long.MIN_VALUE;
65 | }
66 | });
67 | iteration.closeWith(feedback);
68 | // |创建输出流|
69 | // |选择第三位置为Min的元组,例如(A, 1, 0),并将其转换为(A, 1)|
70 | DataStream> output =
71 | iteratedStream
72 | .filter(
73 | new FilterFunction>() {
74 | @Override
75 | public boolean filter(Tuple3 value) throws Exception {
76 | return value.f2 == Long.MIN_VALUE;
77 | }
78 | })
79 | .map(
80 | new MapFunction, Tuple2>() {
81 | @Override
82 | public Tuple2 map(Tuple3 value)
83 | throws Exception {
84 | return new Tuple2<>(value.f0, value.f1);
85 | }
86 | });
87 | // |输出流式迭代计算结果|
88 | output.print();
89 |
90 | /* |步骤3:触发程序执行| */
91 | env.execute("Streaming Iteration");
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/Flink/src/main/java/cn/edu/ecnu/flink/examples/java/integersum/DataflowModel_How.java:
--------------------------------------------------------------------------------
1 | package cn.edu.ecnu.flink.examples.java.integersum;
2 |
3 | import cn.edu.ecnu.flink.examples.java.integersum.trigger.CustomerTriggerWithAccumulation;
4 | import org.apache.flink.api.java.tuple.Tuple;
5 | import org.apache.flink.api.java.tuple.Tuple2;
6 | import org.apache.flink.streaming.api.TimeCharacteristic;
7 | import org.apache.flink.streaming.api.datastream.DataStream;
8 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
9 | import org.apache.flink.streaming.api.functions.windowing.AllWindowFunction;
10 | import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
11 | import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
12 | import org.apache.flink.streaming.api.windowing.time.Time;
13 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
14 | import org.apache.flink.util.Collector;
15 | import cn.edu.ecnu.flink.examples.java.integersum.producer.Producer;
16 | import cn.edu.ecnu.flink.examples.java.integersum.trigger.CustomerTrigger;
17 |
18 | import java.text.SimpleDateFormat;
19 | import java.util.Iterator;
20 |
21 | /** 使用Flink DataStream 实现基于事件时间并且带有水位线的窗口的聚合操作(允许数据存在延迟,并对延迟数据进行处理) */
22 | public class DataflowModel_How {
23 | public static void main(String[] args) throws Exception {
24 | run(args);
25 | }
26 |
27 | public static void run(String[] args) throws Exception {
28 | /* |步骤1: 创建StreamExecutionEnvironment对象| */
29 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
30 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
31 | env.setParallelism(1);
32 |
33 | /* |步骤2:按应用逻辑使用操作算子编写DAG,操作算子包括数据源、转换和数据池等| */
34 | DataStream> source = env.addSource(new Producer(false));
35 | DataStream sink =
36 | source
37 | .keyBy(0)
38 | .window(TumblingEventTimeWindows.of(Time.seconds(120L)))
39 | // |自定义触发器:在水位线机制的基础上,在处理时间域上每隔一分钟输出一次结果。同时,迟到数据修正窗口结果|
40 | .trigger(new CustomerTriggerWithAccumulation(60L))
41 | // |设置允许延迟时间为300s|
42 | .allowedLateness(Time.seconds(300L))
43 | .apply(new myWindowFunction());
44 | sink.print();
45 |
46 | /* |步骤3:触发程序执行| */
47 | env.execute("Dataflow Model-How");
48 | }
49 |
50 | static class myWindowFunction
51 | implements WindowFunction, String, Tuple, TimeWindow> {
52 | @Override
53 | public void apply(
54 | Tuple tuple,
55 | TimeWindow window,
56 | Iterable> input,
57 | Collector out)
58 | throws Exception {
59 | final SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss");
60 | // |记录整数的累加和|
61 | int sum = 0;
62 | // |获取窗口中键值对的迭代器|
63 | Iterator> it = input.iterator();
64 | // |遍历窗口中的键值对,并对整数进行求和|
65 | while (it.hasNext()) {
66 | Tuple2 next = it.next();
67 | sum = sum + next.f1;
68 | }
69 | // |以字符串形式返回形如”the sum of window [12:00:00,12:02:00) is 14”的窗口函数结果|
70 | String res =
71 | "the sum of window ["
72 | + sdf.format(window.getStart())
73 | + ","
74 | + sdf.format(window.getEnd())
75 | + ") is "
76 | + sum;
77 | out.collect(res);
78 | }
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/Flink/src/main/java/cn/edu/ecnu/flink/examples/java/integersum/DataflowModel_What.java:
--------------------------------------------------------------------------------
1 | package cn.edu.ecnu.flink.examples.java.integersum;
2 |
3 | import org.apache.flink.api.common.functions.MapFunction;
4 | import org.apache.flink.api.java.tuple.Tuple2;
5 | import org.apache.flink.streaming.api.datastream.DataStream;
6 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
7 | import cn.edu.ecnu.flink.examples.java.integersum.producer.Producer;
8 |
9 | public class DataflowModel_What {
10 | public static void main(String[] args) throws Exception {
11 | run(args);
12 | }
13 |
14 | private static void run(String[] args) throws Exception {
15 | /* |步骤1: 创建StreamExecutionEnvironment对象| */
16 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
17 |
18 | /* |步骤2:按应用逻辑使用操作算子编写DAG,操作算子包括数据源、转换和数据池等| */
19 | // |接收来自CustomSource的记录,抛弃代表watermark的记录,创建名为source的DataStream|
20 | DataStream> source = env.addSource(new Producer(true));
21 | // |对键值对按键聚合,并使用sum对整数进行累加,创建名为sink的DataStream|
22 | DataStream> sink = source.keyBy(0).sum(1);
23 | // |输出整数求和结果|
24 | sink.print();
25 |
26 | /* |步骤3:触发程序执行| */
27 | env.execute("Dataflow Model-What");
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/Flink/src/main/java/cn/edu/ecnu/flink/examples/java/integersum/DataflowModel_When.java:
--------------------------------------------------------------------------------
1 | package cn.edu.ecnu.flink.examples.java.integersum;
2 |
3 | import cn.edu.ecnu.flink.examples.java.integersum.trigger.CustomerTrigger;
4 | import org.apache.flink.api.java.tuple.Tuple;
5 | import org.apache.flink.api.java.tuple.Tuple2;
6 | import org.apache.flink.streaming.api.TimeCharacteristic;
7 | import org.apache.flink.streaming.api.datastream.DataStream;
8 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
9 | import org.apache.flink.streaming.api.functions.windowing.AllWindowFunction;
10 | import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
11 | import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
12 | import org.apache.flink.streaming.api.windowing.time.Time;
13 | import org.apache.flink.streaming.api.windowing.triggers.EventTimeTrigger;
14 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
15 | import org.apache.flink.util.Collector;
16 | import cn.edu.ecnu.flink.examples.java.integersum.producer.Producer;
17 |
18 | import java.text.SimpleDateFormat;
19 | import java.util.Iterator;
20 |
21 | /** 使用Flink DataStream 实现基于事件时间并且带有水位线的窗口的聚合操作 */
22 | public class DataflowModel_When {
23 | public static void main(String[] args) throws Exception {
24 | run(args);
25 | }
26 |
27 | public static void run(String[] args) throws Exception {
28 | /* |步骤1: 创建StreamExecutionEnvironment对象| */
29 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
30 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
31 | env.setParallelism(1);
32 |
33 | /* |步骤2:按应用逻辑使用操作算子编写DAG,操作算子包括数据源、转换和数据池等| */
34 | // |通过由CustomSource产生的无界数据集创建名为source的DataStream|
35 | DataStream> source = env.addSource(new Producer(false));
36 | DataStream sink =
37 | source
38 | .keyBy(0)
39 | .window(TumblingEventTimeWindows.of(Time.seconds(120L)))
40 | // |定义水位线到达窗口最大时间戳的时候输出结果|
41 | .trigger(EventTimeTrigger.create())
42 | // .trigger(new CustomerTrigger(60L))
43 | .apply(new myWindowFunction());
44 | sink.print();
45 |
46 | /* |步骤3:触发程序执行| */
47 | env.execute("Dataflow Model-When");
48 | }
49 |
50 | static class myWindowFunction
51 | implements WindowFunction, String, Tuple, TimeWindow> {
52 | @Override
53 | public void apply(
54 | Tuple tuple,
55 | TimeWindow window,
56 | Iterable> input,
57 | Collector out)
58 | throws Exception {
59 | final SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss");
60 | // |记录整数的累加和|
61 | int sum = 0;
62 | // |获取窗口中键值对的迭代器|
63 | Iterator> it = input.iterator();
64 | // |遍历窗口中的键值对,并对整数进行求和|
65 | while (it.hasNext()) {
66 | Tuple2 next = it.next();
67 | sum = sum + next.f1;
68 | }
69 | // |以字符串形式返回形如”the sum of window [12:00:00,12:02:00) is 14”的窗口函数结果|
70 | String res =
71 | "the sum of window ["
72 | + sdf.format(window.getStart())
73 | + ","
74 | + sdf.format(window.getEnd())
75 | + ") is "
76 | + sum;
77 | out.collect(res);
78 | }
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/Flink/src/main/java/cn/edu/ecnu/flink/examples/java/integersum/DataflowModel_Where.java:
--------------------------------------------------------------------------------
1 | package cn.edu.ecnu.flink.examples.java.integersum;
2 |
3 | import org.apache.flink.api.java.tuple.Tuple;
4 | import org.apache.flink.api.java.tuple.Tuple2;
5 | import org.apache.flink.streaming.api.TimeCharacteristic;
6 | import org.apache.flink.streaming.api.datastream.DataStream;
7 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
8 | import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
9 | import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
10 | import org.apache.flink.streaming.api.windowing.time.Time;
11 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
12 | import org.apache.flink.streaming.api.windowing.windows.Window;
13 | import org.apache.flink.util.Collector;
14 | import cn.edu.ecnu.flink.examples.java.integersum.producer.Producer;
15 |
16 | import java.text.SimpleDateFormat;
17 | import java.util.Iterator;
18 |
19 | public class DataflowModel_Where {
20 | public static void main(String[] args) throws Exception {
21 | run(args);
22 | }
23 |
24 | public static void run(String[] args) throws Exception {
25 | /* |步骤1: 创建StreamExecutionEnvironment对象| */
26 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
27 | // |设置时间特征为事件时间|
28 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
29 |
30 | /* |步骤2:按应用逻辑使用操作算子编写DAG,操作算子包括数据源、转换和数据池等| */
31 | DataStream> source = env.addSource(new Producer(true));
32 | // |将source中的记录按照事件时间以2分钟为单位进行窗口划分|
33 | DataStream sink =
34 | source
35 | .keyBy(0)
36 | .window(TumblingEventTimeWindows.of(Time.seconds(120L)))
37 | // |使用myWindowFunction作为窗口函数对整数进行累加求和|
38 | .apply(new myWindowFunction());
39 | sink.print();
40 |
41 | /* |步骤3:触发程序执行| */
42 | env.execute("Dataflow Model-Where");
43 | }
44 |
45 | static class myWindowFunction
46 | implements WindowFunction, String, Tuple, TimeWindow> {
47 | @Override
48 | public void apply(
49 | Tuple tuple,
50 | TimeWindow window,
51 | Iterable> input,
52 | Collector out)
53 | throws Exception {
54 | final SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss");
55 | // |记录整数的累加和|
56 | int sum = 0;
57 | // |获取窗口中键值对的迭代器|
58 | Iterator> it = input.iterator();
59 | // |遍历窗口中的键值对,并对整数进行求和|
60 | while (it.hasNext()) {
61 | Tuple2 next = it.next();
62 | sum = sum + next.f1;
63 | }
64 | // |以字符串形式返回形如”the sum of window [12:00:00,12:02:00) is 14”的窗口函数结果|
65 | String res =
66 | "the sum of window ["
67 | + sdf.format(window.getStart())
68 | + ","
69 | + sdf.format(window.getEnd())
70 | + ") is "
71 | + sum;
72 | out.collect(res);
73 | }
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/Flink/src/main/java/cn/edu/ecnu/flink/examples/java/integersum/producer/Producer.java:
--------------------------------------------------------------------------------
1 | package cn.edu.ecnu.flink.examples.java.integersum.producer;
2 |
3 | import org.apache.flink.api.java.tuple.Tuple2;
4 | import org.apache.flink.api.java.tuple.Tuple3;
5 | import org.apache.flink.streaming.api.functions.source.SourceFunction;
6 | import org.apache.flink.streaming.api.watermark.Watermark;
7 |
8 | import java.text.SimpleDateFormat;
9 | import java.util.ArrayList;
10 | import java.util.Arrays;
11 | import java.util.List;
12 |
13 | public class Producer implements SourceFunction> {
14 | boolean isBounded;
15 | final List> data =
16 | new ArrayList>(
17 | Arrays.asList(
18 | new Tuple3("dataflow", 5, "12:00:30"),
19 | new Tuple3("dataflow", 7, "12:02:30"),
20 | new Tuple3("dataflow", 3, "12:03:45"),
21 | // 水位线
22 | new Tuple3("dataflow", null, "12:02:00"),
23 | new Tuple3("dataflow", 4, "12:03:50"),
24 | new Tuple3("dataflow", 3, "12:04:30"),
25 | new Tuple3("dataflow", 8, "12:03:30"),
26 | // 水位线
27 | new Tuple3("dataflow", null, "12:04:00"),
28 | // 水位线
29 | new Tuple3("dataflow", null, "12:06:00"),
30 | new Tuple3("dataflow", 3, "12:06:30"),
31 | new Tuple3("dataflow", 9, "12:01:30"),
32 | new Tuple3("dataflow", 8, "12:07:30"),
33 | new Tuple3("dataflow", 1, "12:07:50"),
34 | // 水位线
35 | new Tuple3("dataflow", null, "12:08:00")));
36 |
37 | // 每个数据的处理时间间隔
38 | final List processInterval =
39 | new ArrayList(Arrays.asList(40, 15, 25, 10, 5, 15, 30, 10, 10, 30, 20, 40, 20, 20));
40 |
41 | final SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss");
42 |
43 | public Producer(boolean isBounded) {
44 | this.isBounded = isBounded;
45 | }
46 |
47 | @Override
48 | public void run(SourceContext> ctx) throws Exception {
49 | waitForMinute();
50 | for (int i = 0; i < 14; i++) {
51 | // |记录发送延迟时间|
52 | Thread.sleep(processInterval.get(i) * 1000);
53 | Long timestamp = sdf.parse(data.get(i).f2).getTime();
54 | Integer value = data.get(i).f1;
55 | // |若为水位线记录且输入数据作为无界数据集,则生成系统的水位线|
56 | if (value == null) {
57 | if (!isBounded) {
58 | ctx.emitWatermark(new Watermark(sdf.parse(data.get(i).f2).getTime()));
59 | }
60 | }
61 | // |设置键值对的事件时间并发送至下游|
62 | else {
63 | ctx.collectWithTimestamp(new Tuple2(data.get(i).f0, value), timestamp);
64 | }
65 | }
66 | }
67 |
68 | @Override
69 | public void cancel() {}
70 |
71 | private void waitForMinute() throws InterruptedException {
72 | Long interval = 60 * 1000L;
73 | Long timestamp = System.currentTimeMillis();
74 | Thread.sleep(interval - (timestamp % interval));
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/Flink/src/main/java/cn/edu/ecnu/flink/examples/java/integersum/trigger/CustomerTrigger.java:
--------------------------------------------------------------------------------
1 | package cn.edu.ecnu.flink.examples.java.integersum.trigger;
2 |
3 | import org.apache.flink.api.common.functions.ReduceFunction;
4 | import org.apache.flink.api.common.state.ReducingState;
5 | import org.apache.flink.api.common.state.ReducingStateDescriptor;
6 | import org.apache.flink.api.common.typeutils.base.LongSerializer;
7 | import org.apache.flink.streaming.api.windowing.time.Time;
8 | import org.apache.flink.streaming.api.windowing.triggers.Trigger;
9 | import org.apache.flink.streaming.api.windowing.triggers.TriggerResult;
10 | import org.apache.flink.streaming.api.windowing.windows.Window;
11 | import java.text.SimpleDateFormat;
12 | import java.util.ArrayList;
13 | import java.util.HashMap;
14 | import java.util.List;
15 | import java.util.Map;
16 | import java.util.concurrent.TimeUnit;
17 |
18 | public class CustomerTrigger extends Trigger