├── .gitignore ├── Flink ├── pom.xml └── src │ └── main │ ├── java │ └── cn │ │ └── edu │ │ └── ecnu │ │ └── flink │ │ └── examples │ │ └── java │ │ ├── fibonacciexample │ │ └── FibonacciExample.java │ │ ├── integersum │ │ ├── DataflowModel_How.java │ │ ├── DataflowModel_What.java │ │ ├── DataflowModel_When.java │ │ ├── DataflowModel_Where.java │ │ ├── producer │ │ │ └── Producer.java │ │ └── trigger │ │ │ ├── CustomerTrigger.java │ │ │ └── CustomerTriggerWithAccumulation.java │ │ ├── wordcount │ │ └── WordCount.java │ │ └── wordcountwithfaulttolerance │ │ └── WordCountWithFaultTolerance.java │ ├── resources │ └── META-INF │ │ └── MANIFEST.MF │ └── scala │ └── cn │ └── edu │ └── ecnu │ └── flink │ └── examples │ └── scala │ ├── fibonacciexample │ └── FibonacciExample.scala │ ├── integersum │ ├── DataflowModel_How.scala │ ├── DataflowModel_What.scala │ ├── DataflowModel_When.scala │ ├── DataflowModel_Where.scala │ ├── producer │ │ └── Producer.scala │ └── trigger │ │ ├── CustomerTrigger.scala │ │ └── CustomerTriggerWithAccumulation.scala │ ├── wordcount │ └── WordCount.scala │ └── wordcountwithfaulttolerance │ └── WordCountWithFaultTolerance.scala ├── Giraph ├── pom.xml └── src │ └── main │ ├── java │ └── cn │ │ └── edu │ │ └── ecnu │ │ └── giraph │ │ └── examples │ │ ├── cc │ │ ├── ConnectedComponentsComputation.java │ │ ├── ConnectedComponentsRunner.java │ │ └── README.md │ │ ├── kmeans │ │ ├── KMeansComputation.java │ │ ├── KMeansMasterCompute.java │ │ ├── KMeansRunner.java │ │ ├── README.md │ │ └── utils │ │ │ ├── FileOperation.java │ │ │ └── PointsOperation.java │ │ ├── pagerank │ │ ├── PageRankComputation.java │ │ ├── PageRankRunner.java │ │ └── README.md │ │ └── sssp │ │ ├── README.md │ │ ├── ShortestPathComputation.java │ │ └── ShortestPathRunner.java │ └── resources │ ├── inputs │ ├── cc │ │ └── data.txt │ ├── kmeans │ │ ├── center │ │ │ └── centers.txt │ │ └── data.txt │ ├── pagerank │ │ └── data.txt │ └── sssp │ │ └── data.txt │ └── log4j.properties ├── HDFS ├── HDFS.md ├── README.md ├── pom.xml └── src │ └── main │ ├── java │ └── cn │ │ └── edu │ │ └── ecnu │ │ └── hdfs │ │ └── examples │ │ ├── read │ │ └── Reader.java │ │ └── write │ │ └── Writer.java │ └── resources │ └── example │ └── example.txt ├── LICENSE ├── MapReduce ├── MapReduce.md ├── pom.xml └── src │ └── main │ ├── java │ └── cn │ │ └── edu │ │ └── ecnu │ │ └── mapreduce │ │ └── examples │ │ ├── Constants.java │ │ ├── join │ │ ├── README.md │ │ ├── entity │ │ │ └── ReduceJoinWritable.java │ │ ├── mapjoin │ │ │ ├── MapJoin.java │ │ │ └── MapJoinMapper.java │ │ └── reducejoin │ │ │ ├── ReduceJoin.java │ │ │ ├── ReduceJoinMapper.java │ │ │ └── ReduceJoinReducer.java │ │ ├── kmeans │ │ ├── KMeans.java │ │ ├── KMeansMapper.java │ │ ├── KMeansReducer.java │ │ ├── README.md │ │ └── utils │ │ │ ├── CentersOperation.java │ │ │ └── FileOperation.java │ │ ├── pagerank │ │ ├── PageRank.java │ │ ├── PageRankMapper.java │ │ ├── PageRankReducer.java │ │ ├── README.md │ │ └── ReducePageRankWritable.java │ │ └── wordcount │ │ ├── README.md │ │ ├── WordCount.java │ │ ├── WordCountCombiner.java │ │ ├── WordCountMapper.java │ │ └── WordCountReducer.java │ └── resources │ ├── inputs │ ├── join │ │ ├── department.csv │ │ ├── employee.csv │ │ └── input_cluster │ │ │ ├── department.csv │ │ │ └── employee.csv │ ├── kmeans │ │ ├── center │ │ │ └── centers.txt │ │ └── data.txt │ ├── pagerank │ │ └── data.txt │ └── wordcount │ │ └── data.txt │ └── log4j.properties ├── README.md ├── Spark ├── pom.xml └── src │ └── main │ ├── java │ └── cn │ │ └── edu │ │ └── ecnu │ │ └── spark │ │ └── examples │ │ └── java │ │ ├── checkpoint │ │ └── Checkpoint.java │ │ ├── join │ │ ├── BroadcastJoin.java │ │ └── ShuffleJoin.java │ │ ├── kmeans │ │ └── KMeans.java │ │ ├── pagerank │ │ └── PageRank.java │ │ └── wordcount │ │ └── WordCount.java │ ├── resources │ └── input │ │ ├── join │ │ ├── department.csv │ │ └── employee.csv │ │ ├── kmeans │ │ ├── centers.txt │ │ └── data.txt │ │ ├── overleaf │ │ ├── a.txt │ │ ├── c.txt │ │ └── e.txt │ │ ├── pagerank │ │ └── pagerank.txt │ │ └── wordcount │ │ └── words.txt │ └── scala │ └── cn │ └── edu │ └── ecnu │ └── spark │ └── examples │ └── scala │ ├── checkpoint │ └── Checkpoint.scala │ ├── join │ ├── BroadcastJoin.scala │ └── ShuffleJoin.scala │ ├── kmeans │ └── KMeans.scala │ ├── pagerank │ └── PageRank.scala │ └── wordcount │ └── WordCount.scala ├── SparkStreaming ├── pom.xml └── src │ └── main │ ├── java │ └── cn │ │ └── edu │ │ └── ecnu │ │ └── sparkstreaming │ │ └── examples │ │ └── java │ │ ├── anomaly │ │ └── AnomalyDetection.java │ │ ├── window │ │ └── Window.java │ │ └── wordcount │ │ ├── BatchWordCount.java │ │ └── GlobalWordCount.java │ └── scala │ └── cn │ └── edu │ └── ecnu │ └── sparkstreaming │ └── examples │ └── scala │ ├── anomaly │ └── AnomalyDetection.scala │ ├── window │ └── Window.scala │ └── wordcount │ ├── BatchWordCount.scala │ └── GlobalWordCount.scala └── Storm ├── README.md ├── pom.xml └── src └── main └── java └── cn └── edu └── ecnu └── example └── storm ├── common └── SocketSpout.java ├── detection ├── DetectionBolt.java └── OutlierTopology.java └── wordcount ├── CountBolt.java ├── SplitBolt.java ├── window ├── WindowBolt.java └── WindowWordCountTopology.java ├── withAck ├── SocketSpoutWithAck.java └── WordCountTopologyWithAck.java └── withoutAck └── WordCountTopology.java /.gitignore: -------------------------------------------------------------------------------- 1 | */.* 2 | */logs/* 3 | */target/* 4 | *.iml 5 | */.idea/* 6 | */output/* 7 | */out/* 8 | */classes/* 9 | .idea/ 10 | -------------------------------------------------------------------------------- /Flink/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | Flink_Dataflow 8 | Flink_Dataflow 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 13 | org.apache.maven.plugins 14 | maven-compiler-plugin 15 | 16 | 1.8 17 | 1.8 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | org.apache.flink 26 | flink-streaming-java_2.11 27 | 1.7.2 28 | 29 | 30 | org.apache.flink 31 | flink-streaming-scala_2.11 32 | 1.7.2 33 | 34 | 35 | org.apache.flink 36 | flink-table_2.11 37 | 1.7.2 38 | 39 | 40 | 41 | 42 | 43 | org.apache.flink 44 | flink-scala_2.11 45 | 1.7.2 46 | 47 | 48 | 49 | 50 | org.apache.flink 51 | flink-core 52 | 1.7.2 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /Flink/src/main/java/cn/edu/ecnu/flink/examples/java/fibonacciexample/FibonacciExample.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.flink.examples.java.fibonacciexample; 2 | 3 | import org.apache.flink.api.common.functions.FilterFunction; 4 | import org.apache.flink.api.common.functions.FlatMapFunction; 5 | import org.apache.flink.api.common.functions.MapFunction; 6 | import org.apache.flink.api.java.tuple.Tuple2; 7 | import org.apache.flink.api.java.tuple.Tuple3; 8 | import org.apache.flink.streaming.api.datastream.DataStream; 9 | import org.apache.flink.streaming.api.datastream.IterativeStream; 10 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 11 | import org.apache.flink.util.Collector; 12 | 13 | public class FibonacciExample { 14 | public static void main(String[] args) throws Exception { 15 | run(args); 16 | } 17 | 18 | public static void run(String[] args) throws Exception { 19 | /* |步骤1: 创建StreamExecutionEnvironment对象| */ 20 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 21 | 22 | /* |步骤2:按应用逻辑使用操作算子编写DAG,操作算子包括数据源、转换和数据池等| */ 23 | // |接收来自Socket数据,创建名为inputStream的DataStream| 24 | DataStream inputStream = env.socketTextStream("localhost", 9099, "\n"); 25 | // |解析inputStream中的数据,创建名为first的DataStream| 26 | DataStream> first = 27 | inputStream.map( 28 | new MapFunction>() { 29 | @Override 30 | public Tuple3 map(String value) throws Exception { 31 | return new Tuple3<>( 32 | value.split(" ")[0].charAt(0), 33 | Long.valueOf(value.split(" ")[1]), 34 | Long.valueOf(value.split(" ")[2])); 35 | } 36 | }); 37 | // |创建迭代算子| 38 | IterativeStream> iteration = first.iterate(5000L); 39 | // |实现迭代步逻辑,计算下一个斐波那契数| 40 | DataStream> iteratedStream = 41 | iteration.flatMap( 42 | new FlatMapFunction, Tuple3>() { 43 | @Override 44 | public void flatMap( 45 | Tuple3 value, Collector> out) 46 | throws Exception { 47 | // |例如迭代算子的输入输入为(A, 1, 2),此处转换将(A, 1, 2)转换为(A, 2, 3)| 48 | Tuple3 feedbackValue = 49 | new Tuple3(value.f0, value.f2, value.f1 + value.f2); 50 | // |例如迭代算子的输入输入为(A, 1, 2),此处转换将(A, 1, 2)转换为(A, 1, Min)| 51 | Tuple3 outputValue = 52 | new Tuple3(value.f0, value.f1, Long.MIN_VALUE); 53 | out.collect(feedbackValue); 54 | out.collect(outputValue); 55 | } 56 | }).setParallelism(2); 57 | // |创建反馈流| 58 | // |选择第三位置不为Min的元组,例如(A, 2, 3)| 59 | DataStream> feedback = 60 | iteratedStream.filter( 61 | new FilterFunction>() { 62 | @Override 63 | public boolean filter(Tuple3 value) throws Exception { 64 | return value.f2 != Long.MIN_VALUE; 65 | } 66 | }); 67 | iteration.closeWith(feedback); 68 | // |创建输出流| 69 | // |选择第三位置为Min的元组,例如(A, 1, 0),并将其转换为(A, 1)| 70 | DataStream> output = 71 | iteratedStream 72 | .filter( 73 | new FilterFunction>() { 74 | @Override 75 | public boolean filter(Tuple3 value) throws Exception { 76 | return value.f2 == Long.MIN_VALUE; 77 | } 78 | }) 79 | .map( 80 | new MapFunction, Tuple2>() { 81 | @Override 82 | public Tuple2 map(Tuple3 value) 83 | throws Exception { 84 | return new Tuple2<>(value.f0, value.f1); 85 | } 86 | }); 87 | // |输出流式迭代计算结果| 88 | output.print(); 89 | 90 | /* |步骤3:触发程序执行| */ 91 | env.execute("Streaming Iteration"); 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /Flink/src/main/java/cn/edu/ecnu/flink/examples/java/integersum/DataflowModel_How.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.flink.examples.java.integersum; 2 | 3 | import cn.edu.ecnu.flink.examples.java.integersum.trigger.CustomerTriggerWithAccumulation; 4 | import org.apache.flink.api.java.tuple.Tuple; 5 | import org.apache.flink.api.java.tuple.Tuple2; 6 | import org.apache.flink.streaming.api.TimeCharacteristic; 7 | import org.apache.flink.streaming.api.datastream.DataStream; 8 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 9 | import org.apache.flink.streaming.api.functions.windowing.AllWindowFunction; 10 | import org.apache.flink.streaming.api.functions.windowing.WindowFunction; 11 | import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows; 12 | import org.apache.flink.streaming.api.windowing.time.Time; 13 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow; 14 | import org.apache.flink.util.Collector; 15 | import cn.edu.ecnu.flink.examples.java.integersum.producer.Producer; 16 | import cn.edu.ecnu.flink.examples.java.integersum.trigger.CustomerTrigger; 17 | 18 | import java.text.SimpleDateFormat; 19 | import java.util.Iterator; 20 | 21 | /** 使用Flink DataStream 实现基于事件时间并且带有水位线的窗口的聚合操作(允许数据存在延迟,并对延迟数据进行处理) */ 22 | public class DataflowModel_How { 23 | public static void main(String[] args) throws Exception { 24 | run(args); 25 | } 26 | 27 | public static void run(String[] args) throws Exception { 28 | /* |步骤1: 创建StreamExecutionEnvironment对象| */ 29 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 30 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); 31 | env.setParallelism(1); 32 | 33 | /* |步骤2:按应用逻辑使用操作算子编写DAG,操作算子包括数据源、转换和数据池等| */ 34 | DataStream> source = env.addSource(new Producer(false)); 35 | DataStream sink = 36 | source 37 | .keyBy(0) 38 | .window(TumblingEventTimeWindows.of(Time.seconds(120L))) 39 | // |自定义触发器:在水位线机制的基础上,在处理时间域上每隔一分钟输出一次结果。同时,迟到数据修正窗口结果| 40 | .trigger(new CustomerTriggerWithAccumulation(60L)) 41 | // |设置允许延迟时间为300s| 42 | .allowedLateness(Time.seconds(300L)) 43 | .apply(new myWindowFunction()); 44 | sink.print(); 45 | 46 | /* |步骤3:触发程序执行| */ 47 | env.execute("Dataflow Model-How"); 48 | } 49 | 50 | static class myWindowFunction 51 | implements WindowFunction, String, Tuple, TimeWindow> { 52 | @Override 53 | public void apply( 54 | Tuple tuple, 55 | TimeWindow window, 56 | Iterable> input, 57 | Collector out) 58 | throws Exception { 59 | final SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss"); 60 | // |记录整数的累加和| 61 | int sum = 0; 62 | // |获取窗口中键值对的迭代器| 63 | Iterator> it = input.iterator(); 64 | // |遍历窗口中的键值对,并对整数进行求和| 65 | while (it.hasNext()) { 66 | Tuple2 next = it.next(); 67 | sum = sum + next.f1; 68 | } 69 | // |以字符串形式返回形如”the sum of window [12:00:00,12:02:00) is 14”的窗口函数结果| 70 | String res = 71 | "the sum of window [" 72 | + sdf.format(window.getStart()) 73 | + "," 74 | + sdf.format(window.getEnd()) 75 | + ") is " 76 | + sum; 77 | out.collect(res); 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /Flink/src/main/java/cn/edu/ecnu/flink/examples/java/integersum/DataflowModel_What.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.flink.examples.java.integersum; 2 | 3 | import org.apache.flink.api.common.functions.MapFunction; 4 | import org.apache.flink.api.java.tuple.Tuple2; 5 | import org.apache.flink.streaming.api.datastream.DataStream; 6 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 7 | import cn.edu.ecnu.flink.examples.java.integersum.producer.Producer; 8 | 9 | public class DataflowModel_What { 10 | public static void main(String[] args) throws Exception { 11 | run(args); 12 | } 13 | 14 | private static void run(String[] args) throws Exception { 15 | /* |步骤1: 创建StreamExecutionEnvironment对象| */ 16 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 17 | 18 | /* |步骤2:按应用逻辑使用操作算子编写DAG,操作算子包括数据源、转换和数据池等| */ 19 | // |接收来自CustomSource的记录,抛弃代表watermark的记录,创建名为source的DataStream| 20 | DataStream> source = env.addSource(new Producer(true)); 21 | // |对键值对按键聚合,并使用sum对整数进行累加,创建名为sink的DataStream| 22 | DataStream> sink = source.keyBy(0).sum(1); 23 | // |输出整数求和结果| 24 | sink.print(); 25 | 26 | /* |步骤3:触发程序执行| */ 27 | env.execute("Dataflow Model-What"); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /Flink/src/main/java/cn/edu/ecnu/flink/examples/java/integersum/DataflowModel_When.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.flink.examples.java.integersum; 2 | 3 | import cn.edu.ecnu.flink.examples.java.integersum.trigger.CustomerTrigger; 4 | import org.apache.flink.api.java.tuple.Tuple; 5 | import org.apache.flink.api.java.tuple.Tuple2; 6 | import org.apache.flink.streaming.api.TimeCharacteristic; 7 | import org.apache.flink.streaming.api.datastream.DataStream; 8 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 9 | import org.apache.flink.streaming.api.functions.windowing.AllWindowFunction; 10 | import org.apache.flink.streaming.api.functions.windowing.WindowFunction; 11 | import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows; 12 | import org.apache.flink.streaming.api.windowing.time.Time; 13 | import org.apache.flink.streaming.api.windowing.triggers.EventTimeTrigger; 14 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow; 15 | import org.apache.flink.util.Collector; 16 | import cn.edu.ecnu.flink.examples.java.integersum.producer.Producer; 17 | 18 | import java.text.SimpleDateFormat; 19 | import java.util.Iterator; 20 | 21 | /** 使用Flink DataStream 实现基于事件时间并且带有水位线的窗口的聚合操作 */ 22 | public class DataflowModel_When { 23 | public static void main(String[] args) throws Exception { 24 | run(args); 25 | } 26 | 27 | public static void run(String[] args) throws Exception { 28 | /* |步骤1: 创建StreamExecutionEnvironment对象| */ 29 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 30 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); 31 | env.setParallelism(1); 32 | 33 | /* |步骤2:按应用逻辑使用操作算子编写DAG,操作算子包括数据源、转换和数据池等| */ 34 | // |通过由CustomSource产生的无界数据集创建名为source的DataStream| 35 | DataStream> source = env.addSource(new Producer(false)); 36 | DataStream sink = 37 | source 38 | .keyBy(0) 39 | .window(TumblingEventTimeWindows.of(Time.seconds(120L))) 40 | // |定义水位线到达窗口最大时间戳的时候输出结果| 41 | .trigger(EventTimeTrigger.create()) 42 | // .trigger(new CustomerTrigger(60L)) 43 | .apply(new myWindowFunction()); 44 | sink.print(); 45 | 46 | /* |步骤3:触发程序执行| */ 47 | env.execute("Dataflow Model-When"); 48 | } 49 | 50 | static class myWindowFunction 51 | implements WindowFunction, String, Tuple, TimeWindow> { 52 | @Override 53 | public void apply( 54 | Tuple tuple, 55 | TimeWindow window, 56 | Iterable> input, 57 | Collector out) 58 | throws Exception { 59 | final SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss"); 60 | // |记录整数的累加和| 61 | int sum = 0; 62 | // |获取窗口中键值对的迭代器| 63 | Iterator> it = input.iterator(); 64 | // |遍历窗口中的键值对,并对整数进行求和| 65 | while (it.hasNext()) { 66 | Tuple2 next = it.next(); 67 | sum = sum + next.f1; 68 | } 69 | // |以字符串形式返回形如”the sum of window [12:00:00,12:02:00) is 14”的窗口函数结果| 70 | String res = 71 | "the sum of window [" 72 | + sdf.format(window.getStart()) 73 | + "," 74 | + sdf.format(window.getEnd()) 75 | + ") is " 76 | + sum; 77 | out.collect(res); 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /Flink/src/main/java/cn/edu/ecnu/flink/examples/java/integersum/DataflowModel_Where.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.flink.examples.java.integersum; 2 | 3 | import org.apache.flink.api.java.tuple.Tuple; 4 | import org.apache.flink.api.java.tuple.Tuple2; 5 | import org.apache.flink.streaming.api.TimeCharacteristic; 6 | import org.apache.flink.streaming.api.datastream.DataStream; 7 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 8 | import org.apache.flink.streaming.api.functions.windowing.WindowFunction; 9 | import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows; 10 | import org.apache.flink.streaming.api.windowing.time.Time; 11 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow; 12 | import org.apache.flink.streaming.api.windowing.windows.Window; 13 | import org.apache.flink.util.Collector; 14 | import cn.edu.ecnu.flink.examples.java.integersum.producer.Producer; 15 | 16 | import java.text.SimpleDateFormat; 17 | import java.util.Iterator; 18 | 19 | public class DataflowModel_Where { 20 | public static void main(String[] args) throws Exception { 21 | run(args); 22 | } 23 | 24 | public static void run(String[] args) throws Exception { 25 | /* |步骤1: 创建StreamExecutionEnvironment对象| */ 26 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 27 | // |设置时间特征为事件时间| 28 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); 29 | 30 | /* |步骤2:按应用逻辑使用操作算子编写DAG,操作算子包括数据源、转换和数据池等| */ 31 | DataStream> source = env.addSource(new Producer(true)); 32 | // |将source中的记录按照事件时间以2分钟为单位进行窗口划分| 33 | DataStream sink = 34 | source 35 | .keyBy(0) 36 | .window(TumblingEventTimeWindows.of(Time.seconds(120L))) 37 | // |使用myWindowFunction作为窗口函数对整数进行累加求和| 38 | .apply(new myWindowFunction()); 39 | sink.print(); 40 | 41 | /* |步骤3:触发程序执行| */ 42 | env.execute("Dataflow Model-Where"); 43 | } 44 | 45 | static class myWindowFunction 46 | implements WindowFunction, String, Tuple, TimeWindow> { 47 | @Override 48 | public void apply( 49 | Tuple tuple, 50 | TimeWindow window, 51 | Iterable> input, 52 | Collector out) 53 | throws Exception { 54 | final SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss"); 55 | // |记录整数的累加和| 56 | int sum = 0; 57 | // |获取窗口中键值对的迭代器| 58 | Iterator> it = input.iterator(); 59 | // |遍历窗口中的键值对,并对整数进行求和| 60 | while (it.hasNext()) { 61 | Tuple2 next = it.next(); 62 | sum = sum + next.f1; 63 | } 64 | // |以字符串形式返回形如”the sum of window [12:00:00,12:02:00) is 14”的窗口函数结果| 65 | String res = 66 | "the sum of window [" 67 | + sdf.format(window.getStart()) 68 | + "," 69 | + sdf.format(window.getEnd()) 70 | + ") is " 71 | + sum; 72 | out.collect(res); 73 | } 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /Flink/src/main/java/cn/edu/ecnu/flink/examples/java/integersum/producer/Producer.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.flink.examples.java.integersum.producer; 2 | 3 | import org.apache.flink.api.java.tuple.Tuple2; 4 | import org.apache.flink.api.java.tuple.Tuple3; 5 | import org.apache.flink.streaming.api.functions.source.SourceFunction; 6 | import org.apache.flink.streaming.api.watermark.Watermark; 7 | 8 | import java.text.SimpleDateFormat; 9 | import java.util.ArrayList; 10 | import java.util.Arrays; 11 | import java.util.List; 12 | 13 | public class Producer implements SourceFunction> { 14 | boolean isBounded; 15 | final List> data = 16 | new ArrayList>( 17 | Arrays.asList( 18 | new Tuple3("dataflow", 5, "12:00:30"), 19 | new Tuple3("dataflow", 7, "12:02:30"), 20 | new Tuple3("dataflow", 3, "12:03:45"), 21 | // 水位线 22 | new Tuple3("dataflow", null, "12:02:00"), 23 | new Tuple3("dataflow", 4, "12:03:50"), 24 | new Tuple3("dataflow", 3, "12:04:30"), 25 | new Tuple3("dataflow", 8, "12:03:30"), 26 | // 水位线 27 | new Tuple3("dataflow", null, "12:04:00"), 28 | // 水位线 29 | new Tuple3("dataflow", null, "12:06:00"), 30 | new Tuple3("dataflow", 3, "12:06:30"), 31 | new Tuple3("dataflow", 9, "12:01:30"), 32 | new Tuple3("dataflow", 8, "12:07:30"), 33 | new Tuple3("dataflow", 1, "12:07:50"), 34 | // 水位线 35 | new Tuple3("dataflow", null, "12:08:00"))); 36 | 37 | // 每个数据的处理时间间隔 38 | final List processInterval = 39 | new ArrayList(Arrays.asList(40, 15, 25, 10, 5, 15, 30, 10, 10, 30, 20, 40, 20, 20)); 40 | 41 | final SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss"); 42 | 43 | public Producer(boolean isBounded) { 44 | this.isBounded = isBounded; 45 | } 46 | 47 | @Override 48 | public void run(SourceContext> ctx) throws Exception { 49 | waitForMinute(); 50 | for (int i = 0; i < 14; i++) { 51 | // |记录发送延迟时间| 52 | Thread.sleep(processInterval.get(i) * 1000); 53 | Long timestamp = sdf.parse(data.get(i).f2).getTime(); 54 | Integer value = data.get(i).f1; 55 | // |若为水位线记录且输入数据作为无界数据集,则生成系统的水位线| 56 | if (value == null) { 57 | if (!isBounded) { 58 | ctx.emitWatermark(new Watermark(sdf.parse(data.get(i).f2).getTime())); 59 | } 60 | } 61 | // |设置键值对的事件时间并发送至下游| 62 | else { 63 | ctx.collectWithTimestamp(new Tuple2(data.get(i).f0, value), timestamp); 64 | } 65 | } 66 | } 67 | 68 | @Override 69 | public void cancel() {} 70 | 71 | private void waitForMinute() throws InterruptedException { 72 | Long interval = 60 * 1000L; 73 | Long timestamp = System.currentTimeMillis(); 74 | Thread.sleep(interval - (timestamp % interval)); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /Flink/src/main/java/cn/edu/ecnu/flink/examples/java/integersum/trigger/CustomerTrigger.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.flink.examples.java.integersum.trigger; 2 | 3 | import org.apache.flink.api.common.functions.ReduceFunction; 4 | import org.apache.flink.api.common.state.ReducingState; 5 | import org.apache.flink.api.common.state.ReducingStateDescriptor; 6 | import org.apache.flink.api.common.typeutils.base.LongSerializer; 7 | import org.apache.flink.streaming.api.windowing.time.Time; 8 | import org.apache.flink.streaming.api.windowing.triggers.Trigger; 9 | import org.apache.flink.streaming.api.windowing.triggers.TriggerResult; 10 | import org.apache.flink.streaming.api.windowing.windows.Window; 11 | import java.text.SimpleDateFormat; 12 | import java.util.ArrayList; 13 | import java.util.HashMap; 14 | import java.util.List; 15 | import java.util.Map; 16 | import java.util.concurrent.TimeUnit; 17 | 18 | public class CustomerTrigger extends Trigger { 19 | final SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss"); 20 | 21 | // |以自定义状态形式保存每个窗口中处理时间定时器所对应的触发时间| 22 | private final ReducingStateDescriptor processTimerStateDescriptor = 23 | new ReducingStateDescriptor( 24 | "processTimer", new CustomerTrigger.Update(), LongSerializer.INSTANCE); 25 | 26 | // |基于处理时间触发间隔| 27 | Long interval = 60L; 28 | 29 | public CustomerTrigger(long interval) { 30 | this.interval = interval * 1000; 31 | } 32 | 33 | // |当有记录进入相应窗口时触发器将调用此方法| 34 | @Override 35 | public TriggerResult onElement(Object element, long timestamp, W window, TriggerContext ctx) 36 | throws Exception { 37 | // |从自定义状态中获取处理时间定时器所对应的触发时间| 38 | ReducingState fireTimestamp = ctx.getPartitionedState(processTimerStateDescriptor); 39 | // |窗口中进入第一条记录,处理时间定时器所对应的触发时间状态不存在| 40 | if (fireTimestamp.get() == null) { 41 | Long timeStamp = ctx.getCurrentProcessingTime(); 42 | // |计算窗口的下一次触发时间| 43 | Long start = timeStamp - (timeStamp % interval); 44 | Long nextFireTimestamp = start + interval; 45 | // |注册处理时间定时器| 46 | ctx.registerProcessingTimeTimer(nextFireTimestamp); 47 | // |将处理时间定时器所对应触发时间存入自定义状态| 48 | fireTimestamp.add(nextFireTimestamp); 49 | } 50 | // |根据记录所在窗口的最大时间戳,注册事件时间定时器| 51 | ctx.registerEventTimeTimer(window.maxTimestamp()); 52 | // |对窗口不采取任何操作| 53 | return TriggerResult.CONTINUE; 54 | } 55 | 56 | // |当注册的处理时间定时器到达指定时间时调用此方法| 57 | @Override 58 | public TriggerResult onProcessingTime(long time, W window, TriggerContext ctx) throws Exception { 59 | ReducingState fireTimestamp = ctx.getPartitionedState(processTimerStateDescriptor); 60 | Long timestamp = fireTimestamp.get(); 61 | // |更新自定义状态中的触发时间| 62 | fireTimestamp.add(timestamp + interval); 63 | // |根据窗口下一次触发的处理时间,注册处理时间定时器| 64 | ctx.registerProcessingTimeTimer(timestamp + interval); 65 | System.out.println("第 " + sdf.format(time) + " 分钟触发 ..."); 66 | // |触发窗口操作时,调用窗口函数进行计算并保留窗口状态| 67 | return TriggerResult.FIRE; 68 | } 69 | 70 | // |当注册的事件时间定时器到达指定时间时调用此方法| 71 | @Override 72 | public TriggerResult onEventTime(long time, W window, TriggerContext triggerContext) 73 | throws Exception { 74 | if (time == window.maxTimestamp()) { 75 | System.out.println("水位线触发 ..."); 76 | // |触发窗口操作时,调用窗口函数进行计算并清除窗口状态| 77 | return TriggerResult.FIRE_AND_PURGE; 78 | } else { 79 | return TriggerResult.CONTINUE; 80 | } 81 | } 82 | 83 | // |清除窗口状态| 84 | @Override 85 | public void clear(W window, TriggerContext ctx) throws Exception { 86 | ReducingState fireTimestamp = ctx.getPartitionedState(processTimerStateDescriptor); 87 | ctx.deleteProcessingTimeTimer(fireTimestamp.get()); // |清除处理时间定时器| 88 | fireTimestamp.clear(); // |清除自定义状态中的触发时间| 89 | ctx.deleteEventTimeTimer(window.maxTimestamp()); // |清除事件时间定时器| 90 | } 91 | 92 | // |更新状态时,使用新值替代旧值| 93 | private static class Update implements ReduceFunction { 94 | private static final long serialVersionUID = 1L; 95 | 96 | @Override 97 | public Long reduce(Long value1, Long value2) throws Exception { 98 | return value2; 99 | } 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /Flink/src/main/java/cn/edu/ecnu/flink/examples/java/integersum/trigger/CustomerTriggerWithAccumulation.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.flink.examples.java.integersum.trigger; 2 | 3 | import org.apache.flink.api.common.functions.ReduceFunction; 4 | import org.apache.flink.api.common.state.ReducingState; 5 | import org.apache.flink.api.common.state.ReducingStateDescriptor; 6 | import org.apache.flink.api.common.typeutils.base.LongSerializer; 7 | import org.apache.flink.streaming.api.windowing.triggers.Trigger; 8 | import org.apache.flink.streaming.api.windowing.triggers.TriggerResult; 9 | import org.apache.flink.streaming.api.windowing.windows.Window; 10 | 11 | import java.text.SimpleDateFormat; 12 | 13 | 14 | public class CustomerTriggerWithAccumulation extends Trigger { 15 | final SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss"); 16 | 17 | // |以自定义状态形式保存每个窗口中处理时间定时器所对应的触发时间| 18 | private final ReducingStateDescriptor processTimerStateDescriptor = 19 | new ReducingStateDescriptor( 20 | "processTimer", new CustomerTriggerWithAccumulation.Update(), LongSerializer.INSTANCE); 21 | 22 | // |基于处理时间触发间隔| 23 | Long interval = 60L; 24 | 25 | public CustomerTriggerWithAccumulation(long interval) { 26 | this.interval = interval * 1000; 27 | } 28 | 29 | // |当有记录进入相应窗口时触发器将调用此方法| 30 | @Override 31 | public TriggerResult onElement(Object element, long timestamp, W window, TriggerContext ctx) 32 | throws Exception { 33 | // |从自定义状态中获取处理时间定时器所对应的触发时间| 34 | ReducingState fireTimestamp = ctx.getPartitionedState(processTimerStateDescriptor); 35 | // |窗口中进入第一条记录,处理时间定时器所对应的触发时间状态不存在| 36 | if (fireTimestamp.get() == null) { 37 | Long timeStamp = ctx.getCurrentProcessingTime(); 38 | // |计算窗口的下一次触发时间| 39 | Long start = timeStamp - (timeStamp % interval); 40 | Long nextFireTimestamp = start + interval; 41 | // |注册处理时间定时器| 42 | ctx.registerProcessingTimeTimer(nextFireTimestamp); 43 | // |将处理时间定时器所对应触发时间存入自定义状态| 44 | fireTimestamp.add(nextFireTimestamp); 45 | } 46 | // |迟到记录处理| 47 | if (window.maxTimestamp() <= ctx.getCurrentWatermark()) { 48 | System.out.println("迟到记录触发 ..."); 49 | // |触发窗口操作时,调用窗口函数进行计算并保留窗口状态| 50 | return TriggerResult.FIRE; 51 | } else { 52 | // |根据记录所在窗口的最大时间戳,注册事件时间定时器| 53 | ctx.registerEventTimeTimer(window.maxTimestamp()); 54 | // |对窗口不采取任何操作| 55 | return TriggerResult.CONTINUE; 56 | } 57 | } 58 | 59 | // |当注册的处理时间定时器到达指定时间时调用此方法| 60 | @Override 61 | public TriggerResult onProcessingTime(long time, W window, TriggerContext ctx) throws Exception { 62 | ReducingState fireTimestamp = ctx.getPartitionedState(processTimerStateDescriptor); 63 | Long timestamp = fireTimestamp.get(); 64 | // |更新自定义状态中的触发时间| 65 | fireTimestamp.add(timestamp + interval); 66 | // |根据窗口下一次触发的处理时间,注册处理时间定时器| 67 | ctx.registerProcessingTimeTimer(timestamp + interval); 68 | System.out.println("第 " + sdf.format(time) + " 分钟触发 ..."); 69 | // |触发窗口操作时,调用窗口函数进行计算并保留窗口状态| 70 | return TriggerResult.FIRE; 71 | } 72 | 73 | // |当注册的事件时间定时器到达指定时间时调用此方法| 74 | @Override 75 | public TriggerResult onEventTime(long time, W window, TriggerContext triggerContext) 76 | throws Exception { 77 | if (time == window.maxTimestamp()) { 78 | System.out.println("水位线触发 ..."); 79 | // |触发窗口操作时,调用窗口函数进行计算并清除窗口状态| 80 | return TriggerResult.FIRE; 81 | } else { 82 | return TriggerResult.CONTINUE; 83 | } 84 | } 85 | 86 | // |清除窗口状态| 87 | @Override 88 | public void clear(W window, TriggerContext ctx) throws Exception { 89 | ReducingState fireTimestamp = ctx.getPartitionedState(processTimerStateDescriptor); 90 | ctx.deleteProcessingTimeTimer(fireTimestamp.get()); // |清除处理时间定时器| 91 | fireTimestamp.clear(); // |清除自定义状态中的触发时间| 92 | ctx.deleteEventTimeTimer(window.maxTimestamp()); // |清除事件时间定时器| 93 | } 94 | 95 | // |更新状态时,使用新值替代旧值| 96 | private static class Update implements ReduceFunction { 97 | private static final long serialVersionUID = 1L; 98 | 99 | @Override 100 | public Long reduce(Long value1, Long value2) throws Exception { 101 | return value2; 102 | } 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /Flink/src/main/java/cn/edu/ecnu/flink/examples/java/wordcount/WordCount.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.flink.examples.java.wordcount; 2 | 3 | import org.apache.flink.api.common.functions.FlatMapFunction; 4 | import org.apache.flink.api.common.functions.MapFunction; 5 | import org.apache.flink.api.java.tuple.Tuple2; 6 | import org.apache.flink.streaming.api.datastream.DataStream; 7 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 8 | import org.apache.flink.util.Collector; 9 | 10 | import javax.xml.crypto.Data; 11 | 12 | public class WordCount { 13 | public static void main(String[] args) throws Exception { 14 | run(args); 15 | } 16 | 17 | public static void run(String[] args) throws Exception { 18 | /* |步骤1:创建StreamExecutionEnvironment对象 |*/ 19 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 20 | 21 | /* |步骤2:按应用逻辑使用操作算子编写DAG,操作算子包括数据源、转换、数据池等| */ 22 | // |从指定的主机名和端口号接收数据,创建名为lines的DataStream| 23 | DataStream lines = env.socketTextStream("localhost", 9099, "\n"); 24 | // |将lines中的每一个文本行按空格分割成单个单词| 25 | DataStream words = 26 | lines.flatMap( 27 | new FlatMapFunction() { 28 | @Override 29 | public void flatMap(String value, Collector out) throws Exception { 30 | for (String word : value.split(" ")) { 31 | out.collect(word); 32 | } 33 | } 34 | }); 35 | // |将每个单词的频数设置为1,即将每个单词映射为[单词, 1]| 36 | DataStream> pairs = 37 | words.map( 38 | new MapFunction>() { 39 | @Override 40 | public Tuple2 map(String value) throws Exception { 41 | return new Tuple2<>(value, 1); 42 | } 43 | }); 44 | // |按单词聚合,并对相同单词的频数使用sum进行累计| 45 | DataStream> counts = pairs.keyBy(0).sum(1); 46 | // |输出词频统计结果| 47 | counts.print(); 48 | 49 | /* |步骤3:触发程序执行| */ 50 | env.execute("Streaming WordCount"); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /Flink/src/main/java/cn/edu/ecnu/flink/examples/java/wordcountwithfaulttolerance/WordCountWithFaultTolerance.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.flink.examples.java.wordcountwithfaulttolerance; 2 | 3 | import org.apache.flink.api.common.functions.FlatMapFunction; 4 | import org.apache.flink.api.common.functions.MapFunction; 5 | import org.apache.flink.api.java.tuple.Tuple2; 6 | import org.apache.flink.runtime.state.filesystem.FsStateBackend; 7 | import org.apache.flink.streaming.api.datastream.DataStream; 8 | import org.apache.flink.streaming.api.environment.CheckpointConfig; 9 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 10 | import org.apache.flink.util.Collector; 11 | 12 | public class WordCountWithFaultTolerance { 13 | public static void main(String[] args) throws Exception { 14 | run(args); 15 | } 16 | 17 | public static void run(String[] args) throws Exception { 18 | /* |步骤1:创建StreamExecutionEnvironment对象 |*/ 19 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 20 | // |设置checkpoint的周期,每隔1000ms尝试启动一个检查点| 21 | env.enableCheckpointing(1000); 22 | // |设置检查点的最大并发数| 23 | env.getCheckpointConfig().setMaxConcurrentCheckpoints(Integer.MAX_VALUE); 24 | // |设置statebackend,使用FsStateBackend将状态存储至hdfs| 25 | env.setStateBackend(new FsStateBackend("hdfs://hadoop:9000/flink/checkpoints")); 26 | // |处理程序被cancel后,会保留checkpoint数据| 27 | env.getCheckpointConfig() 28 | .enableExternalizedCheckpoints( 29 | CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION); 30 | 31 | /* |步骤2:按应用逻辑使用操作算子编写DAG,操作算子包括数据源、转换、数据池等| */ 32 | DataStream lines = env.socketTextStream("localhost", 9099, "\n"); 33 | DataStream words = 34 | lines.flatMap( 35 | new FlatMapFunction() { 36 | @Override 37 | public void flatMap(String value, Collector out) throws Exception { 38 | for (String word : value.split(" ")) { 39 | out.collect(word); 40 | } 41 | } 42 | }); 43 | DataStream> pairs = 44 | words.map( 45 | new MapFunction>() { 46 | @Override 47 | public Tuple2 map(String value) throws Exception { 48 | return new Tuple2<>(value, 1); 49 | } 50 | }); 51 | DataStream> counts = pairs.keyBy(0).sum(1); 52 | counts.print(); 53 | 54 | /* |步骤3:触发程序执行| */ 55 | env.execute("WordCount With Fault Tolerance"); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /Flink/src/main/resources/META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | 3 | -------------------------------------------------------------------------------- /Flink/src/main/scala/cn/edu/ecnu/flink/examples/scala/fibonacciexample/FibonacciExample.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.flink.examples.scala.fibonacciexample 2 | 3 | import java.util.Random 4 | 5 | import org.apache.flink.api.common.functions.Partitioner 6 | import org.apache.flink.api.scala._ 7 | import org.apache.flink.streaming.api.functions.source.SourceFunction 8 | import org.apache.flink.streaming.api.functions.source.SourceFunction.SourceContext 9 | import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment} 10 | 11 | 12 | object FibonacciExample { 13 | def run(args: Array[String]): Unit = { 14 | /* |步骤1: 创建StreamExecutionEnvironment对象| */ 15 | val env = StreamExecutionEnvironment.getExecutionEnvironment 16 | env.setParallelism(1) 17 | 18 | /* |步骤2:按应用逻辑使用操作算子编写DAG,操作算子包括数据源、转换和数据池等| */ 19 | // |接收来自Socket数据,创建名为inputStream的DataStream| 20 | val inputStream = env.socketTextStream("localhost", 9099) 21 | // |解析inputStream中的数据,创建名为first的DataStream| 22 | val first: DataStream[(Char, Long, Long)] = inputStream.map(lines => (lines.split(" ")(0).charAt(0), 23 | lines.split(" ")(1).toLong, lines.split(" ")(2).toLong)) 24 | val outputStream = first 25 | // |创建迭代算子| 26 | .iterate( 27 | (iteration: DataStream[(Char, Long, Long)]) => { 28 | // |实现迭代步逻辑,计算下一个斐波那契数| 29 | val step = iteration.flatMap(t => { 30 | // |例如迭代算子的输入输入为(A, 1, 2),此处转换将(A, 1, 2)转换为(A, 2, 3)| 31 | val feedbackValue = (t._1, t._3, t._2 + t._3) 32 | // |例如迭代算子的输入输入为(A, 1, 2),此处转换将(A, 1, 2)转换为(A, 1, Min)| 33 | val outputValue = (t._1, t._2, Long.MinValue) 34 | val list = feedbackValue :: outputValue :: Nil 35 | list.toIterator 36 | }).setParallelism(2) 37 | // |创建反馈流| 38 | // |选择第三位置不为Min的元组,例如(A, 2, 3)| 39 | val feedback = step.filter(_._3 != Long.MinValue) 40 | // |创建输出流| 41 | // |选择第三位置为Min的元组,例如(A, 1, 0),并将其转换为(A, 1)| 42 | val output = step.filter(_._3 == Long.MinValue).map(t => (t._1, t._2)) 43 | (feedback, output) 44 | } 45 | // |设置等待反馈输入的最大时间间隔为5s| 46 | , 5000L 47 | ) 48 | // |输出流式迭代计算结果| 49 | outputStream.print() 50 | 51 | /* |步骤3:触发程序执行| */ 52 | env.execute("Streaming Iteration") 53 | } 54 | 55 | def main(args: Array[String]): Unit = { 56 | run(args) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /Flink/src/main/scala/cn/edu/ecnu/flink/examples/scala/integersum/DataflowModel_How.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.flink.examples.scala.integersum 2 | 3 | import java.text.SimpleDateFormat 4 | 5 | import cn.edu.ecnu.flink.examples.scala.integersum.producer.Producer 6 | import cn.edu.ecnu.flink.examples.scala.integersum.trigger.CustomerTriggerWithAccumulation 7 | import org.apache.flink.api.java.tuple.Tuple 8 | import org.apache.flink.streaming.api.TimeCharacteristic 9 | import org.apache.flink.streaming.api.scala.{StreamExecutionEnvironment, _} 10 | import org.apache.flink.streaming.api.scala.function.WindowFunction 11 | import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows 12 | import org.apache.flink.streaming.api.windowing.time.Time 13 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow 14 | import org.apache.flink.util.Collector 15 | 16 | object DataflowModel_How { 17 | def run(args: Array[String]): Unit = { 18 | /* |步骤1: 创建StreamExecutionEnvironment对象| */ 19 | val env = StreamExecutionEnvironment.getExecutionEnvironment 20 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) 21 | env.setParallelism(1) 22 | 23 | /* |步骤2:按应用逻辑使用操作算子编写DAG,操作算子包括数据源、转换和数据池等| */ 24 | val source = env.addSource(new Producer(false)) 25 | val sink = source.keyBy(0) 26 | .window(TumblingEventTimeWindows.of(Time.seconds(120L))) 27 | // |自定义触发器:在水位线机制的基础上,在处理时间域上每隔一分钟输出一次结果。同时,迟到数据修正窗口结果| 28 | .trigger(new CustomerTriggerWithAccumulation(60L)) 29 | // |设置允许延迟时间为300s| 30 | .allowedLateness(Time.seconds(300L)) 31 | .apply(new myWindowFunction) 32 | sink.print() 33 | 34 | /* |步骤3:触发程序执行| */ 35 | env.execute("Dataflow Model-How") 36 | } 37 | 38 | val sdf = new SimpleDateFormat("HH:mm:ss") 39 | 40 | class myWindowFunction extends WindowFunction[Tuple2[String, Integer], String, Tuple, TimeWindow] { 41 | override def apply(key: Tuple, window: TimeWindow, input: Iterable[(String, Integer)], out: Collector[String]): Unit = { 42 | // |记录整数的累加和| 43 | var sum = 0 44 | // |获取窗口中键值对的迭代器| 45 | val it = input.iterator 46 | // |遍历窗口中的键值对,并对整数进行求和| 47 | while (it.hasNext) { 48 | val next = it.next() 49 | sum = sum + next._2 50 | } 51 | // |以字符串形式返回形如”the sum of window [12:00:00,12:02:00) is 14”的窗口函数结果| 52 | val res = "the sum of window [" + sdf.format(window.getStart) + "," + sdf.format(window.getEnd) + ") is " + sum 53 | out.collect(res) 54 | } 55 | } 56 | 57 | def main(args: Array[String]): Unit = { 58 | run(args) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /Flink/src/main/scala/cn/edu/ecnu/flink/examples/scala/integersum/DataflowModel_What.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.flink.examples.scala.integersum 2 | 3 | import cn.edu.ecnu.flink.examples.scala.integersum.producer.Producer 4 | import org.apache.flink.streaming.api.scala.{StreamExecutionEnvironment, _} 5 | 6 | object DataflowModel_What { 7 | def run(args: Array[String]): Unit = { 8 | /* |步骤1: 创建StreamExecutionEnvironment对象| */ 9 | val env = StreamExecutionEnvironment.getExecutionEnvironment 10 | 11 | /* |步骤2:按应用逻辑使用操作算子编写DAG,操作算子包括数据源、转换和数据池等| */ 12 | // |接收来自CustomSource的记录,抛弃代表watermark的记录,创建名为source的DataStream| 13 | val source = env.addSource(new Producer(true)) 14 | // |对键值对按键聚合,并使用sum对整数进行累加,创建名为sink的DataStream| 15 | val sink = source.keyBy(0).sum(1) 16 | // |输出整数求和结果| 17 | sink.print() 18 | 19 | /* |步骤3:触发程序执行| */ 20 | env.execute("Dataflow Model-What") 21 | } 22 | 23 | def main(args: Array[String]): Unit = { 24 | run(args) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /Flink/src/main/scala/cn/edu/ecnu/flink/examples/scala/integersum/DataflowModel_When.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.flink.examples.scala.integersum 2 | 3 | import java.text.SimpleDateFormat 4 | 5 | import cn.edu.ecnu.flink.examples.scala.integersum.producer.Producer 6 | import cn.edu.ecnu.flink.examples.scala.integersum.trigger.CustomerTrigger 7 | import org.apache.flink.api.java.tuple.Tuple 8 | import org.apache.flink.streaming.api.TimeCharacteristic 9 | import org.apache.flink.streaming.api.scala.{StreamExecutionEnvironment, _} 10 | import org.apache.flink.streaming.api.scala.function.WindowFunction 11 | import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows 12 | import org.apache.flink.streaming.api.windowing.time.Time 13 | import org.apache.flink.streaming.api.windowing.triggers.EventTimeTrigger 14 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow 15 | import org.apache.flink.util.Collector 16 | 17 | object DataflowModel_When { 18 | def run(args: Array[String]): Unit = { 19 | /* |步骤1: 创建StreamExecutionEnvironment对象| */ 20 | val env = StreamExecutionEnvironment.getExecutionEnvironment 21 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) 22 | env.setParallelism(1) 23 | 24 | /* |步骤2:按应用逻辑使用操作算子编写DAG,操作算子包括数据源、转换和数据池等| */ 25 | // |通过由CustomSource产生的无界数据集创建名为source的DataStream| 26 | val source = env.addSource(new Producer(false)) 27 | val sink = source.keyBy(0) 28 | .window(TumblingEventTimeWindows.of(Time.seconds(120L))) 29 | // |定义水位线到达窗口最大时间戳的时候输出结果| 30 | .trigger(EventTimeTrigger.create()) 31 | // .trigger(new CustomerTrigger(60L)) 32 | .apply(new myWindowFunction) 33 | sink.print() 34 | 35 | /* |步骤3:触发程序执行| */ 36 | env.execute("Dataflow Model-When") 37 | } 38 | 39 | val sdf = new SimpleDateFormat("HH:mm:ss") 40 | 41 | class myWindowFunction extends WindowFunction[Tuple2[String, Integer], String, Tuple, TimeWindow] { 42 | override def apply(key: Tuple, window: TimeWindow, input: Iterable[(String, Integer)], out: Collector[String]): Unit = { 43 | // |记录整数的累加和| 44 | var sum = 0 45 | // |获取窗口中键值对的迭代器| 46 | val it = input.iterator 47 | // |遍历窗口中的键值对,并对整数进行求和| 48 | while (it.hasNext) { 49 | val next = it.next() 50 | sum = sum + next._2 51 | } 52 | // |以字符串形式返回形如”the sum of window [12:00:00,12:02:00) is 14”的窗口函数结果| 53 | val res = "the sum of window [" + sdf.format(window.getStart) + "," + sdf.format(window.getEnd) + ") is " + sum 54 | out.collect(res) 55 | } 56 | } 57 | 58 | def main(args: Array[String]): Unit = { 59 | run(args) 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /Flink/src/main/scala/cn/edu/ecnu/flink/examples/scala/integersum/DataflowModel_Where.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.flink.examples.scala.integersum 2 | 3 | import java.text.SimpleDateFormat 4 | 5 | import cn.edu.ecnu.flink.examples.scala.integersum.producer.Producer 6 | import org.apache.flink.api.java.tuple.Tuple 7 | import org.apache.flink.streaming.api.TimeCharacteristic 8 | import org.apache.flink.streaming.api.scala.{StreamExecutionEnvironment, _} 9 | import org.apache.flink.streaming.api.scala.function.WindowFunction 10 | import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows 11 | import org.apache.flink.streaming.api.windowing.time.Time 12 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow 13 | import org.apache.flink.util.Collector 14 | 15 | object DataflowModel_Where { 16 | def run(args: Array[String]): Unit = { 17 | /* |步骤1: 创建StreamExecutionEnvironment对象| */ 18 | val env = StreamExecutionEnvironment.getExecutionEnvironment 19 | // |设置时间特征为事件时间| 20 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) 21 | env.setParallelism(1) 22 | 23 | /* |步骤2:按应用逻辑使用操作算子编写DAG,操作算子包括数据源、转换和数据池等| */ 24 | val source = env.addSource(new Producer(true)) 25 | // |将source中的记录按照事件时间以2分钟为单位进行窗口划分| 26 | val sink = source.keyBy(0) 27 | .window(TumblingEventTimeWindows.of(Time.seconds(120L))) 28 | // |使用myWindowFunction作为窗口函数对整数进行累加求和| 29 | .apply(new myWindowFunction) 30 | sink.print() 31 | 32 | /* |步骤3:触发程序执行| */ 33 | env.execute("Dataflow Model-Where") 34 | } 35 | 36 | // |设置时间输出格式为HH:mm:ss| 37 | val sdf = new SimpleDateFormat("HH:mm:ss") 38 | 39 | class myWindowFunction extends WindowFunction[Tuple2[String, Integer], String, Tuple, TimeWindow] { 40 | override def apply(key: Tuple, window: TimeWindow, input: Iterable[(String, Integer)], out: Collector[String]): Unit = { 41 | // |记录整数的累加和| 42 | var sum = 0 43 | // |获取窗口中键值对的迭代器| 44 | val it = input.iterator 45 | // |遍历窗口中的键值对,并对整数进行求和| 46 | while (it.hasNext) { 47 | val next = it.next() 48 | sum = sum + next._2 49 | } 50 | // |以字符串形式返回形如”the sum of window [12:00:00,12:02:00) is 14”的窗口函数结果| 51 | val res = "the sum of window [" + sdf.format(window.getStart) + "," + sdf.format(window.getEnd) + ") is " + sum 52 | out.collect(res) 53 | } 54 | } 55 | 56 | def main(args: Array[String]): Unit = { 57 | run(args) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /Flink/src/main/scala/cn/edu/ecnu/flink/examples/scala/integersum/producer/Producer.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.flink.examples.scala.integersum.producer 2 | 3 | import java.text.SimpleDateFormat 4 | import java.util 5 | 6 | import org.apache.flink.streaming.api.functions.source.SourceFunction 7 | import org.apache.flink.streaming.api.watermark.Watermark 8 | 9 | class Producer(var isBounded: Boolean) extends SourceFunction[Tuple2[String, Integer]] { 10 | final val data = util.Arrays.asList( 11 | new Tuple3[String, Integer, String]("dataflow", 5, "12:00:30"), 12 | new Tuple3[String, Integer, String]("dataflow", 7, "12:02:30"), 13 | new Tuple3[String, Integer, String]("dataflow", 3, "12:03:45"), 14 | // |水位线| 15 | new Tuple3[String, Integer, String]("dataflow", null, "12:02:00"), 16 | new Tuple3[String, Integer, String]("dataflow", 4, "12:03:50"), 17 | new Tuple3[String, Integer, String]("dataflow", 3, "12:04:30"), 18 | new Tuple3[String, Integer, String]("dataflow", 8, "12:03:30"), 19 | // |水位线| 20 | new Tuple3[String, Integer, String]("dataflow", null, "12:04:00"), 21 | // |水位线| 22 | new Tuple3[String, Integer, String]("dataflow", null, "12:06:00"), 23 | new Tuple3[String, Integer, String]("dataflow", 3, "12:06:30"), 24 | new Tuple3[String, Integer, String]("dataflow", 9, "12:01:30"), 25 | new Tuple3[String, Integer, String]("dataflow", 8, "12:07:30"), 26 | new Tuple3[String, Integer, String]("dataflow", 1, "12:07:50"), 27 | // |水位线| 28 | new Tuple3[String, Integer, String]("dataflow", null, "12:08:00")) 29 | 30 | // |每条记录的处理时间间隔| 31 | final val processInterval = util.Arrays.asList(40, 15, 25, 10, 5, 15, 30, 10, 10, 30, 20, 40, 20, 20) 32 | 33 | final val sdf = new SimpleDateFormat("HH:mm:ss") 34 | 35 | override def run(ctx: SourceFunction.SourceContext[(String, Integer)]): Unit = { 36 | waitForMinute() 37 | for (i <- 0 to 13) { 38 | // |记录发送延迟时间| 39 | Thread.sleep(processInterval.get(i) * 1000) 40 | val timestamp = sdf.parse(data.get(i)._3).getTime 41 | val value = data.get(i)._2 42 | // |若为水位线记录且输入数据作为无界数据集,则生成系统的水位线| 43 | if (value == null) { 44 | if (!isBounded) { 45 | ctx.emitWatermark(new Watermark(sdf.parse(data.get(i)._3).getTime)) 46 | } 47 | } 48 | else { 49 | // |设置键值对的事件时间并发送至下游| 50 | ctx.collectWithTimestamp(new Tuple2[String, Integer](data.get(i)._1, value), timestamp) 51 | } 52 | } 53 | } 54 | 55 | override def cancel(): Unit = {} 56 | 57 | def waitForMinute(): Unit = { 58 | val interval = 60 * 1000 59 | val timestamp = System.currentTimeMillis() 60 | Thread.sleep(interval - (timestamp % interval)) 61 | } 62 | } -------------------------------------------------------------------------------- /Flink/src/main/scala/cn/edu/ecnu/flink/examples/scala/integersum/trigger/CustomerTrigger.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.flink.examples.scala.integersum.trigger 2 | 3 | import java.text.SimpleDateFormat 4 | import java.util 5 | import java.util.concurrent.TimeUnit 6 | 7 | import org.apache.flink.api.common.functions.ReduceFunction 8 | import org.apache.flink.api.common.state.ReducingStateDescriptor 9 | import org.apache.flink.streaming.api.windowing.triggers.{Trigger, TriggerResult} 10 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow 11 | 12 | class CustomerTrigger extends Trigger[Any, TimeWindow] { 13 | val sdf = new SimpleDateFormat("HH:mm:ss") 14 | 15 | // |以自定义状态形式保存每个窗口中处理时间定时器所对应的触发时间| 16 | private lazy val processTimerStateDescriptor: ReducingStateDescriptor[Long] = new ReducingStateDescriptor[Long]("processTimer", new Update, classOf[Long]) 17 | 18 | // |基于处理时间触发间隔| 19 | var interval = 60L 20 | 21 | // |构造函数| 22 | def this(interval: Long) { 23 | this() 24 | this.interval = interval * 1000 25 | } 26 | 27 | // |当有记录进入相应窗口时触发器将调用此方法| 28 | override def onElement(element: Any, timestamp: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = { 29 | // |从自定义状态中获取处理时间定时器所对应的触发时间| 30 | val fireTimestamp = ctx.getPartitionedState(processTimerStateDescriptor) 31 | // |窗口中进入第一条记录,处理时间定时器所对应的触发时间状态不存在| 32 | if (fireTimestamp.get == null) { 33 | val timestamp = ctx.getCurrentProcessingTime 34 | // |计算窗口的下一次触发时间| 35 | val start = timestamp - (timestamp % interval) 36 | val nextFireTimestamp = start + interval 37 | // |注册处理时间定时器| 38 | ctx.registerProcessingTimeTimer(nextFireTimestamp) 39 | // |将处理时间定时器所对应触发时间存入自定义状态| 40 | fireTimestamp.add(nextFireTimestamp) 41 | } 42 | // |根据记录所在窗口的最大时间戳,注册事件时间定时器| 43 | ctx.registerEventTimeTimer(window.maxTimestamp) 44 | // |对窗口不采取任何操作| 45 | TriggerResult.CONTINUE 46 | } 47 | 48 | // |当注册的处理时间定时器到达指定时间时调用此方法| 49 | override def onProcessingTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = { 50 | val fireTimestamp = ctx.getPartitionedState(processTimerStateDescriptor) 51 | val timestamp = fireTimestamp.get 52 | // |更新自定义状态中的触发时间| 53 | fireTimestamp.add(timestamp + interval) 54 | // |根据窗口下一次触发的处理时间,注册处理时间定时器| 55 | ctx.registerProcessingTimeTimer(timestamp + interval) 56 | System.out.println("第 " + sdf.format(time) + " 分钟触发 ...") 57 | // |触发窗口操作时,调用窗口函数进行计算并保留窗口状态| 58 | TriggerResult.FIRE 59 | } 60 | 61 | // |当注册的事件时间定时器到达指定时间时调用此方法| 62 | override def onEventTime(time: Long, window: TimeWindow, triggerContext: Trigger.TriggerContext): TriggerResult = { 63 | if (time == window.maxTimestamp) { 64 | System.out.println("水位线触发 ...") 65 | // |触发窗口操作时,调用窗口函数进行计算并清除窗口状态| 66 | TriggerResult.FIRE_AND_PURGE 67 | } else { 68 | TriggerResult.CONTINUE 69 | } 70 | } 71 | 72 | // |清除窗口状态| 73 | override def clear(window: TimeWindow, ctx: Trigger.TriggerContext) = { 74 | val fireTimestamp = ctx.getPartitionedState(processTimerStateDescriptor) 75 | ctx.deleteProcessingTimeTimer(fireTimestamp.get) // |清除处理时间定时器| 76 | fireTimestamp.clear() // |清除自定义状态中的触发时间| 77 | ctx.deleteEventTimeTimer(window.maxTimestamp) // |清除事件时间定时器| 78 | } 79 | 80 | // |更新状态时,使用新值替代旧值| 81 | class Update extends ReduceFunction[Long] { 82 | override def reduce(value1: Long, value2: Long): Long = value2 83 | } 84 | 85 | } -------------------------------------------------------------------------------- /Flink/src/main/scala/cn/edu/ecnu/flink/examples/scala/integersum/trigger/CustomerTriggerWithAccumulation.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.flink.examples.scala.integersum.trigger 2 | 3 | import java.text.SimpleDateFormat 4 | import java.util 5 | 6 | import org.apache.flink.api.common.functions.ReduceFunction 7 | import org.apache.flink.api.common.state.ReducingStateDescriptor 8 | import org.apache.flink.streaming.api.windowing.triggers.{Trigger, TriggerResult} 9 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow 10 | 11 | class CustomerTriggerWithAccumulation extends Trigger[Any, TimeWindow] { 12 | val sdf = new SimpleDateFormat("HH:mm:ss") 13 | 14 | // |以自定义状态形式保存每个窗口中处理时间定时器所对应的触发时间| 15 | private lazy val processTimerStateDescriptor: ReducingStateDescriptor[Long] = new ReducingStateDescriptor[Long]("processTimer", new Update, classOf[Long]) 16 | 17 | // |基于处理时间触发间隔| 18 | var interval = 60L 19 | 20 | // |构造函数| 21 | def this(interval: Long) { 22 | this() 23 | this.interval = interval * 1000 24 | } 25 | 26 | // |当有记录进入相应窗口时触发器将调用此方法| 27 | override def onElement(element: Any, timestamp: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = { 28 | // |从自定义状态中获取处理时间定时器所对应的触发时间| 29 | val fireTimestamp = ctx.getPartitionedState(processTimerStateDescriptor) 30 | // |窗口中进入第一条记录,处理时间定时器所对应的触发时间状态不存在| 31 | if (fireTimestamp.get == null) { 32 | val timestamp = ctx.getCurrentProcessingTime 33 | // |计算窗口的下一次触发时间| 34 | val start = timestamp - (timestamp % interval) 35 | val nextFireTimestamp = start + interval 36 | // |注册处理时间定时器| 37 | ctx.registerProcessingTimeTimer(nextFireTimestamp) 38 | // |将处理时间定时器所对应触发时间存入自定义状态| 39 | fireTimestamp.add(nextFireTimestamp) 40 | } 41 | // |迟到记录处理| 42 | if (window.maxTimestamp <= ctx.getCurrentWatermark) { 43 | System.out.println("迟到记录触发 ...") 44 | // |触发窗口操作时,调用窗口函数进行计算并保留窗口状态| 45 | TriggerResult.FIRE 46 | } else { 47 | // |根据记录所在窗口的最大时间戳,注册事件时间定时器| 48 | ctx.registerEventTimeTimer(window.maxTimestamp) 49 | // |对窗口不采取任何操作| 50 | TriggerResult.CONTINUE 51 | } 52 | } 53 | 54 | // |当注册的处理时间定时器到达指定时间时调用此方法| 55 | override def onProcessingTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = { 56 | val fireTimestamp = ctx.getPartitionedState(processTimerStateDescriptor) 57 | val timestamp = fireTimestamp.get 58 | // |更新自定义状态中的触发时间| 59 | fireTimestamp.add(timestamp + interval) 60 | // |根据窗口下一次触发的处理时间,注册处理时间定时器| 61 | ctx.registerProcessingTimeTimer(timestamp + interval) 62 | System.out.println("第 " + sdf.format(time) + " 分钟触发 ...") 63 | // |触发窗口操作时,调用窗口函数进行计算并保留窗口状态| 64 | TriggerResult.FIRE 65 | } 66 | 67 | // |当注册的事件时间定时器到达指定时间时调用此方法| 68 | override def onEventTime(time: Long, window: TimeWindow, triggerContext: Trigger.TriggerContext): TriggerResult = { 69 | if (time == window.maxTimestamp) { 70 | System.out.println("水位线触发 ...") 71 | // |触发窗口操作时,调用窗口函数进行计算并保留窗口状态| 72 | TriggerResult.FIRE 73 | } else { 74 | TriggerResult.CONTINUE 75 | } 76 | } 77 | 78 | // |清除窗口状态| 79 | override def clear(window: TimeWindow, ctx: Trigger.TriggerContext) = { 80 | val fireTimestamp = ctx.getPartitionedState(processTimerStateDescriptor) 81 | ctx.deleteProcessingTimeTimer(fireTimestamp.get) // |清除处理时间定时器| 82 | fireTimestamp.clear() // |清除自定义状态中的触发时间| 83 | ctx.deleteEventTimeTimer(window.maxTimestamp) // |清除事件时间定时器| 84 | } 85 | 86 | // |更新状态时,使用新值替代旧值| 87 | class Update extends ReduceFunction[Long] { 88 | override def reduce(value1: Long, value2: Long): Long = value2 89 | } 90 | 91 | } -------------------------------------------------------------------------------- /Flink/src/main/scala/cn/edu/ecnu/flink/examples/scala/wordcount/WordCount.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.flink.examples.scala.wordcount 2 | 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 4 | import org.apache.flink.streaming.api.scala._ 5 | import org.apache.flink.streaming.api.windowing.assigners.{GlobalWindows, TumblingEventTimeWindows, TumblingProcessingTimeWindows} 6 | import org.apache.flink.streaming.api.windowing.time.Time 7 | 8 | object WordCount { 9 | def run(args: Array[String]): Unit = { 10 | /* |步骤1:创建StreamExecutionEnvironment对象 |*/ 11 | val env = StreamExecutionEnvironment.getExecutionEnvironment 12 | 13 | /* |步骤2:按应用逻辑使用操作算子编写DAG,操作算子包括数据源、转换、数据池等| */ 14 | // |从指定的主机名和端口号接收数据,创建名为lines的DataStream| 15 | val lines = env.socketTextStream("localhost", 9099) 16 | // |将lines中的每一个文本行按空格分割成单个单词| 17 | val words = lines.flatMap(w => w.split(" ")) 18 | // |将每个单词的频数设置为1,即将每个单词映射为[单词, 1]| 19 | val pairs = words.map(word => (word, 1)) 20 | // |按单词聚合,并对相同单词的频数使用sum进行累计| 21 | val counts = pairs.keyBy(0) 22 | .sum(1) 23 | // |输出词频统计结果| 24 | counts.print() 25 | 26 | /* |步骤3:触发程序执行| */ 27 | env.execute("Streaming WordCount") 28 | } 29 | 30 | def main(args: Array[String]): Unit = { 31 | run(args) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /Flink/src/main/scala/cn/edu/ecnu/flink/examples/scala/wordcountwithfaulttolerance/WordCountWithFaultTolerance.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.flink.examples.scala.wordcountwithfaulttolerance 2 | 3 | import org.apache.flink.streaming.api.scala._ 4 | import org.apache.flink.runtime.state.filesystem.FsStateBackend 5 | import org.apache.flink.streaming.api.CheckpointingMode 6 | import org.apache.flink.streaming.api.environment.CheckpointConfig 7 | import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment} 8 | 9 | object WordCountWithFaultTolerance { 10 | def run(args: Array[String]): Unit = { 11 | /* |步骤1: 创建StreamExecutionEnvironment对象| */ 12 | val env = StreamExecutionEnvironment.getExecutionEnvironment 13 | // |设置checkpoint的周期,每隔1000ms试图启动一个检查点| 14 | env.enableCheckpointing(1000) 15 | // |设置检查点的最大并发数| 16 | env.getCheckpointConfig.setMaxConcurrentCheckpoints(Integer.MAX_VALUE) 17 | // |设置statebackend,使用FsStateBackend将状态存储至hdfs| 18 | env.setStateBackend(new FsStateBackend("hdfs://hadoop:9000/flink/checkpoints")) 19 | // |处理程序被cancel后,会保留checkpoint数据| 20 | env.getCheckpointConfig.enableExternalizedCheckpoints(CheckpointConfig 21 | .ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION) 22 | 23 | /* |步骤2:按应用逻辑使用操作算子编写DAG,操作算子包括数据源、转换和数据池等| */ 24 | val lines: DataStream[String] = env.socketTextStream("localhost", 9099) 25 | val words = lines.flatMap(w => w.split(" ")) 26 | val pairs: DataStream[(String, Int)] = words.map(word => (word, 1)) 27 | val counts: DataStream[(String, Int)] = pairs 28 | .keyBy(0) 29 | .sum(1) 30 | counts.print() 31 | 32 | /* |步骤3:触发程序执行| */ 33 | env.execute("WordCount With Fault Tolerance") 34 | } 35 | 36 | def main(args: Array[String]): Unit = { 37 | run(args) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /Giraph/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.z_fan 8 | giraph-app-demo 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 13 | 14 | org.apache.giraph 15 | giraph-core 16 | 1.2.0-hadoop2 17 | 18 | 19 | 20 | 21 | org.apache.giraph 22 | giraph-examples 23 | 1.2.0-hadoop2 24 | 25 | 26 | 27 | 28 | org.apache.hadoop 29 | hadoop-common 30 | 2.5.1 31 | 32 | 33 | 34 | 35 | org.apache.hadoop 36 | hadoop-client 37 | 2.5.1 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | org.apache.maven.plugins 46 | maven-compiler-plugin 47 | 48 | 8 49 | 8 50 | 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /Giraph/src/main/java/cn/edu/ecnu/giraph/examples/cc/ConnectedComponentsComputation.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.giraph.examples.cc; 2 | 3 | import java.io.IOException; 4 | import org.apache.giraph.graph.BasicComputation; 5 | import org.apache.giraph.graph.Vertex; 6 | import org.apache.hadoop.io.IntWritable; 7 | import org.apache.hadoop.io.NullWritable; 8 | 9 | public class ConnectedComponentsComputation 10 | extends BasicComputation { 11 | 12 | @Override 13 | public void compute(Vertex vertex, 14 | Iterable messages) throws IOException { 15 | /* 步骤2:编写与顶点计算、更新相关的处理逻辑以及发送消息 */ 16 | // 超步0时向所有邻居顶点发送消息 17 | if (getSuperstep() == 0) { 18 | sendMessageToAllEdges(vertex, vertex.getValue()); 19 | vertex.voteToHalt(); 20 | return; 21 | } 22 | 23 | boolean changed = false; 24 | int currentComponent = vertex.getValue().get(); 25 | // 从消息中挑选出最小的连通分量编号 26 | for (IntWritable message : messages) { 27 | int candidateComponent = message.get(); 28 | if (candidateComponent < currentComponent) { 29 | currentComponent = candidateComponent; 30 | changed = true; 31 | } 32 | } 33 | 34 | if (changed) { 35 | // 更新计算值并向邻居顶点发送消息 36 | vertex.setValue(new IntWritable(currentComponent)); 37 | sendMessageToAllEdges(vertex, vertex.getValue()); 38 | } 39 | vertex.voteToHalt(); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /Giraph/src/main/java/cn/edu/ecnu/giraph/examples/cc/ConnectedComponentsRunner.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.giraph.examples.cc; 2 | 3 | import org.apache.giraph.conf.GiraphConfiguration; 4 | import org.apache.giraph.conf.GiraphConstants; 5 | import org.apache.giraph.io.formats.GiraphTextInputFormat; 6 | import org.apache.giraph.io.formats.GiraphTextOutputFormat; 7 | import org.apache.giraph.io.formats.IdWithValueTextOutputFormat; 8 | import org.apache.giraph.io.formats.IntIntNullTextInputFormat; 9 | import org.apache.giraph.job.GiraphJob; 10 | import org.apache.hadoop.conf.Configured; 11 | import org.apache.hadoop.fs.Path; 12 | import org.apache.hadoop.util.Tool; 13 | import org.apache.hadoop.util.ToolRunner; 14 | 15 | public class ConnectedComponentsRunner extends Configured implements Tool { 16 | 17 | @Override 18 | public int run(String[] args) throws Exception { 19 | /* 步骤1: 设置作业的信息 */ 20 | GiraphConfiguration giraphConf = new GiraphConfiguration(getConf()); 21 | 22 | // 设置compute方法 23 | giraphConf.setComputationClass(ConnectedComponentsComputation.class); 24 | // 设置图数据的输入格式 25 | giraphConf.setVertexInputFormatClass(IntIntNullTextInputFormat.class); 26 | // 设置图数据的输出格式 27 | giraphConf.setVertexOutputFormatClass(IdWithValueTextOutputFormat.class); 28 | 29 | // 启用本地调试模式 30 | giraphConf.setLocalTestMode(true); 31 | // 最小和最大的Worker数量均为1,Master协调超步时所需Worker响应的百分比为100 32 | giraphConf.setWorkerConfiguration(1, 1, 100); 33 | // Master和Worker位于同一进程 34 | GiraphConstants.SPLIT_MASTER_WORKER.set(giraphConf, false); 35 | 36 | // 创建Giraph作业 37 | GiraphJob giraphJob = new GiraphJob(giraphConf, getClass().getSimpleName()); 38 | 39 | // 设置图数据的输入路径 40 | GiraphTextInputFormat.addVertexInputPath(giraphConf, new Path(args[0])); 41 | // 设置图数据的输出路径 42 | GiraphTextOutputFormat.setOutputPath(giraphJob.getInternalJob(), new Path(args[1])); 43 | 44 | return giraphJob.run(true) ? 0 : -1; 45 | } 46 | 47 | public static void main(String[] args) throws Exception { 48 | /* 步骤2: 运行作业 */ 49 | int exitCode = ToolRunner.run(new ConnectedComponentsRunner(), args); 50 | System.exit(exitCode); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /Giraph/src/main/java/cn/edu/ecnu/giraph/examples/cc/README.md: -------------------------------------------------------------------------------- 1 | #### 运行方法 2 | 3 | 修改运行配置,在 Program arguments 中填入 `src/main/resources/inputs/cc/data.txt src/main/resources/outputs/cc/` -------------------------------------------------------------------------------- /Giraph/src/main/java/cn/edu/ecnu/giraph/examples/kmeans/KMeansComputation.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.giraph.examples.kmeans; 2 | 3 | import cn.edu.ecnu.giraph.examples.kmeans.utils.PointsOperation; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | import org.apache.giraph.graph.BasicComputation; 7 | import org.apache.giraph.graph.Vertex; 8 | import org.apache.hadoop.io.DoubleWritable; 9 | import org.apache.hadoop.io.NullWritable; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.util.StringUtils; 12 | 13 | /* 步骤1:确定顶点标识I、顶点的计算值V、边的权值E以及消息值M的数据类型 */ 14 | public class KMeansComputation 15 | extends BasicComputation { 16 | // 最大超步数 17 | private static final int MAX_SUPERSTEP = 20; 18 | // 聚类中心集的名称 19 | private static final String CENTERS = "centers"; 20 | // 聚类中心1和2的名称前缀 21 | private static final String CENTER_PREFIX = "center"; 22 | 23 | @Override 24 | public void compute(Vertex vertex, 25 | Iterable iterable) { 26 | /* 步骤2:编写与顶点计算、更新相关的处理逻辑以及发送消息 */ 27 | if (getSuperstep() < MAX_SUPERSTEP) { 28 | // 通过Aggregator获取聚类中心集并进行解析 29 | String centersStr = getAggregatedValue(CENTERS).toString(); 30 | List> centers = PointsOperation.parse(centersStr); 31 | 32 | // 解析顶点中保存的数据点 33 | List point = new ArrayList<>(); 34 | for (String dimension : vertex.getId().toString().split(",")) { 35 | point.add(Double.parseDouble(dimension)); 36 | } 37 | 38 | // 遍历聚类中心集并计算与数据点的距离 39 | double minDistance = Double.MAX_VALUE; 40 | int centerIndex = -1; 41 | 42 | for (int i = 0; i < centers.size(); i++) { 43 | double distance = 0; 44 | List center = centers.get(i); 45 | for (int j = 0; j < center.size(); j++) { 46 | distance += Math.pow(point.get(j) - center.get(j), 2); 47 | } 48 | 49 | distance = Math.sqrt(distance); 50 | if (distance < minDistance) { 51 | minDistance = distance; 52 | centerIndex = i + 1; 53 | } 54 | } 55 | vertex.setValue(new DoubleWritable(centerIndex)); 56 | // 将数据点提供给所属聚类中心的Aggregator 57 | aggregate(CENTER_PREFIX + centerIndex, 58 | new Text(StringUtils.join(",", point) + "\t")); 59 | } else { 60 | vertex.voteToHalt(); 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /Giraph/src/main/java/cn/edu/ecnu/giraph/examples/kmeans/KMeansMasterCompute.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.giraph.examples.kmeans; 2 | 3 | import cn.edu.ecnu.giraph.examples.kmeans.utils.PointsOperation; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | import org.apache.giraph.aggregators.TextAppendAggregator; 7 | import org.apache.giraph.master.DefaultMasterCompute; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.util.StringUtils; 10 | 11 | public class KMeansMasterCompute extends DefaultMasterCompute { 12 | 13 | // 聚类中心个数 14 | private static final int CENTER_SIZE = 2; 15 | // 最大超步数 16 | private static final int MAX_SUPERSTEP = 20; 17 | // 聚类中心集的名称 18 | private static final String CENTERS = "centers"; 19 | // 聚类中心1和2的名称前缀 20 | private static final String CENTER_PREFIX = "center"; 21 | 22 | @Override 23 | public void initialize() throws InstantiationException, IllegalAccessException { 24 | /* 步骤1:注册Aggregator */ 25 | // 注册聚类中心集的Aggregator 26 | registerAggregator(CENTERS, TextAppendAggregator.class); 27 | for (int i = 1; i <= CENTER_SIZE; i++) { 28 | registerAggregator(CENTER_PREFIX + i, TextAppendAggregator.class); 29 | } 30 | } 31 | 32 | @Override 33 | public void compute() { 34 | /* 步骤2:对汇总的数据进行处理 */ 35 | StringBuilder centers = new StringBuilder(); 36 | // 超步0时从文件中读取聚类中心集 37 | if (getSuperstep() == 0) { 38 | String centersPath = getConf().get(CENTERS); 39 | for (String center : PointsOperation.getCenters(centersPath)) { 40 | centers.append(center).append("\t"); 41 | } 42 | } else if (getSuperstep() < MAX_SUPERSTEP) { 43 | // 依次处理聚类中心1和2的Aggregator汇总的数据点 44 | for (int i = 1; i <= CENTER_SIZE; i++) { 45 | List newCenter = new ArrayList<>(); 46 | // 获取并解析出属于同一聚类中心的数据点 47 | String datas = getAggregatedValue(CENTER_PREFIX + i).toString(); 48 | List> points = PointsOperation.parse(datas); 49 | // 计算每个维度的平均值从而得到新的聚类中心 50 | for (int j = 0; j < points.get(0).size(); j++) { 51 | double sum = 0; 52 | for (List point : points) { 53 | sum += point.get(j); 54 | } 55 | newCenter.add(sum / points.size()); 56 | } 57 | centers.append(StringUtils.join(",", newCenter)).append("\t"); 58 | } 59 | } 60 | // 汇总聚类中心到Aggregator 61 | setAggregatedValue(CENTERS, new Text(centers.toString())); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /Giraph/src/main/java/cn/edu/ecnu/giraph/examples/kmeans/KMeansRunner.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.giraph.examples.kmeans; 2 | 3 | import org.apache.giraph.conf.GiraphConfiguration; 4 | import org.apache.giraph.conf.GiraphConstants; 5 | import org.apache.giraph.io.formats.GiraphTextInputFormat; 6 | import org.apache.giraph.io.formats.GiraphTextOutputFormat; 7 | import org.apache.giraph.io.formats.IdWithValueTextOutputFormat; 8 | import org.apache.giraph.io.formats.TextDoubleDoubleAdjacencyListVertexInputFormat; 9 | import org.apache.giraph.job.GiraphJob; 10 | import org.apache.hadoop.conf.Configured; 11 | import org.apache.hadoop.fs.Path; 12 | import org.apache.hadoop.util.Tool; 13 | import org.apache.hadoop.util.ToolRunner; 14 | 15 | public class KMeansRunner extends Configured implements Tool { 16 | 17 | // 聚类中心集的名称 18 | private static final String CENTERS = "centers"; 19 | 20 | @Override 21 | public int run(String[] args) throws Exception { 22 | /* 步骤1: 设置作业的信息 */ 23 | GiraphConfiguration giraphConf = new GiraphConfiguration(getConf()); 24 | 25 | // 设置compute方法 26 | giraphConf.setComputationClass(KMeansComputation.class); 27 | // 设置图数据的输入格式 28 | giraphConf.setVertexInputFormatClass(TextDoubleDoubleAdjacencyListVertexInputFormat.class); 29 | // 设置图数据的输出格式 30 | giraphConf.setVertexOutputFormatClass(IdWithValueTextOutputFormat.class); 31 | // 设置MasterCompute,启用Aggregator机制 32 | giraphConf.setMasterComputeClass(KMeansMasterCompute.class); 33 | // 设置初始聚类中心集的文件路径的配置项 34 | giraphConf.set(CENTERS, args[2]); 35 | 36 | // 启用本地调试模式 37 | giraphConf.setLocalTestMode(true); 38 | // 最小的Worker数量和最大的Worker数量均为1,Master协调超步时所需Worker响应的百分比为100 39 | giraphConf.setWorkerConfiguration(1, 1, 100); 40 | // Master和Worker位于同一进程 41 | GiraphConstants.SPLIT_MASTER_WORKER.set(giraphConf, false); 42 | 43 | // 创建Giraph作业 44 | GiraphJob giraphJob = new GiraphJob(giraphConf, getClass().getSimpleName()); 45 | 46 | // 设置图数据的输入路径 47 | GiraphTextInputFormat.addVertexInputPath(giraphConf, new Path(args[0])); 48 | // 设置图数据的输出路径 49 | GiraphTextOutputFormat.setOutputPath(giraphJob.getInternalJob(), new Path(args[1])); 50 | 51 | return giraphJob.run(true) ? 0 : -1; 52 | } 53 | 54 | public static void main(String[] args) throws Exception { 55 | /* 步骤2: 运行作业 */ 56 | int exitCode = ToolRunner.run(new KMeansRunner(), args); 57 | System.exit(exitCode); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /Giraph/src/main/java/cn/edu/ecnu/giraph/examples/kmeans/README.md: -------------------------------------------------------------------------------- 1 | #### 运行方法 2 | 3 | 修改运行配置,在 Program arguments 中填入 `src/main/resources/inputs/kmeans/data.txt src/main/resources/outputs/kmeans/ src/main/resources/inputs/kmeans/center/centers.txt` -------------------------------------------------------------------------------- /Giraph/src/main/java/cn/edu/ecnu/giraph/examples/kmeans/utils/FileOperation.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.giraph.examples.kmeans.utils; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileOutputStream; 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.io.OutputStream; 9 | import java.net.URI; 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | import org.apache.hadoop.conf.Configuration; 13 | import org.apache.hadoop.fs.FileStatus; 14 | import org.apache.hadoop.fs.FileSystem; 15 | import org.apache.hadoop.fs.Path; 16 | 17 | public class FileOperation { 18 | 19 | /** 用于标识传入路径为 HDFS 路径 */ 20 | private static final String HDFS = "hdfs"; 21 | 22 | /** 23 | * 从指定路径获取输入流 24 | * 25 | * @param path 本地路径或 HDFS 路径 26 | */ 27 | public static InputStream read(String path) { 28 | boolean isHDFS = path.contains(HDFS); 29 | InputStream inputStream = null; 30 | try { 31 | if (isHDFS) { 32 | FileSystem fs = getFileSystem(path); 33 | inputStream = fs.open(new Path(path)); 34 | } else { 35 | inputStream = new FileInputStream(new File(path)); 36 | } 37 | } catch (IOException e) { 38 | e.printStackTrace(); 39 | } 40 | return inputStream; 41 | } 42 | 43 | /** 44 | * 从指定路径获取输出流 45 | * 46 | * @param path 本地路径或 HDFS 路径 47 | * @param isOverwrite 标识文件内容是否需要覆盖 48 | */ 49 | public static OutputStream write(String path, boolean isOverwrite) { 50 | boolean isHDFS = path.contains(HDFS); 51 | OutputStream outputStream = null; 52 | try { 53 | if (isHDFS) { 54 | FileSystem fs = getFileSystem(path); 55 | outputStream = fs.create(new Path(path), isOverwrite); 56 | } else { 57 | boolean isAppend = !isOverwrite; 58 | outputStream = new FileOutputStream(new File(path), isAppend); 59 | } 60 | } catch (IOException e) { 61 | e.printStackTrace(); 62 | } 63 | return outputStream; 64 | } 65 | 66 | /** 67 | * 获取目录下的文件信息 68 | * 69 | * @param directory 目录路径 70 | * @return 目录下的文件路径集合 71 | */ 72 | public static List getPaths(String directory) { 73 | List paths = new ArrayList<>(); 74 | boolean isHDFS = directory.contains(HDFS); 75 | try { 76 | if (isHDFS) { 77 | FileSystem fs = getFileSystem(directory); 78 | // 读取目录下文件信息 79 | FileStatus[] fileStatuses = fs.listStatus(new Path(directory)); 80 | for (FileStatus fileStatus : fileStatuses) { 81 | paths.add(fileStatus.getPath().toString()); 82 | } 83 | } else { 84 | File root = new File(directory); 85 | File[] files = root.listFiles(); 86 | if (files != null) { 87 | for (File file : files) { 88 | String path = file.getPath(); 89 | // 本地文件系统中需要对输出文件进行过滤,不过滤可能产生错误 90 | if (path.matches(".*[0~9]+") || path.contains("centers")) { 91 | paths.add(file.getPath()); 92 | } 93 | } 94 | } 95 | } 96 | } catch (IOException e) { 97 | e.printStackTrace(); 98 | } 99 | return paths.size() > 0 ? paths : null; 100 | } 101 | 102 | /** 103 | * 用于删除输出目录,便于下一次输出 104 | * 105 | * @param path 目录路径 106 | */ 107 | public static void deletePath(String path, boolean isDirectory) { 108 | boolean isHDFS = path.contains(HDFS); 109 | try { 110 | if (isHDFS) { 111 | FileSystem fs = getFileSystem(path); 112 | if (fs.exists(new Path(path))) { 113 | fs.delete(new Path(path), isDirectory); 114 | } 115 | } else { 116 | File file = new File(path); 117 | if (file.exists()) { 118 | // 本地目录递归删除 119 | if (isDirectory) { 120 | File[] subFiles = file.listFiles(); 121 | for (File subFile : subFiles) { 122 | if (subFile.isFile()) { 123 | subFile.delete(); 124 | } else { 125 | deletePath(subFile.getPath(), true); 126 | } 127 | } 128 | } 129 | file.delete(); 130 | } 131 | } 132 | } catch (IOException e) { 133 | e.printStackTrace(); 134 | } 135 | } 136 | 137 | /** 138 | * @param path HDFS 文件系统的目标路径,此路径决定将要使用的文件系统,如果没有指定则会使用默认的文件系统 139 | * @return 返回 FileSystem 实例,用于访问文件系统 140 | */ 141 | public static FileSystem getFileSystem(String path) { 142 | FileSystem fs = null; 143 | try { 144 | fs = FileSystem.get(URI.create(path), new Configuration()); 145 | } catch (IOException e) { 146 | e.printStackTrace(); 147 | } 148 | return fs; 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /Giraph/src/main/java/cn/edu/ecnu/giraph/examples/kmeans/utils/PointsOperation.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.giraph.examples.kmeans.utils; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.io.InputStreamReader; 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | public class PointsOperation { 11 | public static List getCenters(String centersPath) { 12 | List centers = new ArrayList<>(); 13 | InputStream inputStream = FileOperation.read(centersPath); 14 | BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); 15 | String line; 16 | try { 17 | while ((line = reader.readLine()) != null) { 18 | centers.add(line); 19 | } 20 | } catch (IOException e) { 21 | e.printStackTrace(); 22 | } 23 | return centers; 24 | } 25 | 26 | public static List> parse(String pointsStr) { 27 | pointsStr = pointsStr.replace("\u0000", ""); 28 | String[] datas = pointsStr.split("\t"); 29 | List> points = new ArrayList<>(); 30 | for (String data : datas) { 31 | List point = new ArrayList<>(); 32 | for (String dimension : data.split("[ ,]")) { 33 | point.add(Double.parseDouble(dimension)); 34 | } 35 | points.add(point); 36 | } 37 | return points; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /Giraph/src/main/java/cn/edu/ecnu/giraph/examples/pagerank/PageRankComputation.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.giraph.examples.pagerank; 2 | 3 | import org.apache.giraph.graph.BasicComputation; 4 | import org.apache.giraph.graph.Vertex; 5 | import org.apache.hadoop.io.DoubleWritable; 6 | import org.apache.hadoop.io.Text; 7 | 8 | /* 步骤1:确定顶点标识I、顶点的计算值V、边的权值E以及消息值M的数据类型 */ 9 | public class PageRankComputation 10 | extends BasicComputation { 11 | 12 | // 阻尼系数 13 | private static final double D = 0.85; 14 | // 最大超步数 15 | private static final int MAX_ITERATION = 20; 16 | 17 | @Override 18 | public void compute(Vertex vertex, 19 | Iterable messages) { 20 | /* 步骤2:编写与顶点计算、更新相关的处理逻辑以及发送消息 */ 21 | if (getSuperstep() > 0) { 22 | // 对接收到的贡献值进行累加 23 | double sum = 0; 24 | for (DoubleWritable message : messages) { 25 | sum += message.get(); 26 | } 27 | // |根据公式计算并更新排名值| 28 | double rankValue = (1 - D) / getTotalNumVertices() + D * sum; 29 | vertex.setValue(new DoubleWritable(rankValue)); 30 | } 31 | 32 | // 小于设定的最大超步数则发送消息,否则使得顶点进入非活跃状态 33 | if (getSuperstep() < MAX_ITERATION) { 34 | // 存在出站链接时,各网页将网页的贡献值发送给链向的网页 35 | if (vertex.getNumEdges() != 0) { 36 | sendMessageToAllEdges( 37 | vertex, new DoubleWritable(vertex.getValue().get() / vertex.getNumEdges())); 38 | } 39 | } else { 40 | // 将当前网页的排名值四舍五入保留5位小数 41 | double rankValue = vertex.getValue().get(); 42 | rankValue = Double.parseDouble(String.format("%.5f", rankValue)); 43 | vertex.setValue(new DoubleWritable(rankValue)); 44 | vertex.voteToHalt(); 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /Giraph/src/main/java/cn/edu/ecnu/giraph/examples/pagerank/PageRankRunner.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.giraph.examples.pagerank; 2 | 3 | import org.apache.giraph.conf.GiraphConfiguration; 4 | import org.apache.giraph.conf.GiraphConstants; 5 | import org.apache.giraph.io.formats.GiraphTextInputFormat; 6 | import org.apache.giraph.io.formats.GiraphTextOutputFormat; 7 | import org.apache.giraph.io.formats.IdWithValueTextOutputFormat; 8 | import org.apache.giraph.io.formats.TextDoubleDoubleAdjacencyListVertexInputFormat; 9 | import org.apache.giraph.job.GiraphJob; 10 | import org.apache.hadoop.conf.Configured; 11 | import org.apache.hadoop.fs.Path; 12 | import org.apache.hadoop.util.Tool; 13 | import org.apache.hadoop.util.ToolRunner; 14 | 15 | public class PageRankRunner extends Configured implements Tool { 16 | 17 | @Override 18 | public int run(String[] args) throws Exception { 19 | /* 步骤1: 设置作业的信息 */ 20 | GiraphConfiguration giraphConf = new GiraphConfiguration(getConf()); 21 | // 设置compute方法 22 | giraphConf.setComputationClass(PageRankComputation.class); 23 | // 设置图数据的输入格式 24 | giraphConf.setVertexInputFormatClass(TextDoubleDoubleAdjacencyListVertexInputFormat.class); 25 | // 设置图数据的输出格式 26 | giraphConf.setVertexOutputFormatClass(IdWithValueTextOutputFormat.class); 27 | 28 | giraphConf.setCheckpointFrequency(5); 29 | 30 | // 启用本地调试模式 31 | giraphConf.setLocalTestMode(true); 32 | // 最小和最大的Worker数量均为1,Master协调超步时所需Worker响应的百分比为100 33 | giraphConf.setWorkerConfiguration(1, 1, 100); 34 | // Master和Worker位于同一进程 35 | GiraphConstants.SPLIT_MASTER_WORKER.set(giraphConf, false); 36 | 37 | // 创建Giraph作业 38 | GiraphJob giraphJob = new GiraphJob(giraphConf, getClass().getSimpleName()); 39 | // 设置图数据的输入路径 40 | GiraphTextInputFormat.addVertexInputPath(giraphConf, new Path(args[0])); 41 | // 设置图数据的输出路径 42 | GiraphTextOutputFormat.setOutputPath(giraphJob.getInternalJob(), new Path(args[1])); 43 | 44 | return giraphJob.run(true) ? 0 : -1; 45 | } 46 | 47 | public static void main(String[] args) throws Exception { 48 | /* 步骤2: 运行作业 */ 49 | int exitCode = ToolRunner.run(new PageRankRunner(), args); 50 | System.exit(exitCode); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /Giraph/src/main/java/cn/edu/ecnu/giraph/examples/pagerank/README.md: -------------------------------------------------------------------------------- 1 | #### 运行方法 2 | 3 | 修改运行配置,在 Program arguments 中填入 `src/main/resources/inputs/pagerank/data.txt src/main/resources/outputs/pagerank/` -------------------------------------------------------------------------------- /Giraph/src/main/java/cn/edu/ecnu/giraph/examples/sssp/README.md: -------------------------------------------------------------------------------- 1 | #### 运行方法 2 | 3 | 修改运行配置,在 Program arguments 中填入 `src/main/resources/inputs/sssp/data.txt src/main/resources/outputs/sssp/` -------------------------------------------------------------------------------- /Giraph/src/main/java/cn/edu/ecnu/giraph/examples/sssp/ShortestPathComputation.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.giraph.examples.sssp; 2 | 3 | import org.apache.giraph.edge.Edge; 4 | import org.apache.giraph.graph.BasicComputation; 5 | import org.apache.giraph.graph.Vertex; 6 | import org.apache.hadoop.io.DoubleWritable; 7 | import org.apache.hadoop.io.FloatWritable; 8 | import org.apache.hadoop.io.LongWritable; 9 | 10 | /* 步骤1:确定顶点标识I、顶点的计算值V、边的权值E以及消息值M的数据类型 */ 11 | public class ShortestPathComputation 12 | extends BasicComputation { 13 | 14 | // 源点 15 | protected static final int SOURCE_VERTEX = 0; 16 | // 表示无穷大 17 | protected static final Double INF = Double.MAX_VALUE; 18 | 19 | @Override 20 | public void compute(Vertex vertex, 21 | Iterable messages) { 22 | /* 步骤2:编写与顶点计算、更新相关的处理逻辑以及发送消息 */ 23 | // 超步0时将顶点初始化为表示无穷大的INF 24 | if (getSuperstep() == 0) { 25 | vertex.setValue(new DoubleWritable(INF)); 26 | } 27 | 28 | // 根据接收到的消息计算当前距离源点的最短路径值 29 | double minDist = vertex.getId().get() == SOURCE_VERTEX ? 0d : INF; 30 | for (DoubleWritable message : messages) { 31 | minDist = Math.min(minDist, message.get()); 32 | } 33 | 34 | // 当minDist小于顶点的计算值时将计算值更新为minDist 35 | if (minDist < vertex.getValue().get()) { 36 | vertex.setValue(new DoubleWritable(minDist)); 37 | for (Edge edge : vertex.getEdges()) { 38 | double distance = minDist + edge.getValue().get(); 39 | sendMessage(edge.getTargetVertexId(), new DoubleWritable(distance)); 40 | } 41 | } 42 | 43 | vertex.voteToHalt(); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /Giraph/src/main/java/cn/edu/ecnu/giraph/examples/sssp/ShortestPathRunner.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.giraph.examples.sssp; 2 | 3 | import org.apache.giraph.conf.GiraphConfiguration; 4 | import org.apache.giraph.conf.GiraphConstants; 5 | import org.apache.giraph.io.formats.GiraphTextInputFormat; 6 | import org.apache.giraph.io.formats.GiraphTextOutputFormat; 7 | import org.apache.giraph.io.formats.IdWithValueTextOutputFormat; 8 | import org.apache.giraph.io.formats.JsonLongDoubleFloatDoubleVertexInputFormat; 9 | import org.apache.giraph.job.GiraphJob; 10 | import org.apache.hadoop.conf.Configured; 11 | import org.apache.hadoop.fs.Path; 12 | import org.apache.hadoop.util.Tool; 13 | import org.apache.hadoop.util.ToolRunner; 14 | 15 | 16 | public class ShortestPathRunner extends Configured implements Tool { 17 | 18 | @Override 19 | public int run(String[] args) throws Exception { 20 | /* 步骤1: 设置作业的信息 */ 21 | GiraphConfiguration giraphConf = new GiraphConfiguration(getConf()); 22 | 23 | // 设置compute方法 24 | giraphConf.setComputationClass(ShortestPathComputation.class); 25 | // 设置图数据的输入格式 26 | giraphConf.setVertexInputFormatClass(JsonLongDoubleFloatDoubleVertexInputFormat.class); 27 | // 设置图数据的输出格式 28 | giraphConf.setVertexOutputFormatClass(IdWithValueTextOutputFormat.class); 29 | 30 | // 启用本地调试模式 31 | giraphConf.setLocalTestMode(true); 32 | // 最小和最大的Worker数量均为1,Master协调超步时所需Worker响应的百分比为100 33 | giraphConf.setWorkerConfiguration(1, 1, 100); 34 | // Master和Worker位于同一进程 35 | GiraphConstants.SPLIT_MASTER_WORKER.set(giraphConf, false); 36 | 37 | // 创建Giraph作业 38 | GiraphJob giraphJob = new GiraphJob(giraphConf, getClass().getSimpleName()); 39 | 40 | // 设置图数据的输入路径 41 | GiraphTextInputFormat.addVertexInputPath(giraphConf, new Path(args[0])); 42 | // 设置图数据的输出路径 43 | GiraphTextOutputFormat.setOutputPath(giraphJob.getInternalJob(), new Path(args[1])); 44 | 45 | return giraphJob.run(true) ? 0 : -1; 46 | } 47 | 48 | public static void main(String[] args) throws Exception { 49 | /* 步骤2: 运行作业 */ 50 | int exitCode = ToolRunner.run(new ShortestPathRunner(), args); 51 | System.exit(exitCode); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /Giraph/src/main/resources/inputs/cc/data.txt: -------------------------------------------------------------------------------- 1 | 0 1 2 | 1 0 2 3 3 | 2 1 3 4 | 3 1 2 5 | 4 5 6 6 | 5 4 6 7 | 6 4 5 -------------------------------------------------------------------------------- /Giraph/src/main/resources/inputs/kmeans/center/centers.txt: -------------------------------------------------------------------------------- 1 | 1,2 2 | 3,1 -------------------------------------------------------------------------------- /Giraph/src/main/resources/inputs/kmeans/data.txt: -------------------------------------------------------------------------------- 1 | 0,0 -1 2 | 1,2 -1 3 | 3,1 -1 4 | 8,8 -1 5 | 9,10 -1 6 | 10,7 -1 -------------------------------------------------------------------------------- /Giraph/src/main/resources/inputs/pagerank/data.txt: -------------------------------------------------------------------------------- 1 | A 1 B 1 D 1 2 | B 1 C 1 3 | C 1 A 1 B 1 4 | D 1 B 1 C 1 -------------------------------------------------------------------------------- /Giraph/src/main/resources/inputs/sssp/data.txt: -------------------------------------------------------------------------------- 1 | [0,-1,[[1,1.0],[3,3.0]]] 2 | [1,-1,[[2,1.0]]] 3 | [2,-1,[[0,1.0],[1,1.0]]] 4 | [3,-1,[[1,1.0],[2,1.0]]] -------------------------------------------------------------------------------- /Giraph/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n -------------------------------------------------------------------------------- /HDFS/HDFS.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dasebigdata-ecnu/DistributedComputingSystems_Example/8dd34541736d06f8cd8e1b2a55fc6455b1c7b0e8/HDFS/HDFS.md -------------------------------------------------------------------------------- /HDFS/README.md: -------------------------------------------------------------------------------- 1 | #### 功能说明 2 | 3 | 工程包含 HDFS 的读写操作示例 4 | 5 | #### 运行方式 6 | 7 | 首先需要启动 HDFS,启动 HDFS 的命令为 `$HADOOP_HOME/sbin/start-dfs.sh`。 8 | 1. 运行写程序 9 | 10 | 修改运行配置,在 Program arguments 中依次填入 `hdfs://localhost:9000/ecnu/hdfs/example.txt` 和 `src/main/resources/example/example.txt` 11 | 12 | 2. 运行读程序 13 | 14 | 修改运行配置,在 Program arguments 中填入`hdfs://localhost:9000/ecnu/hdfs/example.txt` 和 `src/main/resources/example/output.txt` 15 | 16 | -------------------------------------------------------------------------------- /HDFS/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.ecnu.hdfs 8 | HDFS 9 | 1.0 10 | 11 | 12 | 13 | 14 | org.apache.maven.plugins 15 | maven-compiler-plugin 16 | 17 | 7 18 | 7 19 | 20 | 21 | 22 | 23 | 24 | 25 | UTF-8 26 | 1.7 27 | 1.7 28 | 29 | 2.9.2 30 | 31 | 32 | 33 | 34 | org.apache.hadoop 35 | hadoop-client 36 | ${hadoop.version} 37 | 38 | 39 | 40 | org.apache.hadoop 41 | hadoop-common 42 | ${hadoop.version} 43 | 44 | 45 | 46 | org.apache.hadoop 47 | hadoop-hdfs 48 | ${hadoop.version} 49 | 50 | 51 | 52 | org.apache.hadoop 53 | hadoop-mapreduce-client 54 | ${hadoop.version} 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /HDFS/src/main/java/cn/edu/ecnu/hdfs/examples/read/Reader.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.hdfs.examples.read; 2 | 3 | import java.io.FileOutputStream; 4 | import java.io.IOException; 5 | import java.net.URI; 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.hadoop.fs.FSDataInputStream; 8 | import org.apache.hadoop.fs.FileSystem; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.IOUtils; 11 | 12 | public class Reader { 13 | 14 | public void read(String hdfsFilePath, String localFilePath) throws IOException { 15 | /* 步骤1:获取HDFS的文件系统对象 */ 16 | Configuration conf = new Configuration(); 17 | FileSystem fs = FileSystem.get(URI.create(hdfsFilePath), conf); 18 | /* 步骤2:获取输入流hdfsInputStream */ 19 | FSDataInputStream hdfsInputStream = fs.open(new Path(hdfsFilePath)); 20 | /* 步骤3:利用输入流读取HDFS文件 */ 21 | // 写入本地文件的输出流 22 | FileOutputStream localOutputStream = new FileOutputStream(localFilePath); 23 | // 将HDFS文件的输入流拷贝至本地文件的输出流 24 | IOUtils.copyBytes(hdfsInputStream, localOutputStream, 4096, true); 25 | } 26 | 27 | public static void main(String[] args) throws IOException { 28 | if (args.length < 1) { 29 | System.err.println("Usage: "); 30 | System.exit(-1); 31 | } 32 | 33 | Reader reader = new Reader(); 34 | reader.read(args[0], args[1]); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /HDFS/src/main/java/cn/edu/ecnu/hdfs/examples/write/Writer.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.hdfs.examples.write; 2 | 3 | import java.io.FileInputStream; 4 | import java.io.IOException; 5 | import java.net.URI; 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.hadoop.fs.FSDataOutputStream; 8 | import org.apache.hadoop.fs.FileSystem; 9 | import org.apache.hadoop.fs.Path; 10 | import org.apache.hadoop.io.IOUtils; 11 | 12 | public class Writer { 13 | 14 | public void write(String hdfsFilePath, String localFilePath) throws IOException { 15 | 16 | /* 步骤1:获取HDFS的文件系统对象 */ 17 | Configuration conf = new Configuration(); 18 | FileSystem fs = FileSystem.get(URI.create(hdfsFilePath), conf); 19 | 20 | /* 步骤2:获取输出流hdfsOutputStream */ 21 | FSDataOutputStream hdfsOutputStream = fs.create(new Path(hdfsFilePath)); 22 | 23 | /* 步骤3:利用输出流写入HDFS文件 */ 24 | // 读取本地文件的输入流 25 | FileInputStream localInputStream = new FileInputStream(localFilePath); 26 | // 将本地文件的输入流拷贝至HDFS文件的输出流 27 | IOUtils.copyBytes(localInputStream, hdfsOutputStream, 4096, true); 28 | } 29 | 30 | public static void main(String[] args) throws IOException { 31 | if (args.length < 2) { 32 | System.err.println("Usage: "); 33 | System.exit(-1); 34 | } 35 | 36 | Writer writer = new Writer(); 37 | writer.write(args[0], args[1]); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /HDFS/src/main/resources/example/example.txt: -------------------------------------------------------------------------------- 1 | On the top of the Crumpetty Tree The Quangle Wangle sat, 2 | But his face you could not see, On account of his Beaver Hat. 3 | For his Hat was a hundred and two feet wide, 4 | With ribbons and bibbons on every side 5 | And bells, and buttons, and loops, and lace, 6 | So that nobody ever could see the face Of the Quangle Wangle Quee. -------------------------------------------------------------------------------- /MapReduce/MapReduce.md: -------------------------------------------------------------------------------- 1 | 项目包括三个模块: 2 | 3 | 1. 单词计数 4 | 2. Join 5 | 3. KMeans -------------------------------------------------------------------------------- /MapReduce/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.ecnu.mapreduce 8 | MapReduce 9 | 1.1 10 | 11 | 12 | 13 | org.apache.maven.plugins 14 | maven-compiler-plugin 15 | 16 | 7 17 | 7 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 2.9.2 26 | 27 | 28 | 29 | 30 | org.apache.hadoop 31 | hadoop-client 32 | ${hadoop.version} 33 | 34 | 35 | 36 | org.apache.hadoop 37 | hadoop-common 38 | ${hadoop.version} 39 | 40 | 41 | 42 | org.apache.hadoop 43 | hadoop-hdfs 44 | ${hadoop.version} 45 | 46 | 47 | 48 | org.apache.hadoop 49 | hadoop-mapreduce-client 50 | ${hadoop.version} 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/Constants.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.mapreduce.examples; 2 | 3 | public class Constants { 4 | public static final String EMPLOYEE = "employee"; 5 | public static final String DEPARTMENT = "department"; 6 | } 7 | -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/join/README.md: -------------------------------------------------------------------------------- 1 | #### 运行方法 2 | 3 | ##### Reduce 端 Join 部分 4 | 5 | 修改运行配置,在 Program arguments 中填入 `src/main/resources/inputs/join/ src/main/resources/outputs/reduceJoin` 6 | 7 | ##### Map 端 Join 部分 8 | 9 | 修改运行配置,在 Program arguments 中填入 `src/main/resources/inputs/join/employee.csv src/main/resources/outputs/mapJoin src/main/resources/inputs/join/department.csv` 10 | 11 | **Note:在src/main/resources/inputs/join/input_cluster下放置了一组输入数据。当希望在集群上运行join作业时,可以将这组输入数据作为join作业的输入。需要说明的是,此时需要将这组数据上传至集群的HDFS,然后才能提交join作业。** -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/join/entity/ReduceJoinWritable.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.mapreduce.examples.join.entity; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | public class ReduceJoinWritable implements Writable { 9 | 10 | // 保存雇员表或部门表元组 11 | private String data; 12 | // 标识当前对象保存的元组来自雇员表还是部门表 13 | private String tag; 14 | 15 | // 用于标识的常量 16 | public static final String EMPLOYEE = "1"; 17 | public static final String DEPARTMENT = "2"; 18 | 19 | @Override 20 | public void write(DataOutput dataOutput) throws IOException { 21 | dataOutput.writeUTF(tag); 22 | dataOutput.writeUTF(data); 23 | } 24 | 25 | @Override 26 | public void readFields(DataInput dataInput) throws IOException { 27 | tag = dataInput.readUTF(); 28 | data = dataInput.readUTF(); 29 | } 30 | 31 | // get和set方法 32 | public String getData() { 33 | return data; 34 | } 35 | 36 | public void setData(String data) { 37 | this.data = data; 38 | } 39 | 40 | public String getTag() { 41 | return tag; 42 | } 43 | 44 | public void setTag(String tag) { 45 | this.tag = tag; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/join/mapjoin/MapJoin.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.mapreduce.examples.join.mapjoin; 2 | 3 | import java.net.URI; 4 | import org.apache.hadoop.conf.Configured; 5 | import org.apache.hadoop.fs.FileStatus; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.io.NullWritable; 9 | import org.apache.hadoop.io.Text; 10 | import org.apache.hadoop.mapreduce.Job; 11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 13 | import org.apache.hadoop.util.Tool; 14 | import org.apache.hadoop.util.ToolRunner; 15 | 16 | public class MapJoin extends Configured implements Tool { 17 | 18 | @Override 19 | public int run(String[] args) throws Exception { 20 | 21 | /* 步骤1:设置作业的信息 */ 22 | Job job = Job.getInstance(getConf(), getClass().getSimpleName()); 23 | // 设置程序的类名 24 | job.setJarByClass(getClass()); 25 | 26 | // 设置数据的输入输出路径 27 | FileInputFormat.addInputPath(job, new Path(args[0])); 28 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 29 | 30 | // 设置map方法及其输出键值对数据类型 31 | job.setMapperClass(MapJoinMapper.class); 32 | job.setMapOutputKeyClass(Text.class); 33 | job.setMapOutputValueClass(NullWritable.class); 34 | job.setNumReduceTasks(0); 35 | 36 | // 将部门表通过分布式缓存广播出去 37 | job.addCacheFile(new URI(args[2])); 38 | 39 | return job.waitForCompletion(true) ? 0 : 1; 40 | } 41 | 42 | public static void main(String[] args) throws Exception { 43 | /* 步骤2:运行作业 */ 44 | int exitCode = ToolRunner.run(new MapJoin(), args); 45 | System.exit(exitCode); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/join/mapjoin/MapJoinMapper.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.mapreduce.examples.join.mapjoin; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStreamReader; 6 | import java.net.URI; 7 | import java.util.HashMap; 8 | import java.util.Map; 9 | import org.apache.hadoop.conf.Configuration; 10 | import org.apache.hadoop.fs.FileSystem; 11 | import org.apache.hadoop.fs.Path; 12 | import org.apache.hadoop.io.LongWritable; 13 | import org.apache.hadoop.io.NullWritable; 14 | import org.apache.hadoop.io.Text; 15 | import org.apache.hadoop.mapreduce.Mapper; 16 | 17 | /* 步骤1:确定输入键值对[K1,V1]的数据类型为[LongWritable,Text],确定输出键值对[K2,V2]的数据类型为[Text,NullWritable] */ 18 | public class MapJoinMapper extends Mapper { 19 | 20 | private Map departmentsTable = new HashMap<>(); 21 | 22 | @Override 23 | protected void map(LongWritable key, Text value, Context context) 24 | throws IOException, InterruptedException { 25 | /* |步骤2:编写处理逻辑将[K1,V1]转换为[K2,V2]并输出| */ 26 | if (departmentsTable.isEmpty()) { 27 | URI uri = context.getCacheFiles()[0]; 28 | FileSystem fs = FileSystem.get(uri, new Configuration()); 29 | BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(new Path(uri)))); 30 | String content; 31 | // 载入部门表 32 | while ((content = reader.readLine()) != null) { 33 | // 以制表符为分隔符对部门表元组进行切分,以便获取DeptName属性值 34 | String[] datas = content.split("\t"); 35 | // 以DeptName属性值为key将元组保存在集合中 36 | departmentsTable.put(datas[0], datas[1]); 37 | } 38 | } 39 | // 以制表符为分隔符切分雇员表元组 40 | String[] datas = value.toString().split("\t"); 41 | // 获取DeptName的属性值 42 | String deptName = datas[2]; 43 | // 进行连接操作并输出 44 | if (departmentsTable.containsKey(deptName)) { 45 | context.write( 46 | new Text(value.toString() + "\t" + departmentsTable.get(deptName)), NullWritable.get()); 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/join/reducejoin/ReduceJoin.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.mapreduce.examples.join.reducejoin; 2 | 3 | import cn.edu.ecnu.mapreduce.examples.join.entity.ReduceJoinWritable; 4 | import org.apache.hadoop.conf.Configured; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.NullWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Job; 9 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 10 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 11 | import org.apache.hadoop.util.Tool; 12 | import org.apache.hadoop.util.ToolRunner; 13 | 14 | public class ReduceJoin extends Configured implements Tool { 15 | 16 | @Override 17 | public int run(String[] args) throws Exception { 18 | /* 步骤1:设置作业的信息 */ 19 | Job job = Job.getInstance(getConf(), getClass().getSimpleName()); 20 | // 设置程序的类名 21 | job.setJarByClass(getClass()); 22 | 23 | // 设置数据的输入输出路径 24 | FileInputFormat.addInputPath(job, new Path(args[0])); 25 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 26 | 27 | // 设置map和reduce方法 28 | job.setMapperClass(ReduceJoinMapper.class); 29 | job.setReducerClass(ReduceJoinReducer.class); 30 | 31 | // 设置map方法的输出键值对数据类型 32 | job.setMapOutputKeyClass(Text.class); 33 | job.setMapOutputValueClass(ReduceJoinWritable.class); 34 | // 设置reduce方法的输出键值对数据类型 35 | job.setOutputKeyClass(Text.class); 36 | job.setOutputValueClass(NullWritable.class); 37 | 38 | return job.waitForCompletion(true) ? 0 : 1; 39 | } 40 | 41 | public static void main(String[] args) throws Exception { 42 | /* |步骤2:运行作业| */ 43 | int exitCode = ToolRunner.run(new ReduceJoin(), args); 44 | System.exit(exitCode); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/join/reducejoin/ReduceJoinMapper.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.mapreduce.examples.join.reducejoin; 2 | 3 | import cn.edu.ecnu.mapreduce.examples.Constants; 4 | import cn.edu.ecnu.mapreduce.examples.join.entity.ReduceJoinWritable; 5 | import java.io.IOException; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapreduce.Mapper; 9 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 10 | 11 | /* 步骤1:确定输入键值对[K1,V1]的数据类型为[LongWritable,Text],确定输出键值对[K2,V2]的数据类型为[Text,ReduceJoinWritable] */ 12 | public class ReduceJoinMapper extends Mapper { 13 | 14 | @Override 15 | protected void map(LongWritable key, Text value, Context context) 16 | throws IOException, InterruptedException { 17 | /* 步骤2:编写处理逻辑将[K1,V1]转换为[K2,V2]并输出 */ 18 | // 获取输入键值对所属的Split 19 | FileSplit split = (FileSplit) context.getInputSplit(); 20 | // 通过Split获取键值对所属的文件路径 21 | String path = split.getPath().toString(); 22 | ReduceJoinWritable writable = new ReduceJoinWritable(); 23 | // 用writable的data保存元组 24 | writable.setData(value.toString()); 25 | // 以制表符为分隔符切分元组,方便获取DeptName属性值 26 | String[] datas = value.toString().split("\t"); 27 | // 通过输入数据所属文件路径判断datas的表来源并进行分类处理 28 | if (path.contains(Constants.EMPLOYEE)) { 29 | // 标识data保存的元组来自雇员表 30 | writable.setTag(ReduceJoinWritable.EMPLOYEE); 31 | // 用DeptName属性值为键输出结果 32 | context.write(new Text(datas[2]), writable); 33 | } else if (path.contains(Constants.DEPARTMENT)) { 34 | // 标识data中保存的元组来自部门表 35 | writable.setTag(ReduceJoinWritable.DEPARTMENT); 36 | // 用DeptName属性值为键输出结果 37 | context.write(new Text(datas[0]), writable); 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/join/reducejoin/ReduceJoinReducer.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.mapreduce.examples.join.reducejoin; 2 | 3 | import cn.edu.ecnu.mapreduce.examples.join.entity.ReduceJoinWritable; 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | import org.apache.hadoop.io.NullWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Reducer; 10 | 11 | /* 步骤1:确定输入键值对[K2,List(V2)]的数据类型为[Text, ReduceJoinWritable],确定输出键值对[K3,V3]的数据类型为[Text,NullWritable] */ 12 | public class ReduceJoinReducer extends Reducer { 13 | 14 | @Override 15 | protected void reduce(Text key, Iterable values, Context context) 16 | throws IOException, InterruptedException { 17 | /* 步骤2:编写处理逻辑将输入键值对[K2,List(V2)]转换为[K3,V3]并输出 */ 18 | List employees = new ArrayList<>(); 19 | List departments = new ArrayList<>(); 20 | // 分离values集合中雇员表和部门表元组 21 | for (ReduceJoinWritable value : values) { 22 | // 获取ReduceJoinWritable对象的标识 23 | String tag = value.getTag(); 24 | if (tag.equals(ReduceJoinWritable.EMPLOYEE)) { 25 | employees.add(value.getData()); 26 | } else if (tag.equals(ReduceJoinWritable.DEPARTMENT)) { 27 | departments.add(value.getData()); 28 | } 29 | } 30 | 31 | // 进行连接操作并输出连接结果 32 | for (String employee : employees) { 33 | for (String department : departments) { 34 | String[] datas = department.split("\t"); 35 | // 不重复输出DeptName属性值 36 | String result = employee + "\t" + datas[1]; 37 | context.write(new Text(result), NullWritable.get()); 38 | } 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/kmeans/KMeans.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.mapreduce.examples.kmeans; 2 | 3 | import java.net.URI; 4 | import org.apache.hadoop.conf.Configured; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.IntWritable; 7 | import org.apache.hadoop.io.NullWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 | import org.apache.hadoop.util.Tool; 13 | import org.apache.hadoop.util.ToolRunner; 14 | 15 | public class KMeans extends Configured implements Tool { 16 | 17 | // 最大的迭代次数 18 | public static final int MAX_ITERATION = 20; 19 | // 从0开始记录迭代步数 20 | private static int iteration = 0; 21 | 22 | // 配置项中用于记录当前迭代步数的键 23 | public static final String ITERATION = "1"; 24 | 25 | @Override 26 | public int run(String[] args) throws Exception { 27 | /* 步骤1:设置作业的信息 */ 28 | getConf().setInt(KMeans.ITERATION, iteration); 29 | 30 | Job job = Job.getInstance(getConf(), getClass().getSimpleName()); 31 | // 设置程序的类名 32 | job.setJarByClass(getClass()); 33 | 34 | // 设置数据的输入输出路径 35 | FileInputFormat.addInputPath(job, new Path(args[0])); 36 | FileOutputFormat.setOutputPath(job, new Path(args[1] + iteration)); 37 | 38 | // 设置map方法及其输出键值对数据类型 39 | job.setMapperClass(KMeansMapper.class); 40 | job.setMapOutputKeyClass(Text.class); 41 | job.setMapOutputValueClass(Text.class); 42 | 43 | // 将聚类中心集通过分布式缓存广播出去 44 | if (iteration == 0) { 45 | // 第一次迭代时的聚类中心集 46 | job.addCacheFile(new URI(args[2])); 47 | } else { 48 | // 广播上一次迭代输出的聚类中心集 49 | job.addCacheFile(new URI(args[1] + (iteration - 1))); 50 | } 51 | 52 | // 最后一次迭代输出的是聚类结果,不需要再计算新的聚类中心 53 | if ((iteration + 1) != MAX_ITERATION) { 54 | job.setReducerClass(KMeansReducer.class); 55 | job.setOutputKeyClass(Text.class); 56 | job.setOutputValueClass(NullWritable.class); 57 | } else { 58 | job.setNumReduceTasks(0); 59 | } 60 | 61 | return job.waitForCompletion(true) ? 0 : -1; 62 | } 63 | 64 | public static void main(String[] args) throws Exception { 65 | /* 步骤2:运行作业 */ 66 | int exitCode = 0; 67 | // 执行指定次数的迭代,在最后一次迭代时输出聚类结果 68 | while (iteration < MAX_ITERATION) { 69 | exitCode = ToolRunner.run(new KMeans(), args); 70 | if (exitCode == -1) { 71 | break; 72 | } 73 | iteration++; 74 | } 75 | System.exit(exitCode); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/kmeans/KMeansMapper.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.mapreduce.examples.kmeans; 2 | 3 | import cn.edu.ecnu.mapreduce.examples.kmeans.utils.CentersOperation; 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.LongWritable; 9 | import org.apache.hadoop.io.Text; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | 12 | /* 步骤1:确定输入键值对[K1,V1]的数据类型为[LongWritable,Text],确定输出键值对[K2,V2]的数据类型为[Text,Text] */ 13 | public class KMeansMapper extends Mapper { 14 | 15 | private List> centers = new ArrayList<>(); 16 | 17 | @Override 18 | protected void map(LongWritable key, Text value, Context context) 19 | throws IOException, InterruptedException { 20 | /* 步骤2:编写处理逻辑将[K1,V1]转换为[K2,V2]并输出 */ 21 | String[] dimensions; 22 | List point = new ArrayList<>(); 23 | double centerIndex = 1; 24 | double minDistance = Double.MAX_VALUE; 25 | int iteration = context.getConfiguration().getInt(KMeans.ITERATION, 0); 26 | 27 | if (centers.size() == 0) { 28 | // 获取广播的聚类中心集路径 29 | String centersPath = context.getCacheFiles()[0].toString(); 30 | // 将聚类中心加载到集合centers 31 | centers = CentersOperation.getCenters(centersPath, true); 32 | } 33 | 34 | // 解析数据点 35 | dimensions = value.toString().split("[,\\t]"); 36 | for (int i = 0; i < dimensions.length - 1; i++) { 37 | point.add(Double.parseDouble(dimensions[i])); 38 | } 39 | 40 | // 遍历聚类中心集并计算与数据点的距离 41 | for (int i = 0; i < centers.size(); i++) { 42 | double distance = 0; 43 | List center = centers.get(i); 44 | // 计算数据点与当前聚类中心之间的距离 45 | for (int j = 0; j < center.size(); j++) { 46 | distance += Math.pow((point.get(j) - center.get(j)), 2); 47 | } 48 | distance = Math.sqrt(distance); 49 | // 如果距离小于当前记录的最小距离则将数据点分配给当前聚类中心(类别号标识) 50 | if (distance < minDistance) { 51 | minDistance = distance; 52 | centerIndex = i + 1; 53 | } 54 | } 55 | 56 | // 从输入值中截取数据点 57 | String pointData = value.toString().split("\t")[0]; 58 | if (iteration == (KMeans.MAX_ITERATION - 1)) { 59 | context.write(new Text(pointData), new Text(String.valueOf(centerIndex))); 60 | } else { 61 | // 输出以类别号为键,数据点为值的键值对 62 | context.write(new Text(String.valueOf(centerIndex)), new Text(pointData)); 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/kmeans/KMeansReducer.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.mapreduce.examples.kmeans; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | import org.apache.hadoop.io.IntWritable; 7 | import org.apache.hadoop.io.NullWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Reducer; 10 | 11 | /* 步骤1:确定输出键值对[K2,V2]的数据类型为[Text,Text] ,确定输出键值对[K3,V3]的数据类型为[Text,NullWritable] */ 12 | public class KMeansReducer extends Reducer { 13 | 14 | @Override 15 | protected void reduce(Text key, Iterable values, Context context) 16 | throws IOException, InterruptedException { 17 | /* 步骤2:编写处理逻辑将[K2,V2]转换为[K3,V3]并输出 */ 18 | List> points = new ArrayList<>(); 19 | // 解析数据点并保存到集合points 20 | for (Text text : values) { 21 | String value = text.toString(); 22 | List point = new ArrayList<>(); 23 | for (String s : value.split(",")) { 24 | point.add(Double.parseDouble(s)); 25 | } 26 | points.add(point); 27 | } 28 | 29 | StringBuilder newCenter = new StringBuilder(); 30 | // 计算每个维度的平均值从而得到新的聚类中心 31 | for (int i = 0; i < points.get(0).size(); i++) { 32 | double sum = 0; 33 | // 计算第i个维度值的和 34 | for (List data : points) { 35 | sum += data.get(i); 36 | } 37 | // 计算平均值得到新的聚类中心的第i个维度值并生成需要输出的数据 38 | newCenter.append(sum / points.size()); 39 | newCenter.append(","); 40 | } 41 | 42 | context.write(new Text(newCenter.toString()), NullWritable.get()); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/kmeans/README.md: -------------------------------------------------------------------------------- 1 | #### 运行方法 2 | 3 | 修改运行配置,在 Program arguments 中填入 `src/main/resources/inputs/kmeans/data.txt src/main/resources/outputs/kmeans/ src/main/resources/inputs/kmeans/center/` -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/kmeans/utils/CentersOperation.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.mapreduce.examples.kmeans.utils; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.InputStream; 5 | import java.io.InputStreamReader; 6 | import java.io.OutputStream; 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | import org.apache.hadoop.io.IOUtils; 10 | 11 | public class CentersOperation { 12 | 13 | /** 14 | * 从指定路径读取中心点数据 15 | * 16 | * @param centersPath 中心数据路径 17 | * @param isDirectory 标识路径是否为目录 18 | * @return 返回中心数据 19 | */ 20 | public static List> getCenters(String centersPath, boolean isDirectory) { 21 | List> centers = new ArrayList<>(); 22 | 23 | try { 24 | if (isDirectory) { 25 | List paths = FileOperation.getPaths(centersPath); 26 | if (paths == null) { 27 | throw new Exception(centersPath + "centers directory is empty"); 28 | } 29 | for (String path : paths) { 30 | centers.addAll(getCenters(path, false)); 31 | } 32 | return centers; 33 | } 34 | 35 | InputStream inputStream = FileOperation.read(centersPath); 36 | BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); 37 | String line; 38 | while ((line = reader.readLine()) != null) { 39 | String[] datas = line.split(","); 40 | // 用集合保存每个中心信息 41 | List center = new ArrayList<>(); 42 | for (String data : datas) { 43 | center.add(Double.parseDouble(data)); 44 | } 45 | centers.add(center); 46 | } 47 | 48 | } catch (Exception e) { 49 | e.printStackTrace(); 50 | } 51 | 52 | return centers; 53 | } 54 | 55 | /** 56 | * 比较新旧中心数据,如果相同返回 true 57 | * 58 | * @param centerPath 旧的中心数据路径。即初始时刻设定的中心数据路径 59 | * @param newCenterPath mapReduce 生成的新的中心数据 60 | */ 61 | public static boolean compareCenters(String centerPath, String newCenterPath) { 62 | List> centers = getCenters(centerPath, false); 63 | List> newCenters = getCenters(newCenterPath, true); 64 | 65 | double distance = 0; 66 | for (int i = 0; i < centers.size(); i++) { 67 | for (int j = 1; j < centers.get(0).size(); j++) { 68 | // 计算两个中心之间的距离 69 | distance += Math.pow(centers.get(i).get(j) - newCenters.get(i).get(j), 2); 70 | } 71 | } 72 | 73 | if (distance == 0) { 74 | // 中心相同则删除 mapReduce 生成的中心数据,方便输出聚类结果 75 | FileOperation.deletePath(newCenterPath, true); 76 | return true; 77 | } else { 78 | // 中心数据不相同则要将新的中心数据复制到初始中心数据路径处 79 | List paths = FileOperation.getPaths(newCenterPath); 80 | try { 81 | if (paths == null) { 82 | throw new Exception("centers directory is empty"); 83 | } 84 | for (String path : paths) { 85 | InputStream inputStream = FileOperation.read(path); 86 | OutputStream outputStream = FileOperation.write(centerPath, true); 87 | IOUtils.copyBytes(inputStream, outputStream, 4096, true); 88 | } 89 | } catch (Exception e) { 90 | e.printStackTrace(); 91 | } 92 | FileOperation.deletePath(newCenterPath, true); 93 | } 94 | return false; 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/kmeans/utils/FileOperation.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.mapreduce.examples.kmeans.utils; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileOutputStream; 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.io.OutputStream; 9 | import java.net.URI; 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | import org.apache.hadoop.conf.Configuration; 13 | import org.apache.hadoop.fs.FileStatus; 14 | import org.apache.hadoop.fs.FileSystem; 15 | import org.apache.hadoop.fs.Path; 16 | 17 | public class FileOperation { 18 | 19 | /** 用于标识传入路径为 HDFS 路径 */ 20 | private static final String HDFS = "hdfs"; 21 | 22 | /** 23 | * 从指定路径获取输入流 24 | * 25 | * @param path 本地路径或 HDFS 路径 26 | */ 27 | public static InputStream read(String path) { 28 | boolean isHDFS = path.contains(HDFS); 29 | InputStream inputStream = null; 30 | try { 31 | if (isHDFS) { 32 | FileSystem fs = getFileSystem(path); 33 | inputStream = fs.open(new Path(path)); 34 | } else { 35 | inputStream = new FileInputStream(new File(path)); 36 | } 37 | } catch (IOException e) { 38 | e.printStackTrace(); 39 | } 40 | return inputStream; 41 | } 42 | 43 | /** 44 | * 从指定路径获取输出流 45 | * 46 | * @param path 本地路径或 HDFS 路径 47 | * @param isOverwrite 标识文件内容是否需要覆盖 48 | */ 49 | public static OutputStream write(String path, boolean isOverwrite) { 50 | boolean isHDFS = path.contains(HDFS); 51 | OutputStream outputStream = null; 52 | try { 53 | if (isHDFS) { 54 | FileSystem fs = getFileSystem(path); 55 | outputStream = fs.create(new Path(path), isOverwrite); 56 | } else { 57 | boolean isAppend = !isOverwrite; 58 | outputStream = new FileOutputStream(new File(path), isAppend); 59 | } 60 | } catch (IOException e) { 61 | e.printStackTrace(); 62 | } 63 | return outputStream; 64 | } 65 | 66 | /** 67 | * 获取目录下的文件信息 68 | * 69 | * @param directory 目录路径 70 | * @return 目录下的文件路径集合 71 | */ 72 | public static List getPaths(String directory) { 73 | List paths = new ArrayList<>(); 74 | boolean isHDFS = directory.contains(HDFS); 75 | try { 76 | if (isHDFS) { 77 | FileSystem fs = getFileSystem(directory); 78 | // 读取目录下文件信息 79 | FileStatus[] fileStatuses = fs.listStatus(new Path(directory)); 80 | for (FileStatus fileStatus : fileStatuses) { 81 | paths.add(fileStatus.getPath().toString()); 82 | } 83 | } else { 84 | File root = new File(directory); 85 | File[] files = root.listFiles(); 86 | if (files != null) { 87 | for (File file : files) { 88 | String path = file.getPath(); 89 | // 本地文件系统中需要对输出文件进行过滤,不过滤可能产生错误 90 | if (path.matches(".*[0~9]+") || path.contains("centers")) { 91 | paths.add(file.getPath()); 92 | } 93 | } 94 | } 95 | } 96 | } catch (IOException e) { 97 | e.printStackTrace(); 98 | } 99 | return paths.size() > 0 ? paths : null; 100 | } 101 | 102 | /** 103 | * 用于删除输出目录,便于下一次输出 104 | * 105 | * @param path 目录路径 106 | */ 107 | public static void deletePath(String path, boolean isDirectory) { 108 | boolean isHDFS = path.contains(HDFS); 109 | try { 110 | if (isHDFS) { 111 | FileSystem fs = getFileSystem(path); 112 | if (fs.exists(new Path(path))) { 113 | fs.delete(new Path(path), isDirectory); 114 | } 115 | } else { 116 | File file = new File(path); 117 | if (file.exists()) { 118 | // 本地目录递归删除 119 | if (isDirectory) { 120 | File[] subFiles = file.listFiles(); 121 | for (File subFile : subFiles) { 122 | if (subFile.isFile()) { 123 | subFile.delete(); 124 | } else { 125 | deletePath(subFile.getPath(), true); 126 | } 127 | } 128 | } 129 | file.delete(); 130 | } 131 | } 132 | } catch (IOException e) { 133 | e.printStackTrace(); 134 | } 135 | } 136 | 137 | /** 138 | * @param path HDFS 文件系统的目标路径,此路径决定将要使用的文件系统,如果没有指定则会使用默认的文件系统 139 | * @return 返回 FileSystem 实例,用于访问文件系统 140 | */ 141 | public static FileSystem getFileSystem(String path) { 142 | FileSystem fs = null; 143 | try { 144 | fs = FileSystem.get(URI.create(path), new Configuration()); 145 | } catch (IOException e) { 146 | e.printStackTrace(); 147 | } 148 | return fs; 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/pagerank/PageRank.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.mapreduce.examples.pagerank; 2 | 3 | import org.apache.hadoop.conf.Configured; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.io.NullWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Job; 8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 9 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 10 | import org.apache.hadoop.util.Tool; 11 | import org.apache.hadoop.util.ToolRunner; 12 | 13 | public class PageRank extends Configured implements Tool { 14 | 15 | // 最大的迭代次数 16 | public static final int MAX_ITERATION = 20; 17 | // 从0开始记录当前迭代步数 18 | private static int iteration = 0; 19 | // 配置项中用于记录网页总数的键 20 | public static final String TOTAL_PAGE = "1"; 21 | // 配置项中用于记录当前迭代步数的键 22 | public static final String ITERATION = "2"; 23 | 24 | @Override 25 | public int run(String[] args) throws Exception { 26 | /* 步骤1:设置作业的信息 */ 27 | int totalPage = Integer.parseInt(args[2]); 28 | getConf().setInt(PageRank.TOTAL_PAGE, totalPage); 29 | getConf().setInt(PageRank.ITERATION, iteration); 30 | 31 | Job job = Job.getInstance(getConf(), getClass().getSimpleName()); 32 | // 设置程序的类名 33 | job.setJarByClass(getClass()); 34 | 35 | // 设置数据的输入路径 36 | if (iteration == 0) { 37 | FileInputFormat.addInputPath(job, new Path(args[0])); 38 | } else { 39 | // 将上一次迭代的输出设置为输入 40 | FileInputFormat.addInputPath(job, new Path(args[1] + (iteration - 1))); 41 | } 42 | // 设置数据的输出路径 43 | FileOutputFormat.setOutputPath(job, new Path(args[1] + iteration)); 44 | 45 | // 设置map方法及其输出键值对的数据类型 46 | job.setMapperClass(PageRankMapper.class); 47 | job.setMapOutputKeyClass(Text.class); 48 | job.setMapOutputValueClass(ReducePageRankWritable.class); 49 | 50 | // 设置reduce方法及其输出键值对的数据类型 51 | job.setReducerClass(PageRankReducer.class); 52 | job.setOutputKeyClass(Text.class); 53 | job.setOutputValueClass(NullWritable.class); 54 | 55 | return job.waitForCompletion(true) ? 0 : -1; 56 | } 57 | 58 | public static void main(String[] args) throws Exception { 59 | /* 步骤2:运行作业 */ 60 | int exitCode = 0; 61 | while (iteration < MAX_ITERATION) { 62 | exitCode = ToolRunner.run(new PageRank(), args); 63 | if (exitCode == -1) { 64 | break; 65 | } 66 | iteration++; 67 | } 68 | System.exit(exitCode); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/pagerank/PageRankMapper.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.mapreduce.examples.pagerank; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.io.LongWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Mapper; 7 | 8 | /* 步骤1:确定输出键值对[K1,V1]的数据类型为[LongWritable,Text],确定输出键值对[K2,V2]的数据类型为[Text,ReducePageRankWritable] */ 9 | public class PageRankMapper extends Mapper { 10 | 11 | @Override 12 | protected void map(LongWritable key, Text value, Context context) 13 | throws IOException, InterruptedException { 14 | /* 步骤2:编写处理逻辑将[K1,V1]转换为[K2,V2]并输出 */ 15 | // 以空格为分隔符切分 16 | String[] pageInfo = value.toString().split(" "); 17 | // 网页的排名值 18 | double pageRank = Double.parseDouble(pageInfo[1]); 19 | // 网页的出站链接数 20 | int outLink = (pageInfo.length - 2) / 2; 21 | ReducePageRankWritable writable; 22 | writable = new ReducePageRankWritable(); 23 | // 计算贡献值并保存 24 | writable.setData(String.valueOf(pageRank / outLink)); 25 | // 设置对应标识 26 | writable.setTag(ReducePageRankWritable.PR_L); 27 | // 对于每一个出站链接,输出贡献值 28 | for (int i = 2; i < pageInfo.length; i += 2) { 29 | context.write(new Text(pageInfo[i]), writable); 30 | } 31 | writable = new ReducePageRankWritable(); 32 | // 保存网页信息并标识 33 | writable.setData(value.toString()); 34 | writable.setTag(ReducePageRankWritable.PAGE_INFO); 35 | // 以输入的网页信息的网页名称为key进行输出 36 | context.write(new Text(pageInfo[0]), writable); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/pagerank/PageRankReducer.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.mapreduce.examples.pagerank; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.io.NullWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Reducer; 7 | 8 | /* 步骤1:确定输出键值对[K2,V2]的数据类型为[Text,ReducePageRankWritable],确定输出键值对[K3,V3]的数据类型为[Text,NullWritable] */ 9 | public class PageRankReducer extends Reducer { 10 | 11 | // 阻尼系数 12 | private static final double D = 0.85; 13 | 14 | @Override 15 | protected void reduce(Text key, Iterable values, Context context) 16 | throws IOException, InterruptedException { 17 | /* 步骤2:编写处理逻辑将[K2,V2]转换为[K3,V3]并输出 */ 18 | String[] pageInfo = null; 19 | // 从配置项中读取网页的总数 20 | int totalPage = context.getConfiguration().getInt(PageRank.TOTAL_PAGE, 0); 21 | // 从配置项中读取当前迭代步数 22 | int iteration = context.getConfiguration().getInt(PageRank.ITERATION, 0); 23 | double sum = 0; 24 | for (ReducePageRankWritable value : values) { 25 | String tag = value.getTag(); 26 | // 如果是贡献值则进行求和,否则以空格为分隔符切分后保存到pageInfo 27 | if (tag.equals(ReducePageRankWritable.PR_L)) { 28 | sum += Double.parseDouble(value.getData()); 29 | } else if (tag.equals(ReducePageRankWritable.PAGE_INFO)) { 30 | pageInfo = value.getData().split(" "); 31 | } 32 | } 33 | // 根据公式计算排名值 34 | double pageRank = (1 - D) / totalPage + D * sum; 35 | // 更新网页信息中的排名值 36 | pageInfo[1] = String.valueOf(pageRank); 37 | // 最后一次迭代输出网页名及排名值,而其余迭代输出网页信息 38 | StringBuilder result = new StringBuilder(); 39 | if (iteration == (PageRank.MAX_ITERATION - 1)) { 40 | result.append(pageInfo[0]).append(" ").append(String.format("%.5f", pageRank)); 41 | } else { 42 | for (String data : pageInfo) { 43 | result.append(data).append(" "); 44 | } 45 | } 46 | context.write(new Text(result.toString()), NullWritable.get()); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/pagerank/README.md: -------------------------------------------------------------------------------- 1 | #### 运行方法 2 | 3 | 修改运行配置,在 Program arguments 中填入 `src/main/resources/inputs/pagerank/data.txt src/main/resources/outputs/pagerank/ 4` -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/pagerank/ReducePageRankWritable.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.mapreduce.examples.pagerank; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | import org.apache.hadoop.io.Writable; 7 | 8 | public class ReducePageRankWritable implements Writable { 9 | 10 | /// 保存贡献值或网页信息 11 | private String data; 12 | // 标识data保存的是贡献值还是网页信息 13 | private String tag; 14 | 15 | // 用于标识的常量 16 | public static final String PAGE_INFO = "1"; 17 | public static final String PR_L = "2"; 18 | 19 | @Override 20 | public void write(DataOutput dataOutput) throws IOException { 21 | dataOutput.writeUTF(tag); 22 | dataOutput.writeUTF(data); 23 | } 24 | 25 | @Override 26 | public void readFields(DataInput dataInput) throws IOException { 27 | tag = dataInput.readUTF(); 28 | data = dataInput.readUTF(); 29 | } 30 | 31 | public String getData() { 32 | return data; 33 | } 34 | 35 | public void setData(String data) { 36 | this.data = data; 37 | } 38 | 39 | public String getTag() { 40 | return tag; 41 | } 42 | 43 | public void setTag(String tag) { 44 | this.tag = tag; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/wordcount/README.md: -------------------------------------------------------------------------------- 1 | #### 运行方法 2 | 3 | 修改运行配置,在 Program arguments 中填入 `src/main/resources/inputs/wordcount/data.txt src/main/resources/outputs/wordcount` 4 | 5 | **Note:当希望在集群上运行词频统计作业时,可以使用下载[🔗](https://github.com/ymcui/Chinese-Cloze-RC/blob/master/people_daily/pd.zip)的数据作为输入数据** -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/wordcount/WordCount.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.mapreduce.examples.wordcount; 2 | 3 | import org.apache.hadoop.conf.Configured; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Job; 8 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 9 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 10 | import org.apache.hadoop.util.Tool; 11 | import org.apache.hadoop.util.ToolRunner; 12 | 13 | public class WordCount extends Configured implements Tool { 14 | 15 | @Override 16 | public int run(String[] args) throws Exception { 17 | /* 步骤1:设置作业的信息 */ 18 | Job job = Job.getInstance(getConf(), getClass().getSimpleName()); 19 | // 设置程序的类名 20 | job.setJarByClass(getClass()); 21 | 22 | // 设置数据的输入输出路径 23 | FileInputFormat.addInputPath(job, new Path(args[0])); 24 | FileOutputFormat.setOutputPath(job, new Path(args[1])); 25 | 26 | // 设置map和reduce方法 27 | job.setMapperClass(WordCountMapper.class); 28 | job.setReducerClass(WordCountReducer.class); 29 | job.setCombinerClass(WordCountCombiner.class); 30 | 31 | // 设置map方法的输出键值对数据类型 32 | job.setMapOutputKeyClass(Text.class); 33 | job.setMapOutputValueClass(IntWritable.class); 34 | // 设置reduce方法的输出键值对数据类型 35 | job.setOutputKeyClass(Text.class); 36 | job.setOutputValueClass(IntWritable.class); 37 | 38 | return job.waitForCompletion(true) ? 0 : 1; 39 | } 40 | 41 | public static void main(String[] args) throws Exception { 42 | /* 步骤2:运行作业 */ 43 | int exitCode = ToolRunner.run(new WordCount(), args); 44 | System.exit(exitCode); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/wordcount/WordCountCombiner.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.mapreduce.examples.wordcount; 2 | 3 | import java.io.IOException; 4 | import org.apache.hadoop.io.IntWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Reducer; 7 | 8 | /* 步骤1:确定输入键值对[K2,List(V2)]的数据类型为[Text, IntWritable],输出键值对[K3,V3]的数据类型为[Text,IntWritable] */ 9 | public class WordCountCombiner extends Reducer { 10 | 11 | @Override 12 | protected void reduce(Text key, Iterable values, Context context) 13 | throws IOException, InterruptedException { 14 | /* 步骤2:将[K2,List(V2)]合并为[K3,V3]并输出 */ 15 | int sum = 0; 16 | // 进行合并操作 17 | for (IntWritable value : values) { 18 | sum += value.get(); 19 | } 20 | // 输出合并的结果 21 | context.write(key, new IntWritable(sum)); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/wordcount/WordCountMapper.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.mapreduce.examples.wordcount; 2 | 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.LongWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapreduce.Mapper; 7 | 8 | import java.io.IOException; 9 | 10 | /* 步骤1:确定输入键值对[K1,V1]的数据类型为[LongWritable,Text],输出键值对[K2,V2]的数据类型为[Text,IntWritable] */ 11 | public class WordCountMapper extends Mapper { 12 | 13 | @Override 14 | protected void map(LongWritable key, Text value, Context context) 15 | throws IOException, InterruptedException { 16 | /* 步骤2:编写处理逻辑将[K1,V1]转换为[K2,V2]并输出 */ 17 | // 以空格作为分隔符拆分成单词 18 | String[] datas = value.toString().split(" "); 19 | for (String data : datas) { 20 | // 输出分词结果 21 | context.write(new Text(data), new IntWritable(1)); 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/wordcount/WordCountReducer.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.mapreduce.examples.wordcount; 2 | 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.Reducer; 6 | 7 | import java.io.IOException; 8 | 9 | /* 步骤1:确定输入键值对[K2,List(V2)]的数据类型为[Text, IntWritable],输出键值对[K3,V3]的数据类型为[Text,IntWritable] */ 10 | public class WordCountReducer extends Reducer { 11 | @Override 12 | protected void reduce(Text key, Iterable values, Context context) 13 | throws IOException, InterruptedException { 14 | /* 步骤2:编写处理逻辑将[K2,List(V2)]转换为[K3,V3]并输出 */ 15 | int sum = 0; 16 | // 遍历累加求和 17 | for (IntWritable value : values) { 18 | sum += value.get(); 19 | } 20 | // 输出计数结果 21 | context.write(key, new IntWritable(sum)); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /MapReduce/src/main/resources/inputs/join/department.csv: -------------------------------------------------------------------------------- 1 | 会计 George 2 | 销售 Harriet -------------------------------------------------------------------------------- /MapReduce/src/main/resources/inputs/join/employee.csv: -------------------------------------------------------------------------------- 1 | Harry 3415 会计 2 | Sally 2241 销售 3 | George 3401 会计 4 | Harriet 2202 销售 5 | Bart 6077 会计 6 | Elise 9263 销售 7 | Gemma 3870 会计 8 | Tyler 6236 销售 9 | Camden 4527 销售 10 | Sofia 2035 会计 -------------------------------------------------------------------------------- /MapReduce/src/main/resources/inputs/join/input_cluster/department.csv: -------------------------------------------------------------------------------- 1 | Oscar 会计 2 | Gavin 销售 3 | -------------------------------------------------------------------------------- /MapReduce/src/main/resources/inputs/kmeans/center/centers.txt: -------------------------------------------------------------------------------- 1 | 1,2 2 | 3,1 -------------------------------------------------------------------------------- /MapReduce/src/main/resources/inputs/kmeans/data.txt: -------------------------------------------------------------------------------- 1 | 0,0 -1 2 | 1,2 -1 3 | 3,1 -1 4 | 8,8 -1 5 | 9,10 -1 6 | 10,7 -1 -------------------------------------------------------------------------------- /MapReduce/src/main/resources/inputs/pagerank/data.txt: -------------------------------------------------------------------------------- 1 | A 1.0 B 1.0 D 1.0 2 | B 1.0 C 1.0 3 | C 1.0 A 1.0 B 1.0 4 | D 1.0 B 1.0 C 1.0 -------------------------------------------------------------------------------- /MapReduce/src/main/resources/inputs/wordcount/data.txt: -------------------------------------------------------------------------------- 1 | An An 2 | My Me 3 | An An 4 | My He 5 | My My 6 | An My -------------------------------------------------------------------------------- /MapReduce/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 《分布式计算系统》示例代码 2 | 3 | 徐辰 编著 《分布式计算系统》 2022年9月 4 | 5 | ## [第2章 Hadoop文件系统](HDFS) 6 | 7 | 1. [写文件](HDFS/src/main/java/cn/edu/ecnu/hdfs/examples/write) 8 | 2. [读文件](HDFS/src/main/java/cn/edu/ecnu/hdfs/examples/read) 9 | 10 | ## [第3章 批处理系统MapReduce](MapReduce) 11 | 12 | 1. [词频统计](MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/wordcount) 13 | 2. [关系表自然连接及其优化](MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/join) 14 | 3. [网页链接排名](MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/pagerank) 15 | 4. [K均值聚类](MapReduce/src/main/java/cn/edu/ecnu/mapreduce/examples/kmeans) 16 | 17 | ## [第4章 批处理系统Spark](Spark) 18 | 19 | 1. [词频统计](Spark/src/main/scala/cn/edu/ecnu/spark/examples/scala/wordcount) 20 | 2. [关系表自然连接及其优化](Spark/src/main/scala/cn/edu/ecnu/spark/examples/scala/join) 21 | 3. [网页链接排名](Spark/src/main/scala/cn/edu/ecnu/spark/examples/scala/pagerank) 22 | 4. [K均值聚类](Spark/src/main/scala/cn/edu/ecnu/spark/examples/scala/kmeans) 23 | 5. [检查点](Spark/src/main/scala/cn/edu/ecnu/spark/examples/scala/checkpoint) 24 | 25 | ## [第7章 流计算系统Storm](Storm) 26 | 27 | 1. [词频统计](Storm/src/main/java/cn/edu/ecnu/example/storm/wordcount/withoutAck) 28 | 2. [支持容错的词频统计](Storm/src/main/java/cn/edu/ecnu/example/storm/wordcount/withAck) 29 | 3. [简化的窗口操作](Storm/src/main/java/cn/edu/ecnu/example/storm/wordcount/window) 30 | 4. [异常检测](Storm/src/main/java/cn/edu/ecnu/example/storm/detection) 31 | 32 | ## [第8章 流计算系统Spark Streaming](SparkStreaming) 33 | 34 | 1. [按批词频统计](SparkStreaming/src/main/scala/cn/edu/ecnu/sparkstreaming/examples/scala/wordcount/BatchWordCount.scala) 35 | 2. [全局词频统计](SparkStreaming/src/main/scala/cn/edu/ecnu/sparkstreaming/examples/scala/wordcount/GlobalWordCount.scala) 36 | 3. [窗口操作](SparkStreaming/src/main/scala/cn/edu/ecnu/sparkstreaming/examples/scala/window) 37 | 4. [异常检测](SparkStreaming/src/main/scala/cn/edu/ecnu/sparkstreaming/examples/scala/anomaly) 38 | 39 | ## [第10章 批流融合系统Flink](Flink) 40 | 41 | 1. [词频统计](Flink/src/main/scala/cn/edu/ecnu/flink/examples/scala/wordcount) 42 | 2. [斐波那契数列生成](Flink/src/main/scala/cn/edu/ecnu/flink/examples/scala/fibonacciexample) 43 | 3. [整数求和*](Flink/src/main/scala/cn/edu/ecnu/flink/examples/scala/integersum) 44 | 4. [支持容错的词频统计](Flink/src/main/scala/cn/edu/ecnu/flink/examples/scala/wordcountwithfaulttolerance) 45 | 46 | ## [第11章 图处理系统Giraph](Giraph) 47 | 48 | 1. [连通分量](Giraph/src/main/java/cn/edu/ecnu/giraph/examples/cc) 49 | 2. [单源最短路径](Giraph/src/main/java/cn/edu/ecnu/giraph/examples/sssp) 50 | 3. [网页链接排名](Giraph/src/main/java/cn/edu/ecnu/giraph/examples/pagerank) 51 | 4. [K均值聚类](Giraph/src/main/java/cn/edu/ecnu/giraph/examples/kmeans) 52 | -------------------------------------------------------------------------------- /Spark/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | fortest 8 | sparktest 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 13 | org.apache.maven.plugins 14 | maven-compiler-plugin 15 | 16 | 8 17 | 8 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | org.apache.spark 28 | spark-core_2.11 29 | 2.4.0 30 | 31 | 32 | 33 | 34 | org.apache.spark 35 | spark-sql_2.11 36 | 2.4.0 37 | 38 | 39 | 40 | 41 | org.apache.spark 42 | spark-streaming_2.11 43 | 2.4.0 44 | 45 | 46 | 47 | 48 | org.scala-lang 49 | scala-library 50 | 2.11.12 51 | 52 | 53 | 54 | 55 | org.apache.hadoop 56 | hadoop-hdfs 57 | 3.1.0 58 | 59 | 60 | 61 | 62 | org.apache.hadoop 63 | hadoop-client 64 | 3.1.0 65 | 66 | 67 | 68 | 69 | org.apache.spark 70 | spark-streaming-kafka-0-10_2.11 71 | 2.4.0 72 | 73 | 74 | 75 | 76 | 77 | com.thoughtworks.paranamer 78 | paranamer 79 | 2.8 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /Spark/src/main/java/cn/edu/ecnu/spark/examples/java/checkpoint/Checkpoint.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.spark.examples.java.checkpoint; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.JavaPairRDD; 5 | import org.apache.spark.api.java.JavaRDD; 6 | import org.apache.spark.api.java.JavaSparkContext; 7 | import org.apache.spark.api.java.function.Function; 8 | import org.apache.spark.api.java.function.Function2; 9 | import org.apache.spark.api.java.function.PairFlatMapFunction; 10 | import org.apache.spark.api.java.function.PairFunction; 11 | import org.apache.spark.api.java.function.VoidFunction; 12 | import scala.Tuple2; 13 | 14 | import java.util.ArrayList; 15 | import java.util.Iterator; 16 | import java.util.List; 17 | 18 | public class Checkpoint { 19 | public static void run(String[] args) { 20 | /* 步骤1:通过SparkConf设置配置信息,并创建SparkContext */ 21 | SparkConf conf = new SparkConf(); 22 | conf.setAppName("Checkpoint"); 23 | conf.setMaster("local"); // 仅用于本地进行调试,如在集群中运行则删除本行 24 | JavaSparkContext sc = new JavaSparkContext(conf); 25 | 26 | // 设置检查点路径 27 | sc.setCheckpointDir("hdfs://localhost:9000/sout/ck001"); 28 | 29 | /* 步骤2:按应用逻辑使用操作算子编写DAG,其中包括RDD的创建、转换和行动等 */ 30 | int iterateNum = 20; // 指定迭代次数 31 | double factor = 0.85; // 指定系数 32 | 33 | // 读取输入文本数据 34 | JavaRDD text = sc.textFile("src/main/resources/input/pagerank2/pagerank.txt"); 35 | 36 | // 将文本数据转换成(pageId, List(link0, link1, link2...))的形式 37 | JavaPairRDD> links = 38 | text.mapToPair( 39 | new PairFunction>() { 40 | @Override 41 | public Tuple2> call(String line) throws Exception { 42 | String[] tokens = line.split(" "); 43 | List list = new ArrayList<>(); 44 | for (int i = 2; i < tokens.length; i+=2) { 45 | list.add(tokens[i]); 46 | } 47 | return new Tuple2<>(tokens[0], list); 48 | } 49 | }) 50 | .cache(); // 持久化到内存 51 | 52 | long N = Long.parseLong(args[0]); // 从输入中获取网页总数N 53 | 54 | // 初始化每个页面的排名值(pageId, rank) 55 | JavaPairRDD ranks = 56 | text.mapToPair( 57 | new PairFunction() { 58 | @Override 59 | public Tuple2 call(String line) throws Exception { 60 | String[] tokens = line.split(" "); 61 | return new Tuple2<>(tokens[0], Double.valueOf(tokens[1])); 62 | } 63 | }); 64 | 65 | // 执行iterateNum次迭代计算 66 | for (int iter = 1; iter <= iterateNum; iter++) { 67 | JavaPairRDD contributions = 68 | links 69 | // 将links和ranks做join,得到(pageId, (List(link0, link1, link2...), rank)) 70 | .join(ranks) 71 | // 计算出每个page对其每个link目标page的贡献值 72 | .flatMapToPair( 73 | new PairFlatMapFunction< 74 | Tuple2, Double>>, String, Double>() { 75 | @Override 76 | public Iterator> call( 77 | Tuple2, Double>> t) throws Exception { 78 | List> list = new ArrayList<>(); 79 | for (int i = 0; i < t._2._1.size(); i++) { 80 | // 网页排名值除以链接总数 81 | list.add(new Tuple2<>(t._2._1.get(i), t._2._2 / t._2._1.size())); 82 | } 83 | return list.iterator(); 84 | } 85 | }); 86 | 87 | ranks = 88 | contributions 89 | // 聚合对相同网页的贡献值,求和得到对每个网页的总贡献值 90 | .reduceByKey( 91 | new Function2() { 92 | @Override 93 | public Double call(Double r1, Double r2) throws Exception { 94 | return r1 + r2; 95 | } 96 | }) 97 | // 根据公式计算得到每个网页的新排名值 98 | .mapValues( 99 | new Function() { 100 | @Override 101 | public Double call(Double v) throws Exception { 102 | return (1 - factor) * 1.0 / N + factor * v; 103 | } 104 | }); 105 | 106 | // 每隔5次迭代保存一次检查点 107 | if (iter % 5 == 0) { 108 | // 将要设置检查点的RDD缓存在内存中,避免写检查点时二次计算 109 | ranks.cache(); 110 | // 调用checkpoint方法设置检查点 111 | ranks.checkpoint(); 112 | } 113 | 114 | // 对排名值保留5位小数,并打印每轮迭代的网页排名中间结果 115 | ranks.foreach(new VoidFunction>() { 116 | @Override 117 | public void call(Tuple2 t) throws Exception { 118 | System.out.println(t._1 + " " + String.format("%.5f", t._2)); 119 | } 120 | }); 121 | } 122 | 123 | /* 步骤3:关闭SparkContext */ 124 | sc.stop(); 125 | } 126 | 127 | public static void main(String[] args) { 128 | run(args); 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /Spark/src/main/java/cn/edu/ecnu/spark/examples/java/join/BroadcastJoin.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.spark.examples.java.join; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.JavaRDD; 5 | import org.apache.spark.api.java.JavaSparkContext; 6 | import org.apache.spark.api.java.function.Function; 7 | import org.apache.spark.api.java.function.PairFunction; 8 | import org.apache.spark.broadcast.Broadcast; 9 | import scala.Tuple2; 10 | 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | import java.util.Map; 14 | 15 | public class BroadcastJoin { 16 | public static void run(String[] args) { 17 | /* 步骤1:通过SparkConf设置配置信息,并创建SparkContext */ 18 | SparkConf conf = new SparkConf(); 19 | conf.setAppName("ShuffleJoin"); 20 | conf.setMaster("local"); // 仅用于本地进行调试,如在集群中运行则删除本行 21 | JavaSparkContext sc = new JavaSparkContext(conf); 22 | 23 | /* 步骤2:按应用逻辑使用操作算子编写DAG,其中包括RDD的创建、转换和行动等 */ 24 | // 读入部门表 25 | Map departmentsMap = 26 | sc.textFile("src/main/resources/input/join/department.csv") 27 | // 按制表符解析为[DeptName, Manager]键值对后collectAsMap到Driver中 28 | .mapToPair( 29 | new PairFunction() { 30 | @Override 31 | public Tuple2 call(String line) throws Exception { 32 | String[] tokens = line.split("\t"); 33 | return new Tuple2(tokens[0], tokens[1]); 34 | } 35 | }) 36 | .collectAsMap(); 37 | 38 | // 广播部门表 39 | Broadcast> departmentsBroadCast = sc.broadcast(departmentsMap); 40 | 41 | // 读入雇员表 42 | JavaRDD> employeesRDD = 43 | sc.textFile("src/main/resources/input/join/employee.csv") 44 | // 按制表符解析为(DeptName, Name, EmpId)元组 45 | .map( 46 | new Function>() { 47 | @Override 48 | public List call(String line) throws Exception { 49 | String[] tokens = line.split("\t"); 50 | List list = new ArrayList(); 51 | list.add(tokens[2]); 52 | list.add(tokens[0]); 53 | list.add(tokens[1]); 54 | return list; 55 | } 56 | }); 57 | 58 | // 在map转换操作中对雇员表与广播的部门表做自然连接 59 | employeesRDD 60 | .map( 61 | new Function, List>() { 62 | @Override 63 | public List call(List r) throws Exception { 64 | // 获取广播变量部门表的值 65 | Map departmentsBroadCastValue = departmentsBroadCast.value(); 66 | if (departmentsBroadCastValue.containsKey(r.get(0))) { 67 | // 获取departmentsBroadCastValue中对应key的value 68 | String left = departmentsBroadCastValue.get(r.get(0)); 69 | // 返回连接结果(Name, EmpId, DeptName, Manager) 70 | List joinList = new ArrayList<>(); 71 | joinList.add(r.get(1)); 72 | joinList.add(r.get(2)); 73 | joinList.add(r.get(0)); 74 | joinList.add(left); 75 | return joinList; 76 | } else { 77 | return null; 78 | } 79 | } 80 | }) 81 | // 过滤空值 82 | .filter( 83 | new Function, Boolean>() { 84 | @Override 85 | public Boolean call(List joinList) throws Exception { 86 | return joinList != null; 87 | } 88 | }) 89 | .foreach(item -> System.out.println(item)); 90 | 91 | /* |步骤3:关闭SparkContext| */ 92 | sc.stop(); 93 | } 94 | 95 | public static void main(String[] args) { 96 | run(args); 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /Spark/src/main/java/cn/edu/ecnu/spark/examples/java/join/ShuffleJoin.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.spark.examples.java.join; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.JavaPairRDD; 5 | import org.apache.spark.api.java.JavaSparkContext; 6 | import org.apache.spark.api.java.function.FlatMapFunction; 7 | import org.apache.spark.api.java.function.PairFunction; 8 | import scala.Tuple2; 9 | 10 | import java.util.ArrayList; 11 | import java.util.Iterator; 12 | import java.util.List; 13 | 14 | public class ShuffleJoin { 15 | public static void run(String[] args) { 16 | /* 步骤1:通过SparkConf设置配置信息,并创建SparkContext */ 17 | SparkConf conf = new SparkConf(); 18 | conf.setAppName("ShuffleJoin"); 19 | conf.setMaster("local"); // 仅用于本地进行调试,如在集群中运行则删除本行 20 | JavaSparkContext sc = new JavaSparkContext(conf); 21 | 22 | /* 步骤2:按应用逻辑使用操作算子编写DAG,其中包括RDD的创建、转换和行动等 */ 23 | // 读入部门表 24 | JavaPairRDD departmentsRDD = 25 | sc.textFile("src/main/resources/input/join/department.csv") 26 | .mapToPair( 27 | new PairFunction() { 28 | @Override 29 | public Tuple2 call(String line) throws Exception { 30 | // 按制表符分割文本行,将每行文本映射为[DeptName, Manager]键值对 31 | String[] tokens = line.split("\t"); 32 | return new Tuple2(tokens[0], tokens[1]); 33 | } 34 | }); 35 | 36 | // 读入雇员表 37 | JavaPairRDD> employeesRDD = 38 | sc.textFile("src/main/resources/input/join/employee.csv") 39 | .mapToPair( 40 | new PairFunction>() { 41 | @Override 42 | public Tuple2> call(String line) throws Exception { 43 | // 按制表符分割文本行,将每行文本映射为[DeptName, Name EmpId]键值对 44 | String[] tokens = line.split("\t"); 45 | List list = new ArrayList(); 46 | for (int i = 0; i <= 1; i++) { 47 | list.add(tokens[i]); 48 | } 49 | return new Tuple2>(tokens[2], list); 50 | } 51 | }); 52 | 53 | // 用coGroup算子对雇员表和部门表按DeptName聚合 54 | employeesRDD 55 | .cogroup(departmentsRDD) 56 | // 对[DeptName, {{Name EmpId}, {Manager}}]进行连接操作 57 | .flatMap( 58 | new FlatMapFunction< 59 | Tuple2>, Iterable>>, List>() { 60 | @Override 61 | public Iterator> call( 62 | Tuple2>, Iterable>> tuple) 63 | throws Exception { 64 | // 返回连接结果(Name, EmpId, DeptName, Manager) 65 | List> list = new ArrayList<>(); 66 | for (List t1 : tuple._2._1) { 67 | for (String t2 : tuple._2._2) { 68 | List newList = new ArrayList<>(); 69 | newList.add(t1.get(0)); 70 | newList.add(t1.get(1)); 71 | newList.add(tuple._1); 72 | newList.add(t2); 73 | list.add(newList); 74 | } 75 | } 76 | return list.iterator(); 77 | } 78 | }) 79 | .foreach(item -> System.out.println(item)); 80 | 81 | /* |步骤3:关闭SparkContext| */ 82 | sc.stop(); 83 | } 84 | 85 | public static void main(String[] args) { 86 | run(args); 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /Spark/src/main/java/cn/edu/ecnu/spark/examples/java/pagerank/PageRank.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.spark.examples.java.pagerank; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.JavaPairRDD; 5 | import org.apache.spark.api.java.JavaRDD; 6 | import org.apache.spark.api.java.JavaSparkContext; 7 | import org.apache.spark.api.java.function.*; 8 | import scala.Tuple2; 9 | 10 | import java.util.ArrayList; 11 | import java.util.Iterator; 12 | import java.util.List; 13 | 14 | public class PageRank { 15 | public static void run(String[] args) { 16 | /* 步骤1:通过SparkConf设置配置信息,并创建SparkContext */ 17 | SparkConf conf = new SparkConf(); 18 | conf.setAppName("PageRank"); 19 | conf.setMaster("local"); // 仅用于本地进行调试,如在集群中运行则删除本行 20 | JavaSparkContext sc = new JavaSparkContext(conf); 21 | 22 | /* 步骤2:按应用逻辑使用操作算子编写DAG,其中包括RDD的创建、转换和行动等 */ 23 | int iterateNum = 20; // 指定迭代次数 24 | double factor = 0.85; // 指定系数 25 | // 读取输入文本 26 | JavaRDD text = sc.textFile("src/main/resources/input/pagerank/pagerank.txt"); 27 | 28 | // 将文本数据转换成[网页, {链接列表}]键值对 29 | JavaPairRDD> links = 30 | text.mapToPair( 31 | new PairFunction>() { 32 | @Override 33 | public Tuple2> call(String line) throws Exception { 34 | String[] tokens = line.split(" "); 35 | List list = new ArrayList<>(); 36 | for (int i = 2; i < tokens.length; i+=2) { 37 | list.add(tokens[i]); 38 | } 39 | return new Tuple2<>(tokens[0], list); 40 | } 41 | }) 42 | .cache(); // 持久化到内存 43 | 44 | long N = Long.parseLong(args[0]); // 从输入中获取网页总数N 45 | 46 | // 初始化每个页面的排名值,得到[网页, 排名值]键值对 47 | JavaPairRDD ranks = 48 | text.mapToPair( 49 | new PairFunction() { 50 | @Override 51 | public Tuple2 call(String line) throws Exception { 52 | String[] tokens = line.split(" "); 53 | return new Tuple2<>(tokens[0], Double.valueOf(tokens[1])); 54 | } 55 | }); 56 | 57 | // 执行iterateNum次迭代计算 58 | for (int iter = 1; iter <= iterateNum; iter++) { 59 | JavaPairRDD contributions = 60 | links 61 | // 将links和ranks做join,得到[网页, {{链接列表}, 排名值}] 62 | .join(ranks) 63 | // 计算出每个网页对其每个链接网页的贡献值 64 | .flatMapToPair( 65 | new PairFlatMapFunction< 66 | Tuple2, Double>>, String, Double>() { 67 | @Override 68 | public Iterator> call( 69 | Tuple2, Double>> t) throws Exception { 70 | List> list = new ArrayList<>(); 71 | for (int i = 0; i < t._2._1.size(); i++) { 72 | // 网页排名值除以链接总数 73 | list.add(new Tuple2<>(t._2._1.get(i), t._2._2 / t._2._1.size())); 74 | } 75 | return list.iterator(); 76 | } 77 | }); 78 | 79 | ranks = 80 | contributions 81 | // 聚合对相同网页的贡献值,求和得到对每个网页的总贡献值 82 | .reduceByKey( 83 | new Function2() { 84 | @Override 85 | public Double call(Double r1, Double r2) throws Exception { 86 | return r1 + r2; 87 | } 88 | }) 89 | // 根据公式计算得到每个网页的新排名值 90 | .mapValues( 91 | new Function() { 92 | @Override 93 | public Double call(Double v) throws Exception { 94 | return (1 - factor) * 1.0 / N + factor * v; 95 | } 96 | }); 97 | } 98 | // 对排名值保留5位小数,并打印最终网页排名结果 99 | ranks.foreach(new VoidFunction>() { 100 | @Override 101 | public void call(Tuple2 t) throws Exception { 102 | System.out.println(t._1 + " " + String.format("%.5f", t._2)); 103 | } 104 | }); 105 | 106 | /* 步骤3:关闭SparkContext */ 107 | sc.stop(); 108 | } 109 | 110 | public static void main(String[] args) { 111 | run(args); 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /Spark/src/main/java/cn/edu/ecnu/spark/examples/java/wordcount/WordCount.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.spark.examples.java.wordcount; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.JavaPairRDD; 5 | import org.apache.spark.api.java.JavaRDD; 6 | import org.apache.spark.api.java.JavaSparkContext; 7 | import org.apache.spark.api.java.function.*; 8 | import scala.Tuple2; 9 | 10 | import java.util.Arrays; 11 | import java.util.Iterator; 12 | 13 | public class WordCount { 14 | 15 | public static void run(String[] args) { 16 | /* 步骤1:通过SparkConf设置配置信息,并创建SparkContext */ 17 | SparkConf conf = new SparkConf(); 18 | conf.setAppName("WordCount"); 19 | conf.setMaster("local"); // 仅用于本地进行调试,如在集群中运行则删除本行 20 | JavaSparkContext sc = new JavaSparkContext(conf); 21 | 22 | /* 步骤2:按应用逻辑使用操作算子编写DAG,其中包括RDD的创建、转换和行动等 */ 23 | // 读入文本数据,创建名为lines的RDD 24 | JavaRDD lines = sc.textFile("src/main/resources/input/wordcount/words.txt"); 25 | 26 | // 将lines中的每一个文本行按空格分割成单个单词 27 | JavaRDD words = 28 | lines.flatMap( 29 | new FlatMapFunction() { 30 | @Override 31 | public Iterator call(String line) throws Exception { 32 | return Arrays.asList(line.split(" ")).iterator(); 33 | } 34 | }); 35 | // 将每个单词的频数设置为1,即将每个单词映射为[单词, 1] 36 | JavaPairRDD pairs = 37 | words.mapToPair( 38 | new PairFunction() { 39 | @Override 40 | public Tuple2 call(String word) throws Exception { 41 | return new Tuple2(word, 1); 42 | } 43 | }); 44 | // 按单词聚合,并对相同单词的频数使用sum进行累计 45 | JavaPairRDD wordCounts = 46 | pairs 47 | .groupByKey() 48 | .mapToPair( 49 | new PairFunction>, String, Integer>() { 50 | @Override 51 | public Tuple2 call(Tuple2> t) 52 | throws Exception { 53 | Integer sum = Integer.valueOf(0); 54 | for (Integer i : t._2) { 55 | sum += i; 56 | } 57 | return new Tuple2(t._1, sum); 58 | } 59 | }); 60 | // 合并机制 61 | /*JavaPairRDD wordCounts = 62 | pairs.reduceByKey( 63 | new Function2() { 64 | @Override 65 | public Integer call(Integer t1, Integer t2) throws Exception { 66 | return t1 + t2; 67 | } 68 | });*/ 69 | 70 | // 输出词频统计结果 71 | wordCounts.foreach(t -> System.out.println(t._1 + " " + t._2)); 72 | 73 | /* 步骤3:关闭SparkContext */ 74 | sc.stop(); 75 | } 76 | 77 | public static void main(String[] args) { 78 | run(args); 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /Spark/src/main/resources/input/join/department.csv: -------------------------------------------------------------------------------- 1 | 会计 George 2 | 销售 Harriet -------------------------------------------------------------------------------- /Spark/src/main/resources/input/join/employee.csv: -------------------------------------------------------------------------------- 1 | Harry 3415 会计 2 | Sally 2241 销售 3 | George 3401 会计 4 | Harriet 2202 销售 5 | Bart 6077 会计 6 | Elise 9263 销售 7 | Gemma 3870 会计 8 | Tyler 6236 销售 9 | Camden 4527 销售 10 | Sofia 2035 会计 -------------------------------------------------------------------------------- /Spark/src/main/resources/input/kmeans/centers.txt: -------------------------------------------------------------------------------- 1 | 1,2 2 | 3,1 -------------------------------------------------------------------------------- /Spark/src/main/resources/input/kmeans/data.txt: -------------------------------------------------------------------------------- 1 | 0,0 -1 2 | 1,2 -1 3 | 3,1 -1 4 | 8,8 -1 5 | 9,10 -1 6 | 10,7 -1 -------------------------------------------------------------------------------- /Spark/src/main/resources/input/overleaf/a.txt: -------------------------------------------------------------------------------- 1 | 数据 2 | 体育 3 | 大数据 -------------------------------------------------------------------------------- /Spark/src/main/resources/input/overleaf/c.txt: -------------------------------------------------------------------------------- 1 | 科学 2 | 足球 3 | Spark -------------------------------------------------------------------------------- /Spark/src/main/resources/input/overleaf/e.txt: -------------------------------------------------------------------------------- 1 | 工程 2 | 篮球 3 | Flink -------------------------------------------------------------------------------- /Spark/src/main/resources/input/pagerank/pagerank.txt: -------------------------------------------------------------------------------- 1 | A 1.0 B 1.0 D 1.0 2 | B 1.0 C 1.0 3 | C 1.0 A 1.0 B 1.0 4 | D 1.0 B 1.0 C 1.0 -------------------------------------------------------------------------------- /Spark/src/main/resources/input/wordcount/words.txt: -------------------------------------------------------------------------------- 1 | An An 2 | My Me 3 | An An 4 | My He 5 | My My 6 | An My -------------------------------------------------------------------------------- /Spark/src/main/scala/cn/edu/ecnu/spark/examples/scala/checkpoint/Checkpoint.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.spark.examples.scala.checkpoint 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | object Checkpoint { 6 | def run(args: Array[String]): Unit = { 7 | /* 步骤1:通过SparkConf设置配置信息,并创建SparkContext */ 8 | val conf = new SparkConf() 9 | .setAppName("Checkpoint") 10 | .setMaster("local") // 仅用于本地进行调试,如在集群中运行则删除本行 11 | val sc = new SparkContext(conf) 12 | 13 | // 设置检查点路径 14 | sc.setCheckpointDir("hdfs://localhost:9000/sout/ck001") 15 | 16 | /* 步骤2:按应用逻辑使用操作算子编写DAG,其中包括RDD的创建、转换和行动等 */ 17 | val iterateNum = 20 // 指定迭代次数 18 | val factor = 0.85 // 指定系数 19 | 20 | // 读取输入文本数据 21 | val text = sc.textFile("src/main/resources/input/pagerank/pagerank.txt") 22 | 23 | // 将文本数据转换成(pageId, List(link0, link1, link2...)) 24 | val links = text.map(line => { 25 | val tokens = line.split(" ") 26 | var list = List[String]() 27 | for (i <- 2 until tokens.size by 2) { 28 | list = list :+ tokens(i) 29 | } 30 | (tokens(0), list) 31 | }).cache() // 持久化到内存 32 | 33 | val N = args(0).toLong // 从输入中获取网页总数N 34 | 35 | // 初始化每个页面的排名值(pageId, rank) 36 | var ranks = text.map(line => { 37 | val tokens = line.split(" ") 38 | (tokens(0), tokens(1).toDouble) 39 | }) 40 | 41 | // 执行iterateNum次迭代计算 42 | for (iter <- 1 to iterateNum) { 43 | val contributions = links 44 | // 将links和ranks做join,得到(pageId, (List(link0, link1, link2...), rank)) 45 | .join(ranks) 46 | // 计算出每个page对其每个link目标page的贡献值 47 | .flatMap { 48 | case (pageId, (links, rank)) => 49 | // 网页排名值除以链接总数 50 | links.map(dest => (dest, rank / links.size)) 51 | } 52 | 53 | ranks = contributions 54 | // 聚合对相同网页的贡献值,求和得到对每个网页的总贡献值 55 | .reduceByKey(_ + _) 56 | // 根据公式计算得到每个网页的新排名值 57 | .mapValues(v => (1 - factor) * 1.0 / N + factor * v) 58 | 59 | // 每隔5次迭代保存一次检查点 60 | if (iter % 5 == 0) { 61 | // 将要设置检查点的RDD缓存在内存中,避免写检查点时二次计算 62 | ranks.cache() 63 | // 调用checkpoint方法设置检查点 64 | ranks.checkpoint() 65 | } 66 | 67 | // 对排名值保留5位小数,并打印每轮迭代的网页排名值 68 | ranks.foreach(t => println(t._1 + " " + t._2.formatted("%.5f"))) 69 | } 70 | 71 | /* 步骤3:关闭SparkContext */ 72 | sc.stop() 73 | } 74 | 75 | def main(args: Array[String]): Unit = { 76 | run(args) 77 | } 78 | } 79 | 80 | -------------------------------------------------------------------------------- /Spark/src/main/scala/cn/edu/ecnu/spark/examples/scala/join/BroadcastJoin.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.spark.examples.scala.join 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | object BroadcastJoin { 6 | 7 | def run(args: Array[String]): Unit = { 8 | /* 步骤1:通过SparkConf设置配置信息,并创建SparkContext */ 9 | val conf = new SparkConf() 10 | .setAppName("BroadcastJoin") 11 | .setMaster("local") // 仅用于本地进行调试,如在集群中运行则删除本行 12 | val sc = new SparkContext(conf) 13 | 14 | /* 步骤2:按应用逻辑使用操作算子编写DAG,其中包括RDD的创建、转换和行动等 */ 15 | // 读入部门表 16 | val departmentsMap = sc.textFile("src/main/resources/input/join/department.csv") 17 | // 按制表符解析为[DeptName, Manager]键值对后collectAsMap到Driver中 18 | .map(line => { 19 | val tokens = line.split("\t") 20 | (tokens(0), tokens(1)) 21 | }).collectAsMap() 22 | 23 | // 广播部门表 24 | val departmentsBroadCast = sc.broadcast(departmentsMap) 25 | 26 | // 读入雇员表 27 | val employeesRDD = sc.textFile("src/main/resources/input/join/employee.csv") 28 | .map(line => { 29 | // 按制表符解析为(DeptName, Name, EmpId)元组 30 | val tokens = line.split("\t") 31 | (tokens(2), tokens(0), tokens(1)) 32 | }) 33 | 34 | // 在map转换操作中对雇员表与广播的部门表做自然连接 35 | employeesRDD.map(r => { 36 | // 获取广播变量部门表的值 37 | val departmentsBroadCastValue = departmentsBroadCast.value 38 | if (departmentsBroadCastValue.contains(r._1)) { 39 | // 获取departmentsBroadCastValue中对应key的value 40 | val left = departmentsBroadCastValue.get(r._1).get 41 | // 返回连接结果(Name, EmpId, DeptName, Manager) 42 | (r._2, r._3, r._1, left) 43 | } else { 44 | null 45 | } 46 | }) 47 | .filter(_ != null) // 过滤空值 48 | .foreach(println) 49 | 50 | /* |步骤3:关闭SparkContext| */ 51 | sc.stop() 52 | } 53 | 54 | def main(args: Array[String]): Unit = { 55 | run(args) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /Spark/src/main/scala/cn/edu/ecnu/spark/examples/scala/join/ShuffleJoin.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.spark.examples.scala.join 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | object ShuffleJoin { 6 | 7 | def run(args: Array[String]): Unit = { 8 | /* 步骤1:通过SparkConf设置配置信息,并创建SparkContext */ 9 | val conf = new SparkConf() 10 | .setAppName("ShuffleJoin") 11 | .setMaster("local") // 仅用于本地进行调试,如在集群中运行则删除本行 12 | val sc = new SparkContext(conf) 13 | 14 | /* 步骤2:按应用逻辑使用操作算子编写DAG,其中包括RDD的创建、转换和行动等 */ 15 | // 读入部门表 16 | val departmentsRDD = sc.textFile("src/main/resources/input/join/department.csv") 17 | .map(line => { 18 | // 按制表符分割文本行,将每行文本映射为[DeptName, Manager]键值对 19 | val tokens = line.split("\t") 20 | (tokens(0), tokens(1)) 21 | }) 22 | 23 | // 读入雇员表 24 | val employeesRDD = sc.textFile("src/main/resources/input/join/employee.csv") 25 | .map(line => { 26 | val tokens = line.split("\t") 27 | // 按制表符分割文本行,将每行文本映射为[DeptName, Name EmpId]键值对 28 | (tokens(2), (tokens(0), tokens(1))) 29 | }) 30 | 31 | // 用coGroup算子对雇员表和部门表按DeptName聚合 32 | employeesRDD.cogroup(departmentsRDD, 2) 33 | // 对[DeptName, {{Name EmpId}, {Manager}}]进行连接操作 34 | .flatMap( tuple => 35 | // 返回连接结果(Name, EmpId, DeptName, Manager) 36 | for (v <- tuple._2._1.iterator; w <- tuple._2._2.iterator) yield (v._1, v._2, tuple._1, w) 37 | ) 38 | .foreach(println) 39 | 40 | /* |步骤3:关闭SparkContext| */ 41 | sc.stop() 42 | } 43 | 44 | def main(args: Array[String]): Unit = { 45 | run(args) 46 | } 47 | } -------------------------------------------------------------------------------- /Spark/src/main/scala/cn/edu/ecnu/spark/examples/scala/kmeans/KMeans.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.spark.examples.scala.kmeans 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | import scala.collection.mutable.ArrayBuffer 6 | import scala.math.pow 7 | 8 | 9 | object KMeans { 10 | // 计算两个的点距离的平方 11 | def distanceSquared(p1: Array[Int], p2: Array[Double]): Double = { 12 | var sum = 0.0 13 | for (i <- 0 until p1.size) { 14 | sum += pow(p1(i).toDouble - p2(i), 2) 15 | } 16 | sum 17 | } 18 | 19 | // 计算两个点的和 20 | def addPoints(p1: Array[Int], p2: Array[Int]): Array[Int] = { 21 | val newPoint = ArrayBuffer[Int]() 22 | for (i <- 0 until p1.size) { 23 | newPoint += p1(i) + p2(i) 24 | } 25 | newPoint.toArray 26 | } 27 | 28 | // 计算一群点中距离某个点最近的点的角标 29 | def closestPoint(p: Array[Int], kPoints: Array[Array[Double]]): Int = { 30 | var bestIndex = 0 31 | var closest = Double.PositiveInfinity 32 | // 遍历聚类中心集,并计算与数据点的距离 33 | for (i <- kPoints.indices) { 34 | // 计算数据点与当前聚类中心的距离 35 | val dist = distanceSquared(p, kPoints(i)) 36 | if (dist < closest) { 37 | closest = dist 38 | bestIndex = i 39 | } 40 | } 41 | bestIndex 42 | } 43 | 44 | def run(args: Array[String]): Unit = { 45 | /* 步骤1:通过SparkConf设置配置信息,并创建SparkContext */ 46 | val conf = new SparkConf() 47 | .setAppName("KMeans") 48 | .setMaster("local") // 仅用于本地进行调试,如在集群中运行则删除本行 49 | val sc = new SparkContext(conf) 50 | 51 | /* 步骤2:按应用逻辑使用操作算子编写DAG,其中包括RDD的创建、转换和行动等 */ 52 | val iterateNum = 20 // 指定迭代次数 53 | 54 | // 从数据源读入数据点 55 | val points = sc.textFile("src/main/resources/input/kmeans/data.txt") 56 | // 解析每行数据,并转换为Int类型,并持久化到内存 57 | .map(_.split("\t")(0).split(",").map(_.toInt)) 58 | .cache() 59 | 60 | // 获取设置的初始中心点 61 | val kPoints = sc.textFile("src/main/resources/input/kmeans/centers.txt") 62 | // 解析每行数据,并转换为Int类型 63 | .map(_.split(",").map(_.toDouble)) 64 | .collect() 65 | 66 | // 执行iterateNum次迭代计算 67 | for (iter <- 1 to iterateNum - 1) { 68 | val closest = points.map(p => { 69 | // 计算距离最近的聚类中心 70 | (closestPoint(p, kPoints), (p, 1)) 71 | }) 72 | 73 | // 按类别号标识聚合,并计算新的聚类中心 74 | val newPoints = closest.reduceByKey { 75 | (t1, t2) => { 76 | // 计算两个点的和,并累加数据点个数 77 | (addPoints(t1._1, t2._1), t1._2 + t2._2) 78 | } 79 | }.map { 80 | case (index, (point, n)) => { 81 | val newPoint = ArrayBuffer[Double]() 82 | for (i <- point.indices) { 83 | // 每个维度的和值除以数据点个数得到每个维度的均值 84 | newPoint += point(i).toDouble / n 85 | } 86 | newPoint.toArray 87 | } 88 | }.collect() 89 | 90 | // 将旧的聚类中心替换为新的聚类中心 91 | for (i <- kPoints.indices) { 92 | kPoints(i) = newPoints(i) 93 | } 94 | 95 | // 如果是最后一次迭代,则输出聚类结果 96 | if (iter == iterateNum - 1) { 97 | closest.foreach(item => { 98 | for (i <- 0 until item._2._1.size - 1) { 99 | print(item._2._1(i) + ",") 100 | } 101 | print(item._2._1(item._2._1.size - 1) + " ") 102 | println((item._1 + 1).toDouble) 103 | }) 104 | } 105 | 106 | } 107 | 108 | /* 步骤3:关闭SparkContext */ 109 | sc.stop() 110 | } 111 | 112 | def main(args: Array[String]): Unit = { 113 | run(args) 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /Spark/src/main/scala/cn/edu/ecnu/spark/examples/scala/pagerank/PageRank.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.spark.examples.scala.pagerank 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | object PageRank { 6 | def run(args: Array[String]): Unit = { 7 | /* 步骤1:通过SparkConf设置配置信息,并创建SparkContext */ 8 | val conf = new SparkConf() 9 | .setAppName("PageRank") 10 | .setMaster("local") // 仅用于本地进行调试,如在集群中运行则删除本行 11 | val sc = new SparkContext(conf) 12 | 13 | /* 步骤2:按应用逻辑使用操作算子编写DAG,其中包括RDD的创建、转换和行动等 */ 14 | val iterateNum = 20 // 指定迭代次数 15 | val factor = 0.85 // 指定系数 16 | // 读取输入文本数据 17 | val text = sc.textFile("src/main/resources/input/pagerank/pagerank.txt") 18 | 19 | // 将文本数据转换成[网页, {链接列表}]键值对 20 | val links = text.map(line => { 21 | val tokens = line.split(" ") 22 | var list = List[String]() 23 | for (i <- 2 until tokens.size by 2) { 24 | list = list :+ tokens(i) 25 | } 26 | (tokens(0), list) 27 | }).cache() // 持久化到内存 28 | 29 | val N = args(0).toLong // 从输入中获取网页总数N 30 | 31 | // 初始化每个页面的排名值,得到[网页, 排名值]键值对 32 | var ranks = text.map(line => { 33 | val tokens = line.split(" ") 34 | (tokens(0), tokens(1).toDouble) 35 | }) 36 | 37 | // 执行iterateNum次迭代计算 38 | for (iter <- 1 to iterateNum) { 39 | val contributions = links 40 | // 将links和ranks做join,得到[网页, {{链接列表}, 排名值}] 41 | .join(ranks) 42 | // 计算出每个网页对其每个链接网页的贡献值 43 | .flatMap { 44 | case (pageId, (links, rank)) => 45 | // 网页排名值除以链接总数 46 | links.map(dest => (dest, rank / links.size)) 47 | } 48 | 49 | ranks = contributions 50 | // 聚合对相同网页的贡献值,求和得到对每个网页的总贡献值 51 | .reduceByKey(_ + _) 52 | // 根据公式计算得到每个网页的新排名值 53 | .mapValues(v => (1 - factor) * 1.0 / N + factor * v) 54 | 55 | } 56 | 57 | // 对排名值保留5位小数,并打印最终网页排名结果 58 | ranks.foreach(t => println(t._1 + " " + t._2.formatted("%.5f"))) 59 | 60 | /* 步骤3:关闭SparkContext */ 61 | sc.stop() 62 | } 63 | 64 | def main(args: Array[String]): Unit = { 65 | run(args) 66 | } 67 | } -------------------------------------------------------------------------------- /Spark/src/main/scala/cn/edu/ecnu/spark/examples/scala/wordcount/WordCount.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.spark.examples.scala.wordcount 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | object WordCount { 6 | 7 | def run(args: Array[String]): Unit = { 8 | /* 步骤1:通过SparkConf设置配置信息,并创建SparkContext */ 9 | val conf = new SparkConf() 10 | .setAppName("WordCount") 11 | .setMaster("local") // 仅用于本地进行调试,如在集群中运行则删除本行 12 | val sc = new SparkContext(conf) 13 | 14 | /* 步骤2:按应用逻辑使用操作算子编写DAG,其中包括RDD的创建、转换和行动等 */ 15 | // 读入文本数据,创建名为lines的RDD 16 | val lines = sc.textFile("src/main/resources/input/wordcount/words.txt") 17 | // 将lines中的每一个文本行按空格分割成单个单词 18 | val words = lines.flatMap { line => line.split(" ") } 19 | // 将每个单词的频数设置为1,即将每个单词映射为[单词, 1] 20 | val pairs = words.map { word => (word, 1) } 21 | 22 | // 按单词聚合,并对相同单词的频数使用sum进行累计 23 | val wordCounts = pairs.groupByKey().map(t => (t._1, t._2.sum)) 24 | // 如需使用合并机制则将第上一行替换为下行 25 | // val wordCounts = pairs.reduceByKey(_+_) 26 | 27 | // 输出词频统计结果 28 | wordCounts.foreach(t => { println(t._1 + " " + t._2) }) 29 | 30 | /* 步骤3:关闭SparkContext */ 31 | sc.stop() 32 | } 33 | 34 | def main(args: Array[String]): Unit = { 35 | run(args) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /SparkStreaming/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | fortest 8 | sparktest 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 13 | org.apache.maven.plugins 14 | maven-compiler-plugin 15 | 16 | 8 17 | 8 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | org.apache.spark 28 | spark-core_2.11 29 | 2.4.0 30 | 31 | 32 | 33 | 34 | org.apache.spark 35 | spark-sql_2.11 36 | 2.4.0 37 | 38 | 39 | 40 | 41 | org.apache.spark 42 | spark-streaming_2.11 43 | 2.4.0 44 | 45 | 46 | 47 | 48 | org.scala-lang 49 | scala-library 50 | 2.11.12 51 | 52 | 53 | 54 | 55 | org.apache.hadoop 56 | hadoop-hdfs 57 | 3.1.0 58 | 59 | 60 | 61 | 62 | org.apache.hadoop 63 | hadoop-client 64 | 3.1.0 65 | 66 | 67 | 68 | 69 | org.apache.spark 70 | spark-streaming-kafka-0-10_2.11 71 | 2.4.0 72 | 73 | 74 | 75 | 76 | 77 | com.thoughtworks.paranamer 78 | paranamer 79 | 2.8 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /SparkStreaming/src/main/java/cn/edu/ecnu/sparkstreaming/examples/java/anomaly/AnomalyDetection.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.sparkstreaming.examples.java.anomaly; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.function.Function; 5 | import org.apache.spark.api.java.function.PairFunction; 6 | import org.apache.spark.streaming.Durations; 7 | import org.apache.spark.streaming.api.java.JavaPairDStream; 8 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; 9 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 10 | import scala.Tuple2; 11 | 12 | public class AnomalyDetection { 13 | public static void run(String[] args) throws InterruptedException { 14 | /* 步骤1:通过SparkConf设置配置信息,并创建StreamingContext */ 15 | SparkConf sparkConf = 16 | new SparkConf() 17 | .setAppName("AnomalyDetection") 18 | .setMaster("local[*]"); // 仅用于本地进行调试,如在集群中运行则删除该行 19 | JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(1)); 20 | 21 | /* 步骤2:按应用逻辑使用操作算子编写DAG,包括DStream的输入、转换和输出等 */ 22 | // 模型参数 23 | Double w = 1.1; 24 | Double b = 2.2; 25 | Double delta = 0.5; 26 | 27 | // 从指定的主机名和端口号接收数据 28 | JavaReceiverInputDStream inputDStream = ssc.socketTextStream("localhost", 9999); 29 | 30 | // 按逗号分割解析每行数据,并转化为Double类型 31 | JavaPairDStream anomaly = 32 | inputDStream 33 | .mapToPair( 34 | new PairFunction() { 35 | @Override 36 | public Tuple2 call(String line) throws Exception { 37 | String[] tokens = line.split(","); 38 | return new Tuple2<>(Double.valueOf(tokens[0]), Double.valueOf(tokens[1])); 39 | } 40 | }) 41 | .filter( 42 | new Function, Boolean>() { // 使用线性模型检测异常 43 | @Override 44 | public Boolean call(Tuple2 t) throws Exception { 45 | return Math.abs(w * t._1 + b - t._2) > delta; 46 | } 47 | }); 48 | 49 | // 输出异常 50 | anomaly.print(); 51 | 52 | /* 步骤3:开启计算并等待计算结束 */ 53 | ssc.start(); 54 | ssc.awaitTermination(); 55 | } 56 | 57 | public static void main(String[] args) throws InterruptedException { 58 | run(args); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /SparkStreaming/src/main/java/cn/edu/ecnu/sparkstreaming/examples/java/window/Window.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.sparkstreaming.examples.java.window; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.function.FlatMapFunction; 5 | import org.apache.spark.api.java.function.Function2; 6 | import org.apache.spark.api.java.function.PairFunction; 7 | import org.apache.spark.streaming.Duration; 8 | import org.apache.spark.streaming.Durations; 9 | import org.apache.spark.streaming.api.java.JavaDStream; 10 | import org.apache.spark.streaming.api.java.JavaPairDStream; 11 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; 12 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 13 | import org.apache.spark.streaming.dstream.DStream; 14 | import scala.Tuple2; 15 | 16 | import java.util.*; 17 | 18 | public class Window { 19 | public static void run(String[] args) throws InterruptedException { 20 | /* 步骤1:通过SparkConf设置配置信息,并创建StreamingContext */ 21 | SparkConf sparkConf = 22 | new SparkConf().setAppName("Window").setMaster("local[*]"); // 仅用于本地进行调试,如在集群中运行则删除该行 23 | JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(5)); 24 | 25 | // 如需使用增量式窗口操作则必须设置检查点路径 26 | // ssc.checkpoint("hdfs://localhost:9000/sparkstreaming/checkpoint") 27 | 28 | /* 步骤2:按应用逻辑使用操作算子编写DAG,包括DStream的输入、转换和输出等 */ 29 | // 从指定的主机名和端口号接收数据 30 | JavaReceiverInputDStream inputDStream = ssc.socketTextStream("localhost", 9999); 31 | 32 | // 将接收到的文本行数据按空格分割 33 | JavaDStream words = 34 | inputDStream.flatMap( 35 | new FlatMapFunction() { 36 | @Override 37 | public Iterator call(String line) throws Exception { 38 | return Arrays.asList(line.split(" ")).iterator(); 39 | } 40 | }); 41 | 42 | // 将每个单词映射为[word, 1]键值对 43 | JavaPairDStream mapToPairDStream = 44 | words.mapToPair( 45 | new PairFunction() { 46 | @Override 47 | public Tuple2 call(String word) throws Exception { 48 | return new Tuple2(word, 1); 49 | } 50 | }); 51 | 52 | // 按单词聚合,对相同单词的频数进行累计 53 | JavaPairDStream wordCounts = 54 | mapToPairDStream.reduceByKeyAndWindow( 55 | new Function2() { 56 | @Override 57 | public Integer call(Integer v1, Integer v2) throws Exception { 58 | return v1 + v2; 59 | } 60 | }, 61 | Durations.seconds(10), 62 | Durations.seconds(5)); 63 | 64 | // 如需使用增量式窗口操作则将上方的reduceByKeyAndWindow方法替换为下方的reduceByKeyAndWindow方法 65 | /*DStream> wordCounts = 66 | mapToPairDStream 67 | .reduceByKeyAndWindow( 68 | new Function2() { 69 | @Override 70 | public Integer call(Integer v1, Integer v2) throws Exception { 71 | return v1 + v2; 72 | } 73 | }, 74 | new Function2() { 75 | @Override 76 | public Integer call(Integer v1, Integer v2) throws Exception { 77 | return v1 - v2; 78 | } 79 | }, 80 | Durations.seconds(10), 81 | Durations.seconds(5)) 82 | .checkpoint(Durations.seconds(25));*/ 83 | 84 | // 打印结果 85 | wordCounts.print(); 86 | 87 | /* 步骤3:开启计算并等待计算结束 */ 88 | ssc.start(); 89 | ssc.awaitTermination(); 90 | } 91 | 92 | public static void main(String[] args) throws InterruptedException { 93 | run(args); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /SparkStreaming/src/main/java/cn/edu/ecnu/sparkstreaming/examples/java/wordcount/BatchWordCount.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.sparkstreaming.examples.java.wordcount; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.function.FlatMapFunction; 5 | import org.apache.spark.api.java.function.Function2; 6 | import org.apache.spark.api.java.function.PairFunction; 7 | import org.apache.spark.streaming.Durations; 8 | import org.apache.spark.streaming.api.java.JavaDStream; 9 | import org.apache.spark.streaming.api.java.JavaPairDStream; 10 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; 11 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 12 | import scala.Tuple2; 13 | 14 | import java.util.*; 15 | 16 | public class BatchWordCount { 17 | public static void run(String[] args) throws InterruptedException { 18 | /* 步骤1:通过SparkConf设置配置信息,并创建StreamingContext */ 19 | SparkConf sparkConf = 20 | new SparkConf() 21 | .setAppName("BatchWordCount") 22 | .setMaster("local[*]"); // 仅用于本地进行调试,如在集群中运行则删除该行 23 | JavaStreamingContext ssc = new JavaStreamingContext(sparkConf, Durations.seconds(5)); 24 | 25 | /* 步骤2:按应用逻辑使用操作算子编写DAG,包括DStream的输入、转换和输出等 */ 26 | // 从指定的主机名和端口号接收数据 27 | JavaReceiverInputDStream inputDStream = 28 | ssc.socketTextStream("localhost", 9999); 29 | 30 | // 将接收到的文本行数据按空格分割 31 | JavaDStream words = 32 | inputDStream.flatMap( 33 | new FlatMapFunction() { 34 | @Override 35 | public Iterator call(String line) throws Exception { 36 | return Arrays.asList(line.split(" ")).iterator(); 37 | } 38 | }); 39 | 40 | // 并将每个单词映射为[word, 1]键值对 41 | JavaPairDStream mapToPairDStream = 42 | words.mapToPair( 43 | new PairFunction() { 44 | @Override 45 | public Tuple2 call(String word) throws Exception { 46 | return new Tuple2(word, 1); 47 | } 48 | }); 49 | 50 | // 按单词聚合,对相同单词的频数进行累计 51 | JavaPairDStream wordCounts = 52 | mapToPairDStream.reduceByKey( 53 | new Function2() { 54 | @Override 55 | public Integer call(Integer i1, Integer i2) throws Exception { 56 | return i1 + i2; 57 | } 58 | }); 59 | 60 | // 打印结果 61 | wordCounts.print(); 62 | 63 | /* 步骤3:开启计算并等待计算结束 */ 64 | ssc.start(); 65 | ssc.awaitTermination(); 66 | } 67 | 68 | public static void main(String[] args) throws InterruptedException { 69 | run(args); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /SparkStreaming/src/main/java/cn/edu/ecnu/sparkstreaming/examples/java/wordcount/GlobalWordCount.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.sparkstreaming.examples.java.wordcount; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.Optional; 5 | import org.apache.spark.api.java.function.FlatMapFunction; 6 | import org.apache.spark.api.java.function.Function2; 7 | import org.apache.spark.api.java.function.PairFunction; 8 | import org.apache.spark.streaming.Durations; 9 | import org.apache.spark.streaming.api.java.JavaPairDStream; 10 | import org.apache.spark.streaming.api.java.JavaReceiverInputDStream; 11 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 12 | import org.apache.spark.streaming.dstream.DStream; 13 | import scala.Tuple2; 14 | 15 | import java.util.*; 16 | 17 | public class GlobalWordCount { 18 | public static void run(String[] args) throws InterruptedException { 19 | /* 步骤1:通过SparkConf设置配置信息,并创建StreamingContext */ 20 | SparkConf conf = 21 | new SparkConf() 22 | .setAppName("GlobalWordCount") 23 | .setMaster("local[*]"); // 仅用于本地进行调试,如在集群中运行则删除该行 24 | JavaStreamingContext ssc = new JavaStreamingContext(conf, Durations.seconds(5)); 25 | 26 | // 若使用了有状态算子,则必须设置checkpoint 27 | ssc.checkpoint("hdfs://localhost:9000/spark/checkpoint"); 28 | 29 | /* 步骤2:按应用逻辑使用操作算子编写DAG,包括DStream的输入、转换和输出等 */ 30 | // 将接收到的文本行数据按空格分割 31 | JavaReceiverInputDStream inputDStream = ssc.socketTextStream("localhost", 9999); 32 | 33 | // 将每个单词映射为[word, 1]键值对 34 | JavaPairDStream pairsDStream = 35 | inputDStream 36 | .flatMap( 37 | new FlatMapFunction() { 38 | @Override 39 | public Iterator call(String line) throws Exception { 40 | return Arrays.asList(line.split(" ")).iterator(); 41 | } 42 | }) 43 | .mapToPair( 44 | new PairFunction() { 45 | @Override 46 | public Tuple2 call(String word) throws Exception { 47 | return new Tuple2(word, 1); 48 | } 49 | }); 50 | 51 | // 使用updateStateByKey根据状态值和新到达数据统计词频 52 | DStream> wordCounts = 53 | pairsDStream 54 | .updateStateByKey( 55 | new Function2, Optional, Optional>() { 56 | @Override 57 | public Optional call(List values, Optional state) 58 | throws Exception { 59 | Integer updatedValue = 0; 60 | if (state.isPresent()) { 61 | updatedValue = state.get(); 62 | } 63 | for (Integer value : values) { 64 | updatedValue += value; 65 | } 66 | return Optional.of(updatedValue); 67 | } 68 | }) 69 | .checkpoint(Durations.seconds(25)); // 设置检查点间隔,最佳实践为批次间隔的5~10倍 70 | 71 | wordCounts.print(); // 打印结果 72 | 73 | /* 步骤3:开启计算并等待计算结束 */ 74 | ssc.start(); 75 | ssc.awaitTermination(); 76 | } 77 | 78 | public static void main(String[] args) throws InterruptedException { 79 | run(args); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /SparkStreaming/src/main/scala/cn/edu/ecnu/sparkstreaming/examples/scala/anomaly/AnomalyDetection.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.sparkstreaming.examples.scala.anomaly 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.{Seconds, StreamingContext} 5 | 6 | object AnomalyDetection { 7 | def run(args: Array[String]): Unit = { 8 | /* 步骤1:通过SparkConf设置配置信息,并创建StreamingContext */ 9 | val sparkConf = new SparkConf() 10 | .setAppName("AnomalyDetection") 11 | .setMaster("local[*]") // 仅用于本地进行调试,如在集群中运行则删除该行 12 | val ssc = new StreamingContext(sparkConf, Seconds(1)) 13 | 14 | /* 步骤2:按应用逻辑使用操作算子编写DAG,包括DStream的输入、转换和输出等 */ 15 | // 模型参数 16 | val w = 1.1 17 | val b = 2.2 18 | val delta = 0.5 19 | 20 | // 从指定的主机名和端口号接收数据 21 | val inputDStream = ssc.socketTextStream("localhost", 9999) 22 | 23 | // 按逗号分割解析每行数据,并转化为Double类型 24 | val anomaly = inputDStream 25 | .map(line => { 26 | val tokens = line.split(",") 27 | (tokens(0).toDouble, tokens(1).toDouble) 28 | }) 29 | // 使用线性模型检测异常 30 | .filter(t => { 31 | Math.abs(w * t._1 + b - t._2) > delta 32 | }) 33 | 34 | // 输出异常 35 | anomaly.print() 36 | 37 | /* 步骤3:开启计算并等待计算结束 */ 38 | ssc.start() 39 | ssc.awaitTermination() 40 | } 41 | 42 | def main(args: Array[String]): Unit = { 43 | run(args) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /SparkStreaming/src/main/scala/cn/edu/ecnu/sparkstreaming/examples/scala/window/Window.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.sparkstreaming.examples.scala.window 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.{Seconds, StreamingContext} 5 | 6 | object Window { 7 | def run(args: Array[String]): Unit = { 8 | /* 步骤1:通过SparkConf设置配置信息,并创建StreamingContext */ 9 | val sparkConf = new SparkConf() 10 | .setAppName("Window") 11 | .setMaster("local[*]") // 仅用于本地进行调试,如在集群中运行则删除该行 12 | val ssc = new StreamingContext(sparkConf, Seconds(5)) 13 | 14 | // 如需使用增量式窗口操作则必须设置检查点路径 15 | // ssc.checkpoint("hdfs://localhost:9000/sparkstreaming/checkpoint") 16 | 17 | /* 步骤2:按应用逻辑使用操作算子编写DAG,包括DStream的输入、转换和输出等 */ 18 | // 从指定的主机名和端口号接收数据 19 | val inputDStream = ssc.socketTextStream("localhost", 9999) 20 | 21 | // 将接收到的文本行数据按空格分割,并将每个单词映射为[word, 1]键值对 22 | val pairsDStream = inputDStream.flatMap(_.split(" ")).map(x => (x, 1)) 23 | 24 | // 按单词聚合,对相同单词的频数进行累计 25 | val wordCounts = pairsDStream.reduceByKeyAndWindow((a: Int, b: Int) => (a + b), Seconds(10), Seconds(5)) 26 | // 如需使用增量式窗口操作则将上一行替换为下行 27 | // val wordCounts = pairsDStream.reduceByKeyAndWindow(_+_, _-_, Seconds(10), Seconds(5)).checkpoint(Seconds(25)) 28 | 29 | // 打印结果 30 | wordCounts.print() 31 | 32 | /* 步骤3:开启计算并等待计算结束 */ 33 | ssc.start() 34 | ssc.awaitTermination() 35 | } 36 | 37 | def main(args: Array[String]): Unit = { 38 | run(args) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /SparkStreaming/src/main/scala/cn/edu/ecnu/sparkstreaming/examples/scala/wordcount/BatchWordCount.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.sparkstreaming.examples.scala.wordcount 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.{Seconds, StreamingContext} 5 | 6 | object BatchWordCount { 7 | def run(args: Array[String]): Unit = { 8 | /* 步骤1:通过SparkConf设置配置信息,并创建StreamingContext */ 9 | val sparkConf = new SparkConf() 10 | .setAppName("BatchWordCount") 11 | .setMaster("local[*]") // 仅用于本地进行调试,如在集群中运行则删除该行 12 | val ssc = new StreamingContext(sparkConf, Seconds(5)) 13 | 14 | /* 步骤2:按应用逻辑使用操作算子编写DAG,包括DStream的输入、转换和输出等 */ 15 | // 从指定的主机名和端口号接收数据 16 | val inputDStream = ssc.socketTextStream("localhost", 9999) 17 | 18 | // 将接收到的文本行数据按空格分割,并将每个单词映射为[word, 1]键值对 19 | val pairsDStream = inputDStream.flatMap(_.split(" ")).map(x => (x, 1)) 20 | // 按单词聚合,对相同单词的频数进行累计 21 | val wordCounts = pairsDStream.reduceByKey((t1: Int, t2: Int) => t1 + t2) 22 | // 打印结果 23 | wordCounts.print() 24 | 25 | /* 步骤3:开启计算并等待计算结束 */ 26 | ssc.start() 27 | ssc.awaitTermination() 28 | } 29 | 30 | def main(args: Array[String]): Unit = { 31 | run(args) 32 | } 33 | } -------------------------------------------------------------------------------- /SparkStreaming/src/main/scala/cn/edu/ecnu/sparkstreaming/examples/scala/wordcount/GlobalWordCount.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.sparkstreaming.examples.scala.wordcount 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.{Seconds, StreamingContext} 5 | 6 | object GlobalWordCount { 7 | def run(args: Array[String]): Unit = { 8 | /* 步骤1:通过SparkConf设置配置信息,并创建StreamingContext */ 9 | val conf = new SparkConf() 10 | .setAppName("GlobalWordCount") 11 | .setMaster("local[*]") // 仅用于本地进行调试,如在集群中运行则删除该行 12 | val ssc = new StreamingContext(conf, Seconds(5)) 13 | 14 | // 若使用了有状态算子,则必须设置checkpoint 15 | ssc.checkpoint("hdfs://localhost:9000/sparkstreaming/checkpoint") 16 | 17 | /* 步骤2:按应用逻辑使用操作算子编写DAG,包括DStream的输入、转换和输出等 */ 18 | // 从指定的主机名和端口号接收数据 19 | val inputDStream = ssc.socketTextStream("localhost", 9999) 20 | 21 | // 将接收到的文本行数据按空格分割,并将每个单词映射为[word, 1]键值对 22 | val pairsDStream = inputDStream.flatMap(_.split(" ")).map(word => (word, 1)) 23 | 24 | // 使用updateStateByKey根据状态值和新到达数据统计词频 25 | val wordCounts = pairsDStream.updateStateByKey( 26 | (curValues: Seq[Int], preValue: Option[Int]) => { 27 | val curValue = curValues.sum 28 | Some(curValue + preValue.getOrElse(0)) 29 | }) 30 | .checkpoint(Seconds(25)) // 设置检查点间隔,最佳实践为批次间隔的5~10倍 31 | 32 | wordCounts.print() // 打印结果 33 | 34 | /* 步骤3:开启计算并等待计算结束 */ 35 | ssc.start() 36 | ssc.awaitTermination() 37 | } 38 | 39 | def main(args: Array[String]): Unit = { 40 | run(args) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /Storm/README.md: -------------------------------------------------------------------------------- 1 | # Storm编程示例 2 | 3 | 本部分包括四个编程示例: 4 | 5 | 1. [词频统计](src/main/java/cn/edu/ecnu/example/storm/wordcount/withoutAck) 6 | 2. [支持容错的词频统计](src/main/java/cn/edu/ecnu/example/storm/wordcount/withAck) 7 | 3. [简化的窗口操作](src/main/java/cn/edu/ecnu/example/storm/wordcount/window) 8 | 4. [异常检测](src/main/java/cn/edu/ecnu/example/storm/detection) -------------------------------------------------------------------------------- /Storm/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | Storm 8 | Storm 9 | 1.0-SNAPSHOT 10 | 11 | UTF-8 12 | 13 | 14 | 15 | maven-ali 16 | http://maven.aliyun.com/nexus/content/groups/public// 17 | 18 | true 19 | 20 | 21 | false 22 | 23 | 24 | 25 | 26 | 27 | 28 | org.apache.storm 29 | storm-core 30 | 1.2.3 31 | 32 | 33 | org.apache.storm 34 | storm-client 35 | 2.1.0 36 | provided 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | org.apache.maven.plugins 45 | maven-compiler-plugin 46 | 47 | 1.8 48 | 1.8 49 | 50 | 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /Storm/src/main/java/cn/edu/ecnu/example/storm/common/SocketSpout.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.example.storm.common; 2 | 3 | import org.apache.storm.spout.SpoutOutputCollector; 4 | import org.apache.storm.task.TopologyContext; 5 | import org.apache.storm.topology.OutputFieldsDeclarer; 6 | import org.apache.storm.topology.base.BaseRichSpout; 7 | import org.apache.storm.tuple.Fields; 8 | import org.apache.storm.tuple.Values; 9 | 10 | import java.io.BufferedReader; 11 | import java.io.IOException; 12 | import java.io.InputStreamReader; 13 | import java.net.Socket; 14 | import java.util.Map; 15 | 16 | public class SocketSpout extends BaseRichSpout { 17 | SpoutOutputCollector collector; 18 | String ip; 19 | int port; 20 | BufferedReader br = null; 21 | Socket socket = null; 22 | 23 | public SocketSpout(String ip, String port) { 24 | this.ip = ip; 25 | this.port = Integer.parseInt(port); 26 | } 27 | 28 | /* 步骤1: 初始化Spout */ 29 | @Override 30 | public void open(Map map, TopologyContext topologyContext, SpoutOutputCollector collector) { 31 | this.collector = collector; 32 | try { 33 | socket = new Socket(ip, port); 34 | br = new BufferedReader(new InputStreamReader(socket.getInputStream())); 35 | } catch (IOException e) { 36 | e.printStackTrace(); 37 | } 38 | } 39 | 40 | @Override 41 | public void close() { 42 | try { 43 | br.close(); 44 | socket.close(); 45 | } catch (IOException e) { 46 | e.printStackTrace(); 47 | } 48 | } 49 | 50 | /* 步骤2:读取并发送元组 */ 51 | @Override 52 | public void nextTuple() { 53 | try { 54 | String tuple; 55 | if ((tuple = br.readLine()) != null) { // 读取元组 56 | collector.emit(new Values(tuple)); // 发送元组 57 | } 58 | } catch (IOException e) { 59 | e.printStackTrace(); 60 | } 61 | } 62 | 63 | /* 步骤3:声明输出元组的字段名称 */ 64 | @Override 65 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 66 | // 该输出元组仅有一个字段sentence 67 | declarer.declare(new Fields("sentence")); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /Storm/src/main/java/cn/edu/ecnu/example/storm/detection/DetectionBolt.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.example.storm.detection; 2 | 3 | import org.apache.storm.topology.BasicOutputCollector; 4 | import org.apache.storm.topology.OutputFieldsDeclarer; 5 | import org.apache.storm.topology.base.BaseBasicBolt; 6 | import org.apache.storm.tuple.Tuple; 7 | 8 | public class DetectionBolt extends BaseBasicBolt { 9 | 10 | double w, b, delta; // 线性模型参数 11 | 12 | public DetectionBolt(double w, double b, double delta) { 13 | this.w = w; 14 | this.b = b; 15 | this.delta = delta; 16 | } 17 | 18 | /* 步骤1:定义元组处理逻辑 */ 19 | @Override 20 | public void execute(Tuple tuple, BasicOutputCollector basicOutputCollector) { 21 | // 获取数据 22 | double x = tuple.getDoubleByField("x"); 23 | double y = tuple.getDoubleByField("y"); 24 | // 判断数据是否异常 25 | if (Math.abs(w * x + b - y) > delta) { 26 | System.out.println(x + " " + y); 27 | } 28 | } 29 | 30 | /* 步骤2:声明输出元组的字段名称 */ 31 | @Override 32 | public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) { 33 | // 为空 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /Storm/src/main/java/cn/edu/ecnu/example/storm/detection/OutlierTopology.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.example.storm.detection; 2 | 3 | import cn.edu.ecnu.example.storm.common.SocketSpout; 4 | import org.apache.storm.Config; 5 | import org.apache.storm.LocalCluster; 6 | import org.apache.storm.StormSubmitter; 7 | import org.apache.storm.generated.AlreadyAliveException; 8 | import org.apache.storm.generated.AuthorizationException; 9 | import org.apache.storm.generated.InvalidTopologyException; 10 | import org.apache.storm.topology.TopologyBuilder; 11 | 12 | public class OutlierTopology { 13 | public static void main(String[] args) 14 | throws InvalidTopologyException, AuthorizationException, AlreadyAliveException { 15 | if (args.length < 3) { 16 | System.exit(-1); 17 | return; 18 | } 19 | 20 | /* 步骤1:构建拓扑 */ 21 | TopologyBuilder builder = new TopologyBuilder(); 22 | // 设置Spout,这个Spout的名字叫做"SPOUT",设置并行度为1 23 | builder.setSpout("SPOUT", new SocketSpout(args[1], args[2]), 1); 24 | // 设置Bolt——“split”,并行度为2,它的数据来源是spout的 25 | builder 26 | .setBolt("DETECTION", new DetectionBolt(1.5, 2.5, 0.5), 2) 27 | .setNumTasks(2) 28 | .shuffleGrouping("SPOUT"); 29 | 30 | /* 步骤2:设置配置信息 */ 31 | Config conf = new Config(); 32 | conf.setDebug(false); 33 | conf.setNumWorkers(2); 34 | conf.setNumAckers(0); 35 | 36 | /* 步骤3:指定运行程序的方式 */ 37 | if (args[0].equals("cluster")) { // 在集群运行程序,拓扑名称为OUTLIERTOPOLOGY 38 | conf.setNumWorkers(2); 39 | conf.setNumAckers(2); 40 | StormSubmitter.submitTopology("OUTLIERTOPOLOGY", conf, builder.createTopology()); 41 | } else if (args[0].equals("local")) { 42 | // 在本地IDE调试程序,拓扑的名称为OUTLIERTOPOLOGY 43 | LocalCluster cluster = new LocalCluster(); 44 | cluster.submitTopology("OUTLIERTOPOLOGY", conf, builder.createTopology()); 45 | } else { 46 | System.exit(-2); 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /Storm/src/main/java/cn/edu/ecnu/example/storm/wordcount/CountBolt.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.example.storm.wordcount; 2 | 3 | import org.apache.storm.topology.BasicOutputCollector; 4 | import org.apache.storm.topology.OutputFieldsDeclarer; 5 | import org.apache.storm.topology.base.BaseBasicBolt; 6 | import org.apache.storm.tuple.Tuple; 7 | 8 | import java.util.HashMap; 9 | import java.util.Map; 10 | 11 | public class CountBolt extends BaseBasicBolt { 12 | // 保存单词的频数 13 | Map counts = new HashMap(); 14 | 15 | /* 步骤1:描述元组的处理逻辑 */ 16 | @Override 17 | public void execute(Tuple tuple, BasicOutputCollector collector) { 18 | // 从接收到的元组中按字段提取单词 19 | String word = tuple.getStringByField("word"); 20 | // 获取该单词对应的频数 21 | Integer count = counts.get(word); 22 | if (count == null) { 23 | count = 0; 24 | } 25 | // 计数增加,并将单词和对应的频数加入 map 中 26 | count++; 27 | counts.put(word, count); 28 | // 输出结果,也可采用写入文件等其它方式 29 | System.out.println(word + "," + count); 30 | } 31 | 32 | /* 步骤2:声明输出元组的字段名称 */ 33 | @Override 34 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 35 | // 为空 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /Storm/src/main/java/cn/edu/ecnu/example/storm/wordcount/SplitBolt.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.example.storm.wordcount; 2 | 3 | import org.apache.storm.task.TopologyContext; 4 | import org.apache.storm.topology.BasicOutputCollector; 5 | import org.apache.storm.topology.OutputFieldsDeclarer; 6 | import org.apache.storm.topology.base.BaseBasicBolt; 7 | import org.apache.storm.tuple.Fields; 8 | import org.apache.storm.tuple.Tuple; 9 | import org.apache.storm.tuple.Values; 10 | 11 | import java.util.Map; 12 | import java.util.StringTokenizer; 13 | 14 | public class SplitBolt extends BaseBasicBolt { 15 | @Override 16 | public void prepare(Map stormConf, TopologyContext context) { 17 | super.prepare(stormConf, context); 18 | } 19 | /* 步骤1:描述元组的处理逻辑 */ 20 | @Override 21 | public void execute(Tuple tuple, BasicOutputCollector collector) { 22 | String sentence = tuple.getStringByField("sentence"); 23 | StringTokenizer iter = new StringTokenizer(sentence); 24 | while (iter.hasMoreElements()) { 25 | collector.emit(new Values(iter.nextToken())); 26 | } 27 | } 28 | 29 | /* 步骤2:声明输出元组的字段名称 */ 30 | @Override 31 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 32 | // 该元组仅有一个字段 33 | declarer.declare(new Fields("word")); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /Storm/src/main/java/cn/edu/ecnu/example/storm/wordcount/window/WindowBolt.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.example.storm.wordcount.window; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | import java.util.Map.Entry; 6 | import org.apache.storm.topology.BasicOutputCollector; 7 | import org.apache.storm.topology.OutputFieldsDeclarer; 8 | import org.apache.storm.topology.base.BaseBasicBolt; 9 | import org.apache.storm.tuple.Tuple; 10 | 11 | import java.util.HashMap; 12 | import java.util.Map; 13 | 14 | public class WindowBolt extends BaseBasicBolt { 15 | 16 | // 窗口的元组 17 | private final List window = new ArrayList<>(); 18 | // 窗口的大小和间隔 19 | private static final int LENGTH_AND_INTERVAL = 3; 20 | 21 | /* 步骤1:描述元组的处理逻辑 */ 22 | @Override 23 | public void execute(Tuple tuple, BasicOutputCollector basicOutputCollector) { 24 | // 缓存接收到的单词元组 25 | String word = tuple.getStringByField("word"); 26 | window.add(word); 27 | // 接收的单词元组数量等于窗口间隔时,触发计数操作 28 | if (window.size() == LENGTH_AND_INTERVAL) { 29 | // 计数 30 | Map wordCounts = new HashMap<>(); 31 | for (String wordInWindow : window) { 32 | if (wordCounts.containsKey(wordInWindow)) { 33 | wordCounts.put(wordInWindow, wordCounts.get(wordInWindow) + 1); 34 | } else { 35 | wordCounts.put(wordInWindow, 1); 36 | } 37 | } 38 | 39 | // 输出计数结果 40 | for (Entry entry : wordCounts.entrySet()) { 41 | System.out.println(entry.getKey() + " " + entry.getValue()); 42 | } 43 | 44 | // 清除窗口内容 45 | window.clear(); 46 | } 47 | } 48 | 49 | /* 步骤2:声明输出元组的字段名称 */ 50 | @Override 51 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 52 | // 为空 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /Storm/src/main/java/cn/edu/ecnu/example/storm/wordcount/window/WindowWordCountTopology.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.example.storm.wordcount.window; 2 | 3 | import cn.edu.ecnu.example.storm.common.SocketSpout; 4 | import cn.edu.ecnu.example.storm.wordcount.SplitBolt; 5 | import org.apache.storm.Config; 6 | import org.apache.storm.LocalCluster; 7 | import org.apache.storm.StormSubmitter; 8 | import org.apache.storm.topology.TopologyBuilder; 9 | import org.apache.storm.tuple.Fields; 10 | 11 | public class WindowWordCountTopology { 12 | public static void main(String[] args) throws Exception { 13 | if (args.length < 3) { 14 | System.exit(-1); 15 | return; 16 | } 17 | /* 步骤1:构建拓扑 */ 18 | TopologyBuilder builder = new TopologyBuilder(); 19 | // 设置Spout的名称为"SPOUT",executor数量为1,任务数量为1 20 | builder.setSpout("SPOUT", new SocketSpout(args[1], args[2]), 1); 21 | // 设置Bolt的名称为"SPLIT",executor数量为2,任务数量为2,与"SPOUT"之间的流分组策略为随机分组 22 | builder.setBolt("SPLIT", new SplitBolt(), 2).setNumTasks(2).shuffleGrouping("SPOUT"); 23 | // 设置Bolt取名"WINDOWCOUNT",executor数量为2,任务数量为2,订阅策略为fieldsGrouping 24 | builder.setBolt("WINDOWCOUNT", new WindowBolt(), 2).fieldsGrouping("SPLIT", new Fields("word")); 25 | 26 | /* 步骤2:设置配置信息 */ 27 | Config conf = new Config(); 28 | conf.setDebug(false); // 关闭调试模式 29 | conf.setNumWorkers(2); // 设置Worker数量为2 30 | conf.setNumAckers(0); // 设置Acker数量为0 31 | 32 | /* 步骤3:指定程序运行的方式 */ 33 | if (args[0].equals("cluster")) { 34 | // 在集群运行程序,拓扑的名称为WINDOWWORDCOUNT 35 | StormSubmitter.submitTopology("WINDOWWORDCOUNT", conf, builder.createTopology()); 36 | } else if (args[0].equals("local")) { 37 | // 在本地IDE调试程序,拓扑的名称为WINDOWWORDCOUNT 38 | LocalCluster cluster = new LocalCluster(); 39 | cluster.submitTopology("WINDOWWORDCOUNT", conf, builder.createTopology()); 40 | } else { 41 | System.exit(-2); 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /Storm/src/main/java/cn/edu/ecnu/example/storm/wordcount/withAck/SocketSpoutWithAck.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.example.storm.wordcount.withAck; 2 | 3 | import org.apache.storm.spout.SpoutOutputCollector; 4 | import org.apache.storm.task.TopologyContext; 5 | import org.apache.storm.topology.OutputFieldsDeclarer; 6 | import org.apache.storm.topology.base.BaseRichSpout; 7 | import org.apache.storm.tuple.Fields; 8 | import org.apache.storm.tuple.Values; 9 | 10 | import java.io.BufferedReader; 11 | import java.io.IOException; 12 | import java.io.InputStreamReader; 13 | import java.net.Socket; 14 | import java.util.HashMap; 15 | import java.util.Map; 16 | import java.util.UUID; 17 | 18 | public class SocketSpoutWithAck extends BaseRichSpout { 19 | SpoutOutputCollector collector; 20 | String ip; 21 | int port; 22 | BufferedReader br = null; 23 | Socket socket = null; 24 | 25 | // 该Map的键为STid,值为源元组的值 26 | private HashMap waitAck = new HashMap(); 27 | 28 | SocketSpoutWithAck(String ip, String port) { 29 | this.ip = ip; 30 | this.port = Integer.parseInt(port); 31 | } 32 | 33 | /* 步骤1: 初始化Spout */ 34 | @Override 35 | public void open(Map map, TopologyContext topologyContext, SpoutOutputCollector collector) { 36 | this.collector = collector; 37 | try { 38 | socket = new Socket(ip, port); 39 | br = new BufferedReader(new InputStreamReader(socket.getInputStream())); 40 | } catch (IOException e) { 41 | e.printStackTrace(); 42 | } 43 | } 44 | 45 | @Override 46 | public void close() { 47 | try { 48 | br.close(); 49 | socket.close(); 50 | } catch (IOException e) { 51 | e.printStackTrace(); 52 | } 53 | } 54 | 55 | /* 步骤2:接收网络元组,将元组绑定一个STid后发送到下游Bolt */ 56 | @Override 57 | public void nextTuple() { 58 | try { 59 | String tuple; 60 | String STid = UUID.randomUUID().toString(); 61 | if ((tuple = br.readLine()) != null) { 62 | waitAck.put(STid, tuple); 63 | collector.emit(new Values(tuple), STid); 64 | } 65 | } catch (IOException e) { 66 | e.printStackTrace(); 67 | } 68 | } 69 | 70 | /* 步骤3:声明输出元组的字段名称 */ 71 | @Override 72 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 73 | declarer.declare(new Fields("sentence")); 74 | } 75 | 76 | /* 步骤4:描述元组ACK成功或失败后的处理逻辑 */ 77 | @Override 78 | public void ack(Object STid) { 79 | // 当STid所对应的元组树中所有元组都得到成功处理时调用该方法 80 | // 根据STid删除waitAck中对应的元组 81 | waitAck.remove(STid); 82 | } 83 | 84 | @Override 85 | public void fail(Object STid) { 86 | // 当STid所对应的元组树中所有元组未得到成功处理时调用该方法 87 | // 重新发送waitAck中STid对应的元组 88 | collector.emit(new Values(waitAck.get(STid)), STid); 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /Storm/src/main/java/cn/edu/ecnu/example/storm/wordcount/withAck/WordCountTopologyWithAck.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.example.storm.wordcount.withAck; 2 | 3 | import cn.edu.ecnu.example.storm.wordcount.CountBolt; 4 | import cn.edu.ecnu.example.storm.wordcount.SplitBolt; 5 | import org.apache.storm.Config; 6 | import org.apache.storm.LocalCluster; 7 | import org.apache.storm.StormSubmitter; 8 | import org.apache.storm.topology.TopologyBuilder; 9 | import org.apache.storm.tuple.Fields; 10 | 11 | public class WordCountTopologyWithAck { 12 | public static void main(String[] args) throws Exception { 13 | if (args.length < 3) { 14 | System.exit(-1); 15 | return; 16 | } 17 | /* 步骤1:构建拓扑 */ 18 | TopologyBuilder builder = new TopologyBuilder(); 19 | // 设置Spout,这个Spout的名字叫做"SPOUT",executor数量为1,任务数量为1 20 | builder.setSpout("SPOUT", new SocketSpoutWithAck(args[1], args[2]), 1); 21 | // 设置Bolt——“SPLIT”,executor数量为2,任务数量为2,与"SPOUT"之间的流分组策略为随机分组 22 | builder.setBolt("SPLIT", new SplitBolt(), 2).setNumTasks(2).shuffleGrouping("SPOUT"); 23 | // 设置Bolt——“COUNT”,executor数量为2,任务数量为2,订阅策略为fieldsGrouping 24 | builder.setBolt("COUNT", new CountBolt(), 2).fieldsGrouping("SPLIT", new Fields("word")); 25 | 26 | /* 步骤2:设置配置信息 */ 27 | Config conf = new Config(); 28 | conf.setDebug(false); // 关闭调试模式 29 | conf.setNumWorkers(2); // 设置Worker数量为2 30 | conf.setNumAckers(2); // 设置Acker数量为0 31 | 32 | /* 步骤3:指定程序运行的方式 */ 33 | if (args[0].equals("cluster")) { // 在集群运行程序,拓扑的名称为WORDCOUNTwithack 34 | StormSubmitter.submitTopology("WORDCOUNTwithack", conf, builder.createTopology()); 35 | } else if (args[0].equals("local")) { 36 | // 在本地IDE调试程序,拓扑的名称为WORDCOUNTwithack 37 | LocalCluster cluster = new LocalCluster(); 38 | cluster.submitTopology("WORDCOUNTwithack", conf, builder.createTopology()); 39 | } else { 40 | System.exit(-2); 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /Storm/src/main/java/cn/edu/ecnu/example/storm/wordcount/withoutAck/WordCountTopology.java: -------------------------------------------------------------------------------- 1 | package cn.edu.ecnu.example.storm.wordcount.withoutAck; 2 | 3 | import cn.edu.ecnu.example.storm.wordcount.CountBolt; 4 | import cn.edu.ecnu.example.storm.common.SocketSpout; 5 | import cn.edu.ecnu.example.storm.wordcount.SplitBolt; 6 | import org.apache.storm.Config; 7 | import org.apache.storm.LocalCluster; 8 | import org.apache.storm.StormSubmitter; 9 | import org.apache.storm.topology.TopologyBuilder; 10 | import org.apache.storm.tuple.Fields; 11 | 12 | public class WordCountTopology { 13 | public static void main(String[] args) throws Exception { 14 | if (args.length < 3) { 15 | System.exit(-1); 16 | return; 17 | } 18 | /* 步骤1:构建拓扑 */ 19 | TopologyBuilder builder = new TopologyBuilder(); 20 | // 设置Spout的名称为"SPOUT",executor数量为1,任务数量为1 21 | builder.setSpout("SPOUT", new SocketSpout(args[1], args[2]), 1); 22 | // 设置Bolt的名称为"SPLIT",executor数量为2,任务数量为2,与"SPOUT"之间的流分组策略为随机分组 23 | builder.setBolt("SPLIT", new SplitBolt(), 2).setNumTasks(2).shuffleGrouping("SPOUT"); 24 | // 设置Bolt取名"COUNT",executor数量为2,任务数量为2,订阅策略为fieldsGrouping 25 | builder.setBolt("COUNT", new CountBolt(), 2).fieldsGrouping("SPLIT", new Fields("word")); 26 | 27 | /* 步骤2:设置配置信息 */ 28 | Config conf = new Config(); 29 | conf.setDebug(false); // 关闭调试模式 30 | conf.setNumWorkers(2); // 设置Worker数量为2 31 | conf.setNumAckers(0); // 设置Acker数量为0 32 | 33 | /* 步骤3:指定程序运行的方式 */ 34 | if (args[0].equals("cluster")) { 35 | // 在集群运行程序,拓扑的名称为WORDCOUNT 36 | StormSubmitter.submitTopology("WORDCOUNT", conf, builder.createTopology()); 37 | } else if (args[0].equals("local")) { 38 | // 在本地IDE调试程序,拓扑的名称为WORDCOUNT 39 | LocalCluster cluster = new LocalCluster(); 40 | cluster.submitTopology("WORDCOUNT", conf, builder.createTopology()); 41 | } else { 42 | System.exit(-2); 43 | } 44 | } 45 | } 46 | --------------------------------------------------------------------------------