├── .gitignore ├── README.md ├── flink-train ├── pom.xml └── src │ └── main │ ├── java │ └── cn │ │ └── edu │ │ └── nju │ │ ├── JavaWindowWordCount.java │ │ ├── SocketTextStreamWordCount.java │ │ ├── course04 │ │ ├── JavaCounterApp.java │ │ ├── JavaDataSetDataSourceApp.java │ │ ├── JavaDataSetSinkApp.java │ │ ├── JavaDataSetTransformationApp.java │ │ ├── JavaDistributedCacheApp.java │ │ └── Person.java │ │ ├── course05 │ │ ├── JavaCustomNonParallelSourceFunction.java │ │ ├── JavaCustomParallelSourceFunction.java │ │ ├── JavaCustomRichParallelSourceFunction.java │ │ ├── JavaCustomSinkToMySQL.java │ │ ├── JavaDataStreamSourceApp.java │ │ ├── JavaDataStreamTransformationApp.java │ │ ├── SinkToMySQL.java │ │ └── Student.java │ │ ├── course06 │ │ └── JavaTableSQLAPI.java │ │ ├── course07 │ │ ├── JavaWindowsApp.java │ │ ├── JavaWindowsProcessApp.java │ │ └── JavaWindowsReduceApp.java │ │ ├── hotItem │ │ ├── HotItems.java │ │ └── UserBehavior.java │ │ └── project │ │ └── MyKafkaProducer.java │ ├── resources │ └── log4j.properties │ └── scala │ └── cn │ └── edu │ └── nju │ ├── BatchJob.scala │ ├── BatchWCScalaApp.scala │ ├── StreamingJob.scala │ ├── StreamingWCScalaApp.scala │ ├── WindowWordCount.java │ ├── course04 │ ├── CounterApp.scala │ ├── DBUtils.scala │ ├── DataSetDataSourceApp.scala │ ├── DataSetSinkApp.scala │ ├── DataSetTransformationApp.scala │ └── DistributedCacheApp.scala │ ├── course05 │ ├── CustomNonParallelSourceFunction.scala │ ├── CustomParallelSourceFunction.scala │ ├── CustomRichParallelSourceFunction.scala │ ├── DataStreamSourceApp.scala │ └── DataStreamTransformationApp.scala │ ├── course06 │ └── TableSQLAPI.scala │ ├── course07 │ ├── WindowsApp.scala │ ├── WindowsProcessApp.scala │ └── WindowsReduceApp.scala │ ├── course08 │ ├── FileSystemSinkApp.scala │ ├── KafkaConnectorConsumerApp.scala │ └── KafkaConnectorProducerApp.scala │ └── project │ ├── LogAnalysis.scala │ ├── LogAnalysis02.scala │ ├── MyMySQLSource.scala │ └── MyMySQLSourceTest.scala ├── hadoop-train ├── pom.xml └── src │ ├── main │ └── java │ │ └── cn │ │ └── edu │ │ └── nju │ │ └── hadoop │ │ ├── mapreduce │ │ ├── CombinerApp.java │ │ ├── PartitionerApp.java │ │ ├── WordCount2App.java │ │ ├── WordCountApp.java │ │ ├── sort │ │ │ ├── GlobalSort.java │ │ │ ├── GlobalSortPartitioner.java │ │ │ ├── IntPair.java │ │ │ └── SecondarySort.java │ │ └── topk │ │ │ ├── IPTimes.java │ │ │ └── TopK.java │ │ └── project │ │ └── LogApp.java │ ├── resources │ ├── application.properties │ ├── beans.xml │ └── log.txt │ └── test │ └── java │ └── cn │ └── edu │ └── nju │ └── hadoop │ ├── hdfs │ └── HDFSApp.java │ ├── project │ └── UserAgentTest.java │ └── spring │ ├── SpringBootHDFSApp.java │ └── SpringHadoopHDFSApp.java ├── hbase-train ├── hbase-api-test │ ├── pom.xml │ └── src │ │ ├── main │ │ └── java │ │ │ └── cn │ │ │ └── edu │ │ │ └── nju │ │ │ ├── HBaseConn.java │ │ │ └── HBaseUtil.java │ │ └── test │ │ └── java │ │ └── cn │ │ └── edu │ │ └── nju │ │ ├── HBaseConnTest.java │ │ ├── HBaseFilterTest.java │ │ └── HBaseUtilTest.java ├── hbase-endpoint-test │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── cn │ │ │ └── edu │ │ │ └── nju │ │ │ ├── GetRowCount.java │ │ │ └── TestRowCountEndPoint.java │ │ └── proto │ │ └── RowCountTest.proto ├── hbase-observer-test │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── cn │ │ └── edu │ │ └── nju │ │ └── RegionObserverTest.java ├── pom.xml └── src │ ├── main │ └── java │ │ └── cn │ │ └── edu │ │ └── nju │ │ └── App.java │ └── test │ └── java │ └── cn │ └── edu │ └── nju │ └── AppTest.java ├── log-generator ├── generate_log.py ├── message.py └── message2.py ├── pyspark └── project │ ├── spark.py │ ├── spark_yarn.py │ ├── steam.py │ └── test.py ├── spark-data-visualization ├── .gitignore ├── pom.xml └── src │ ├── main │ ├── java │ │ └── cn │ │ │ └── edu │ │ │ └── nju │ │ │ ├── DataVisualizationApplication.java │ │ │ ├── dao │ │ │ └── CourseClickCountDAO.java │ │ │ ├── domain │ │ │ └── CourseClickCount.java │ │ │ ├── spark │ │ │ ├── HelloBoot.java │ │ │ └── ImoocStatApp.java │ │ │ └── utils │ │ │ └── HBaseUtils.java │ └── resources │ │ ├── application.properties │ │ ├── static │ │ └── js │ │ │ ├── echarts.min.js │ │ │ └── jquery.js │ │ └── templates │ │ ├── demo.html │ │ ├── echarts.html │ │ └── test.html │ └── test │ └── java │ └── cn │ └── edu │ └── nju │ └── DataVisualizationApplicationTests.java ├── spark-mllib ├── pom.xml └── src │ └── main │ ├── resources │ ├── house.csv │ ├── iris.data │ ├── neg.txt │ ├── pos.txt │ └── u.data │ └── scala │ └── cn │ └── edu │ └── nju │ ├── MovieRecommendation.scala │ ├── classification │ └── Iris.scala │ ├── cluster │ ├── KMeans.scala │ └── Lda.scala │ ├── dimensionalityReduction │ └── PCADimensionalityReduction.scala │ ├── emotionAnalysis │ └── EmotionAnalysis.scala │ └── regression │ └── HousePriceForecast.scala ├── spark-sql-train ├── .gitignore ├── pom.xml └── src │ └── main │ ├── resources │ ├── ipDatabase.csv │ └── ipRegion.xlsx │ └── scala │ └── cn │ └── edu │ └── nju │ ├── log │ ├── AccessConvertUtil.scala │ ├── DateUtils.scala │ ├── DayCityVideoAccessStat.scala │ ├── DayVideoAccessStat.scala │ ├── DayVideoTrafficsStat.scala │ ├── IpUtils.scala │ ├── MySQLUtils.scala │ ├── SparkStatCleanJob.scala │ ├── SparkStatFormatJob.scala │ ├── StatDAO.scala │ └── TopNStatJob.scala │ └── spark │ ├── DataFrameApp.scala │ ├── DataFrameCase.scala │ ├── DataFrameRDDApp.scala │ ├── DataSetApp.scala │ ├── HiveContextApp.scala │ ├── HiveMySQLApp.scala │ ├── ParquetApp.scala │ ├── SQLContextApp.scala │ ├── SparkSQLThriftServerApp.scala │ └── SparkSessionApp.scala ├── spark-sql-visualization ├── pom.xml └── src │ └── main │ ├── java │ └── cn │ │ └── edu │ │ └── nju │ │ ├── dao │ │ └── VideoAccessTopNDAO.java │ │ ├── domain │ │ └── VideoAccessTopN.java │ │ ├── utils │ │ └── MySQLUtils.java │ │ └── web │ │ └── VideoAccessTopNServlet.java │ └── webapp │ ├── WEB-INF │ └── web.xml │ ├── js │ ├── echarts.min.js │ └── jquery.js │ ├── test.html │ └── topn.html ├── spark-train ├── pom.xml └── src │ ├── main │ ├── java │ │ └── cn │ │ │ └── edu │ │ │ └── nju │ │ │ └── spark │ │ │ ├── StreamingWordCountApp.java │ │ │ ├── WordCountApp.java │ │ │ ├── kafkas │ │ │ ├── KafkaClientApp.java │ │ │ ├── KafkaConsumer.java │ │ │ ├── KafkaProducer.java │ │ │ └── KafkaProperties.java │ │ │ └── project │ │ │ └── utils │ │ │ └── HBaseUtils.java │ └── scala │ │ └── cn │ │ └── edu │ │ └── nju │ │ └── spark │ │ ├── FlumePullWordCount.scala │ │ ├── FlumePushWordCount.scala │ │ ├── ForeachRDDApp.scala │ │ ├── KafkaDirectWordCount.scala │ │ ├── KafkaReceiverWordCount.scala │ │ ├── KafkaStreamingApp.scala │ │ ├── StatefulWordCount.scala │ │ ├── TransformApp.scala │ │ └── project │ │ ├── dao │ │ ├── CourseClickCountDAO.scala │ │ └── CourseSearchClickCountDAO.scala │ │ ├── domain │ │ ├── ClickLog.scala │ │ ├── CourseClickCount.scala │ │ └── CourseSearchClickCount.scala │ │ ├── spark │ │ └── ImoocStatStreamingApp.scala │ │ └── utils │ │ └── DateUtils.scala │ └── test │ ├── java │ └── LoggerGenerator.java │ └── resources │ └── log4j.properties ├── storm-data-visualization ├── .gitignore ├── pom.xml └── src │ ├── main │ ├── java │ │ └── cn │ │ │ └── edu │ │ │ └── nju │ │ │ ├── DataVisualizationApplication.java │ │ │ ├── controller │ │ │ └── StatApp.java │ │ │ ├── domain │ │ │ └── ResultBean.java │ │ │ └── service │ │ │ └── ResultBeanService.java │ └── resources │ │ ├── application.properties │ │ ├── static │ │ └── js │ │ │ └── jquery.js │ │ └── templates │ │ └── map.html │ └── test │ └── java │ └── cn │ └── edi │ └── nju │ └── DataVisualizationApplicationTests.java ├── storm-train ├── pom.xml └── src │ └── main │ ├── java │ └── cn │ │ └── edu │ │ └── nju │ │ ├── ClusterSumAllGroupingStormTopology.java │ │ ├── ClusterSumFieldGroupingStormTopology.java │ │ ├── ClusterSumShuffleGroupingStormTopology.java │ │ ├── ClusterSumStormAckerTopology.java │ │ ├── ClusterSumStormExecutorsTopology.java │ │ ├── ClusterSumStormTasksTopology.java │ │ ├── ClusterSumStormTopology.java │ │ ├── ClusterSumStormWorkersTopology.java │ │ ├── LocalSumStormAckerTopology.java │ │ ├── LocalSumStormTopology.java │ │ ├── LocalWordCountStormTopology.java │ │ ├── drpc │ │ ├── LocalDRPCTopology.java │ │ ├── RPCClient.java │ │ ├── RPCServer.java │ │ ├── RemoteDRPCClient.java │ │ ├── RemoteDRPCTopology.java │ │ ├── UserService.java │ │ └── UserServiceImpl.java │ │ └── intergration │ │ ├── hbase │ │ └── LocalWordCountHBaseStormTopology.java │ │ ├── hdfs │ │ └── LocalWordCountHDFSStormTopology.java │ │ ├── jdbc │ │ ├── LocalWordCountJDBCStormTopology.java │ │ └── ddl.sql │ │ ├── kafka │ │ ├── DateUtils.java │ │ ├── LogProcessBolt.java │ │ └── StormKafkaTopo.java │ │ └── redis │ │ └── LocalWordCountRedisStormTopology.java │ └── resources │ └── log4j.properties └── 集群搭建.md /.gitignore: -------------------------------------------------------------------------------- 1 | /HadoopTrain/.idea 2 | /HadoopTrain/target 3 | /HadoopTrain/HadoopTrain.iml 4 | 5 | /SparkTrain/.idea 6 | /SparkTrain/target 7 | /SparkTrain/SparkTrain.iml 8 | 9 | /StormTrain/.idea 10 | /StormTrain/target 11 | /StormTrain/logs 12 | 13 | /StormTrain/StormTrain.iml 14 | /LogGenerator/.idea 15 | /DataVisualization/DataVisualization.iml 16 | /StormVisualization/DataVisualization.iml 17 | /SparkSQLVisualization/DataVisualization.iml 18 | 19 | /*/.idea 20 | /*/target 21 | 22 | *.iml 23 | *.idea 24 | target 25 | -------------------------------------------------------------------------------- /flink-train/src/main/java/cn/edu/nju/JavaWindowWordCount.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju; 2 | 3 | import org.apache.flink.api.common.functions.FlatMapFunction; 4 | import org.apache.flink.api.java.tuple.Tuple2; 5 | import org.apache.flink.streaming.api.datastream.DataStream; 6 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 7 | import org.apache.flink.streaming.api.windowing.time.Time; 8 | import org.apache.flink.util.Collector; 9 | 10 | /** 11 | * Created by thpffcj on 2019-08-12. 12 | */ 13 | public class JavaWindowWordCount { 14 | 15 | public static void main(String[] args) throws Exception { 16 | 17 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 18 | 19 | DataStream> dataStream = env 20 | .socketTextStream("localhost", 9999) 21 | .flatMap(new Splitter()) 22 | .keyBy(0) 23 | .timeWindow(Time.seconds(5)) 24 | .sum(1); 25 | 26 | dataStream.print(); 27 | 28 | env.execute("Window WordCount"); 29 | } 30 | 31 | public static final class Splitter implements FlatMapFunction> { 32 | 33 | @Override 34 | public void flatMap(String value, Collector> out) { 35 | // normalize and split the line 36 | String[] tokens = value.toLowerCase().split("\\W+"); 37 | 38 | // emit the pairs 39 | for (String token : tokens) { 40 | if (token.length() > 0) { 41 | out.collect(new Tuple2<>(token, 1)); 42 | } 43 | } 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /flink-train/src/main/java/cn/edu/nju/SocketTextStreamWordCount.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju; 2 | 3 | import org.apache.flink.api.common.functions.FlatMapFunction; 4 | import org.apache.flink.api.java.tuple.Tuple2; 5 | import org.apache.flink.streaming.api.datastream.DataStream; 6 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 7 | import org.apache.flink.util.Collector; 8 | 9 | /** 10 | * Created by thpffcj on 2019-08-04. 11 | */ 12 | public class SocketTextStreamWordCount { 13 | 14 | public static void main(String[] args) throws Exception { 15 | // if (args.length != 2){ 16 | // System.err.println("USAGE:\nSocketTextStreamWordCount "); 17 | // return; 18 | // } 19 | // String hostName = args[0]; 20 | // Integer port = Integer.parseInt(args[1]); 21 | 22 | String hostName = "127.0.0.1"; 23 | Integer port = 9999; 24 | 25 | // set up the execution environment 26 | final StreamExecutionEnvironment env = StreamExecutionEnvironment 27 | .getExecutionEnvironment(); 28 | 29 | // get input data 30 | DataStream text = env.socketTextStream(hostName, port); 31 | 32 | text.flatMap(new LineSplitter()).setParallelism(1) 33 | // group by the tuple field "0" and sum up tuple field "1" 34 | .keyBy(0) 35 | .sum(1).setParallelism(1) 36 | .print(); 37 | 38 | // execute program 39 | env.execute("Java WordCount from SocketTextStream Example"); 40 | } 41 | 42 | /** 43 | * Implements the string tokenizer that splits sentences into words as a user-defined 44 | * FlatMapFunction. The function takes a line (String) and splits it into 45 | * multiple pairs in the form of "(word,1)" (Tuple2<String, Integer>). 46 | */ 47 | public static final class LineSplitter implements FlatMapFunction> { 48 | @Override 49 | public void flatMap(String value, Collector> out) { 50 | // normalize and split the line 51 | String[] tokens = value.toLowerCase().split("\\W+"); 52 | // emit the pairs 53 | for (String token : tokens) { 54 | if (token.length() > 0) { 55 | out.collect(new Tuple2(token, 1)); 56 | } 57 | } 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /flink-train/src/main/java/cn/edu/nju/course04/JavaCounterApp.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course04; 2 | 3 | import org.apache.flink.api.common.JobExecutionResult; 4 | import org.apache.flink.api.common.accumulators.LongCounter; 5 | import org.apache.flink.api.common.functions.RichMapFunction; 6 | import org.apache.flink.api.java.DataSet; 7 | import org.apache.flink.api.java.ExecutionEnvironment; 8 | import org.apache.flink.api.java.operators.DataSource; 9 | import org.apache.flink.configuration.Configuration; 10 | import org.apache.flink.core.fs.FileSystem; 11 | 12 | /** 13 | * Created by thpffcj on 2019-07-04. 14 | */ 15 | public class JavaCounterApp { 16 | 17 | public static void main(String[] args) throws Exception { 18 | 19 | ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); 20 | 21 | DataSource data = env.fromElements("hadoop", "spark", "flink", "pyspark", "storm"); 22 | 23 | DataSet info = data.map(new RichMapFunction() { 24 | 25 | LongCounter counter = new LongCounter(); 26 | 27 | @Override 28 | public void open(Configuration parameters) throws Exception { 29 | super.open(parameters); 30 | getRuntimeContext().addAccumulator("ele-count-java", counter); 31 | } 32 | 33 | @Override 34 | public String map(String value) throws Exception { 35 | counter.add(1); 36 | return value; 37 | } 38 | }); 39 | 40 | String filePath = "file:///Users/thpffcj/Public/data/sink-java-count-out"; 41 | info.writeAsText(filePath, FileSystem.WriteMode.OVERWRITE).setParallelism(3); 42 | JobExecutionResult jobResult = env.execute("JavaCounterApp"); 43 | // step3:获取计数器 44 | long num = jobResult.getAccumulatorResult("ele-counts-scala"); 45 | 46 | System.out.println("num: " + num); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /flink-train/src/main/java/cn/edu/nju/course04/JavaDataSetDataSourceApp.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course04; 2 | 3 | import org.apache.flink.api.java.ExecutionEnvironment; 4 | 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | /** 9 | * Created by thpffcj on 2019-07-02. 10 | */ 11 | public class JavaDataSetDataSourceApp { 12 | 13 | public static void main(String[] args) throws Exception { 14 | ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); 15 | // fromCollection(env); 16 | textFile(env); 17 | } 18 | 19 | public static void textFile(ExecutionEnvironment env) throws Exception { 20 | String filePath = "file:///Users/thpffcj/Public/data/hello.txt"; 21 | env.readTextFile(filePath).print(); 22 | } 23 | 24 | public static void fromCollection(ExecutionEnvironment env) throws Exception { 25 | List list = new ArrayList<>(); 26 | for (int i = 1; i <= 10; i++) { 27 | list.add(i); 28 | } 29 | env.fromCollection(list).print(); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /flink-train/src/main/java/cn/edu/nju/course04/JavaDataSetSinkApp.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course04; 2 | 3 | import org.apache.flink.api.java.ExecutionEnvironment; 4 | import org.apache.flink.api.java.operators.DataSource; 5 | import org.apache.flink.core.fs.FileSystem; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | /** 10 | * Created by thpffcj on 2019-07-04. 11 | */ 12 | public class JavaDataSetSinkApp { 13 | 14 | public static void main(String[] args) throws Exception { 15 | 16 | ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); 17 | 18 | List info = new ArrayList<>(); 19 | for (int i = 1; i <= 10; i++) { 20 | info.add(i); 21 | } 22 | DataSource data = env.fromCollection(info); 23 | 24 | String filePath = "file:///Users/thpffcj/Public/data/sink-out"; 25 | 26 | data.writeAsText(filePath, FileSystem.WriteMode.OVERWRITE); 27 | 28 | env.execute("JavaDataSetSinkApp"); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /flink-train/src/main/java/cn/edu/nju/course04/JavaDistributedCacheApp.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course04; 2 | 3 | import org.apache.commons.io.FileUtils; 4 | import org.apache.flink.api.common.functions.RichMapFunction; 5 | import org.apache.flink.api.java.ExecutionEnvironment; 6 | import org.apache.flink.api.java.operators.DataSource; 7 | import org.apache.flink.configuration.Configuration; 8 | 9 | import java.io.File; 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | 13 | /** 14 | * Created by thpffcj on 2019-07-04. 15 | */ 16 | public class JavaDistributedCacheApp { 17 | 18 | public static void main(String[] args) throws Exception { 19 | 20 | ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); 21 | 22 | String filePath = "file:///Users/thpffcj/Public/data/hello.txt"; 23 | 24 | // step1:注册一个本地/HDFS文件 25 | env.registerCachedFile(filePath, "java-dc"); 26 | 27 | DataSource data = env.fromElements("hadoop", "spark", "flink", "pyspark", "storm"); 28 | 29 | data.map(new RichMapFunction() { 30 | 31 | List list = new ArrayList<>(); 32 | 33 | @Override 34 | public void open(Configuration parameters) throws Exception { 35 | File file = getRuntimeContext().getDistributedCache().getFile("java-dc"); 36 | List lines = FileUtils.readLines(file); 37 | for (String line : lines) { 38 | list.add(line); 39 | System.out.println("line = " + line); 40 | } 41 | } 42 | 43 | @Override 44 | public String map(String value) throws Exception { 45 | return value; 46 | } 47 | }).print(); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /flink-train/src/main/java/cn/edu/nju/course04/Person.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course04; 2 | 3 | /** 4 | * Created by thpffcj on 2019-07-02. 5 | */ 6 | public class Person { 7 | 8 | private String name; 9 | private int age; 10 | private String work; 11 | 12 | public Person() { 13 | } 14 | 15 | public String getName() { 16 | return name; 17 | } 18 | 19 | public void setName(String name) { 20 | this.name = name; 21 | } 22 | 23 | public int getAge() { 24 | return age; 25 | } 26 | 27 | public void setAge(int age) { 28 | this.age = age; 29 | } 30 | 31 | public String getWork() { 32 | return work; 33 | } 34 | 35 | public void setWork(String work) { 36 | this.work = work; 37 | } 38 | 39 | @Override 40 | public String toString() { 41 | return "Person{" + 42 | "name='" + name + '\'' + 43 | ", age=" + age + 44 | ", work='" + work + '\'' + 45 | '}'; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /flink-train/src/main/java/cn/edu/nju/course05/JavaCustomNonParallelSourceFunction.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course05; 2 | 3 | import org.apache.flink.streaming.api.functions.source.SourceFunction; 4 | 5 | /** 6 | * Created by thpffcj on 2019-07-05. 7 | */ 8 | public class JavaCustomNonParallelSourceFunction implements SourceFunction { 9 | 10 | boolean isRunning = true; 11 | Long count = 1L; 12 | 13 | @Override 14 | public void run(SourceContext ctx) throws Exception { 15 | while (isRunning) { 16 | ctx.collect(count); 17 | count += 1; 18 | Thread.sleep(1000); 19 | } 20 | } 21 | 22 | @Override 23 | public void cancel() { 24 | isRunning = false; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /flink-train/src/main/java/cn/edu/nju/course05/JavaCustomParallelSourceFunction.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course05; 2 | 3 | import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction; 4 | import org.apache.flink.streaming.api.functions.source.SourceFunction; 5 | 6 | /** 7 | * Created by thpffcj on 2019-07-05. 8 | */ 9 | public class JavaCustomParallelSourceFunction implements ParallelSourceFunction { 10 | 11 | boolean isRunning = true; 12 | Long count = 1L; 13 | 14 | @Override 15 | public void run(SourceFunction.SourceContext ctx) throws Exception { 16 | while (isRunning) { 17 | ctx.collect(count); 18 | count += 1; 19 | Thread.sleep(1000); 20 | } 21 | } 22 | 23 | @Override 24 | public void cancel() { 25 | isRunning = false; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /flink-train/src/main/java/cn/edu/nju/course05/JavaCustomRichParallelSourceFunction.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course05; 2 | 3 | import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction; 4 | import org.apache.flink.streaming.api.functions.source.SourceFunction; 5 | 6 | /** 7 | * Created by thpffcj on 2019-07-05. 8 | */ 9 | public class JavaCustomRichParallelSourceFunction extends RichParallelSourceFunction { 10 | 11 | boolean isRunning = true; 12 | Long count = 1L; 13 | 14 | @Override 15 | public void run(SourceFunction.SourceContext ctx) throws Exception { 16 | while (isRunning) { 17 | ctx.collect(count); 18 | count += 1; 19 | Thread.sleep(1000); 20 | } 21 | } 22 | 23 | @Override 24 | public void cancel() { 25 | isRunning = false; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /flink-train/src/main/java/cn/edu/nju/course05/JavaCustomSinkToMySQL.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course05; 2 | 3 | import org.apache.flink.api.common.functions.MapFunction; 4 | import org.apache.flink.streaming.api.datastream.DataStreamSource; 5 | import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; 6 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 7 | 8 | /** 9 | * Created by thpffcj on 2019-07-05. 10 | */ 11 | public class JavaCustomSinkToMySQL { 12 | 13 | public static void main(String[] args) throws Exception { 14 | 15 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 16 | 17 | DataStreamSource source = env.socketTextStream("localhost", 7777); 18 | 19 | SingleOutputStreamOperator studentStream = source.map(new MapFunction() { 20 | @Override 21 | public Student map(String value) throws Exception { 22 | System.out.println(value); 23 | String[] splits = value.split(","); 24 | Student stu = new Student(); 25 | stu.setId(Integer.parseInt(splits[0])); 26 | stu.setName(splits[1]); 27 | stu.setAge(Integer.parseInt(splits[2])); 28 | return stu; 29 | } 30 | }); 31 | 32 | studentStream.addSink(new SinkToMySQL()); 33 | 34 | env.execute("JavaCustomSinkToMySQL"); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /flink-train/src/main/java/cn/edu/nju/course05/JavaDataStreamSourceApp.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course05; 2 | 3 | import org.apache.flink.streaming.api.datastream.DataStreamSource; 4 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 5 | 6 | /** 7 | * Created by thpffcj on 2019-07-04. 8 | */ 9 | public class JavaDataStreamSourceApp { 10 | 11 | public static void main(String[] args) throws Exception { 12 | 13 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 14 | 15 | // socketFunction(env); 16 | // nonParallelSourceFunction(env); 17 | // parallelSourceFunction(env); 18 | richParallelSourceFunction(env); 19 | 20 | env.execute("JavaDataStreamSourceApp"); 21 | } 22 | 23 | public static void richParallelSourceFunction(StreamExecutionEnvironment env) { 24 | DataStreamSource data = env.addSource(new JavaCustomRichParallelSourceFunction()).setParallelism(2); 25 | data.print().setParallelism(1); 26 | } 27 | 28 | public static void parallelSourceFunction(StreamExecutionEnvironment env) { 29 | DataStreamSource data = env.addSource(new JavaCustomParallelSourceFunction()).setParallelism(2); 30 | data.print().setParallelism(1); 31 | } 32 | 33 | public static void nonParallelSourceFunction(StreamExecutionEnvironment env) { 34 | DataStreamSource data = env.addSource(new JavaCustomNonParallelSourceFunction()); 35 | data.print().setParallelism(1); 36 | } 37 | 38 | public static void socketFunction(StreamExecutionEnvironment env) { 39 | DataStreamSource data = env.socketTextStream("localhost", 9999); 40 | data.print().setParallelism(1); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /flink-train/src/main/java/cn/edu/nju/course05/JavaDataStreamTransformationApp.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course05; 2 | 3 | import org.apache.flink.api.common.functions.FilterFunction; 4 | import org.apache.flink.api.common.functions.MapFunction; 5 | import org.apache.flink.streaming.api.collector.selector.OutputSelector; 6 | import org.apache.flink.streaming.api.datastream.DataStreamSource; 7 | import org.apache.flink.streaming.api.datastream.SplitStream; 8 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 9 | 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | 13 | /** 14 | * Created by thpffcj on 2019-07-05. 15 | */ 16 | public class JavaDataStreamTransformationApp { 17 | 18 | public static void main(String[] args) throws Exception { 19 | 20 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 21 | 22 | // filterFunction(env); 23 | // unionFunction(env); 24 | splitSelectFunction(env); 25 | 26 | env.execute("JavaDataStreamTransformationApp"); 27 | } 28 | 29 | public static void splitSelectFunction(StreamExecutionEnvironment env) { 30 | DataStreamSource data = env.addSource(new JavaCustomNonParallelSourceFunction()); 31 | 32 | SplitStream splits = data.split(new OutputSelector() { 33 | @Override 34 | public Iterable select(Long value) { 35 | List output = new ArrayList<>(); 36 | if (value % 2 == 0) { 37 | output.add("even"); 38 | } else { 39 | output.add("odd"); 40 | } 41 | return output; 42 | } 43 | }); 44 | 45 | splits.select("odd").print().setParallelism(1); 46 | } 47 | 48 | public static void unionFunction(StreamExecutionEnvironment env) { 49 | DataStreamSource data1 = env.addSource(new JavaCustomNonParallelSourceFunction()); 50 | DataStreamSource data2 = env.addSource(new JavaCustomNonParallelSourceFunction()); 51 | data1.union(data2).print().setParallelism(1); 52 | } 53 | 54 | public static void filterFunction(StreamExecutionEnvironment env) { 55 | DataStreamSource data = env.addSource(new JavaCustomNonParallelSourceFunction()); 56 | data.map(new MapFunction() { 57 | @Override 58 | public Long map(Long value) throws Exception { 59 | System.out.println("receive: " + value); 60 | return value; 61 | } 62 | }).filter(new FilterFunction() { 63 | @Override 64 | public boolean filter(Long value) throws Exception { 65 | return value % 2 == 0; 66 | } 67 | }).print().setParallelism(1); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /flink-train/src/main/java/cn/edu/nju/course05/SinkToMySQL.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course05; 2 | 3 | import org.apache.flink.configuration.Configuration; 4 | import org.apache.flink.streaming.api.functions.sink.RichSinkFunction; 5 | 6 | import java.sql.Connection; 7 | import java.sql.DriverManager; 8 | import java.sql.PreparedStatement; 9 | 10 | /** 11 | * Created by thpffcj on 2019-07-05. 12 | */ 13 | public class SinkToMySQL extends RichSinkFunction { 14 | 15 | Connection connection; 16 | PreparedStatement preparedStatement; 17 | 18 | private Connection getConnection() { 19 | Connection conn = null; 20 | try { 21 | String url = "jdbc:mysql://localhost:3306/test"; 22 | conn = DriverManager.getConnection(url, "root", "00000000"); 23 | } catch (Exception e) { 24 | e.printStackTrace(); 25 | } 26 | return conn; 27 | } 28 | 29 | @Override 30 | public void open(Configuration parameters) throws Exception { 31 | super.open(parameters); 32 | 33 | connection = getConnection(); 34 | String sql = "insert into Student(id, name, age) values (?, ?, ?)"; 35 | preparedStatement = connection.prepareStatement(sql); 36 | } 37 | 38 | // 每条记录插入时调用一次 39 | public void invoke(Student value, Context context) throws Exception { 40 | 41 | // 为前面的占位符赋值 42 | preparedStatement.setInt(1, value.getId()); 43 | preparedStatement.setString(2, value.getName()); 44 | preparedStatement.setInt(3, value.getAge()); 45 | 46 | preparedStatement.executeUpdate(); 47 | } 48 | 49 | @Override 50 | public void close() throws Exception { 51 | if(connection != null) { 52 | try { 53 | connection.close(); 54 | } catch(Exception e) { 55 | e.printStackTrace(); 56 | } 57 | connection = null; 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /flink-train/src/main/java/cn/edu/nju/course05/Student.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course05; 2 | 3 | /** 4 | * Created by thpffcj on 2019-07-05. 5 | */ 6 | public class Student { 7 | 8 | private int id; 9 | private String name; 10 | private int age; 11 | 12 | public int getId() { 13 | return id; 14 | } 15 | 16 | public void setId(int id) { 17 | this.id = id; 18 | } 19 | 20 | public String getName() { 21 | return name; 22 | } 23 | 24 | public void setName(String name) { 25 | this.name = name; 26 | } 27 | 28 | public int getAge() { 29 | return age; 30 | } 31 | 32 | public void setAge(int age) { 33 | this.age = age; 34 | } 35 | 36 | @Override 37 | public String toString() { 38 | return "Student{" + 39 | "id=" + id + 40 | ", name='" + name + '\'' + 41 | ", age=" + age + 42 | '}'; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /flink-train/src/main/java/cn/edu/nju/course06/JavaTableSQLAPI.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course06; 2 | 3 | import org.apache.flink.api.java.DataSet; 4 | import org.apache.flink.api.java.ExecutionEnvironment; 5 | import org.apache.flink.table.api.Table; 6 | import org.apache.flink.table.api.TableEnvironment; 7 | import org.apache.flink.table.api.java.BatchTableEnvironment; 8 | import org.apache.flink.types.Row; 9 | 10 | /** 11 | * Created by thpffcj on 2019-07-06. 12 | */ 13 | public class JavaTableSQLAPI { 14 | 15 | public static void main(String[] args) throws Exception { 16 | ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); 17 | BatchTableEnvironment tableEnv = BatchTableEnvironment.create(env); 18 | 19 | String filePath = "file:///Users/thpffcj/Public/data/sales.csv"; 20 | DataSet csv = env.readCsvFile(filePath) 21 | .ignoreFirstLine() 22 | .pojoType(Sales.class, "transactionId", "customerId", "itemId", "amountPaid"); 23 | 24 | Table sales = tableEnv.fromDataSet(csv); 25 | tableEnv.registerTable("sales", sales); 26 | Table resultTable = tableEnv.sqlQuery("select customerId, sum(amountPaid) money from sales group by customerId"); 27 | 28 | DataSet result = tableEnv.toDataSet(resultTable, Row.class); 29 | result.print(); 30 | } 31 | 32 | public static class Sales { 33 | public String transactionId; 34 | public String customerId; 35 | public String itemId; 36 | public Double amountPaid; 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /flink-train/src/main/java/cn/edu/nju/course07/JavaWindowsApp.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course07; 2 | 3 | import org.apache.flink.api.common.functions.FlatMapFunction; 4 | import org.apache.flink.api.java.tuple.Tuple2; 5 | import org.apache.flink.streaming.api.datastream.DataStreamSource; 6 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 7 | import org.apache.flink.streaming.api.windowing.time.Time; 8 | import org.apache.flink.util.Collector; 9 | 10 | /** 11 | * Created by thpffcj on 2019-07-06. 12 | */ 13 | public class JavaWindowsApp { 14 | 15 | public static void main(String[] args) throws Exception { 16 | 17 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 18 | 19 | DataStreamSource text = env.socketTextStream("localhost", 9999); 20 | 21 | text.flatMap(new FlatMapFunction>() { 22 | @Override 23 | public void flatMap(String value, Collector> out) throws Exception { 24 | String[] tokens = value.toLowerCase().split(","); 25 | for (String token : tokens) { 26 | if (token.length() > 0) { 27 | out.collect(new Tuple2(token, 1)); 28 | } 29 | } 30 | } 31 | }).keyBy(0).timeWindow(Time.seconds(5)).sum(1).print().setParallelism(1); 32 | 33 | env.execute("JavaWindowsApp"); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /flink-train/src/main/java/cn/edu/nju/course07/JavaWindowsProcessApp.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course07; 2 | 3 | import org.apache.flink.api.common.functions.FlatMapFunction; 4 | import org.apache.flink.api.java.tuple.Tuple; 5 | import org.apache.flink.api.java.tuple.Tuple2; 6 | import org.apache.flink.streaming.api.datastream.DataStreamSource; 7 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 8 | import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction; 9 | import org.apache.flink.streaming.api.windowing.time.Time; 10 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow; 11 | import org.apache.flink.util.Collector; 12 | 13 | /** 14 | * Created by thpffcj on 2019-07-06. 15 | */ 16 | public class JavaWindowsProcessApp { 17 | 18 | public static void main(String[] args) throws Exception { 19 | 20 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 21 | 22 | DataStreamSource text = env.socketTextStream("localhost", 9999); 23 | 24 | text.flatMap(new FlatMapFunction>() { 25 | @Override 26 | public void flatMap(String value, Collector> out) throws Exception { 27 | String[] tokens = value.toLowerCase().split(","); 28 | for (String token : tokens) { 29 | if (token.length() > 0) { 30 | out.collect(new Tuple2(1, Integer.parseInt(token))); 31 | } 32 | } 33 | } 34 | }).keyBy(0) 35 | .timeWindow(Time.seconds(5)) 36 | .process(new ProcessWindowFunction, Object, Tuple, TimeWindow>() { 37 | @Override 38 | public void process(Tuple tuple, Context context, Iterable> elements, Collector out) throws Exception { 39 | System.out.println("----------"); 40 | long count = 0; 41 | for (Tuple2 in : elements) { 42 | count++; 43 | } 44 | out.collect("Window: " + context.window() + "count: " + count); 45 | } 46 | }).print().setParallelism(1); 47 | 48 | env.execute("JavaWindowsProcessApp"); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /flink-train/src/main/java/cn/edu/nju/course07/JavaWindowsReduceApp.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course07; 2 | 3 | import org.apache.flink.api.common.functions.FlatMapFunction; 4 | import org.apache.flink.api.common.functions.ReduceFunction; 5 | import org.apache.flink.api.java.tuple.Tuple2; 6 | import org.apache.flink.streaming.api.datastream.DataStreamSource; 7 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 8 | import org.apache.flink.streaming.api.windowing.time.Time; 9 | import org.apache.flink.util.Collector; 10 | 11 | /** 12 | * Created by thpffcj on 2019-07-06. 13 | */ 14 | public class JavaWindowsReduceApp { 15 | 16 | public static void main(String[] args) throws Exception { 17 | 18 | StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 19 | 20 | DataStreamSource text = env.socketTextStream("localhost", 9999); 21 | 22 | text.flatMap(new FlatMapFunction>() { 23 | @Override 24 | public void flatMap(String value, Collector> out) throws Exception { 25 | String[] tokens = value.toLowerCase().split(","); 26 | for (String token : tokens) { 27 | if (token.length() > 0) { 28 | out.collect(new Tuple2(1, Integer.parseInt(token))); 29 | } 30 | } 31 | } 32 | }).keyBy(0) 33 | .timeWindow(Time.seconds(5)) 34 | .reduce(new ReduceFunction>() { 35 | @Override 36 | public Tuple2 reduce(Tuple2 value1, Tuple2 value2) throws Exception { 37 | System.out.println("value1 = [" + value1 + "], value2 = [" + value2 + "]"); 38 | return new Tuple2<>(value1.f0, value1.f1 + value2.f1); 39 | } 40 | }).print().setParallelism(1); 41 | 42 | env.execute("JavaWindowsReduceApp"); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /flink-train/src/main/java/cn/edu/nju/hotItem/UserBehavior.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.hotItem; 2 | 3 | /** 4 | * Created by thpffcj on 2019-08-14. 5 | */ 6 | 7 | /** 用户行为数据结构 **/ 8 | public class UserBehavior { 9 | 10 | public long userId; // 用户 ID 11 | public long itemId; // 商品 ID 12 | public int categoryId; // 商品类目 ID 13 | public String behavior; // 用户行为, 包括("pv", "buy", "cart", "fav") 14 | public long timestamp; // 行为发生的时间戳,单位秒 15 | } 16 | -------------------------------------------------------------------------------- /flink-train/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | ################################################################################ 18 | 19 | log4j.rootLogger=INFO, console 20 | 21 | log4j.appender.console=org.apache.log4j.ConsoleAppender 22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 23 | log4j.appender.console.layout.ConversionPattern=%d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n 24 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/BatchJob.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package cn.edu.nju 20 | 21 | import org.apache.flink.api.scala._ 22 | 23 | /** 24 | * Skeleton for a Flink Batch Job. 25 | * 26 | * For a tutorial how to write a Flink batch application, check the 27 | * tutorials and examples on the Flink Website. 28 | * 29 | * To package your application into a JAR file for execution, 30 | * change the main class in the POM.xml file to this class (simply search for 'mainClass') 31 | * and run 'mvn clean package' on the command line. 32 | */ 33 | object BatchJob { 34 | 35 | def main(args: Array[String]) { 36 | // set up the batch execution environment 37 | val env = ExecutionEnvironment.getExecutionEnvironment 38 | 39 | /* 40 | * Here, you can start creating your execution plan for Flink. 41 | * 42 | * Start with getting some data from the environment, like 43 | * env.readTextFile(textPath); 44 | * 45 | * then, transform the resulting DataSet[String] using operations 46 | * like 47 | * .filter() 48 | * .flatMap() 49 | * .join() 50 | * .group() 51 | * 52 | * and many more. 53 | * Have a look at the programming guide: 54 | * 55 | * http://flink.apache.org/docs/latest/apis/batch/index.html 56 | * 57 | * and the examples 58 | * 59 | * http://flink.apache.org/docs/latest/apis/batch/examples.html 60 | * 61 | */ 62 | 63 | // execute program 64 | env.execute("Flink Batch Scala API Skeleton") 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/BatchWCScalaApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju 2 | 3 | import org.apache.flink.api.scala.ExecutionEnvironment 4 | 5 | /** 6 | * 使用Scala开发Flink的批处理应用程序 7 | * Created by thpffcj on 2019-06-28. 8 | */ 9 | object BatchWCScalaApp { 10 | 11 | def main(args: Array[String]): Unit = { 12 | 13 | val input = "file:///Users/thpffcj/Public/file/hello.txt" 14 | 15 | val env = ExecutionEnvironment.getExecutionEnvironment 16 | 17 | val text = env.readTextFile(input) 18 | 19 | // 引入隐式转换 20 | import org.apache.flink.api.scala._ 21 | 22 | text.flatMap(_.toLowerCase.split("\t")) 23 | .filter(_.nonEmpty) 24 | .map((_, 1)) 25 | .groupBy(0) 26 | .sum(1).print() 27 | } 28 | 29 | /** 30 | * hadoop welcome 31 | * hadoop hdfs mapreduce 32 | * hadoop hdfs 33 | * 34 | * hadoop 35 | * hdfs 36 | * hadoop 37 | * welcome 38 | * hadoop 39 | * hdfs 40 | * mapreduce 41 | * 42 | * hadoop 43 | * hdfs 44 | * hadoop 45 | * welcome 46 | * hadoop 47 | * hdfs 48 | * mapreduce 49 | * 50 | * (hadoop,1) 51 | * (hdfs,1) 52 | * (mapreduce,1) 53 | * (hadoop,1) 54 | * (welcome,1) 55 | * (hadoop,1) 56 | * (hdfs,1) 57 | * 58 | * (hdfs,2) 59 | * (hadoop,3) 60 | * (mapreduce,1) 61 | * (welcome,1) 62 | */ 63 | } 64 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/StreamingJob.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package cn.edu.nju 20 | 21 | import org.apache.flink.streaming.api.scala._ 22 | 23 | /** 24 | * Skeleton for a Flink Streaming Job. 25 | * 26 | * For a tutorial how to write a Flink streaming application, check the 27 | * tutorials and examples on the Flink Website. 28 | * 29 | * To package your application into a JAR file for execution, run 30 | * 'mvn clean package' on the command line. 31 | * 32 | * If you change the name of the main class (with the public static void main(String[] args)) 33 | * method, change the respective entry in the POM.xml file (simply search for 'mainClass'). 34 | */ 35 | object StreamingJob { 36 | def main(args: Array[String]) { 37 | // set up the streaming execution environment 38 | val env = StreamExecutionEnvironment.getExecutionEnvironment 39 | 40 | /* 41 | * Here, you can start creating your execution plan for Flink. 42 | * 43 | * Start with getting some data from the environment, like 44 | * env.readTextFile(textPath); 45 | * 46 | * then, transform the resulting DataStream[String] using operations 47 | * like 48 | * .filter() 49 | * .flatMap() 50 | * .join() 51 | * .group() 52 | * 53 | * and many more. 54 | * Have a look at the programming guide: 55 | * 56 | * http://flink.apache.org/docs/latest/apis/streaming/index.html 57 | * 58 | */ 59 | 60 | // execute program 61 | env.execute("Flink Streaming Scala API Skeleton") 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/StreamingWCScalaApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju 2 | 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 4 | import org.apache.flink.streaming.api.windowing.time.Time 5 | 6 | /** 7 | * 使用Scala开发Flink的实时处理应用程序 8 | * Created by thpffcj on 2019-06-29. 9 | */ 10 | object StreamingWCScalaApp { 11 | 12 | def main(args: Array[String]): Unit = { 13 | 14 | val env = StreamExecutionEnvironment.getExecutionEnvironment 15 | 16 | val text = env.socketTextStream("localhost", 9999) 17 | 18 | import org.apache.flink.api.scala._ 19 | 20 | // text.flatMap(_.split(",")) 21 | // .filter(_.nonEmpty) 22 | // .map((_, 1)) 23 | // .keyBy(0) 24 | // .timeWindow(Time.seconds(5)) 25 | // .sum(1).print() 26 | 27 | text.flatMap(_.split(",")) 28 | .filter(_.nonEmpty) 29 | .map(x => WC(x, 1)) 30 | .keyBy("word") 31 | .timeWindow(Time.seconds(5)) 32 | .sum("count").print() 33 | 34 | env.execute("StreamingWCScalaApp") 35 | } 36 | 37 | case class WC(word: String, count:Int) 38 | } 39 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/WindowWordCount.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju; 2 | 3 | import org.apache.flink.api.common.functions.FlatMapFunction; 4 | import org.apache.flink.api.java.tuple.Tuple2; 5 | import org.apache.flink.api.java.utils.ParameterTool; 6 | import org.apache.flink.streaming.api.datastream.DataStream; 7 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; 8 | import org.apache.flink.util.Collector; 9 | 10 | /** 11 | * Created by thpffcj on 2019-07-28. 12 | */ 13 | public class WindowWordCount { 14 | 15 | public static void main(String[] args) throws Exception { 16 | 17 | final ParameterTool params = ParameterTool.fromArgs(args); 18 | 19 | // set up the execution environment 20 | final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); 21 | 22 | // get input data 23 | DataStream text = env.readTextFile(params.get("input")).setParallelism(2); 24 | 25 | // make parameters available in the web interface 26 | env.getConfig().setGlobalJobParameters(params); 27 | 28 | final int windowSize = params.getInt("window", 10); 29 | final int slideSize = params.getInt("slide", 5); 30 | 31 | DataStream> counts = 32 | // split up the lines in pairs (2-tuples) containing: (word,1) 33 | text.flatMap(new Tokenizer()).setParallelism(4).slotSharingGroup("flatMap_sg") 34 | // create windows of windowSize records slided every slideSize records 35 | .keyBy(0) 36 | .countWindow(windowSize, slideSize) 37 | // group by the tuple field "0" and sum up tuple field "1" 38 | .sum(1).setParallelism(3).slotSharingGroup("sum_sg"); 39 | 40 | // emit result 41 | counts.print().setParallelism(3); 42 | 43 | // execute program 44 | env.execute("WindowWordCount"); 45 | } 46 | 47 | public static final class Tokenizer implements FlatMapFunction> { 48 | 49 | @Override 50 | public void flatMap(String value, Collector> out) { 51 | // normalize and split the line 52 | String[] tokens = value.toLowerCase().split("\\W+"); 53 | 54 | // emit the pairs 55 | for (String token : tokens) { 56 | if (token.length() > 0) { 57 | out.collect(new Tuple2<>(token, 1)); 58 | } 59 | } 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/course04/CounterApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course04 2 | 3 | import org.apache.flink.api.common.accumulators.LongCounter 4 | import org.apache.flink.api.common.functions.RichMapFunction 5 | import org.apache.flink.api.scala.ExecutionEnvironment 6 | import org.apache.flink.configuration.Configuration 7 | import org.apache.flink.core.fs.FileSystem.WriteMode 8 | import org.apache.flink.api.scala._ 9 | 10 | /** 11 | * 基于Flink编程的计数器开发三部曲 12 | * step1:定义计数器 13 | * step2:注册计数器 14 | * step3:获取计数器 15 | * Created by thpffcj on 2019-07-04. 16 | */ 17 | object CounterApp { 18 | 19 | def main(args: Array[String]): Unit = { 20 | 21 | val env = ExecutionEnvironment.getExecutionEnvironment 22 | 23 | val data = env.fromElements("hadoop", "spark", "flink", "pyspark", "storm") 24 | 25 | // data.map(new RichMapFunction[String, Long] { 26 | // var counter = 0l 27 | // override def map(value: String): Long = { 28 | // counter = counter + 1 29 | // println("counter : " + counter) 30 | // counter 31 | // } 32 | // }).setParallelism(5).print() 33 | 34 | val info = data.map(new RichMapFunction[String, String] { 35 | 36 | // step1:定义计数器 37 | var counter = new LongCounter() 38 | 39 | override def open(parameters: Configuration): Unit = { 40 | // step2:注册计数器 41 | getRuntimeContext.addAccumulator("ele-counts-scala", counter) 42 | } 43 | 44 | override def map(value: String): String = { 45 | counter.add(1) 46 | value 47 | } 48 | }).setParallelism(5) 49 | 50 | val filePath = "file:///Users/thpffcj/Public/data/sink-scala-count-out" 51 | info.writeAsText(filePath, WriteMode.OVERWRITE) 52 | val jobResult = env.execute("CounterApp") 53 | // step3:获取计数器 54 | val num = jobResult.getAccumulatorResult[Long]("ele-counts-scala") 55 | 56 | println("num: " + num) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/course04/DBUtils.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course04 2 | 3 | import scala.util.Random 4 | 5 | /** 6 | * Created by thpffcj on 2019-07-02. 7 | */ 8 | object DBUtils { 9 | 10 | def getConnection()= { 11 | new Random().nextInt(10) + "" 12 | } 13 | 14 | def returnConnection(connection: String): Unit = { 15 | 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/course04/DataSetDataSourceApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course04 2 | 3 | import org.apache.flink.api.scala.ExecutionEnvironment 4 | import org.apache.flink.configuration.Configuration 5 | 6 | /** 7 | * Created by thpffcj on 2019-07-02. 8 | */ 9 | object DataSetDataSourceApp { 10 | 11 | def main(args: Array[String]): Unit = { 12 | 13 | val env = ExecutionEnvironment.getExecutionEnvironment 14 | 15 | // fromCollection(env) 16 | // textFile(env) 17 | // csvFile(env) 18 | // readRecursiveFiles(env) 19 | readCompressionFiles(env) 20 | } 21 | 22 | def readCompressionFiles(env: ExecutionEnvironment): Unit = { 23 | val filePath = "file:///Users/thpffcj/Public/data/compression" 24 | env.readTextFile(filePath).print() 25 | } 26 | 27 | def readRecursiveFiles(env: ExecutionEnvironment): Unit = { 28 | val filePath = "file:///Users/thpffcj/Public/data/nested" 29 | val parameters = new Configuration() 30 | parameters.setBoolean("recursive.file.enumeration", true) 31 | env.readTextFile(filePath).withParameters(parameters).print() 32 | } 33 | 34 | case class MyCaseClass(name:String, age:Int) 35 | 36 | def csvFile(env: ExecutionEnvironment): Unit = { 37 | 38 | import org.apache.flink.api.scala._ 39 | val filePath = "file:///Users/thpffcj/Public/data/people.csv" 40 | 41 | env.readCsvFile[(String, Int, String)](filePath, ignoreFirstLine = true).print() 42 | 43 | env.readCsvFile[(String, Int)](filePath, ignoreFirstLine = true, includedFields = Array(0, 1)).print() 44 | 45 | env.readCsvFile[MyCaseClass](filePath, ignoreFirstLine = true, includedFields = Array(0, 1)).print() 46 | 47 | env.readCsvFile[Person](filePath, ignoreFirstLine = true, pojoFields = Array("name", "age", "work")).print() 48 | } 49 | 50 | def textFile(env: ExecutionEnvironment): Unit = { 51 | val filePath = "file:///Users/thpffcj/Public/data/hello.txt" 52 | env.readTextFile(filePath).print() 53 | } 54 | 55 | def fromCollection(env: ExecutionEnvironment): Unit = { 56 | 57 | import org.apache.flink.api.scala._ 58 | val data = 1 to 10 59 | env.fromCollection(data).print() 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/course04/DataSetSinkApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course04 2 | 3 | import org.apache.flink.api.scala.ExecutionEnvironment 4 | import org.apache.flink.core.fs.FileSystem.WriteMode 5 | import org.apache.flink.api.scala._ 6 | 7 | /** 8 | * Created by thpffcj on 2019-07-04. 9 | */ 10 | object DataSetSinkApp { 11 | 12 | def main(args: Array[String]): Unit = { 13 | 14 | val env = ExecutionEnvironment.getExecutionEnvironment 15 | 16 | val data = 1.to(10) 17 | val text = env.fromCollection(data) 18 | 19 | val filePath = "file:///Users/thpffcj/Public/data/sink-out" 20 | 21 | text.writeAsText(filePath, WriteMode.OVERWRITE).setParallelism(2) 22 | 23 | env.execute("DataSetSinkApp") 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/course04/DistributedCacheApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course04 2 | 3 | import org.apache.commons.io.FileUtils 4 | import org.apache.flink.api.common.functions.RichMapFunction 5 | import org.apache.flink.api.scala.ExecutionEnvironment 6 | import org.apache.flink.configuration.Configuration 7 | import org.apache.flink.api.scala._ 8 | 9 | /** 10 | * step1:注册一个本地/HDFS文件 11 | * step2:在open方法中获取到分布式缓存的内容即可 12 | * Created by thpffcj on 2019-07-04. 13 | */ 14 | object DistributedCacheApp { 15 | 16 | def main(args: Array[String]): Unit = { 17 | 18 | val env = ExecutionEnvironment.getExecutionEnvironment 19 | 20 | val filePath = "file:///Users/thpffcj/Public/data/hello.txt" 21 | 22 | // step1:注册一个本地/HDFS文件 23 | env.registerCachedFile(filePath, "scala-dc") 24 | 25 | val data = env.fromElements("hadoop", "spark", "flink", "pyspark", "storm") 26 | 27 | data.map(new RichMapFunction[String, String] { 28 | 29 | // step2:在open方法中获取到分布式缓存的内容即可 30 | override def open(parameters: Configuration): Unit = { 31 | val dcFile = getRuntimeContext.getDistributedCache().getFile("scala-dc") 32 | val lines = FileUtils.readLines(dcFile) 33 | 34 | /** 35 | * 此时会出现一个异常,Java集合和Scala集合不兼容的问题 36 | */ 37 | import scala.collection.JavaConverters._ 38 | for (ele <- lines.asScala) { 39 | println(ele) 40 | } 41 | } 42 | 43 | override def map(value: String): String = { 44 | value 45 | } 46 | }).print() 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/course05/CustomNonParallelSourceFunction.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course05 2 | 3 | import org.apache.flink.streaming.api.functions.source.SourceFunction 4 | 5 | /** 6 | * Created by thpffcj on 2019-07-05. 7 | */ 8 | class CustomNonParallelSourceFunction extends SourceFunction[Long]{ 9 | 10 | var count = 1L 11 | 12 | var isRunning = true 13 | 14 | override def run(ctx: SourceFunction.SourceContext[Long]): Unit = { 15 | while (isRunning) { 16 | ctx.collect(count) 17 | count += 1 18 | Thread.sleep(1000) 19 | } 20 | } 21 | 22 | override def cancel(): Unit = { 23 | isRunning = false 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/course05/CustomParallelSourceFunction.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course05 2 | 3 | import org.apache.flink.streaming.api.functions.source.{ParallelSourceFunction, SourceFunction} 4 | 5 | /** 6 | * Created by thpffcj on 2019-07-05. 7 | */ 8 | class CustomParallelSourceFunction extends ParallelSourceFunction[Long] { 9 | 10 | var count = 1l 11 | var isRunning = true 12 | 13 | override def run(ctx: SourceFunction.SourceContext[Long]): Unit = { 14 | while (isRunning) { 15 | ctx.collect(count) 16 | count += 1 17 | Thread.sleep(1000) 18 | } 19 | } 20 | 21 | override def cancel(): Unit = { 22 | isRunning = false 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/course05/CustomRichParallelSourceFunction.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course05 2 | 3 | import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, SourceFunction} 4 | 5 | /** 6 | * Created by thpffcj on 2019-07-05. 7 | */ 8 | class CustomRichParallelSourceFunction extends RichParallelSourceFunction[Long] { 9 | 10 | var count = 1l 11 | var isRunning = true 12 | 13 | override def run(ctx: SourceFunction.SourceContext[Long]): Unit = { 14 | while (isRunning) { 15 | ctx.collect(count) 16 | count += 1 17 | Thread.sleep(1000) 18 | } 19 | } 20 | 21 | override def cancel(): Unit = { 22 | isRunning = false 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/course05/DataStreamSourceApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course05 2 | 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 4 | import org.apache.flink.api.scala._ 5 | 6 | /** 7 | * Created by thpffcj on 2019-07-04. 8 | */ 9 | object DataStreamSourceApp { 10 | 11 | def main(args: Array[String]): Unit = { 12 | 13 | val env = StreamExecutionEnvironment.getExecutionEnvironment 14 | // socketFunction(env) 15 | 16 | // nonParallelSourceFunction(env) 17 | // parallelSourceFunction(env) 18 | richParallelSourceFunction(env) 19 | 20 | env.execute("DataStreamSourceApp") 21 | } 22 | 23 | def richParallelSourceFunction(env: StreamExecutionEnvironment): Unit = { 24 | val data = env.addSource(new CustomRichParallelSourceFunction).setParallelism(2) 25 | data.print() 26 | } 27 | 28 | def parallelSourceFunction(env: StreamExecutionEnvironment): Unit = { 29 | val data = env.addSource(new CustomParallelSourceFunction).setParallelism(2) 30 | data.print() 31 | } 32 | 33 | def nonParallelSourceFunction(env: StreamExecutionEnvironment): Unit = { 34 | val data = env.addSource(new CustomNonParallelSourceFunction) 35 | data.print() 36 | } 37 | 38 | def socketFunction(env: StreamExecutionEnvironment): Unit = { 39 | 40 | val data = env.socketTextStream("localhost", 9999) 41 | data.print() 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/course05/DataStreamTransformationApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course05 2 | 3 | import java.{lang, util} 4 | 5 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 6 | import org.apache.flink.api.scala._ 7 | import org.apache.flink.streaming.api.TimeCharacteristic 8 | import org.apache.flink.streaming.api.collector.selector.OutputSelector 9 | 10 | /** 11 | * Created by thpffcj on 2019-07-05. 12 | */ 13 | object DataStreamTransformationApp { 14 | 15 | def main(args: Array[String]): Unit = { 16 | val env = StreamExecutionEnvironment.getExecutionEnvironment 17 | 18 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) 19 | 20 | // filterFunction(env) 21 | // unionFunction(env) 22 | splitSelectFunction(env) 23 | 24 | env.execute("DataStreamTransformationApp") 25 | } 26 | 27 | def splitSelectFunction(env: StreamExecutionEnvironment): Unit = { 28 | val data = env.addSource(new CustomNonParallelSourceFunction) 29 | 30 | val splits = data.split(new OutputSelector[Long] { 31 | override def select(value: Long): lang.Iterable[String] = { 32 | val list = new util.ArrayList[String]() 33 | if (value % 2 == 0) { 34 | list.add("even") 35 | } else { 36 | list.add("odd") 37 | } 38 | list 39 | } 40 | }) 41 | 42 | splits.select("even").print().setParallelism(1) 43 | } 44 | 45 | def unionFunction(env: StreamExecutionEnvironment): Unit = { 46 | val data1 = env.addSource(new CustomNonParallelSourceFunction) 47 | val data2 = env.addSource(new CustomNonParallelSourceFunction) 48 | data1.union(data2).print().setParallelism(1) 49 | } 50 | 51 | def filterFunction(env: StreamExecutionEnvironment): Unit = { 52 | val data = env.addSource(new CustomNonParallelSourceFunction) 53 | 54 | data.map(x =>{ 55 | println("received: " + x) 56 | x 57 | }).filter(_%2 == 0).print().setParallelism(1) 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/course06/TableSQLAPI.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course06 2 | 3 | import org.apache.flink.api.scala.ExecutionEnvironment 4 | import org.apache.flink.table.api.TableEnvironment 5 | import org.apache.flink.api.scala._ 6 | import org.apache.flink.types.Row 7 | 8 | /** 9 | * Created by thpffcj on 2019-07-06. 10 | */ 11 | object TableSQLAPI { 12 | 13 | def main(args: Array[String]): Unit = { 14 | 15 | val env = ExecutionEnvironment.getExecutionEnvironment 16 | val tableEnv = TableEnvironment.getTableEnvironment(env) 17 | 18 | val filePath = "file:///Users/thpffcj/Public/data/sales.csv" 19 | // 已经拿到DataSet 20 | val csv = env.readCsvFile[SalesLog](filePath, ignoreFirstLine = true) 21 | 22 | // DataSet => Table 23 | val salesTable = tableEnv.fromDataSet(csv) 24 | // Table => table 25 | tableEnv.registerTable("sales", salesTable) 26 | 27 | // sql 28 | val resultTable = tableEnv.sqlQuery("select customerId, sum(amountPaid) money from sales group by customerId") 29 | 30 | tableEnv.toDataSet[Row](resultTable).print() 31 | } 32 | 33 | case class SalesLog(transactionId: String, customerId: String, itemId: String, amountPaid: Double) 34 | } 35 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/course07/WindowsApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course07 2 | 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 4 | import org.apache.flink.api.scala._ 5 | import org.apache.flink.streaming.api.windowing.time.Time 6 | 7 | /** 8 | * Created by thpffcj on 2019-07-06. 9 | */ 10 | object WindowsApp { 11 | 12 | def main(args: Array[String]): Unit = { 13 | 14 | val env= StreamExecutionEnvironment.getExecutionEnvironment 15 | 16 | val text = env.socketTextStream("localhost", 9999) 17 | 18 | // text.flatMap(_.split(",")) 19 | // .map((_, 1)) 20 | // .keyBy(0) 21 | // .timeWindow(Time.seconds(5)) 22 | // .sum(1).print().setParallelism(1) 23 | 24 | text.flatMap(_.split(",")) 25 | .map((_, 1)) 26 | .keyBy(0) 27 | .timeWindow(Time.seconds(10), Time.seconds(5)) 28 | .sum(1).print().setParallelism(1) 29 | 30 | 31 | env.execute("WindowsApp") 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/course07/WindowsProcessApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course07 2 | 3 | import org.apache.flink.api.java.tuple.Tuple 4 | import org.apache.flink.api.scala._ 5 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 6 | import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction 7 | import org.apache.flink.streaming.api.windowing.time.Time 8 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow 9 | import org.apache.flink.util.Collector 10 | 11 | /** 12 | * Created by thpffcj on 2019-07-06. 13 | */ 14 | object WindowsProcessApp { 15 | 16 | def main(args: Array[String]): Unit = { 17 | 18 | val env = StreamExecutionEnvironment.getExecutionEnvironment 19 | 20 | val text = env.socketTextStream("localhost", 9999) 21 | 22 | text.flatMap(_.split(",")) 23 | .map(x => (1, x.toInt)) 24 | .keyBy(0) 25 | .timeWindow(Time.seconds(5)) 26 | .process(new MyProcessWindowFunction()) 27 | .print().setParallelism(1) 28 | 29 | env.execute("WindowsReduceApp") 30 | } 31 | 32 | class MyProcessWindowFunction extends ProcessWindowFunction[(Int, Int), String, Tuple, TimeWindow] { 33 | 34 | def process(key: Tuple, context: Context, input: Iterable[(Int, Int)], out: Collector[String]): Unit = { 35 | var count = 0L 36 | for (in <- input) { 37 | count = count + 1 38 | } 39 | out.collect(s"Window ${context.window} count: $count") 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/course07/WindowsReduceApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course07 2 | 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 4 | import org.apache.flink.api.scala._ 5 | import org.apache.flink.streaming.api.windowing.time.Time 6 | 7 | /** 8 | * Created by thpffcj on 2019-07-06. 9 | */ 10 | object WindowsReduceApp { 11 | 12 | def main(args: Array[String]): Unit = { 13 | 14 | val env = StreamExecutionEnvironment.getExecutionEnvironment 15 | 16 | val text = env.socketTextStream("localhost", 9999) 17 | 18 | // 原来传递进来的是字符串,此处我们就使用数值类型,通过数值类型来演示增量的效果 19 | text.flatMap(_.split(",")) 20 | .map(x => (1, x.toInt)) // 1,2,3,4,5 => (1,1)(1,2)(1,3)(1,4)(1,5) 21 | .keyBy(0) // 因为key都是1,所以所有的元素都要一个task去执行 22 | .timeWindow(Time.seconds(5)) 23 | .reduce((v1, v2) => { // 不是等待窗口所有的数据进行一次性处理,而是数据两两处理 24 | println(v1 + "..." + v2) 25 | (v1._1, v1._2 + v2._2) 26 | }).print().setParallelism(1) 27 | 28 | env.execute("WindowsReduceApp") 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/course08/FileSystemSinkApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course08 2 | 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 4 | import org.apache.flink.streaming.connectors.fs.{SequenceFileWriter, StringWriter} 5 | import org.apache.flink.streaming.connectors.fs.bucketing.{BucketingSink, DateTimeBucketer} 6 | 7 | /** 8 | * Created by thpffcj on 2019-07-07. 9 | */ 10 | object FileSystemSinkApp { 11 | 12 | def main(args: Array[String]): Unit = { 13 | 14 | val env = StreamExecutionEnvironment.getExecutionEnvironment 15 | 16 | val data = env.socketTextStream("localhost", 9999) 17 | 18 | val filePath = "file:///Users/thpffcj/Public/data" 19 | val sink = new BucketingSink[String](filePath) 20 | sink.setBucketer(new DateTimeBucketer("yyyy-MM-dd--HHmm")) 21 | sink.setWriter(new StringWriter[String]()) 22 | sink.setBatchSize(1024 * 1024 * 400) // this is 400 MB, 23 | sink.setBatchRolloverInterval(20 * 60 * 1000); // this is 20 mins 24 | 25 | data.addSink(sink) 26 | env.execute("FileSystemSinkApp") 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/course08/KafkaConnectorConsumerApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course08 2 | 3 | import java.util.Properties 4 | 5 | import org.apache.flink.api.common.serialization.SimpleStringSchema 6 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 7 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer 8 | import org.apache.flink.api.scala._ 9 | import org.apache.flink.streaming.api.CheckpointingMode 10 | 11 | /** 12 | * Created by thpffcj on 2019-07-07. 13 | */ 14 | object KafkaConnectorConsumerApp { 15 | 16 | def main(args: Array[String]): Unit = { 17 | 18 | val env = StreamExecutionEnvironment.getExecutionEnvironment 19 | 20 | // checkpoint常用设置参数 21 | env.enableCheckpointing(4000) 22 | env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE) 23 | env.getCheckpointConfig.setCheckpointTimeout(10000) 24 | env.getCheckpointConfig.setMaxConcurrentCheckpoints(1) 25 | 26 | val topic = "test" 27 | val properties = new Properties() 28 | properties.setProperty("bootstrap.servers", "localhost:9092") 29 | properties.setProperty("group.id", "test") 30 | val data = env.addSource(new FlinkKafkaConsumer[String](topic, new SimpleStringSchema(), properties)) 31 | 32 | data.print() 33 | 34 | env.execute("KafkaConnectorConsumerApp") 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/course08/KafkaConnectorProducerApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.course08 2 | 3 | import java.util.Properties 4 | 5 | import org.apache.flink.api.common.serialization.SimpleStringSchema 6 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 7 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer 8 | import org.apache.flink.streaming.connectors.kafka.internals.KeyedSerializationSchemaWrapper 9 | 10 | /** 11 | * Created by thpffcj on 2019-07-07. 12 | */ 13 | object KafkaConnectorProducerApp { 14 | 15 | def main(args: Array[String]): Unit = { 16 | 17 | val env = StreamExecutionEnvironment.getExecutionEnvironment 18 | 19 | // 从socket接收数据,通过Flink,将数据sink到Kafka 20 | val data = env.socketTextStream("localhost", 9999) 21 | 22 | val topic = "test" 23 | val properties = new Properties() 24 | properties.setProperty("bootstrap.servers", "localhost:9092") 25 | 26 | // val kafkaSink = new FlinkKafkaProducer[String](topic, 27 | // new KeyedSerializationSchemaWrapper[String](new SimpleStringSchema()), properties) 28 | 29 | val kafkaSink = new FlinkKafkaProducer[String](topic, 30 | new KeyedSerializationSchemaWrapper[String](new SimpleStringSchema()), 31 | properties, 32 | FlinkKafkaProducer.Semantic.EXACTLY_ONCE) 33 | 34 | data.addSink(kafkaSink) 35 | 36 | env.execute("KafkaConnectorProducerApp") 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/project/MyMySQLSource.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.project 2 | 3 | import java.sql.{Connection, DriverManager, PreparedStatement} 4 | 5 | import org.apache.flink.configuration.Configuration 6 | import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, SourceFunction} 7 | 8 | import scala.collection.mutable 9 | 10 | /** 11 | * Created by thpffcj on 2019-07-11. 12 | */ 13 | class MyMySQLSource extends RichParallelSourceFunction[mutable.HashMap[String, String]] { 14 | 15 | var connection:Connection = null 16 | var ps:PreparedStatement = null 17 | 18 | // open:建立连接 19 | override def open(parameters: Configuration): Unit = { 20 | 21 | val url = "jdbc:mysql://localhost:3306/test" 22 | val user = "root" 23 | val password = "00000000" 24 | connection = DriverManager.getConnection(url, user, password) 25 | 26 | val sql = "select user_id, domain from user_domain_config" 27 | ps = connection.prepareStatement(sql) 28 | } 29 | 30 | // 释放资源 31 | override def close(): Unit = { 32 | if (ps != null) { 33 | ps.close() 34 | } 35 | 36 | if (connection != null) { 37 | connection.close() 38 | } 39 | } 40 | 41 | /** 42 | * 此处是代码的关键:要从MySQL表中把数据读取出来转成Map进行数据的封装 43 | * @param ctx 44 | */ 45 | override def run(ctx: SourceFunction.SourceContext[mutable.HashMap[String, String]]): Unit = { 46 | 47 | val resultMap = new mutable.HashMap[String, String]() 48 | 49 | val result = ps.executeQuery() 50 | while (result.next()) { 51 | val userId = result.getString(1) 52 | val domain = result.getString(2) 53 | resultMap.put(domain, userId) 54 | } 55 | ctx.collect(resultMap) 56 | } 57 | 58 | override def cancel(): Unit = {} 59 | } 60 | -------------------------------------------------------------------------------- /flink-train/src/main/scala/cn/edu/nju/project/MyMySQLSourceTest.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.project 2 | 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 4 | import org.apache.flink.api.scala._ 5 | 6 | /** 7 | * Created by thpffcj on 2019-07-11. 8 | */ 9 | object MyMySQLSourceTest { 10 | 11 | def main(args: Array[String]): Unit = { 12 | 13 | val env = StreamExecutionEnvironment.getExecutionEnvironment 14 | 15 | val data = env.addSource(new MyMySQLSource) 16 | data.print() 17 | 18 | env.execute("MyMySQLSourceTest") 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /hadoop-train/src/main/java/cn/edu/nju/hadoop/mapreduce/sort/GlobalSortPartitioner.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.hadoop.mapreduce.sort; 2 | 3 | import org.apache.hadoop.conf.Configurable; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.apache.hadoop.io.LongWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Partitioner; 8 | 9 | /** 10 | * Created by thpffcj on 2020/2/5. 11 | *

12 | * 让MapReduce产生一个全局排序的文件: 13 | *

14 | * 1. 最简单的方法是只使用一个分区(partition),这种在处理小规模文件时还行。但是在处理大型文件是效率极低,所有的数据都发送到一 15 | * 个Reduce进行排序,这样不能充分利用集群的计算资源,而且在数据量很大的情况下,很有可能会出现OOM问题。 16 | * 2. 首先创建一系列排好序的文件,其次串联这些文件,最后生成一个全局排序的文件。它主要的思路使用一个partitioner来描述输出的全 17 | * 局排序。该方案的重点在于分区方法,默认情况下根据hash值进行分区(默认的分区函数是HashPartitioner,其实现的原理是计算map输 18 | * 出key的 hashCode ,然后对Reduce个数 求余,余数相同的 key 都会发送到同一个Reduce);还可以根据用户自定义partitioner 19 | * (自定义一个类并且继承partitioner类,重写器getpartition方法) 20 | */ 21 | class GlobalSortPartitioner extends Partitioner implements Configurable { 22 | 23 | private Configuration configuration = null; 24 | private int indexRange = 0; 25 | 26 | public int getPartition(Text text, LongWritable longWritable, int numPartitions) { 27 | // 假如取值范围等于26的话,那么就意味着只需要根据第一个字母来划分索引 28 | int index = 0; 29 | if (indexRange == 26) { 30 | index = text.toString().toCharArray()[0] - 'a'; 31 | } else if (indexRange == 26 * 26) { 32 | //这里就是需要根据前两个字母进行划分索引了 33 | char[] chars = text.toString().toCharArray(); 34 | if (chars.length == 1) { 35 | index = (chars[0] - 'a') * 26; 36 | } 37 | index = (chars[0] - 'a') * 26 + (chars[1] - 'a'); 38 | } 39 | int perReducerCount = indexRange / numPartitions; 40 | if (indexRange < numPartitions) { 41 | return numPartitions; 42 | } 43 | 44 | for (int i = 0; i < numPartitions; i++) { 45 | int min = i * perReducerCount; 46 | int max = (i + 1) * perReducerCount - 1; 47 | if (index >= min && index <= max) { 48 | return i; 49 | } 50 | } 51 | //这里我们采用的是第一种不太科学的方法 52 | return numPartitions - 1; 53 | } 54 | 55 | public void setConf(Configuration conf) { 56 | this.configuration = conf; 57 | indexRange = configuration.getInt("key.indexRange", 26 * 26); 58 | } 59 | 60 | public Configuration getConf() { 61 | return configuration; 62 | } 63 | } -------------------------------------------------------------------------------- /hadoop-train/src/main/java/cn/edu/nju/hadoop/mapreduce/sort/IntPair.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.hadoop.mapreduce.sort; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.io.WritableComparable; 8 | 9 | /** 10 | * Created by thpffcj on 2020/2/12. 11 | * 12 | * 自定义key排序 13 | * 在mr中,所有的key是需要被比较和排序的,并且是二次,先根据partitioner,再根据大小。而本例中也是要比较两次。 14 | * 先按照第一字段排序,然后再对第一字段相同的按照第二字段排序。 15 | * 根据这一点,我们可以构造一个复合类IntPair,他有两个字段,先利用分区对第一字段排序,再利用分区内的比较对第二字段排序 16 | */ 17 | public class IntPair implements WritableComparable { 18 | 19 | int first; 20 | int second; 21 | 22 | public IntPair(){ 23 | } 24 | 25 | public IntPair(int first, int second){ 26 | this.first = first; 27 | this.second = second; 28 | } 29 | 30 | public int getFirst() { 31 | return first; 32 | } 33 | 34 | public int getSecond() { 35 | return second; 36 | } 37 | 38 | // 反序列化,从流中读进二进制转换成IntPair 39 | @Override 40 | public void readFields(DataInput in) throws IOException { 41 | this.first = in.readInt(); 42 | this.second = in.readInt(); 43 | } 44 | 45 | // 序列化,将IntPair转换成二进制输出 46 | @Override 47 | public void write(DataOutput out) throws IOException { 48 | out.writeInt(first); 49 | out.writeInt(second); 50 | } 51 | 52 | /* 53 | * 为什么要重写equal方法? 54 | * 因为Object的equal方法默认是两个对象的引用的比较,意思就是指向同一内存,地址则相等,否则不相等; 55 | * 如果你现在需要利用对象里面的值来判断是否相等,则重载equal方法。 56 | */ 57 | @Override 58 | public boolean equals(Object obj) { 59 | if (obj == null) { 60 | return false; 61 | } 62 | if (this == obj) { 63 | return true; 64 | } 65 | if (obj instanceof IntPair) { 66 | IntPair r = (IntPair) obj; 67 | return r.first == first && r.second == second; 68 | } else { 69 | return false; 70 | } 71 | } 72 | 73 | /* 74 | * 重写equal 的同时为什么必须重写hashcode? 75 | * hashCode是编译器为不同对象产生的不同整数,根据equal方法的定义:如果两个对象是相等(equal)的,那么两个对象 76 | * 调用 hashCode必须产生相同的整数结果 77 | * 即:equal为true,hashCode必须为true,equal为false,hashCode也必须 为false,所以必须重写hashCode来保证 78 | * 与equal同步。 79 | */ 80 | @Override 81 | public int hashCode() { 82 | return first * 157 + second; 83 | } 84 | 85 | // 实现key的比较 86 | @Override 87 | public int compareTo(IntPair o) { 88 | if (first != o.first) { 89 | return first < o.first ? -1 : 1; 90 | } else if (second != o.second) { 91 | return second < o.second ? -1 : 1; 92 | } else { 93 | return 0; 94 | } 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /hadoop-train/src/main/java/cn/edu/nju/hadoop/mapreduce/topk/IPTimes.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.hadoop.mapreduce.topk; 2 | 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.io.WritableComparable; 6 | 7 | import java.io.DataInput; 8 | import java.io.DataOutput; 9 | import java.io.IOException; 10 | 11 | /** 12 | * Created by thpffcj on 2020/2/5. 13 | */ 14 | public class IPTimes implements WritableComparable { 15 | 16 | // IP 17 | private Text ip; 18 | // IP对应出现的次数 19 | private IntWritable count; 20 | 21 | // 无参构造函数(一定要有,反射机制会出错,另外要对定义的变量进行初始化否则会报空指针异常) 22 | public IPTimes() { 23 | this.ip = new Text(""); 24 | this.count = new IntWritable(1); 25 | } 26 | 27 | // 有参构造函数 28 | public IPTimes(Text ip, IntWritable count) { 29 | this.ip = ip; 30 | this.count = count; 31 | } 32 | 33 | // 反序列化 34 | public void readFields(DataInput in) throws IOException { 35 | ip.readFields(in); 36 | count.readFields(in); 37 | } 38 | 39 | // 序列化 40 | public void write(DataOutput out) throws IOException { 41 | ip.write(out); 42 | count.write(out); 43 | } 44 | 45 | public Text getIp() { 46 | return ip; 47 | } 48 | 49 | public void setIp(Text ip) { 50 | this.ip = ip; 51 | } 52 | 53 | public IntWritable getCount() { 54 | return count; 55 | } 56 | 57 | public void setCount(IntWritable count) { 58 | this.count = count; 59 | } 60 | 61 | // 这个方法是二次排序的关键 62 | public int compareTo(Object o) { 63 | // 强转 64 | IPTimes ipAndCount = (IPTimes) o; 65 | // 对第二列的count进行比较 66 | long minus = this.getCount().compareTo(ipAndCount.getCount()); 67 | // 第二列不相同时降序排列 68 | if (minus != 0) { 69 | return ipAndCount.getCount().compareTo(this.count); 70 | } else { // 第二列相同时第一列升序排列 71 | return this.ip.compareTo(ipAndCount.getIp()); 72 | } 73 | } 74 | 75 | // hashCode和equals()方法 76 | public int hashCode() { 77 | return ip.hashCode(); 78 | } 79 | 80 | public boolean equals(Object o) { 81 | if (!(o instanceof IPTimes)) { 82 | return false; 83 | } 84 | IPTimes other = (IPTimes) o; 85 | return ip.equals(other.ip) && count.equals(other.count); 86 | } 87 | 88 | // 重写toString()方法 89 | public String toString() { 90 | return this.ip + "\t" + this.count; 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /hadoop-train/src/resources/application.properties: -------------------------------------------------------------------------------- 1 | spring.hadoop.fsUri = hdfs://192.168.92.130:8020 -------------------------------------------------------------------------------- /hadoop-train/src/resources/beans.xml: -------------------------------------------------------------------------------- 1 | 2 | 9 | 10 | 11 | 12 | 13 | fs.defaultFS=${spring.hadoop.fsUri} 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /hadoop-train/src/resources/log.txt: -------------------------------------------------------------------------------- 1 | Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0192.168.242.129 - - [27/Dec/2017:23:07:18 +0800] "GET /index.html HTTP/1.1" 304 0 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0" "-" 2 | Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0192.168.242.129 - - [27/Dec/2017:23:07:21 +0800] "GET /index.html HTTP/1.1" 304 0 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0" "-" 3 | Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0192.168.242.129 - - [27/Dec/2017:23:07:22 +0800] "GET /index.html HTTP/1.1" 304 0 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0" "-" 4 | Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0192.168.242.129 - - [27/Dec/2017:23:07:46 +0800] "GET /index.html HTTP/1.1" 403 169 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0" "-" 5 | Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0192.168.242.129 - - [27/Dec/2017:23:08:13 +0800] "GET /index.html HTTP/1.1" 200 612 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0" "-" 6 | Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0192.168.242.129 - - [27/Dec/2017:23:13:44 +0800] "GET /index.html HTTP/1.1" 304 0 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0" "-" 7 | Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0192.168.242.129 - - [27/Dec/2017:23:13:45 +0800] "GET /index.html HTTP/1.1" 304 0 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0" "-" 8 | -------------------------------------------------------------------------------- /hadoop-train/src/test/java/cn/edu/nju/hadoop/spring/SpringBootHDFSApp.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.hadoop.spring; 2 | 3 | import org.apache.hadoop.fs.FileStatus; 4 | import org.springframework.beans.factory.annotation.Autowired; 5 | import org.springframework.boot.CommandLineRunner; 6 | import org.springframework.boot.SpringApplication; 7 | import org.springframework.boot.autoconfigure.SpringBootApplication; 8 | import org.springframework.data.hadoop.fs.FsShell; 9 | 10 | /** 11 | * Created by Thpffcj on 2018/1/9. 12 | */ 13 | @SpringBootApplication 14 | public class SpringBootHDFSApp implements CommandLineRunner { 15 | 16 | @Autowired 17 | FsShell fsShell; 18 | 19 | @Override 20 | public void run(String... strings) throws Exception { 21 | for (FileStatus fileStatus : fsShell.lsr("/springhdfs")) { 22 | System.out.println(">" + fileStatus.getPath()); 23 | } 24 | } 25 | 26 | public static void main(String[] args) { 27 | SpringApplication.run(SpringBootHDFSApp.class, args); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /hadoop-train/src/test/java/cn/edu/nju/hadoop/spring/SpringHadoopHDFSApp.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.hadoop.spring; 2 | 3 | import org.apache.hadoop.fs.FSDataInputStream; 4 | import org.apache.hadoop.fs.FileSystem; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.IOUtils; 7 | import org.junit.After; 8 | import org.junit.Before; 9 | import org.junit.Test; 10 | import org.springframework.context.ApplicationContext; 11 | import org.springframework.context.support.ClassPathXmlApplicationContext; 12 | 13 | import java.io.IOException; 14 | 15 | /** 16 | * Created by Thpffcj on 2018/1/8. 17 | */ 18 | public class SpringHadoopHDFSApp { 19 | 20 | private ApplicationContext context; 21 | private FileSystem fileSystem; 22 | 23 | /** 24 | * 创建HDFS文件夹 25 | * @throws Exception 26 | */ 27 | @Test 28 | public void testMkdirs() throws Exception { 29 | fileSystem.mkdirs(new Path("/springhdfs/")); 30 | } 31 | 32 | /** 33 | * 读取HDFS文件内容 34 | * @throws Exception 35 | */ 36 | @Test 37 | public void testText() throws Exception { 38 | FSDataInputStream inputStream = fileSystem.open(new Path("/springhdfs/hello.txt")); 39 | IOUtils.copyBytes(inputStream, System.out, 1024); 40 | inputStream.close(); 41 | } 42 | 43 | @Before 44 | public void setUp() { 45 | context = new ClassPathXmlApplicationContext("beans.xml"); 46 | fileSystem = (FileSystem) context.getBean("fileSystem"); 47 | } 48 | 49 | @After 50 | public void tearDown() throws IOException { 51 | context = null; 52 | fileSystem = null; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /hbase-train/hbase-api-test/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | hbase-train 7 | cn.edu.nju 8 | 1.0-SNAPSHOT 9 | 10 | 4.0.0 11 | 12 | hbase-api-test 13 | 14 | 15 | 16 | org.apache.hbase 17 | hbase-client 18 | 1.2.0 19 | 20 | 21 | junit 22 | junit 23 | 4.12 24 | test 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /hbase-train/hbase-api-test/src/main/java/cn/edu/nju/HBaseConn.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.hbase.HBaseConfiguration; 5 | import org.apache.hadoop.hbase.TableName; 6 | import org.apache.hadoop.hbase.client.Connection; 7 | import org.apache.hadoop.hbase.client.ConnectionFactory; 8 | import org.apache.hadoop.hbase.client.Table; 9 | 10 | 11 | import java.io.IOException; 12 | 13 | /** 14 | * Created by thpffcj on 2019-04-05. 15 | */ 16 | public class HBaseConn { 17 | 18 | private static final HBaseConn INSTANCE = new HBaseConn(); 19 | private static Configuration configuration; 20 | private static Connection connection; 21 | 22 | private HBaseConn() { 23 | try { 24 | if (configuration == null) { 25 | configuration = HBaseConfiguration.create(); 26 | configuration.set("hbase.zookeeper.quorum", "localhost:2181"); 27 | } 28 | } catch (Exception e) { 29 | e.printStackTrace(); 30 | } 31 | } 32 | 33 | private Connection getConnection() { 34 | if (connection == null || connection.isClosed()) { 35 | try { 36 | connection = ConnectionFactory.createConnection(configuration); 37 | } catch (Exception e) { 38 | e.printStackTrace(); 39 | } 40 | } 41 | return connection; 42 | } 43 | 44 | public static Connection getHBaseConn() { 45 | return INSTANCE.getConnection(); 46 | } 47 | 48 | public static Table getTable(String tableName) throws IOException { 49 | return INSTANCE.getConnection().getTable(TableName.valueOf(tableName)); 50 | } 51 | 52 | public static void closeConn() { 53 | if (connection != null) { 54 | try { 55 | connection.close(); 56 | } catch (IOException ioe) { 57 | ioe.printStackTrace(); 58 | } 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /hbase-train/hbase-api-test/src/test/java/cn/edu/nju/HBaseConnTest.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju; 2 | 3 | import org.apache.hadoop.hbase.client.Connection; 4 | import org.apache.hadoop.hbase.client.Table; 5 | import org.junit.Test; 6 | 7 | import java.io.IOException; 8 | 9 | /** 10 | * Created by thpffcj on 2019-04-05. 11 | */ 12 | public class HBaseConnTest { 13 | 14 | @Test 15 | public void getConnTest() { 16 | Connection conn = HBaseConn.getHBaseConn(); 17 | System.out.println(conn.isClosed()); 18 | HBaseConn.closeConn(); 19 | System.out.println(conn.isClosed()); 20 | } 21 | 22 | @Test 23 | public void getTableTest() { 24 | try { 25 | Table table = HBaseConn.getTable("US_POPULATION"); 26 | System.out.println(table.getName().getNameAsString()); 27 | table.close(); 28 | } catch (IOException ioe) { 29 | ioe.printStackTrace(); 30 | } 31 | } 32 | } -------------------------------------------------------------------------------- /hbase-train/hbase-api-test/src/test/java/cn/edu/nju/HBaseUtilTest.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju; 2 | 3 | import org.apache.hadoop.hbase.client.Result; 4 | import org.apache.hadoop.hbase.client.ResultScanner; 5 | import org.apache.hadoop.hbase.util.Bytes; 6 | import org.junit.Test; 7 | 8 | /** 9 | * Created by thpffcj on 2019-04-06. 10 | */ 11 | public class HBaseUtilTest { 12 | 13 | @Test 14 | public void createTable() { 15 | HBaseUtil.createTable("FileTable", new String[]{"fileInfo", "saveInfo"}); 16 | } 17 | 18 | @Test 19 | public void addFileDetails() { 20 | HBaseUtil.putRow("FileTable", "rowkey1", "fileInfo", "name", "file1.txt"); 21 | HBaseUtil.putRow("FileTable", "rowkey1", "fileInfo", "type", "txt"); 22 | HBaseUtil.putRow("FileTable", "rowkey1", "fileInfo", "size", "1024"); 23 | HBaseUtil.putRow("FileTable", "rowkey1", "saveInfo", "creator", "thpffcj"); 24 | HBaseUtil.putRow("FileTable", "rowkey2", "fileInfo", "name", "file2.jpg"); 25 | HBaseUtil.putRow("FileTable", "rowkey2", "fileInfo", "type", "jpg"); 26 | HBaseUtil.putRow("FileTable", "rowkey2", "fileInfo", "size", "1024"); 27 | HBaseUtil.putRow("FileTable", "rowkey2", "saveInfo", "creator", "thpffcj"); 28 | 29 | } 30 | 31 | @Test 32 | public void getFileDetails() { 33 | Result result = HBaseUtil.getRow("FileTable", "rowkey1"); 34 | if (result != null) { 35 | System.out.println("rowkey=" + Bytes.toString(result.getRow())); 36 | System.out.println("fileName=" + Bytes 37 | .toString(result.getValue(Bytes.toBytes("fileInfo"), Bytes.toBytes("name")))); 38 | } 39 | } 40 | 41 | @Test 42 | public void scanFileDetails() { 43 | ResultScanner scanner = HBaseUtil.getScanner("FileTable", "rowkey2", "rowkey2"); 44 | if (scanner != null) { 45 | scanner.forEach(result -> { 46 | System.out.println("rowkey=" + Bytes.toString(result.getRow())); 47 | System.out.println("fileName=" + Bytes 48 | .toString(result.getValue(Bytes.toBytes("fileInfo"), Bytes.toBytes("name")))); 49 | }); 50 | scanner.close(); 51 | } 52 | } 53 | 54 | @Test 55 | public void deleteRow() { 56 | HBaseUtil.deleteRow("FileTable", "rowkey1"); 57 | } 58 | 59 | @Test 60 | public void deleteTable() { 61 | HBaseUtil.deleteTable("FileTable"); 62 | } 63 | } -------------------------------------------------------------------------------- /hbase-train/hbase-endpoint-test/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | hbase-train 6 | cn.edu.nju 7 | 1.0-SNAPSHOT 8 | 9 | 4.0.0 10 | 11 | hbase-endpoint-test 12 | 13 | 14 | 15 | org.apache.hbase 16 | hbase-server 17 | 1.2.0 18 | 19 | 20 | org.apache.hbase 21 | hbase-common 22 | 1.2.0 23 | 24 | 25 | com.google.protobuf 26 | protobuf-java 27 | 2.5.0 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /hbase-train/hbase-endpoint-test/src/main/proto/RowCountTest.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | 3 | option java_package = "cn.edu.nju"; 4 | 5 | option java_outer_classname = "GetRowCount"; 6 | option java_generic_services = true; 7 | option optimize_for = SPEED; 8 | 9 | message getRowCountRequest{ 10 | 11 | } 12 | 13 | message getRowCountResponse { 14 | optional int64 rowCount = 1; 15 | } 16 | 17 | 18 | service hbaseEndPointTestService { 19 | rpc getRowCount(getRowCountRequest) 20 | returns(getRowCountResponse); 21 | } -------------------------------------------------------------------------------- /hbase-train/hbase-observer-test/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | hbase-train 6 | cn.edu.nju 7 | 1.0-SNAPSHOT 8 | 9 | 4.0.0 10 | 11 | hbase-observer-test 12 | 13 | 14 | 15 | org.apache.hbase 16 | hbase-common 17 | 1.2.0 18 | 19 | 20 | org.apache.hbase 21 | hbase-server 22 | 1.2.0 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /hbase-train/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 4.0.0 6 | 7 | cn.edu.nju 8 | hbase-train 9 | pom 10 | 1.0-SNAPSHOT 11 | 12 | 13 | UTF-8 14 | 1.8 15 | 1.8 16 | 17 | 18 | 19 | hbase-api-test 20 | hbase-observer-test 21 | hbase-endpoint-test 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /hbase-train/src/main/java/cn/edu/nju/App.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju; 2 | 3 | /** 4 | * Hello world! 5 | * 6 | */ 7 | public class App 8 | { 9 | public static void main( String[] args ) 10 | { 11 | System.out.println( "Hello World!" ); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /hbase-train/src/test/java/cn/edu/nju/AppTest.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju; 2 | 3 | import static org.junit.Assert.assertTrue; 4 | 5 | import org.junit.Test; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | { 12 | /** 13 | * Rigorous Test :-) 14 | */ 15 | @Test 16 | public void shouldAnswerWithTrue() 17 | { 18 | assertTrue( true ); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /log-generator/generate_log.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | __author__ = 'Thpffcj' 3 | 4 | import random 5 | import time 6 | 7 | url_path = [ 8 | "class/112.html", 9 | "class/128.html", 10 | "class/145.html", 11 | "class/146.html", 12 | "class/130.html", 13 | "learn/821", 14 | "course/list" 15 | ] 16 | 17 | ip_slices = [132, 156, 124, 10, 29, 143, 187, 30, 46, 55, 63, 72, 98, 168] 18 | 19 | http_referers = [ 20 | "http://www.baidu.com/s?wd={query}", 21 | "http://www.sogou.com/?web={query}", 22 | "http://cn.bing.com/search?q={query}", 23 | "https://search.yahoo.com/search?p={query}", 24 | ] 25 | 26 | search_keyword = [ 27 | "Spark SQL实战", 28 | "Hadoop基础", 29 | "Storm实战", 30 | "Spark Streaming实战", 31 | "大数据面试" 32 | ] 33 | 34 | status_code = ["200", "404", "500"] 35 | 36 | 37 | def sample_url(): 38 | return random.sample(url_path, 1)[0] 39 | 40 | 41 | def sample_ip(): 42 | slice = random.sample(ip_slices, 4) 43 | return ".".join([str(item) for item in slice]) 44 | 45 | 46 | def sample_referrer(): 47 | if random.uniform(0, 1) > 0.2: 48 | return "-" 49 | 50 | refer_str = random.sample(http_referers, 1) 51 | query_str = random.sample(search_keyword, 1) 52 | return refer_str[0].format(query=query_str[0]) 53 | 54 | 55 | def sample_status_code(): 56 | return random.sample(status_code, 1)[0] 57 | 58 | 59 | def generate_log(count=10): 60 | time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 61 | 62 | f = open("/home/thpffcj/data/project/logs/access.log", "a") 63 | 64 | while count >= 1: 65 | query_log = "{ip}\t{local_time}\t\"GET /{url} HTTP/1.1\"\t{status_code}\t{refer}".format( 66 | ip=sample_ip(), local_time=time_str, url=sample_url(), status_code=sample_status_code(), 67 | refer=sample_referrer()) 68 | print(query_log) 69 | 70 | f.write(query_log + "\n") 71 | 72 | count = count - 1 73 | 74 | 75 | if __name__ == '__main__': 76 | generate_log() 77 | -------------------------------------------------------------------------------- /log-generator/message.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | __author__ = 'Thpffcj' 3 | 4 | import random 5 | import time 6 | 7 | infos = [ 8 | "116.397026,39.918058", 9 | "116.410886,39.881949", 10 | "116.272876,39.99243", 11 | "116.544079,40.417555", 12 | "116.225404,40.258186", 13 | "116.38631,39.937209", 14 | "116.399466,39.989743" 15 | ] 16 | 17 | phones = [ 18 | "13888888888", "13877777777", "13866666666", 19 | "13988888888", "13977777777", "13966666666", 20 | "13788888888", "13777777777", "13766666666", 21 | "13688888888", "13677777777", "13666666666", 22 | ] 23 | 24 | def sample_phone(): 25 | return random.sample(phones, 1)[0] 26 | 27 | 28 | def sample_info(): 29 | return random.sample(infos, 1)[0] 30 | 31 | 32 | def generate_log(count = 3): 33 | time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 34 | f = open("/home/thpffcj/data/logs/access.log", "a+") 35 | while count >= 1: 36 | query_log = "{phone}\t{info}\t[{local_time}]".format(phone = sample_phone(), 37 | info = sample_info(), local_time = time_str) 38 | # print(query_log) 39 | f.write(query_log + "\n") 40 | count = count - 1 41 | 42 | 43 | if __name__ == '__main__': 44 | generate_log(10) -------------------------------------------------------------------------------- /log-generator/message2.py: -------------------------------------------------------------------------------- 1 | # _*_ coding: utf-8 _*_ 2 | __author__ = 'Thpffcj' 3 | 4 | import random 5 | import time 6 | 7 | url_path = [ 8 | "http://www.imooc.com/video/8701", 9 | "http://www.imooc.com/video/8702", 10 | "http://www.imooc.com/video/8703", 11 | "http://www.imooc.com/article/8701", 12 | "http://www.imooc.com/article/8704", 13 | "http://www.imooc.com/article/8705", 14 | "http://www.imooc.com/video/8709" 15 | ] 16 | 17 | ip_slices = [132, 156, 124, 10, 29, 143, 187, 30, 46, 55, 63, 72, 98, 168] 18 | 19 | 20 | def sample_traffic(): 21 | return random.randint(0, 100) 22 | 23 | 24 | def sample_url(): 25 | return random.sample(url_path, 1)[0] 26 | 27 | 28 | def sample_ip(): 29 | slice = random.sample(ip_slices, 4) 30 | return ".".join([str(item) for item in slice]) 31 | 32 | 33 | def generate_log(count=1000): 34 | time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 35 | while count >= 1: 36 | query_log = "{local_time}\t{url}\t{traffic}\t{ip}".format(local_time=time_str, url=sample_url(), traffic=sample_traffic(), ip=sample_ip()) 37 | print(query_log) 38 | count = count - 1 39 | 40 | 41 | if __name__ == '__main__': 42 | generate_log() -------------------------------------------------------------------------------- /pyspark/project/spark.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # Created by thpffcj on 2019/10/19. 3 | from pyspark.sql import SparkSession 4 | from pyspark.sql.types import * 5 | from pyspark.sql.functions import udf 6 | 7 | if __name__ == '__main__': 8 | spark = SparkSession.builder.appName("project").getOrCreate() 9 | 10 | data2015 = spark.read.format("csv")\ 11 | .option("header", "true")\ 12 | .option("inferSchema", "true")\ 13 | .load("file:///Users/thpffcj/Public/file/Beijing_2015_HourlyPM25_created20160201.csv")\ 14 | .select("Year", "Month", "Day", "Hour", "Value", "QC Name") 15 | 16 | data2016 = spark.read.format("csv")\ 17 | .option("header", "true")\ 18 | .option("inferSchema", "true")\ 19 | .load("file:///Users/thpffcj/Public/file/Beijing_2016_HourlyPM25_created20170201.csv")\ 20 | .select("Year", "Month", "Day", "Hour", "Value", "QC Name") 21 | 22 | data2017 = spark.read.format("csv")\ 23 | .option("header", "true")\ 24 | .option("inferSchema", "true")\ 25 | .load("file:///Users/thpffcj/Public/file/Beijing_2017_HourlyPM25_created20170803.csv")\ 26 | .select("Year", "Month", "Day", "Hour", "Value", "QC Name") 27 | 28 | 29 | def get_grade(value): 30 | if value <= 50 and value >= 0: 31 | return "健康" 32 | elif value <= 100: 33 | return "中等" 34 | elif value <= 150: 35 | return "对敏感人群不健康" 36 | elif value <= 200: 37 | return "不健康" 38 | elif value <= 300: 39 | return "非常不健康" 40 | elif value <= 500: 41 | return "危险" 42 | elif value > 500: 43 | return "爆表" 44 | else: 45 | return None 46 | 47 | grade_function_udf = udf(get_grade, StringType()) 48 | 49 | # 进来一个Value,出去一个Grade 50 | group2017 = data2017.withColumn("Grade", grade_function_udf(data2017['Value'])).groupBy("Grade").count() 51 | group2016 = data2016.withColumn("Grade", grade_function_udf(data2016['Value'])).groupBy("Grade").count() 52 | group2015 = data2015.withColumn("Grade", grade_function_udf(data2015['Value'])).groupBy("Grade").count() 53 | 54 | group2017.select("Grade", "count", group2017['count'] / data2017.count()).show() 55 | group2016.select("Grade", "count", group2016['count'] / data2016.count()).show() 56 | group2015.select("Grade", "count", group2015['count'] / data2015.count()).show() 57 | 58 | spark.stop() 59 | -------------------------------------------------------------------------------- /pyspark/project/steam.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # Created by thpffcj on 2019/10/21. 3 | from pyspark.sql import SparkSession 4 | from pyspark.sql.types import * 5 | from pyspark.sql.functions import udf 6 | 7 | if __name__ == '__main__': 8 | 9 | spark = SparkSession.builder.appName("steam").getOrCreate() 10 | 11 | steam_data = spark.read.format("csv")\ 12 | .option("header", "true")\ 13 | .option("inferSchema", "true")\ 14 | .load("/data/steam.csv")\ 15 | .select("userId", "gameName", "behavior", "duration") 16 | 17 | # 200000 18 | # print(steam_data.count()) 19 | 20 | def get_time(value): 21 | if value <= 10 and value >= 0: 22 | return "0 ~ 10小时" 23 | elif value <= 50: 24 | return "10 ~ 50小时" 25 | elif value <= 100: 26 | return "51 ~ 100小时" 27 | elif value <= 200: 28 | return "101 ~ 200小时" 29 | elif value <= 300: 30 | return "201 ~ 300小时" 31 | elif value <= 500: 32 | return "301 ~ 500小时" 33 | elif value > 500: 34 | return "大于500小时" 35 | else: 36 | return None 37 | 38 | grade_function_udf = udf(get_time, StringType()) 39 | 40 | dota2_data = steam_data.filter(steam_data["behavior"] == "play").filter(steam_data["gameName"] == "Dota 2")\ 41 | .withColumn("range", grade_function_udf(steam_data['duration'])) 42 | dota2_group = dota2_data.groupBy("range").count() 43 | dota2_result = dota2_group.select("range", "count")\ 44 | .withColumn("percent", dota2_group['count'] / dota2_data.count() * 100) 45 | 46 | team_fortress2_data = steam_data.filter(steam_data["behavior"] == "play").filter(steam_data["gameName"] == "Team Fortress 2")\ 47 | .withColumn("range", grade_function_udf(steam_data['duration'])) 48 | team_fortress2_group = team_fortress2_data.groupBy("range").count() 49 | team_fortress2_result = team_fortress2_group.select("range", "count")\ 50 | .withColumn("percent", team_fortress2_group['count'] / team_fortress2_data.count() * 100) 51 | 52 | # team_fortress2_result.show() 53 | 54 | dota2_data.write.format("org.elasticsearch.spark.sql").option("es.nodes", "172.19.170.131:9200").mode( 55 | "overwrite").save("dota2/data") 56 | 57 | team_fortress2_data.write.format("org.elasticsearch.spark.sql").option("es.nodes", "172.19.170.131:9200").mode( 58 | "overwrite").save("team_fortress2/data") 59 | 60 | dota2_result.write.format("org.elasticsearch.spark.sql").option("es.nodes", "172.19.170.131:9200").mode( 61 | "overwrite").save("aggregation_dota2/aggregation") 62 | 63 | team_fortress2_result.write.format("org.elasticsearch.spark.sql").option("es.nodes", "172.19.170.131:9200").mode( 64 | "overwrite").save("aggregation_fortress2/aggregation") 65 | 66 | 67 | spark.stop() 68 | -------------------------------------------------------------------------------- /pyspark/project/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # Created by thpffcj on 2019/11/18. 3 | 4 | def inputPre(): 5 | global name, pre, m, n 6 | for i in range(m): 7 | v = 0 8 | u = 0 9 | while u < n: 10 | if p[i][0] == name[u]: 11 | break 12 | else: 13 | u += 1 14 | if u == n: 15 | name.append(p[i][0]) 16 | n += 1 17 | while v < n: 18 | if p[i][1] == name[v]: 19 | break 20 | else: 21 | v += 1 22 | if v == n: 23 | name.append(p[i][1]) 24 | n += 1 25 | pre[v] |= (1 << u) 26 | 27 | 28 | def solve(): 29 | global dp, n 30 | dp[0] = 1 31 | for s in range(1 << n): 32 | if dp[s] != 0: 33 | for i in range(n): 34 | if ((s & pre[i]) == pre[i]) and not (s & (1 << i)): 35 | dp[s | (1 << i)] += dp[s] 36 | print(dp[(1 << n) - 1]) 37 | 38 | 39 | if __name__ == '__main__': 40 | N = int(input()) 41 | for k in range(N): 42 | pairs = list(map(str, input().split(","))) # 起点终点对集合 43 | m = len(pairs) 44 | n = 0 45 | p = [] # 存储起点终点对 46 | for i in range(m): 47 | pair = pairs[i].split() 48 | p.append(pair) 49 | name = [] 50 | size = 13 51 | pre = [0 for i in range(size)] 52 | dp = [0 for i in range(1 << size)] 53 | inputPre() 54 | solve() -------------------------------------------------------------------------------- /spark-data-visualization/.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | !.mvn/wrapper/maven-wrapper.jar 3 | 4 | ### STS ### 5 | .apt_generated 6 | .classpath 7 | .factorypath 8 | .project 9 | .settings 10 | .springBeans 11 | 12 | ### IntelliJ IDEA ### 13 | .idea 14 | *.iws 15 | *.iml 16 | *.ipr 17 | 18 | ### NetBeans ### 19 | nbproject/private/ 20 | build/ 21 | nbbuild/ 22 | dist/ 23 | nbdist/ 24 | .nb-gradle/ -------------------------------------------------------------------------------- /spark-data-visualization/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | cn.edu.nju 7 | DataVisualization 8 | 0.0.1 9 | jar 10 | 11 | spark-data-visualization 12 | Demo project for Spring Boot 13 | 14 | 15 | org.springframework.boot 16 | spring-boot-starter-parent 17 | 1.5.8.RELEASE 18 | 19 | 20 | 21 | 22 | UTF-8 23 | UTF-8 24 | 1.8 25 | 26 | 27 | 28 | 29 | cloudera 30 | https://repository.cloudera.com/artifactory/cloudera-repos/ 31 | 32 | 33 | 34 | 35 | 36 | org.springframework.boot 37 | spring-boot-starter-web 38 | 39 | 40 | 41 | org.springframework.boot 42 | spring-boot-starter-test 43 | test 44 | 45 | 46 | 47 | org.springframework.boot 48 | spring-boot-starter-thymeleaf 49 | 50 | 51 | 52 | org.apache.hbase 53 | hbase-client 54 | 1.2.0-cdh5.7.0 55 | 56 | 57 | 58 | net.sf.json-lib 59 | json-lib 60 | 2.4 61 | jdk15 62 | 63 | 64 | 65 | 66 | 67 | 68 | org.springframework.boot 69 | spring-boot-maven-plugin 70 | 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /spark-data-visualization/src/main/java/cn/edu/nju/DataVisualizationApplication.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju; 2 | 3 | import org.springframework.boot.SpringApplication; 4 | import org.springframework.boot.autoconfigure.SpringBootApplication; 5 | 6 | @SpringBootApplication 7 | public class DataVisualizationApplication { 8 | 9 | public static void main(String[] args) { 10 | SpringApplication.run(DataVisualizationApplication.class, args); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /spark-data-visualization/src/main/java/cn/edu/nju/dao/CourseClickCountDAO.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.dao; 2 | 3 | import cn.edu.nju.domain.CourseClickCount; 4 | import cn.edu.nju.utils.HBaseUtils; 5 | import org.springframework.stereotype.Component; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | import java.util.Map; 10 | 11 | /** 12 | * Created by Thpffcj on 2018/1/15. 13 | * 实战课程访问数量数据访问层 14 | */ 15 | @Component 16 | public class CourseClickCountDAO { 17 | 18 | /** 19 | * 根据天查询 20 | */ 21 | public List query(String day) throws Exception { 22 | 23 | List list = new ArrayList<>(); 24 | 25 | // 去HBase表中根据day获取实战课程对应的访问量 26 | Map map = HBaseUtils.getInstance().query("imooc_course_clickcount", day); 27 | 28 | for(Map.Entry entry: map.entrySet()) { 29 | CourseClickCount model = new CourseClickCount(); 30 | model.setName(entry.getKey()); 31 | model.setValue(entry.getValue()); 32 | 33 | list.add(model); 34 | } 35 | 36 | return list; 37 | } 38 | 39 | public static void main(String[] args) throws Exception{ 40 | CourseClickCountDAO dao = new CourseClickCountDAO(); 41 | List list = dao.query("20180115"); 42 | for(CourseClickCount model : list) { 43 | System.out.println(model.getName() + " : " + model.getValue()); 44 | } 45 | } 46 | } -------------------------------------------------------------------------------- /spark-data-visualization/src/main/java/cn/edu/nju/domain/CourseClickCount.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.domain; 2 | 3 | import org.springframework.stereotype.Component; 4 | 5 | /** 6 | * Created by Thpffcj on 2018/1/15. 7 | * 实战课程访问数量实体类 8 | */ 9 | @Component 10 | public class CourseClickCount { 11 | 12 | private String name; 13 | private long value; 14 | 15 | public String getName() { 16 | return name; 17 | } 18 | 19 | public void setName(String name) { 20 | this.name = name; 21 | } 22 | 23 | public long getValue() { 24 | return value; 25 | } 26 | 27 | public void setValue(long value) { 28 | this.value = value; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /spark-data-visualization/src/main/java/cn/edu/nju/spark/HelloBoot.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark; 2 | 3 | import org.springframework.web.bind.annotation.RequestMapping; 4 | import org.springframework.web.bind.annotation.RequestMethod; 5 | import org.springframework.web.bind.annotation.RestController; 6 | import org.springframework.web.servlet.ModelAndView; 7 | 8 | /** 9 | * Created by Thpffcj on 2018/1/15. 10 | * 这是我们的第一个Boot应用 11 | */ 12 | @RestController 13 | public class HelloBoot { 14 | 15 | @RequestMapping(value = "/hello", method = RequestMethod.GET) 16 | public String sayHello() { 17 | 18 | return "Hello World Spring Boot..."; 19 | } 20 | 21 | @RequestMapping(value = "/first", method = RequestMethod.GET) 22 | public ModelAndView firstDemo() { 23 | return new ModelAndView("test"); 24 | } 25 | 26 | @RequestMapping(value = "/course_clickcount", method = RequestMethod.GET) 27 | public ModelAndView courseClickCountStat() { 28 | return new ModelAndView("demo"); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /spark-data-visualization/src/main/java/cn/edu/nju/spark/ImoocStatApp.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark; 2 | 3 | import cn.edu.nju.dao.CourseClickCountDAO; 4 | import cn.edu.nju.domain.CourseClickCount; 5 | import net.sf.json.JSONArray; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | import org.springframework.stereotype.Component; 8 | import org.springframework.web.bind.annotation.RequestMapping; 9 | import org.springframework.web.bind.annotation.RequestMethod; 10 | import org.springframework.web.bind.annotation.ResponseBody; 11 | import org.springframework.web.bind.annotation.RestController; 12 | import org.springframework.web.servlet.ModelAndView; 13 | 14 | import java.util.HashMap; 15 | import java.util.List; 16 | import java.util.Map; 17 | 18 | /** 19 | * Created by Thpffcj on 2018/1/15. 20 | * web层 21 | */ 22 | @RestController 23 | public class ImoocStatApp { 24 | 25 | private static Map courses = new HashMap<>(); 26 | static { 27 | courses.put("112","Spark SQL慕课网日志分析"); 28 | courses.put("128","10小时入门大数据"); 29 | courses.put("145","深度学习之神经网络核心原理与算法"); 30 | courses.put("146","强大的Node.js在Web开发的应用"); 31 | courses.put("131","Vue+Django实战"); 32 | courses.put("130","Web前端性能优化"); 33 | } 34 | 35 | @Autowired 36 | CourseClickCountDAO courseClickCountDAO; 37 | 38 | // @RequestMapping(value = "/course_clickcount_dynamic", method = RequestMethod.GET) 39 | // public ModelAndView courseClickCount() throws Exception { 40 | // 41 | // ModelAndView view = new ModelAndView("index"); 42 | // 43 | // List list = courseClickCountDAO.query("20180115"); 44 | // for(CourseClickCount model : list) { 45 | // model.setName(courses.get(model.getName().substring(9))); 46 | // } 47 | // JSONArray json = JSONArray.fromObject(list); 48 | // 49 | // view.addObject("data_json", json); 50 | // 51 | // return view; 52 | // } 53 | 54 | @RequestMapping(value = "/course_clickcount_dynamic", method = RequestMethod.POST) 55 | @ResponseBody 56 | public List courseClickCount() throws Exception { 57 | 58 | List list = courseClickCountDAO.query("20180115"); 59 | for(CourseClickCount model : list) { 60 | model.setName(courses.get(model.getName().substring(9))); 61 | } 62 | 63 | return list; 64 | } 65 | 66 | @RequestMapping(value = "/echarts", method = RequestMethod.GET) 67 | public ModelAndView echarts(){ 68 | return new ModelAndView("echarts"); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /spark-data-visualization/src/main/resources/application.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/BigData-Getting-Started/5fe231ddaafb31504a41b4ec1c8f0008cd2f1ad2/spark-data-visualization/src/main/resources/application.properties -------------------------------------------------------------------------------- /spark-data-visualization/src/main/resources/templates/demo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | imooc_stat 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |

14 | 15 | 16 | 63 | 64 | -------------------------------------------------------------------------------- /spark-data-visualization/src/main/resources/templates/echarts.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 慕课网实战课程实时访问量统计 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |
17 | 18 | 19 | 74 | 75 | -------------------------------------------------------------------------------- /spark-data-visualization/src/main/resources/templates/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | test 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 | 15 | 16 | 43 | 44 | -------------------------------------------------------------------------------- /spark-data-visualization/src/test/java/cn/edu/nju/DataVisualizationApplicationTests.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju; 2 | 3 | import org.junit.Test; 4 | import org.junit.runner.RunWith; 5 | import org.springframework.boot.test.context.SpringBootTest; 6 | import org.springframework.test.context.junit4.SpringRunner; 7 | 8 | @RunWith(SpringRunner.class) 9 | @SpringBootTest 10 | public class DataVisualizationApplicationTests { 11 | 12 | @Test 13 | public void contextLoads() { 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/cn/edu/nju/MovieRecommendation.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.ml.evaluation.RegressionEvaluator 5 | import org.apache.spark.ml.recommendation.ALS 6 | import org.apache.spark.ml.recommendation.ALS.Rating 7 | import org.apache.spark.sql.SparkSession 8 | 9 | /** 10 | * Created by thpffcj on 2019/10/31. 11 | */ 12 | object MovieRecommendation { 13 | 14 | def main(args: Array[String]): Unit = { 15 | 16 | val conf = new SparkConf().setMaster("local").setAppName("MovieRecommendation") 17 | val spark = SparkSession.builder().config(conf).getOrCreate() 18 | spark.sparkContext.setLogLevel("WARN") 19 | 20 | val parseRating = (string: String ) => { 21 | // 分割 22 | val stringArray = string.split("\t") 23 | // 包装 24 | Rating(stringArray(0).toInt, stringArray(1).toInt, stringArray(2).toFloat) 25 | } 26 | 27 | import spark.implicits._ 28 | val data = spark.read.textFile("src/main/resources/u.data") 29 | // 分割 30 | .map(parseRating) 31 | // 转换成DataFrame 32 | .toDF("userId", "itemId", "rating") 33 | 34 | // data.show() 35 | 36 | val Array(train, test) = data.randomSplit(Array(0.8, 0.2)) 37 | 38 | val als = new ALS() 39 | .setMaxIter(20) 40 | .setUserCol("userId") 41 | .setItemCol("itemId") 42 | .setRatingCol("rating") 43 | // 正则化参数 44 | .setRegParam(0.01) 45 | 46 | val model = als.fit(train) 47 | 48 | // 冷启动策略 49 | model.setColdStartStrategy("drop") 50 | 51 | val predictions = model.transform(test) 52 | // 根据(userID,itemID)预测rating 53 | // predictions.show(false) 54 | 55 | // MovieLens数据集(学术界可靠的一种数据集) 给196号用户推荐10部电影 56 | val users = spark.createDataset(Array(196)).toDF("userID") 57 | // users.show(false) 58 | model.recommendForUserSubset(users, 10).show(false) 59 | 60 | // 模型评估 61 | val evaluator = new RegressionEvaluator() 62 | .setMetricName("rmse") 63 | .setLabelCol("rating") 64 | .setPredictionCol("prediction") 65 | 66 | val rmse = evaluator.evaluate(predictions) 67 | println(s"Root-mean-square error is $rmse \n") 68 | 69 | // Spark机器学习模型的持久化 70 | // 模型保存 71 | // model.save("./xxx") 72 | // 模型加载 73 | // val model = ALS.load("xxxx") 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/cn/edu/nju/classification/Iris.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.classification 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.ml.classification.DecisionTreeClassifier 5 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 6 | import org.apache.spark.ml.feature.VectorAssembler 7 | import org.apache.spark.sql.SparkSession 8 | 9 | import scala.util.Random 10 | 11 | /** 12 | * Created by thpffcj on 2019/10/29. 13 | */ 14 | object Iris { 15 | 16 | def main(args: Array[String]): Unit = { 17 | 18 | val conf = new SparkConf().setMaster("local").setAppName("Iris") 19 | val spark = SparkSession.builder().config(conf).getOrCreate() 20 | spark.sparkContext.setLogLevel("WARN") 21 | 22 | val file = spark.read.format("csv").load("src/main/resources/iris.data") 23 | 24 | val random = new Random() 25 | 26 | import spark.implicits._ 27 | val data = file.map(row => { 28 | val label = row.getString(4) match { 29 | case "Iris-setosa" => 0 30 | case "Iris-versicolor" => 1 31 | case "Iris-virginica" => 2 32 | } 33 | 34 | (row.getString(0).toDouble, row.getString(1).toDouble, 35 | row.getString(2).toDouble, row.getString(3).toDouble, 36 | label, random.nextDouble()) 37 | }).toDF("_c0", "_c1", "_c2", "_c3", "label", "rand").sort("rand") 38 | 39 | val assembler = new VectorAssembler().setInputCols(Array("_c0", "_c1", "_c2", "_c3")).setOutputCol("features") 40 | 41 | val dataset = assembler.transform(data) 42 | val Array(train, test) = dataset.randomSplit(Array(0.8, 0.2)) 43 | 44 | val dt = new DecisionTreeClassifier().setFeaturesCol("features").setLabelCol("label") 45 | val model = dt.fit(train) 46 | 47 | val result = model.transform(test) 48 | result.show() 49 | 50 | val evaluator = new MulticlassClassificationEvaluator() 51 | .setLabelCol("label") 52 | .setPredictionCol("prediction") 53 | .setMetricName("accuracy") 54 | 55 | val accuracy = evaluator.evaluate(result) 56 | println(s"""accuracy is $accuracy""") 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/cn/edu/nju/cluster/KMeans.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.cluster 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.ml.feature.VectorAssembler 5 | import org.apache.spark.ml.clustering.KMeans 6 | import org.apache.spark.sql.SparkSession 7 | 8 | import scala.util.Random 9 | 10 | /** 11 | * Created by thpffcj on 2019/10/29. 12 | */ 13 | object KMeans { 14 | 15 | def main(args: Array[String]): Unit = { 16 | 17 | val conf = new SparkConf().setMaster("local").setAppName("KMeans") 18 | val spark = SparkSession.builder().config(conf).getOrCreate() 19 | 20 | val file = spark.read.format("csv").load("src/main/resources/iris.data") 21 | 22 | val random = new Random() 23 | import spark.implicits._ 24 | val data= file.map(row => { 25 | val label = row.getString(4) match { 26 | case "Iris-setosa" => 0 27 | case "Iris-versicolor" => 1 28 | case "Iris-virginica" => 2 29 | } 30 | 31 | (row.getString(0).toDouble, 32 | row.getString(1).toDouble, 33 | row.getString(2).toDouble, 34 | row.getString(3).toDouble, 35 | label, 36 | random.nextDouble()) 37 | }).toDF("_c0", "_c1", "_c2", "_c3", "label", "rand").sort("rand") 38 | 39 | val assembler = new VectorAssembler() 40 | .setInputCols(Array("_c0", "_c1", "_c2", "_c3")) 41 | .setOutputCol("features") 42 | 43 | // 分割 44 | val dataset = assembler.transform(data) 45 | val Array(train, test) = dataset.randomSplit(Array(0.8, 0.2)) 46 | train.show() 47 | 48 | // kmeans 算法 49 | val kmeans = new KMeans().setFeaturesCol("features").setK(3).setMaxIter(20) 50 | val model = kmeans.fit(train) 51 | 52 | model.transform(train).show() 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/cn/edu/nju/cluster/Lda.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.cluster 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.ml.clustering.LDA 5 | import org.apache.spark.ml.feature.VectorAssembler 6 | import org.apache.spark.sql.SparkSession 7 | 8 | import scala.util.Random 9 | 10 | /** 11 | * Created by thpffcj on 2019/10/29. 12 | */ 13 | object Lda { 14 | 15 | def main(args: Array[String]): Unit = { 16 | 17 | val conf = new SparkConf().setMaster("local").setAppName("LDA") 18 | val spark = SparkSession.builder().config(conf).getOrCreate() 19 | 20 | // 加载数据 21 | val file = spark.read.format("csv").load("src/main/resources/iris.data") 22 | 23 | val random = new Random() 24 | 25 | import spark.implicits._ 26 | val data = file.map(row => { 27 | val label = row.getString(4) match { 28 | case "Iris-setosa" => 0 29 | case "Iris-versicolor" => 1 30 | case "Iris-virginica" => 2 31 | } 32 | 33 | (row.getString(0).toDouble, row.getString(1).toDouble, 34 | row.getString(2).toDouble, row.getString(3).toDouble, 35 | label, random.nextDouble()) 36 | }).toDF("_c0", "_c1", "_c2", "_c3", "label", "rand").sort("rand") 37 | 38 | val assembler = new VectorAssembler() 39 | .setInputCols(Array("_c0", "_c1", "_c2", "_c3")).setOutputCol("features") 40 | 41 | val dataset = assembler.transform(data) 42 | val Array(train, test) = dataset.randomSplit(Array(0.8, 0.2)) 43 | 44 | // 训练一个LDA模型 45 | val lda = new LDA().setFeaturesCol("features").setK(3).setMaxIter(40) 46 | val model = lda.fit(train) 47 | 48 | // 展示结果 49 | val prediction = model.transform(test) 50 | prediction.show() 51 | 52 | val ll = model.logLikelihood(train) 53 | val lp = model.logPerplexity(train) 54 | 55 | // Describe topics. 56 | val topics = model.describeTopics(3) 57 | prediction.select("label", "topicDistribution").show(false) 58 | println("The topics described by their top-weighted terms:") 59 | topics.show(false) 60 | println(s"The lower bound on the log likelihood of the entire corpus: $ll") 61 | println(s"The upper bound on perplexity: $lp") 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/cn/edu/nju/dimensionalityReduction/PCADimensionalityReduction.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.dimensionalityReduction 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.ml.classification.DecisionTreeClassifier 5 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 6 | import org.apache.spark.ml.feature.{PCA, VectorAssembler} 7 | import org.apache.spark.sql.SparkSession 8 | 9 | import scala.util.Random 10 | 11 | /** 12 | * Created by thpffcj on 2019/10/30. 13 | */ 14 | object PCADimensionalityReduction { 15 | 16 | def main(args: Array[String]): Unit = { 17 | 18 | val conf = new SparkConf().setMaster("local").setAppName("PCADimensionalityReduction") 19 | val spark = SparkSession.builder().config(conf).getOrCreate() 20 | 21 | // 日志级别 22 | spark.sparkContext.setLogLevel("WARN") 23 | 24 | val file = spark.read.format("csv").load("src/main/resources/iris.data") 25 | 26 | val random = new Random() 27 | import spark.implicits._ 28 | val data = file.map(row => { 29 | val label = row.getString(4) match { 30 | case "Iris-setosa" => 0 31 | case "Iris-versicolor" => 1 32 | case "Iris-virginica" => 2 33 | } 34 | 35 | (row.getString(0).toDouble, 36 | row.getString(1).toDouble, 37 | row.getString(2).toDouble, 38 | row.getString(3).toDouble, 39 | label, 40 | random.nextDouble()) 41 | }).toDF("_c0", "_c1", "_c2", "_c3", "label", "rand").sort("rand") 42 | 43 | val assembler = new VectorAssembler().setInputCols(Array("_c0", "_c1", "_c2", "_c3")).setOutputCol("features") 44 | 45 | val pca = new PCA() 46 | .setInputCol("features") 47 | .setOutputCol("features2") 48 | .setK(3) 49 | 50 | val dataset = assembler.transform(data) 51 | val pcaModel = pca.fit(dataset) 52 | 53 | val dataset2 = pcaModel.transform(dataset) 54 | val Array(train, test) = dataset2.randomSplit(Array(0.8, 0.2)) 55 | 56 | val dt = new DecisionTreeClassifier().setFeaturesCol("features").setLabelCol("label") 57 | val model = dt.fit(train) 58 | val result = model.transform(test) 59 | result.show(false) 60 | 61 | val evaluator = new MulticlassClassificationEvaluator() 62 | .setLabelCol("label") 63 | .setPredictionCol("prediction") 64 | .setMetricName("accuracy") 65 | val accuracy = evaluator.evaluate(result) 66 | println(s"""accuracy is $accuracy""") 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/cn/edu/nju/emotionAnalysis/EmotionAnalysis.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.emotionAnalysis 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.ml.classification.NaiveBayes 5 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 6 | import org.apache.spark.ml.feature.{HashingTF, IDF} 7 | import org.apache.spark.sql.SparkSession 8 | 9 | import scala.util.Random 10 | 11 | /** 12 | * Created by thpffcj on 2019/10/30. 13 | */ 14 | object EmotionAnalysis { 15 | 16 | def main(args: Array[String]): Unit = { 17 | 18 | val conf = new SparkConf().setMaster("local").setAppName("EmotionAnalysis") 19 | val spark = SparkSession.builder().config(conf).getOrCreate() 20 | // 日志级别 21 | spark.sparkContext.setLogLevel("WARN") 22 | 23 | val rand = new Random() 24 | 25 | import spark.implicits._ 26 | // 数据预处理 27 | val neg = spark.read.textFile("src/main/resources/neg.txt").map(line => { 28 | // 分词 29 | (line.split(" ").filter(!_.equals(" ")), 0, rand.nextDouble()) 30 | }).toDF("words", "value", "random") 31 | 32 | val pos = spark.read.textFile("src/main/resources/pos.txt").map(line => { 33 | (line.split(" ").filter(!_.equals(" ")), 1, rand.nextDouble()) 34 | }).toDF("words", "value", "random") // 思考:这里把inner function提出重用来如何操作 35 | 36 | // 合并乱序 37 | val data = neg.union(pos).sort("random") 38 | data.show(false) 39 | println(neg.count(), pos.count(), data.count()) // 合并 40 | 41 | // 文本特征抽取(TF-IDF) 42 | val hashingTf = new HashingTF() 43 | .setInputCol("words") 44 | .setOutputCol("hashing") 45 | .transform(data) 46 | 47 | val idfModel = new IDF() 48 | .setInputCol("hashing") 49 | .setOutputCol("tfidf") 50 | .fit(hashingTf) 51 | 52 | val transformedData = idfModel.transform(hashingTf) 53 | val Array(training, test) = transformedData 54 | .randomSplit(Array(0.7, 0.3)) 55 | 56 | // 根据抽取到的文本特征,使用分类器进行分类,这是一个二分类问题 57 | // 分类器是可替换的 58 | val bayes = new NaiveBayes() 59 | .setFeaturesCol("tfidf") // X 60 | .setLabelCol("value") // y 0:消极,1:积极 61 | .fit(training) 62 | 63 | // 交叉验证 64 | val result = bayes.transform(test) 65 | result.show(false) 66 | 67 | // 评估模型的准确率 68 | val evaluator = new MulticlassClassificationEvaluator() 69 | .setLabelCol("value") 70 | .setPredictionCol("prediction") 71 | .setMetricName("accuracy") 72 | 73 | val accuracy = evaluator.evaluate(result) 74 | println(s"""accuracy is $accuracy""") 75 | 76 | // 重构思考: 77 | // 尝试用pipeline重构代码 78 | // 尝试用模型预测随便属于一句话的情感,例如: 79 | // You are a bad girl,I hate you. ^_^ 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/cn/edu/nju/regression/HousePriceForecast.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.regression 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.ml.feature.VectorAssembler 5 | import org.apache.spark.ml.regression.LinearRegression 6 | import org.apache.spark.sql.SparkSession 7 | 8 | import scala.util.Random 9 | 10 | /** 11 | * Created by thpffcj on 2019/10/29. 12 | */ 13 | object HousePriceForecast { 14 | 15 | def main(args: Array[String]): Unit = { 16 | 17 | val conf = new SparkConf().setMaster("local").setAppName("HousePriceForecast") 18 | 19 | val spark = SparkSession.builder().config(conf).getOrCreate() 20 | 21 | // 加载文件 22 | val file = spark.read.format("csv") 23 | .option("sep", ";").option("header", "true") 24 | .load("src/main/resources/house.csv") 25 | 26 | import spark.implicits._ 27 | // 开始shuffle 28 | // 打乱顺序 29 | val rand = new Random() 30 | val data = file.select("square", "price").map(row => { 31 | (row.getAs[String](0).toDouble, row.getString(1).toDouble, rand.nextDouble()) 32 | }).toDF("square", "price", "rand").sort("rand") // 强制类型转换过程 33 | 34 | val assembler = new VectorAssembler().setInputCols(Array("square")).setOutputCol("features") 35 | 36 | // 特征包装 37 | val dataset = assembler.transform(data) 38 | 39 | // 训练集,测试集 40 | // 拆分成训练数据集和测试数据集 41 | val Array(train, test) = dataset.randomSplit(Array(0.8, 0.2)) 42 | 43 | val lr = new LinearRegression().setLabelCol("price").setFeaturesCol("features") 44 | .setRegParam(0.3).setElasticNetParam(0.8).setMaxIter(10) 45 | val model = lr.fit(train) 46 | 47 | model.transform(test).show() 48 | val s = model.summary.totalIterations 49 | println(s"iter: ${s}") 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /spark-sql-train/.gitignore: -------------------------------------------------------------------------------- 1 | /memetastore_db 2 | /spark-warehouse 3 | /derby.log 4 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/resources/ipRegion.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Thpffcj/BigData-Getting-Started/5fe231ddaafb31504a41b4ec1c8f0008cd2f1ad2/spark-sql-train/src/main/resources/ipRegion.xlsx -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/log/AccessConvertUtil.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.log 2 | 3 | import org.apache.spark.sql.Row 4 | import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} 5 | 6 | /** 7 | * Created by Thpffcj on 2018/5/7. 8 | * 访问日志转换(输入==>输出)工具类 9 | */ 10 | object AccessConvertUtil { 11 | 12 | // 定义的输出的字段 13 | val struct = StructType( 14 | Array( 15 | StructField("url", StringType), 16 | StructField("cmsType", StringType), 17 | StructField("cmsId", LongType), 18 | StructField("traffic", LongType), 19 | StructField("ip", StringType), 20 | StructField("city", StringType), 21 | StructField("time", StringType), 22 | StructField("day", StringType) 23 | ) 24 | ) 25 | 26 | /** 27 | * 根据输入的每一行信息转换成输出的样式 28 | * 29 | * @param log 输入的每一行记录信息 30 | */ 31 | def parseLog(log: String) = { 32 | 33 | try { 34 | val splits = log.split("\t") 35 | 36 | val url = splits(1) 37 | val traffic = splits(2).toLong 38 | val ip = splits(3) 39 | 40 | val domain = "http://www.imooc.com/" 41 | val cms = url.substring(url.indexOf(domain) + domain.length) 42 | val cmsTypeId = cms.split("/") 43 | 44 | var cmsType = "" 45 | var cmsId = 0l 46 | if (cmsTypeId.length > 1) { 47 | cmsType = cmsTypeId(0) 48 | cmsId = cmsTypeId(1).toLong 49 | } 50 | 51 | val city = IpUtils.getCity(ip) 52 | val time = splits(0) 53 | val day = time.substring(0, 10).replaceAll("-", "") 54 | 55 | // 这个row里面的字段要和struct中的字段对应上 56 | Row(url, cmsType, cmsId, traffic, ip, city, time, day) 57 | } catch { 58 | case e: Exception => Row(0) 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/log/DateUtils.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.log 2 | 3 | import java.util.{Date, Locale} 4 | 5 | import org.apache.commons.lang3.time.FastDateFormat 6 | 7 | /** 8 | * Created by Thpffcj on 2018/5/7. 9 | * 日期时间解析工具类: 10 | * 注意:SimpleDateFormat是线程不安全 11 | */ 12 | object DateUtils { 13 | 14 | //输入文件日期时间格式 15 | //10/Nov/2016:00:01:02 +0800 16 | val YYYYMMDDHHMM_TIME_FORMAT = FastDateFormat.getInstance("dd/MMM/yyyy:HH:mm:ss Z", Locale.ENGLISH) 17 | 18 | //目标日期格式 19 | val TARGET_FORMAT = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss") 20 | 21 | /** 22 | * 获取时间:yyyy-MM-dd HH:mm:ss 23 | */ 24 | def parse(time: String) = { 25 | TARGET_FORMAT.format(new Date(getTime(time))) 26 | } 27 | 28 | /** 29 | * 获取输入日志时间:long类型 30 | * 31 | * time: [10/Nov/2016:00:01:02 +0800] 32 | */ 33 | def getTime(time: String) = { 34 | try { 35 | YYYYMMDDHHMM_TIME_FORMAT.parse(time.substring(time.indexOf("[") + 1, 36 | time.lastIndexOf("]"))).getTime 37 | } catch { 38 | case e: Exception => { 39 | 0l 40 | } 41 | } 42 | } 43 | 44 | def main(args: Array[String]) { 45 | println(parse("[10/Nov/2016:00:01:02 +0800]")) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/log/DayCityVideoAccessStat.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.log 2 | 3 | /** 4 | * Created by Thpffcj on 2018/5/7. 5 | */ 6 | case class DayCityVideoAccessStat(day:String, cmsId:Long, city:String,times:Long,timesRank:Int) 7 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/log/DayVideoAccessStat.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.log 2 | 3 | /** 4 | * Created by Thpffcj on 2018/5/7. 5 | * 每天课程访问次数实体类 6 | */ 7 | case class DayVideoAccessStat(day: String, cmsId: Long, times: Long) 8 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/log/DayVideoTrafficsStat.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.log 2 | 3 | /** 4 | * Created by Thpffcj on 2018/5/7. 5 | */ 6 | case class DayVideoTrafficsStat(day:String,cmsId:Long,traffics:Long) 7 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/log/IpUtils.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.log 2 | 3 | import com.ggstar.util.ip.IpHelper 4 | 5 | /** 6 | * Created by Thpffcj on 2018/5/7. 7 | * IP解析工具类 8 | */ 9 | object IpUtils { 10 | 11 | def getCity(ip:String) = { 12 | IpHelper.findRegionByIp(ip) 13 | } 14 | 15 | def main(args: Array[String]) { 16 | println(getCity("218.197.153.150")) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/log/MySQLUtils.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.log 2 | 3 | import java.sql.DriverManager 4 | 5 | import java.sql.{Connection, PreparedStatement, DriverManager} 6 | 7 | /** 8 | * Created by Thpffcj on 2018/5/7. 9 | * MySQL操作工具类 10 | */ 11 | object MySQLUtils { 12 | 13 | /** 14 | * 获取数据库连接 15 | */ 16 | def getConnection() = { 17 | DriverManager.getConnection("jdbc:mysql://localhost:3306/sparksql?user=root&password=000000") 18 | } 19 | 20 | /** 21 | * 释放数据库连接等资源 22 | * @param connection 23 | * @param pstmt 24 | */ 25 | def release(connection: Connection, pstmt: PreparedStatement): Unit = { 26 | try { 27 | if (pstmt != null) { 28 | pstmt.close() 29 | } 30 | } catch { 31 | case e: Exception => e.printStackTrace() 32 | } finally { 33 | if (connection != null) { 34 | connection.close() 35 | } 36 | } 37 | } 38 | 39 | def main(args: Array[String]) { 40 | println(getConnection()) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/log/SparkStatCleanJob.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.log 2 | 3 | import org.apache.spark.sql.{SaveMode, SparkSession} 4 | 5 | /** 6 | * Created by Thpffcj on 2018/5/7. 7 | * 使用Spark完成我们的数据清洗操作 8 | */ 9 | object SparkStatCleanJob { 10 | 11 | def main(args: Array[String]): Unit = { 12 | 13 | val spark = SparkSession.builder().appName("SparkStatCleanJob") 14 | .master("local[2]").getOrCreate() 15 | 16 | val accessRDD = spark.sparkContext.textFile("D:/access.log") 17 | // accessRDD.take(10).foreach(println) 18 | 19 | // RDD ==> DF 20 | val accessDF = spark.createDataFrame(accessRDD.map(x => AccessConvertUtil.parseLog(x)), 21 | AccessConvertUtil.struct) 22 | 23 | // accessDF.printSchema() 24 | // accessDF.show(false) 25 | 26 | accessDF.coalesce(1).write.format("parquet").mode(SaveMode.Overwrite) 27 | .partitionBy("day").save("D:/clean") 28 | 29 | spark.stop() 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/log/SparkStatFormatJob.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.log 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | /** 6 | * Created by Thpffcj on 2018/5/7. 7 | * 第一步清洗:抽取出我们所需要的指定列的数据 8 | */ 9 | object SparkStatFormatJob { 10 | 11 | def main(args: Array[String]): Unit = { 12 | 13 | val spark = SparkSession.builder().appName("SparkStatFormatJob") 14 | .master("local[2]").getOrCreate() 15 | 16 | val access = spark.sparkContext.textFile("D:/access.log") 17 | // access.take(10).foreach(println) 18 | 19 | access.map(line => { 20 | val splits = line.split(" ") 21 | val ip = splits(0) 22 | 23 | /** 24 | * 原始日志的第三个和第四个字段拼接起来就是完整的访问时间: 25 | * [10/Nov/2016:00:01:02 +0800] ==> yyyy-MM-dd HH:mm:ss 26 | */ 27 | val time = splits(3) + " " + splits(4) 28 | val url = splits(11).replaceAll("\"","") 29 | val traffic = splits(9) 30 | DateUtils.parse(time) + "\t" + url + "\t" + traffic + "\t" + ip 31 | }).saveAsTextFile("D:/output") 32 | 33 | spark.stop() 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/log/StatDAO.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.log 2 | 3 | import java.sql.{PreparedStatement, Connection} 4 | 5 | import scala.collection.mutable.ListBuffer 6 | 7 | /** 8 | * Created by Thpffcj on 2018/5/7. 9 | * 各个维度统计的DAO操作 10 | */ 11 | object StatDAO { 12 | 13 | /** 14 | * 批量保存DayVideoAccessStat到数据库 15 | */ 16 | def insertDayVideoAccessTopN(list: ListBuffer[DayVideoAccessStat]): Unit = { 17 | 18 | var connection: Connection = null 19 | var pstmt: PreparedStatement = null 20 | 21 | try { 22 | connection = MySQLUtils.getConnection() 23 | 24 | connection.setAutoCommit(false) //设置手动提交 25 | 26 | val sql = "insert into day_video_access_topn_stat(day, cms_id, times) values (?,?,?) " 27 | pstmt = connection.prepareStatement(sql) 28 | 29 | for (ele <- list) { 30 | pstmt.setString(1, ele.day) 31 | pstmt.setLong(2, ele.cmsId) 32 | pstmt.setLong(3, ele.times) 33 | 34 | pstmt.addBatch() 35 | } 36 | 37 | pstmt.executeBatch() // 执行批量处理 38 | connection.commit() //手工提交 39 | } catch { 40 | case e: Exception => e.printStackTrace() 41 | } finally { 42 | MySQLUtils.release(connection, pstmt) 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/log/TopNStatJob.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.log 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | import org.apache.spark.sql.functions._ 5 | 6 | import scala.collection.mutable.ListBuffer 7 | 8 | /** 9 | * Created by Thpffcj on 2018/5/7. 10 | * TopN统计Spark作业 11 | */ 12 | object TopNStatJob { 13 | 14 | def main(args: Array[String]): Unit = { 15 | val spark = SparkSession.builder().appName("TopNStatJob") 16 | .config("spark.sql.sources.partitionColumnTypeInference.enabled", "false") 17 | .master("local[2]").getOrCreate() 18 | 19 | val accessDF = spark.read.format("parquet").load("D:/clean") 20 | 21 | // accessDF.printSchema() 22 | // accessDF.show(false) 23 | 24 | val day = "20180507" 25 | 26 | // 最受欢迎的TopN课程 27 | videoAccessTopNStat(spark, accessDF, day) 28 | 29 | spark.stop() 30 | } 31 | 32 | /** 33 | * 按照地市进行统计TopN课程 34 | */ 35 | def videoAccessTopNStat(spark: SparkSession, accessDF: DataFrame, day: String): Unit = { 36 | 37 | /** 38 | * 使用DataFrame的方式进行统计 39 | */ 40 | import spark.implicits._ 41 | 42 | val videoAccessTopNDF = accessDF.filter($"day" === day && $"cmsType" === "video") 43 | .groupBy("day", "cmsId").agg(count("cmsId").as("times")).orderBy($"times".desc) 44 | 45 | videoAccessTopNDF.show(false) 46 | 47 | /** 48 | * 使用SQL的方式进行统计 49 | */ 50 | // accessDF.createOrReplaceTempView("access_logs") 51 | // val videoAccessTopNDF = spark.sql("select day,cmsId, count(1) as times from access_logs " + 52 | // "where day='20180507' and cmsType='video' " + 53 | // "group by day,cmsId order by times desc") 54 | // 55 | // videoAccessTopNDF.show(false) 56 | 57 | /** 58 | * 将统计结果写入到MySQL中 59 | */ 60 | try { 61 | videoAccessTopNDF.foreachPartition(partitionOfRecords => { 62 | val list = new ListBuffer[DayVideoAccessStat] 63 | 64 | partitionOfRecords.foreach(info => { 65 | val day = info.getAs[String]("day") 66 | val cmsId = info.getAs[Long]("cmsId") 67 | val times = info.getAs[Long]("times") 68 | 69 | /** 70 | * 不建议大家在此处进行数据库的数据插入 71 | */ 72 | list.append(DayVideoAccessStat(day, cmsId, times)) 73 | }) 74 | 75 | StatDAO.insertDayVideoAccessTopN(list) 76 | }) 77 | } catch { 78 | case e:Exception => e.printStackTrace() 79 | } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/spark/DataFrameApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | /** 6 | * Created by Thpffcj on 2018/5/3. 7 | * DataFrame API基本操作 8 | */ 9 | object DataFrameApp { 10 | 11 | def main(args: Array[String]): Unit = { 12 | 13 | val spark = SparkSession.builder().appName("DataFrameApp").master("local[2]").getOrCreate() 14 | 15 | // 将json文件加载成一个dataframe 16 | val peopleDF = spark.read.format("json").load("D:/people.json") 17 | 18 | // 输出dataframe对应的schema信息 19 | peopleDF.printSchema() 20 | 21 | // 输出数据集的前20条记录 22 | peopleDF.show() 23 | 24 | //查询某列所有的数据: select name from table 25 | peopleDF.select("name").show() 26 | 27 | // 查询某几列所有的数据,并对列进行计算: select name, age+10 as age2 from table 28 | peopleDF.select(peopleDF.col("name"), (peopleDF.col("age") + 10).as("age2")).show() 29 | 30 | //根据某一列的值进行过滤: select * from table where age>19 31 | peopleDF.filter(peopleDF.col("age") > 19).show() 32 | 33 | //根据某一列进行分组,然后再进行聚合操作: select age,count(1) from table group by age 34 | peopleDF.groupBy("age").count().show() 35 | 36 | spark.stop() 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/spark/DataFrameCase.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | /** 6 | * Created by Thpffcj on 2018/5/3. 7 | * DataFrame中的操作操作 8 | */ 9 | object DataFrameCase { 10 | 11 | def main(args: Array[String]): Unit = { 12 | val spark = SparkSession.builder().appName("DataFrameRDDApp").master("local[2]").getOrCreate() 13 | 14 | // RDD ==> DataFrame 15 | val rdd = spark.sparkContext.textFile("D:/student.data") 16 | 17 | // 注意:需要导入隐式转换 18 | import spark.implicits._ 19 | val studentDF = rdd.map(_.split("\\|")).map(line => Student(line(0).toInt, line(1), line(2), line(3))).toDF() 20 | 21 | // show默认只显示前20条 22 | studentDF.show 23 | studentDF.show(30) 24 | studentDF.show(30, false) 25 | 26 | studentDF.take(10) 27 | studentDF.first() 28 | studentDF.head(3) 29 | 30 | studentDF.select("email").show(30,false) 31 | 32 | studentDF.filter("name=''").show 33 | studentDF.filter("name='' OR name='NULL'").show 34 | 35 | // name以M开头的人 36 | studentDF.filter("SUBSTR(name, 0, 1)='M'").show 37 | 38 | studentDF.sort(studentDF("name")).show 39 | studentDF.sort(studentDF("name").desc).show 40 | 41 | studentDF.sort("name","id").show 42 | studentDF.sort(studentDF("name").asc, studentDF("id").desc).show 43 | 44 | studentDF.select(studentDF("name").as("student_name")).show 45 | 46 | val studentDF2 = rdd.map(_.split("\\|")).map(line => Student(line(0).toInt, line(1), line(2), line(3))).toDF() 47 | 48 | studentDF.join(studentDF2, studentDF.col("id") === studentDF2.col("id")).show 49 | 50 | spark.stop() 51 | } 52 | 53 | case class Student(id: Int, name: String, phone: String, email: String) 54 | } 55 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/spark/DataFrameRDDApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark 2 | 3 | import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType} 4 | import org.apache.spark.sql.{Row, SparkSession} 5 | 6 | /** 7 | * Created by Thpffcj on 2018/5/3. 8 | * DataFrame和RDD的互操作 9 | */ 10 | object DataFrameRDDApp { 11 | 12 | def main(args: Array[String]) { 13 | 14 | val spark = SparkSession.builder().appName("DataFrameRDDApp").master("local[2]").getOrCreate() 15 | 16 | //inferReflection(spark) 17 | 18 | program(spark) 19 | 20 | spark.stop() 21 | } 22 | 23 | def program(spark: SparkSession): Unit = { 24 | // RDD ==> DataFrame 25 | val rdd = spark.sparkContext.textFile("D:/infos.txt") 26 | 27 | val infoRDD = rdd.map(_.split(",")).map(line => Row(line(0).toInt, line(1), line(2).toInt)) 28 | 29 | val structType = StructType(Array(StructField("id", IntegerType, true), 30 | StructField("name", StringType, true), 31 | StructField("age", IntegerType, true))) 32 | 33 | val infoDF = spark.createDataFrame(infoRDD, structType) 34 | infoDF.printSchema() 35 | infoDF.show() 36 | 37 | 38 | //通过df的api进行操作 39 | infoDF.filter(infoDF.col("age") > 30).show 40 | 41 | //通过sql的方式进行操作 42 | infoDF.createOrReplaceTempView("infos") 43 | spark.sql("select * from infos where age > 30").show() 44 | } 45 | 46 | def inferReflection(spark: SparkSession) { 47 | // RDD ==> DataFrame 48 | val rdd = spark.sparkContext.textFile("D:/infos.txt") 49 | 50 | //注意:需要导入隐式转换 51 | import spark.implicits._ 52 | val infoDF = rdd.map(_.split(",")).map(line => Info(line(0).toInt, line(1), line(2).toInt)).toDF() 53 | 54 | infoDF.show() 55 | 56 | infoDF.filter(infoDF.col("age") > 30).show 57 | 58 | infoDF.createOrReplaceTempView("infos") 59 | spark.sql("select * from infos where age > 30").show() 60 | } 61 | 62 | case class Info(id: Int, name: String, age: Int) 63 | } 64 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/spark/DataSetApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | /** 6 | * Created by Thpffcj on 2018/5/3. 7 | * DataSet操作 8 | */ 9 | object DataSetApp { 10 | 11 | def main(args: Array[String]) { 12 | val spark = SparkSession.builder().appName("DatasetApp") 13 | .master("local[2]").getOrCreate() 14 | 15 | //注意:需要导入隐式转换 16 | import spark.implicits._ 17 | 18 | val path = "/Users/thpffcj/Public/data/sales.csv" 19 | 20 | //spark如何解析csv文件? 21 | val df = spark.read.option("header","true").option("inferSchema","true").csv(path) 22 | df.show 23 | 24 | val ds = df.as[Sales] 25 | ds.map(line => line.itemId).show 26 | 27 | ds.map(line => line.itemId) 28 | 29 | spark.stop() 30 | } 31 | 32 | case class Sales(transactionId:Int,customerId:Int,itemId:Int,amountPaid:Double) 33 | } 34 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/spark/HiveContextApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark 2 | 3 | import org.apache.spark.sql.hive.HiveContext 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | 6 | /** 7 | * Created by Thpffcj on 2018/4/29. 8 | * HiveContext的使用 9 | * 使用时需要通过--jars 把mysql的驱动传递到classpath 10 | * 不能直接运行,需要打包到服务器运行 11 | */ 12 | object HiveContextApp { 13 | 14 | def main(args: Array[String]) { 15 | // 1. 创建相应的Context 16 | val sparkConf = new SparkConf() 17 | 18 | // 在测试或者生产中,AppName和Master我们是通过脚本进行指定 19 | // sparkConf.setAppName("HiveContextApp").setMaster("local[2]") 20 | 21 | val sc = new SparkContext(sparkConf) 22 | val hiveContext = new HiveContext(sc) 23 | 24 | // 2. 相关的处理: 25 | hiveContext.table("emp").show 26 | 27 | // 3. 关闭资源 28 | sc.stop() 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/spark/HiveMySQLApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | /** 6 | * Created by Thpffcj on 2018/5/4. 7 | * 使用外部数据源综合查询Hive和MySQL的表数据 8 | * 不能直接运行,需要打包到服务器运行 9 | */ 10 | object HiveMySQLApp { 11 | 12 | def main(args: Array[String]) { 13 | 14 | val spark = SparkSession.builder() 15 | // .appName("HiveMySQLApp") 16 | // .master("local[2]") 17 | .getOrCreate() 18 | 19 | // 加载Hive表数据 20 | val hiveDF = spark.table("emp") 21 | 22 | // 加载MySQL表数据 23 | val mysqlDF = spark.read.format("jdbc") 24 | .option("url", "jdbc:mysql://localhost:3306") 25 | .option("dbtable", "spark.DEPT") 26 | .option("user", "root") 27 | .option("password", "000000") 28 | .option("driver", "com.mysql.jdbc.Driver").load() 29 | 30 | // JOIN 31 | val resultDF = hiveDF.join(mysqlDF, hiveDF.col("deptno") === mysqlDF.col("DEPTNO")) 32 | resultDF.show 33 | 34 | resultDF.select(hiveDF.col("empno"),hiveDF.col("ename"), 35 | mysqlDF.col("deptno"), mysqlDF.col("dname")).show 36 | 37 | spark.stop() 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/spark/ParquetApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | /** 6 | * Created by Thpffcj on 2018/5/4. 7 | */ 8 | object ParquetApp { 9 | 10 | def main(args: Array[String]) { 11 | 12 | val spark = SparkSession.builder().appName("ParquetApp") 13 | .master("local[2]").getOrCreate() 14 | 15 | /** 16 | * spark.read.format("parquet").load 这是标准写法 17 | */ 18 | val userDF = spark.read.format("parquet").load("file:///home/thpffcj/app/spark-2.2.0-bin-2.6.0-cdh5.7.0/examples/src/main/resources/users.parquet") 19 | 20 | userDF.printSchema() 21 | userDF.show() 22 | 23 | userDF.select("name","favorite_color").show 24 | 25 | userDF.select("name","favorite_color").write.format("json").save("file:///home/thpffcj/tmp/jsonout") 26 | 27 | spark.read.load("file:///home/thpffcj/app/spark-2.2.0-bin-2.6.0-cdh5.7.0/examples/src/main/resources/users.parquet").show 28 | 29 | // 会报错,因为sparksql默认处理的format就是parquet 30 | spark.read.load("file:///home/thpffcj/app/spark-2.2.0-bin-2.6.0-cdh5.7.0/examples/src/main/resources/people.json").show 31 | 32 | spark.read.format("parquet").option("path","file:///home/thpffcj/app/spark-2.2.0-bin-2.6.0-cdh5.7.0/examples/src/main/resources/users.parquet").load().show 33 | spark.stop() 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/spark/SQLContextApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import org.apache.spark.sql.SQLContext 5 | 6 | /** 7 | * Created by Thpffcj on 2018/4/29. 8 | * SQLContext的使用 9 | * 注意:IDEA是在本地,而测试数据是在服务器上,能不能在本地进行开发测试的? 10 | */ 11 | object SQLContextApp { 12 | 13 | def main(args: Array[String]): Unit = { 14 | 15 | val path = args(0) 16 | 17 | // 1. 创建相应的Context 18 | val sparkConf = new SparkConf() 19 | 20 | //在测试或者生产中,AppName和Master我们是通过脚本进行指定 21 | //sparkConf.setAppName("SQLContextApp").setMaster("local[2]") 22 | 23 | val sc = new SparkContext(sparkConf) 24 | 25 | val sqlContext = new SQLContext(sc) 26 | 27 | // 2. 相关的处理: json 28 | val people = sqlContext.read.format("json").load(path) 29 | people.printSchema() 30 | people.show() 31 | 32 | // 3. 关闭资源 33 | sc.stop() 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/spark/SparkSQLThriftServerApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark 2 | 3 | import java.sql.DriverManager 4 | 5 | /** 6 | * Created by Thpffcj on 2018/5/2. 7 | * 通过JDBC的方式访问 8 | */ 9 | object SparkSQLThriftServerApp { 10 | 11 | def main(args: Array[String]): Unit = { 12 | 13 | Class.forName("org.apache.hive.jdbc.HiveDriver") 14 | 15 | val conn = DriverManager.getConnection("jdbc:hive2://thpffcj:10000","thpffcj","") 16 | val pstmt = conn.prepareStatement("select empno, ename, sal from emp") 17 | val rs = pstmt.executeQuery() 18 | 19 | while (rs.next()) { 20 | println("empno:" + rs.getInt("empno") + 21 | " , ename:" + rs.getString("ename") + 22 | " , sal:" + rs.getDouble("sal")) 23 | 24 | } 25 | 26 | rs.close() 27 | pstmt.close() 28 | conn.close() 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /spark-sql-train/src/main/scala/cn/edu/nju/spark/SparkSessionApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | /** 6 | * Created by Thpffcj on 2018/5/2. 7 | * SparkSession的使用 8 | */ 9 | object SparkSessionApp { 10 | 11 | def main(args: Array[String]) { 12 | 13 | val spark = SparkSession.builder().appName("SparkSessionApp") 14 | .master("local[2]").getOrCreate() 15 | 16 | val people = spark.read.json("D:/people.json") 17 | people.show() 18 | 19 | spark.stop() 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /spark-sql-visualization/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | cn.edu.nju 7 | 151250052 8 | 1.0-SNAPSHOT 9 | war 10 | 11 | 12 | 13 | javax.servlet 14 | servlet-api 15 | 2.5 16 | 17 | 18 | 19 | javax.servlet 20 | jsp-api 21 | 2.0 22 | 23 | 24 | 25 | mysql 26 | mysql-connector-java 27 | 5.1.38 28 | 29 | 30 | 31 | net.sf.json-lib 32 | json-lib 33 | 2.4 34 | jdk15 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /spark-sql-visualization/src/main/java/cn/edu/nju/dao/VideoAccessTopNDAO.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.dao; 2 | 3 | import cn.edu.nju.domain.VideoAccessTopN; 4 | import cn.edu.nju.utils.MySQLUtils; 5 | 6 | import java.sql.Connection; 7 | import java.sql.PreparedStatement; 8 | import java.sql.ResultSet; 9 | import java.util.ArrayList; 10 | import java.util.HashMap; 11 | import java.util.List; 12 | import java.util.Map; 13 | 14 | /** 15 | * Created by Thpffcj on 2018/5/8. 16 | */ 17 | public class VideoAccessTopNDAO { 18 | 19 | 20 | static Map courses = new HashMap(); 21 | static { 22 | courses.put("8701", "MySQL优化"); 23 | courses.put("8702", "神经网络"); 24 | courses.put("8703", "Swift"); 25 | courses.put("8709", "机器学习"); 26 | } 27 | 28 | /** 29 | * 根据课程编号查询课程名称 30 | */ 31 | public String getCourseName(String id) { 32 | return courses.get(id); 33 | } 34 | 35 | 36 | /** 37 | * 根据day查询当天的最受欢迎的Top5课程 38 | * @param day 39 | */ 40 | public List query(String day) { 41 | List list = new ArrayList(); 42 | 43 | Connection connection = null; 44 | PreparedStatement psmt = null; 45 | ResultSet rs = null; 46 | 47 | try { 48 | connection = MySQLUtils.getConnection(); 49 | String sql = "select cms_id ,times from day_video_access_topn_stat where day =? order by times desc limit 5"; 50 | psmt = connection.prepareStatement(sql); 51 | psmt.setString(1, day); 52 | 53 | rs = psmt.executeQuery(); 54 | 55 | VideoAccessTopN domain = null; 56 | while(rs.next()) { 57 | domain = new VideoAccessTopN(); 58 | /** 59 | * TODO... 在页面上应该显示的是课程名称,而我们此时拿到的是课程编号 60 | * 61 | * 如何根据课程编号去获取课程名称呢? 62 | * 编号和名称是有一个对应关系的,一般是存放在关系型数据库 63 | */ 64 | domain.setName(getCourseName(rs.getLong("cms_id")+"")); 65 | domain.setValue(rs.getLong("times")); 66 | 67 | list.add(domain); 68 | } 69 | 70 | }catch (Exception e) { 71 | e.printStackTrace(); 72 | } finally { 73 | MySQLUtils.release(connection, psmt, rs); 74 | } 75 | return list; 76 | } 77 | 78 | public static void main(String[] args) { 79 | VideoAccessTopNDAO dao = new VideoAccessTopNDAO(); 80 | List list = dao.query("20180507"); 81 | for(VideoAccessTopN result: list) { 82 | System.out.println(result.getName() + " , " + result.getValue()); 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /spark-sql-visualization/src/main/java/cn/edu/nju/domain/VideoAccessTopN.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.domain; 2 | 3 | /** 4 | * Created by Thpffcj on 2018/5/8. 5 | */ 6 | public class VideoAccessTopN { 7 | 8 | private String name; 9 | private long value ; 10 | 11 | public String getName() { 12 | return name; 13 | } 14 | 15 | public void setName(String name) { 16 | this.name = name; 17 | } 18 | 19 | public long getValue() { 20 | return value; 21 | } 22 | 23 | public void setValue(long value) { 24 | this.value = value; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /spark-sql-visualization/src/main/java/cn/edu/nju/utils/MySQLUtils.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.utils; 2 | 3 | import java.sql.*; 4 | 5 | /** 6 | * Created by Thpffcj on 2018/5/8. 7 | */ 8 | public class MySQLUtils { 9 | 10 | private static final String USERNAME = "root"; 11 | 12 | private static final String PASSWORD = "000000"; 13 | 14 | private static final String DRIVERCLASS = "com.mysql.jdbc.Driver"; 15 | 16 | private static final String URL = "jdbc:mysql://localhost:3306/sparksql"; 17 | 18 | /** 19 | * 获取数据库连接 20 | */ 21 | public static Connection getConnection() { 22 | Connection connection = null; 23 | try { 24 | Class.forName(DRIVERCLASS); 25 | connection = DriverManager.getConnection(URL,USERNAME,PASSWORD); 26 | } catch (Exception e) { 27 | e.printStackTrace(); 28 | } 29 | 30 | return connection; 31 | } 32 | 33 | /** 34 | * 释放资源 35 | */ 36 | public static void release(Connection connection, PreparedStatement pstmt, ResultSet rs) { 37 | if(rs != null) { 38 | try { 39 | rs.close(); 40 | } catch (SQLException e) { 41 | e.printStackTrace(); 42 | } 43 | } 44 | 45 | if(pstmt != null) { 46 | try { 47 | pstmt.close(); 48 | } catch (SQLException e) { 49 | e.printStackTrace(); 50 | } 51 | } 52 | 53 | if(connection != null) { 54 | try { 55 | connection.close(); 56 | } catch (SQLException e) { 57 | e.printStackTrace(); 58 | } 59 | } 60 | } 61 | 62 | public static void main(String[] args) { 63 | System.out.println(getConnection()); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /spark-sql-visualization/src/main/java/cn/edu/nju/web/VideoAccessTopNServlet.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.web; 2 | 3 | import cn.edu.nju.dao.VideoAccessTopNDAO; 4 | import cn.edu.nju.domain.VideoAccessTopN; 5 | import net.sf.json.JSONArray; 6 | 7 | import javax.servlet.ServletException; 8 | import javax.servlet.http.HttpServlet; 9 | import javax.servlet.http.HttpServletRequest; 10 | import javax.servlet.http.HttpServletResponse; 11 | import java.io.IOException; 12 | import java.io.PrintWriter; 13 | import java.util.List; 14 | 15 | /** 16 | * Created by Thpffcj on 2018/5/8. 17 | */ 18 | public class VideoAccessTopNServlet extends HttpServlet{ 19 | 20 | private VideoAccessTopNDAO dao; 21 | 22 | @Override 23 | public void init() throws ServletException { 24 | dao = new VideoAccessTopNDAO(); 25 | } 26 | 27 | @Override 28 | protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { 29 | String day = req.getParameter("day"); 30 | 31 | List results = dao.query(day); 32 | JSONArray json = JSONArray.fromObject(results); 33 | 34 | resp.setContentType("text/html;charset=utf-8"); 35 | 36 | PrintWriter writer = resp.getWriter(); 37 | writer.println(json); 38 | writer.flush(); 39 | writer.close(); 40 | } 41 | 42 | @Override 43 | protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException { 44 | this.doGet(req, resp); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /spark-sql-visualization/src/main/webapp/WEB-INF/web.xml: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | Archetype Created Web Application 7 | 8 | 9 | stat 10 | com.imooc.web.VideoAccessTopNServlet 11 | 12 | 13 | 14 | stat 15 | /stat 16 | 17 | 18 | -------------------------------------------------------------------------------- /spark-sql-visualization/src/main/webapp/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Echarts HelloWorld 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 | 15 | 62 | 63 | -------------------------------------------------------------------------------- /spark-sql-visualization/src/main/webapp/topn.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 主站最受欢迎的TopN课程 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 | 16 | 70 | 71 | -------------------------------------------------------------------------------- /spark-train/src/main/java/cn/edu/nju/spark/StreamingWordCountApp.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark; 2 | 3 | import org.apache.spark.*; 4 | import org.apache.spark.streaming.*; 5 | import org.apache.spark.streaming.api.java.*; 6 | import scala.Tuple2; 7 | 8 | import java.util.Arrays; 9 | 10 | 11 | /** 12 | * Created by Thpffcj on 2018/1/16. 13 | * 使用Java开发Spark Streaming应用程序 14 | */ 15 | public class StreamingWordCountApp { 16 | 17 | public static void main(String[] args) throws InterruptedException { 18 | 19 | SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("StreamingWordCountApp"); 20 | JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5)); 21 | 22 | // 创建一个DStream(hostname + port) 23 | JavaReceiverInputDStream lines = jssc.socketTextStream("localhost", 9999); 24 | 25 | JavaDStream words = lines.flatMap(x -> Arrays.asList(x.split(" ")).iterator()); 26 | 27 | JavaPairDStream pairs = words.mapToPair(word -> new Tuple2<>(word, 1)); 28 | JavaPairDStream wordCounts = pairs.reduceByKey((i1, i2) -> i1 + i2); 29 | 30 | wordCounts.print(); 31 | 32 | jssc.start(); 33 | jssc.awaitTermination(); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /spark-train/src/main/java/cn/edu/nju/spark/WordCountApp.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark; 2 | 3 | import org.apache.spark.api.java.JavaPairRDD; 4 | import org.apache.spark.api.java.JavaRDD; 5 | import org.apache.spark.sql.SparkSession; 6 | import scala.Tuple2; 7 | 8 | import java.util.Arrays; 9 | import java.util.List; 10 | 11 | /** 12 | * Created by Thpffcj on 2018/1/16. 13 | * 使用Java开发Spark应用程序 14 | */ 15 | public class WordCountApp { 16 | 17 | public static void main(String[] args) { 18 | 19 | SparkSession spark = SparkSession.builder().appName("WordCountApp").master("local[2]").getOrCreate(); 20 | 21 | JavaRDD lines = spark.read().textFile("/Users/thpffcj/Public/file/hello.txt").javaRDD(); 22 | JavaRDD words = lines.flatMap(line -> Arrays.asList(line.split(" ")).iterator()); 23 | JavaPairRDD counts = words.mapToPair(word -> new Tuple2(word, 1)).reduceByKey((x, y) -> x+y); 24 | 25 | List> output = counts.collect(); 26 | 27 | for (Tuple2 tuple : output) { 28 | System.out.println(tuple._1() + " : " + tuple._2()); 29 | } 30 | 31 | spark.stop(); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /spark-train/src/main/java/cn/edu/nju/spark/kafkas/KafkaClientApp.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark.kafkas; 2 | 3 | /** 4 | * Created by Thpffcj on 2018/1/11. 5 | */ 6 | public class KafkaClientApp { 7 | 8 | public static void main(String[] args) { 9 | new KafkaProducer(KafkaProperties.TOPIC).start(); 10 | new KafkaConsumer(KafkaProperties.TOPIC).start(); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /spark-train/src/main/java/cn/edu/nju/spark/kafkas/KafkaConsumer.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark.kafkas; 2 | 3 | import kafka.consumer.Consumer; 4 | import kafka.consumer.ConsumerConfig; 5 | import kafka.consumer.ConsumerIterator; 6 | import kafka.consumer.KafkaStream; 7 | import kafka.javaapi.consumer.ConsumerConnector; 8 | 9 | import java.util.HashMap; 10 | import java.util.List; 11 | import java.util.Map; 12 | import java.util.Properties; 13 | 14 | /** 15 | * Created by Thpffcj on 2018/1/11. 16 | */ 17 | public class KafkaConsumer extends Thread { 18 | 19 | private String topic; 20 | 21 | public KafkaConsumer(String topic) { 22 | this.topic = topic; 23 | } 24 | 25 | private ConsumerConnector createConnector() { 26 | 27 | Properties properties = new Properties(); 28 | properties.put("zookeeper.connect", KafkaProperties.ZOOKEEPER); 29 | properties.put("group.id", KafkaProperties.GROUP_ID); 30 | 31 | return Consumer.createJavaConsumerConnector(new ConsumerConfig(properties)); 32 | } 33 | 34 | @Override 35 | public void run() { 36 | ConsumerConnector consumer = createConnector(); 37 | 38 | Map topicCountMap = new HashMap(); 39 | topicCountMap.put(topic, 1); 40 | 41 | // String: topic 42 | // List> 对应的数据流 43 | Map>> messageStream = consumer.createMessageStreams(topicCountMap); 44 | 45 | KafkaStream stream = messageStream.get(topic).get(0); //获取我们每次接收到的数据 46 | 47 | ConsumerIterator iterator = stream.iterator(); 48 | 49 | while (iterator.hasNext()) { 50 | String message = new String(iterator.next().message()); 51 | System.out.println("rec: " + message); 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /spark-train/src/main/java/cn/edu/nju/spark/kafkas/KafkaProducer.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark.kafkas; 2 | 3 | import kafka.javaapi.producer.Producer; 4 | import kafka.producer.KeyedMessage; 5 | import kafka.producer.ProducerConfig; 6 | 7 | import java.util.Properties; 8 | 9 | /** 10 | * Created by Thpffcj on 2018/1/11. 11 | */ 12 | public class KafkaProducer extends Thread { 13 | 14 | private String topic; 15 | 16 | private Producer producer; 17 | 18 | public KafkaProducer(String topic) { 19 | this.topic = topic; 20 | 21 | Properties properties = new Properties(); 22 | properties.put("metadata.broker.list", KafkaProperties.BROKER_LIST); 23 | properties.put("serializer.class", "kafka.serializer.StringEncoder"); 24 | properties.put("request.required.acks", "1"); 25 | 26 | producer = new Producer(new ProducerConfig(properties)); 27 | } 28 | 29 | @Override 30 | public void run() { 31 | 32 | int messageNo = 1; 33 | 34 | while (true) { 35 | String message = "message_" + messageNo; 36 | producer.send(new KeyedMessage(topic, message)); 37 | System.out.println("Sent:" + message); 38 | 39 | messageNo++; 40 | 41 | try { 42 | Thread.sleep(2000); 43 | } catch (Exception e) { 44 | e.printStackTrace(); 45 | } 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /spark-train/src/main/java/cn/edu/nju/spark/kafkas/KafkaProperties.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark.kafkas; 2 | 3 | /** 4 | * Created by Thpffcj on 2018/1/11. 5 | * Kafka常用配置文件 6 | */ 7 | public class KafkaProperties { 8 | 9 | public static final String ZOOKEEPER = "192.168.92.130:2181"; 10 | public static final String TOPIC = "kafka-topic"; 11 | public static final String BROKER_LIST = "192.168.92.130:9092"; 12 | public static final String GROUP_ID = "test_group1"; 13 | } 14 | -------------------------------------------------------------------------------- /spark-train/src/main/scala/cn/edu/nju/spark/FlumePullWordCount.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.flume.FlumeUtils 5 | import org.apache.spark.streaming.{Seconds, StreamingContext} 6 | 7 | /** 8 | * Created by Thpffcj on 2018/1/13. 9 | * Spark Streaming整合Flume的第二种方式 10 | */ 11 | object FlumePullWordCount { 12 | 13 | def main(args: Array[String]): Unit = { 14 | 15 | if(args.length != 2) { 16 | System.err.println("Usage: FlumePullWordCount ") 17 | System.exit(1) 18 | } 19 | 20 | val Array(hostname, port) = args 21 | 22 | val sparkConf = new SparkConf().setMaster("local[2]") //.setAppName("FlumePullWordCount") 23 | val ssc = new StreamingContext(sparkConf, Seconds(5)) 24 | 25 | val flumeStream = FlumeUtils.createPollingStream(ssc, hostname, port.toInt) 26 | 27 | flumeStream.map(x=> new String(x.event.getBody.array()).trim) 28 | .flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).print() 29 | 30 | ssc.start() 31 | ssc.awaitTermination() 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /spark-train/src/main/scala/cn/edu/nju/spark/FlumePushWordCount.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.flume.FlumeUtils 5 | import org.apache.spark.streaming.{Seconds, StreamingContext} 6 | 7 | /** 8 | * Created by Thpffcj on 2018/1/13. 9 | * Spark Streaming整合Flume的第一种方式 10 | */ 11 | object FlumePushWordCount { 12 | 13 | def main(args: Array[String]): Unit = { 14 | 15 | if(args.length != 2) { 16 | System.err.println("Usage: FlumePushWordCount ") 17 | System.exit(1) 18 | } 19 | 20 | val Array(hostname, port) = args 21 | 22 | val sparkCof = new SparkConf().setMaster("local[2]").setAppName("FlumePushWordCount") 23 | val ssc = new StreamingContext(sparkCof, Seconds(5)) 24 | 25 | val flumeStream = FlumeUtils.createStream(ssc, hostname, port.toInt) 26 | 27 | flumeStream.map(x => new String(x.event.getBody.array()).trim) 28 | .flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _).print() 29 | 30 | ssc.start() 31 | ssc.awaitTermination() 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /spark-train/src/main/scala/cn/edu/nju/spark/ForeachRDDApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark 2 | 3 | import java.sql.DriverManager 4 | 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.streaming.{Seconds, StreamingContext} 7 | 8 | /** 9 | * Created by Thpffcj on 2018/1/13. 10 | * 使用Spark Streaming完成词频统计,并将结果写入到MySQL数据库中 11 | */ 12 | object ForeachRDDApp { 13 | 14 | def main(args: Array[String]): Unit = { 15 | 16 | val sparkConf = new SparkConf().setAppName("ForeachRDDApp").setMaster("local[2]") 17 | val ssc = new StreamingContext(sparkConf, Seconds(5)) 18 | 19 | val lines = ssc.socketTextStream("192.168.92.130", 6789) 20 | 21 | val result = lines.flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _) 22 | 23 | // 此处仅仅是将统计结果输出到控制台 24 | result.print() 25 | 26 | // 将结果写入到MySQL 27 | result.foreachRDD(rdd => { 28 | rdd.foreachPartition(partitionOfRecords => { 29 | val connection = createConnection() 30 | partitionOfRecords.foreach(record => { 31 | val sql = "insert into wordcount(word, wordcount) values('" + record._1 + "'," + record._2 + ")" 32 | connection.createStatement().execute(sql) 33 | }) 34 | 35 | connection.close() 36 | }) 37 | }) 38 | 39 | ssc.start() 40 | ssc.awaitTermination() 41 | } 42 | 43 | /** 44 | * 获取MySQL的连接 45 | */ 46 | def createConnection() = { 47 | Class.forName("com.mysql.jdbc.Driver") 48 | DriverManager.getConnection("jdbc:mysql://localhost:3306/spark", "root", "000000") 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /spark-train/src/main/scala/cn/edu/nju/spark/KafkaDirectWordCount.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark 2 | 3 | import kafka.serializer.StringDecoder 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.streaming.kafka.KafkaUtils 6 | import org.apache.spark.streaming.{Seconds, StreamingContext} 7 | 8 | /** 9 | * Created by Thpffcj on 2018/1/13. 10 | * Spark Streaming对接Kafka的方式二 11 | */ 12 | object KafkaDirectWordCount { 13 | 14 | def main(args: Array[String]): Unit = { 15 | 16 | if(args.length != 2) { 17 | System.err.println("Usage: KafkaDirectWordCount ") 18 | System.exit(1) 19 | } 20 | 21 | val Array(brokers, topics) = args 22 | 23 | val sparkConf = new SparkConf() //.setAppName("KafkaDirectWordCount").setMaster("local[2]") 24 | 25 | val ssc = new StreamingContext(sparkConf, Seconds(5)) 26 | 27 | val topicsSet = topics.split(",").toSet 28 | val kafkaParams = Map[String, String]("metadata.broker.list"-> brokers) 29 | 30 | val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( 31 | ssc, kafkaParams, topicsSet 32 | ) 33 | 34 | messages.map(_._2).flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).print() 35 | 36 | ssc.start() 37 | ssc.awaitTermination() 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /spark-train/src/main/scala/cn/edu/nju/spark/KafkaReceiverWordCount.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.kafka.KafkaUtils 5 | import org.apache.spark.streaming.{Seconds, StreamingContext} 6 | 7 | /** 8 | * Created by Thpffcj on 2018/1/13. 9 | * Spark Streaming对接Kafka的方式一 10 | */ 11 | object KafkaReceiverWordCount { 12 | 13 | def main(args: Array[String]): Unit = { 14 | 15 | if(args.length != 4) { 16 | System.err.println("Usage: KafkaReceiverWordCount ") 17 | } 18 | 19 | val Array(zkQuorum, group, topics, numThreads) = args 20 | 21 | val sparkConf = new SparkConf().setAppName("KafkaReceiverWordCount").setMaster("local[2]") 22 | val ssc = new StreamingContext(sparkConf, Seconds(5)) 23 | 24 | val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap 25 | 26 | val messages = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap) 27 | 28 | messages.map(_._2).flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).print() 29 | 30 | ssc.start() 31 | ssc.awaitTermination() 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /spark-train/src/main/scala/cn/edu/nju/spark/KafkaStreamingApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.kafka.KafkaUtils 5 | import org.apache.spark.streaming.{Seconds, StreamingContext} 6 | 7 | /** 8 | * Created by Thpffcj on 2018/1/14. 9 | */ 10 | object KafkaStreamingApp { 11 | 12 | def main(args: Array[String]): Unit = { 13 | 14 | if(args.length != 4) { 15 | System.err.println("Usage: KafkaStreamingApp ") 16 | } 17 | 18 | val Array(zkQuorum, group, topics, numThreads) = args 19 | 20 | val sparkConf = new SparkConf().setAppName("KafkaStreamingApp") 21 | .setMaster("local[2]") 22 | 23 | val ssc = new StreamingContext(sparkConf, Seconds(5)) 24 | 25 | val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap 26 | 27 | val messages = KafkaUtils.createStream(ssc, zkQuorum, group,topicMap) 28 | 29 | messages.map(_._2).count().print() 30 | 31 | ssc.start() 32 | ssc.awaitTermination() 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /spark-train/src/main/scala/cn/edu/nju/spark/StatefulWordCount.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.{Seconds, StreamingContext} 5 | 6 | /** 7 | * Created by Thpffcj on 2018/1/12. 8 | * 使用Spark Streaming完成有状态统计 9 | */ 10 | object StatefulWordCount { 11 | 12 | def main(args: Array[String]): Unit = { 13 | 14 | val sparkConf = new SparkConf().setAppName("StatefulWordCount").setMaster("local[2]") 15 | val ssc = new StreamingContext(sparkConf, Seconds(5)) 16 | 17 | // 如果使用了stateful的算子,必须要设置checkpoint 18 | // 在生产环境中,建议大家把checkpoint设置到HDFS的某个文件夹中 19 | // . 代表当前目录 20 | ssc.checkpoint(".") 21 | 22 | val lines = ssc.socketTextStream("localhost", 9999) 23 | 24 | val result = lines.flatMap(_.split(" ")).map((_,1)) 25 | val state = result.updateStateByKey[Int](updateFunction _) 26 | 27 | state.print() 28 | 29 | ssc.start() 30 | ssc.awaitTermination() 31 | } 32 | 33 | /** 34 | * 把当前的数据去更新已有的或者是老的数据 35 | * @param currentValues 当前的 36 | * @param preValues 老的 37 | * @return 38 | */ 39 | def updateFunction(currentValues: Seq[Int], preValues: Option[Int]): Option[Int] = { 40 | val current = currentValues.sum 41 | val pre = preValues.getOrElse(0) 42 | 43 | Some(current + pre) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /spark-train/src/main/scala/cn/edu/nju/spark/TransformApp.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.{Seconds, StreamingContext} 5 | 6 | /** 7 | * Created by Thpffcj on 2018/1/13. 8 | * 黑名单过滤 9 | */ 10 | object TransformApp { 11 | 12 | def main(args: Array[String]): Unit = { 13 | 14 | val sparkConf = new SparkConf().setMaster("local[2]").setAppName("TransformApp") 15 | 16 | // 创建StreamingContext需要两个参数:SparkConf和batch interval 17 | val ssc = new StreamingContext(sparkConf, Seconds(5)) 18 | 19 | // 构建黑名单 20 | val blacks = List("zs", "ls") 21 | val blacksRDD = ssc.sparkContext.parallelize(blacks).map(x => (x, true)) 22 | 23 | val lines = ssc.socketTextStream("192.168.92.130", 6789) 24 | val clicklog = lines.map(x => (x.split(",")(1), x)).transform(rdd => { 25 | rdd.leftOuterJoin(blacksRDD) 26 | .filter(x => x._2._2.getOrElse(false) != true) 27 | .map(x => x._2._1) 28 | }) 29 | 30 | clicklog.print() 31 | 32 | ssc.start() 33 | ssc.awaitTermination() 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /spark-train/src/main/scala/cn/edu/nju/spark/project/dao/CourseClickCountDAO.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark.project.dao 2 | 3 | import cn.edu.nju.spark.project.domain.CourseClickCount 4 | import cn.edu.nju.spark.project.utils.HBaseUtils 5 | import org.apache.hadoop.hbase.client.Get 6 | import org.apache.hadoop.hbase.util.Bytes 7 | 8 | import scala.collection.mutable.ListBuffer 9 | 10 | /** 11 | * Created by Thpffcj on 2018/1/15. 12 | * 实战课程点击数-数据访问层 13 | */ 14 | object CourseClickCountDAO { 15 | 16 | val tableName = "imooc_course_clickcount" 17 | val cf = "info" 18 | val qualifer = "click_count" 19 | 20 | /** 21 | * 保存数据到HBase 22 | * @param list CourseClickCount集合 23 | */ 24 | def save(list: ListBuffer[CourseClickCount]): Unit = { 25 | 26 | val table = HBaseUtils.getInstance().getTable(tableName) 27 | 28 | for(ele <- list) { 29 | table.incrementColumnValue(Bytes.toBytes(ele.day_course), 30 | Bytes.toBytes(cf), 31 | Bytes.toBytes(qualifer), 32 | ele.click_count) 33 | } 34 | } 35 | 36 | /** 37 | * 根据rowkey查询值 38 | */ 39 | def count(day_course: String): Long = { 40 | val table = HBaseUtils.getInstance().getTable(tableName) 41 | 42 | val get = new Get(Bytes.toBytes(day_course)) 43 | val value = table.get(get).getValue(cf.getBytes, qualifer.getBytes) 44 | 45 | if(value == null) { 46 | 0L 47 | }else{ 48 | Bytes.toLong(value) 49 | } 50 | } 51 | 52 | def main(args: Array[String]): Unit = { 53 | 54 | 55 | val list = new ListBuffer[CourseClickCount] 56 | list.append(CourseClickCount("20171111_8",8)) 57 | list.append(CourseClickCount("20171111_9",9)) 58 | list.append(CourseClickCount("20171111_1",100)) 59 | 60 | save(list) 61 | 62 | println(count("20171111_8") + " : " + count("20171111_9")+ " : " + count("20171111_1")) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /spark-train/src/main/scala/cn/edu/nju/spark/project/dao/CourseSearchClickCountDAO.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark.project.dao 2 | 3 | import cn.edu.nju.spark.project.domain.CourseSearchClickCount 4 | import cn.edu.nju.spark.project.utils.HBaseUtils 5 | import org.apache.hadoop.hbase.client.Get 6 | import org.apache.hadoop.hbase.util.Bytes 7 | 8 | import scala.collection.mutable.ListBuffer 9 | 10 | /** 11 | * Created by Thpffcj on 2018/1/15. 12 | * 从搜索引擎过来的实战课程点击数-数据访问层 13 | */ 14 | object CourseSearchClickCountDAO { 15 | 16 | val tableName = "imooc_course_search_clickcount" 17 | val cf = "info" 18 | val qualifer = "click_count" 19 | 20 | /** 21 | * 保存数据到HBase 22 | * 23 | * @param list CourseSearchClickCount集合 24 | */ 25 | def save(list: ListBuffer[CourseSearchClickCount]): Unit = { 26 | 27 | val table = HBaseUtils.getInstance().getTable(tableName) 28 | 29 | for(ele <- list) { 30 | table.incrementColumnValue(Bytes.toBytes(ele.day_search_course), 31 | Bytes.toBytes(cf), 32 | Bytes.toBytes(qualifer), 33 | ele.click_count) 34 | } 35 | 36 | } 37 | 38 | /** 39 | * 根据rowkey查询值 40 | */ 41 | def count(day_search_course: String):Long = { 42 | val table = HBaseUtils.getInstance().getTable(tableName) 43 | 44 | val get = new Get(Bytes.toBytes(day_search_course)) 45 | val value = table.get(get).getValue(cf.getBytes, qualifer.getBytes) 46 | 47 | if(value == null) { 48 | 0L 49 | }else{ 50 | Bytes.toLong(value) 51 | } 52 | } 53 | 54 | def main(args: Array[String]): Unit = { 55 | 56 | val list = new ListBuffer[CourseSearchClickCount] 57 | list.append(CourseSearchClickCount("20171111_www.baidu.com_8",8)) 58 | list.append(CourseSearchClickCount("20171111_cn.bing.com_9",9)) 59 | 60 | save(list) 61 | 62 | println(count("20171111_www.baidu.com_8") + " : " + count("20171111_cn.bing.com_9")) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /spark-train/src/main/scala/cn/edu/nju/spark/project/domain/ClickLog.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark.project.domain 2 | 3 | /** 4 | * Created by Thpffcj on 2018/1/15. 5 | * 清洗后的日志信息 6 | * @param ip 日志访问的ip地址 7 | * @param time 日志访问的时间 8 | * @param courseId 日志访问的实战课程编号 9 | * @param statusCode 日志访问的状态码 10 | * @param referrer 日志访问的referrer 11 | */ 12 | case class ClickLog(ip:String, time:String, courseId:Int, statusCode:Int, referrer:String) 13 | -------------------------------------------------------------------------------- /spark-train/src/main/scala/cn/edu/nju/spark/project/domain/CourseClickCount.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark.project.domain 2 | 3 | /** 4 | * Created by Thpffcj on 2018/1/15. 5 | * 实战课程点击数实体类 6 | * @param day_course 对应的就是HBase中的rowkey,20171111_1 7 | * @param click_count 对应的20171111_1的访问总数 8 | */ 9 | case class CourseClickCount(day_course:String, click_count:Long) 10 | -------------------------------------------------------------------------------- /spark-train/src/main/scala/cn/edu/nju/spark/project/domain/CourseSearchClickCount.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark.project.domain 2 | 3 | /** 4 | * Created by Thpffcj on 2018/1/15. 5 | * 从搜索引擎过来的实战课程点击数实体类 6 | * @param day_search_course 7 | * @param click_count 8 | */ 9 | case class CourseSearchClickCount(day_search_course:String, click_count:Long) 10 | -------------------------------------------------------------------------------- /spark-train/src/main/scala/cn/edu/nju/spark/project/utils/DateUtils.scala: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.spark.project.utils 2 | 3 | import java.util.Date 4 | 5 | import org.apache.commons.lang3.time.FastDateFormat 6 | 7 | /** 8 | * Created by Thpffcj on 2018/1/15. 9 | * 日期时间工具类 10 | */ 11 | object DateUtils { 12 | 13 | val YYYYMMDDHHMMSS_FORMAT = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss") 14 | val TARGE_FORMAT = FastDateFormat.getInstance("yyyyMMddHHmmss") 15 | 16 | 17 | def getTime(time: String) = { 18 | YYYYMMDDHHMMSS_FORMAT.parse(time).getTime 19 | } 20 | 21 | def parseToMinute(time :String) = { 22 | TARGE_FORMAT.format(new Date(getTime(time))) 23 | } 24 | 25 | def main(args: Array[String]): Unit = { 26 | 27 | println(parseToMinute("2017-10-22 14:46:01")) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /spark-train/src/test/java/LoggerGenerator.java: -------------------------------------------------------------------------------- 1 | import org.apache.log4j.Logger; 2 | 3 | /** 4 | * Created by Thpffcj on 2018/1/14. 5 | * 模拟日志产生 6 | */ 7 | public class LoggerGenerator { 8 | 9 | private static Logger logger = Logger.getLogger(LoggerGenerator.class.getName()); 10 | 11 | public static void main(String[] args) throws Exception{ 12 | 13 | int index = 0; 14 | while(true) { 15 | Thread.sleep(1000); 16 | logger.info("value : " + index++); 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /spark-train/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootLogger=INFO,stdout,flume 2 | 3 | log4j.appender.stdout = org.apache.log4j.ConsoleAppender 4 | log4j.appender.stdout.target = System.out 5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c] [%p] - %m%n 7 | 8 | log4j.appender.flume = org.apache.flume.clients.log4jappender.Log4jAppender 9 | log4j.appender.flume.Hostname = 192.168.92.130 10 | log4j.appender.flume.Port = 41414 11 | log4j.appender.flume.UnsafeMode = true 12 | 13 | -------------------------------------------------------------------------------- /storm-data-visualization/.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | !.mvn/wrapper/maven-wrapper.jar 3 | 4 | ### STS ### 5 | .apt_generated 6 | .classpath 7 | .factorypath 8 | .project 9 | .settings 10 | .springBeans 11 | .sts4-cache 12 | 13 | ### IntelliJ IDEA ### 14 | .idea 15 | *.iws 16 | *.iml 17 | *.ipr 18 | 19 | ### NetBeans ### 20 | /nbproject/private/ 21 | /build/ 22 | /nbbuild/ 23 | /dist/ 24 | /nbdist/ 25 | /.nb-gradle/ -------------------------------------------------------------------------------- /storm-data-visualization/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | cn.edu.nju 7 | 151250052 8 | 0.0.1-SNAPSHOT 9 | jar 10 | 11 | storm-data-visualization 12 | 13 | 14 | 15 | org.springframework.boot 16 | spring-boot-starter-parent 17 | 2.0.1.RELEASE 18 | 19 | 20 | 21 | 22 | UTF-8 23 | UTF-8 24 | 1.8 25 | 26 | 27 | 28 | 29 | org.springframework.boot 30 | spring-boot-starter 31 | 32 | 33 | 34 | org.springframework.boot 35 | spring-boot-starter-web 36 | 37 | 38 | 39 | org.springframework.boot 40 | spring-boot-starter-test 41 | test 42 | 43 | 44 | 45 | org.springframework.boot 46 | spring-boot-starter-jdbc 47 | 48 | 49 | 50 | org.springframework.boot 51 | spring-boot-starter-thymeleaf 52 | 53 | 54 | 55 | mysql 56 | mysql-connector-java 57 | 5.1.38 58 | 59 | 60 | 61 | net.sf.json-lib 62 | json-lib 63 | 2.4 64 | jdk15 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | org.springframework.boot 73 | spring-boot-maven-plugin 74 | 75 | 76 | 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /storm-data-visualization/src/main/java/cn/edu/nju/DataVisualizationApplication.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju; 2 | 3 | import org.springframework.boot.SpringApplication; 4 | import org.springframework.boot.autoconfigure.SpringBootApplication; 5 | 6 | @SpringBootApplication 7 | public class DataVisualizationApplication { 8 | 9 | public static void main(String[] args) { 10 | SpringApplication.run(DataVisualizationApplication.class, args); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /storm-data-visualization/src/main/java/cn/edu/nju/controller/StatApp.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.controller; 2 | 3 | import cn.edu.nju.domain.ResultBean; 4 | import cn.edu.nju.service.ResultBeanService; 5 | import net.sf.json.JSONArray; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | import org.springframework.web.bind.annotation.RequestMapping; 8 | import org.springframework.web.bind.annotation.RequestMethod; 9 | import org.springframework.web.bind.annotation.ResponseBody; 10 | import org.springframework.web.bind.annotation.RestController; 11 | import org.springframework.web.servlet.ModelAndView; 12 | 13 | import java.util.List; 14 | 15 | /** 16 | * Created by Thpffcj on 2018/4/10. 17 | */ 18 | @RestController 19 | public class StatApp { 20 | 21 | @Autowired 22 | ResultBeanService resultBeanService; 23 | 24 | @RequestMapping(value = "/map", method = RequestMethod.GET) 25 | public ModelAndView map() { 26 | return new ModelAndView("map.html"); 27 | } 28 | 29 | @RequestMapping(value = "/map_stat", method = RequestMethod.POST) 30 | @ResponseBody 31 | public List mapStat() { 32 | List results = resultBeanService.query(); 33 | return results; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /storm-data-visualization/src/main/java/cn/edu/nju/domain/ResultBean.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.domain; 2 | 3 | /** 4 | * Created by Thpffcj on 2018/4/10. 5 | */ 6 | public class ResultBean { 7 | 8 | private double lng; 9 | private double lat; 10 | private long count; 11 | 12 | public double getLng() { 13 | return lng; 14 | } 15 | 16 | public void setLng(double lng) { 17 | this.lng = lng; 18 | } 19 | 20 | public double getLat() { 21 | return lat; 22 | } 23 | 24 | public void setLat(double lat) { 25 | this.lat = lat; 26 | } 27 | 28 | public long getCount() { 29 | return count; 30 | } 31 | 32 | public void setCount(long count) { 33 | this.count = count; 34 | } 35 | 36 | @Override 37 | public String toString() { 38 | return "ResultBean{" + 39 | "lng=" + lng + 40 | ", lat=" + lat + 41 | ", count=" + count + 42 | '}'; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /storm-data-visualization/src/main/java/cn/edu/nju/service/ResultBeanService.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.service; 2 | 3 | import cn.edu.nju.domain.ResultBean; 4 | import org.springframework.beans.factory.annotation.Autowired; 5 | import org.springframework.jdbc.core.JdbcTemplate; 6 | import org.springframework.jdbc.core.RowMapper; 7 | import org.springframework.stereotype.Service; 8 | 9 | import java.sql.ResultSet; 10 | import java.sql.SQLException; 11 | import java.util.List; 12 | 13 | /** 14 | * Created by Thpffcj on 2018/4/10. 15 | */ 16 | @Service 17 | public class ResultBeanService { 18 | 19 | @Autowired 20 | JdbcTemplate jdbcTemplate; 21 | 22 | public List query() { 23 | 24 | String sql = "select longitude, latitude, count(1) as c from stat where time > unix_timestamp(date_sub(current_timestamp(), interval 10 hour)) * 1000 group by longitude, latitude"; 25 | 26 | return (List) jdbcTemplate.query(sql, new RowMapper() { 27 | 28 | @Override 29 | public ResultBean mapRow(ResultSet resultSet, int i) throws SQLException { 30 | ResultBean bean = new ResultBean(); 31 | bean.setLng(resultSet.getDouble("longitude")); 32 | bean.setLat(resultSet.getDouble("latitude")); 33 | bean.setCount(resultSet.getLong("c")); 34 | return bean; 35 | } 36 | }); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /storm-data-visualization/src/main/resources/application.properties: -------------------------------------------------------------------------------- 1 | spring.datasource.driver-class-name: com.mysql.jdbc.Driver 2 | spring.datasource.url: jdbc:mysql://127.0.0.1:3306/storm?useSSL=false 3 | spring.datasource.username: root 4 | spring.datasource.password: 000000 -------------------------------------------------------------------------------- /storm-data-visualization/src/main/resources/templates/map.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 基于Storm的实时区域游客量热力图统计 5 | 6 | 7 | 8 | 9 | 10 | 11 |
12 | 13 | 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /storm-data-visualization/src/test/java/cn/edi/nju/DataVisualizationApplicationTests.java: -------------------------------------------------------------------------------- 1 | package cn.edi.nju; 2 | 3 | import org.junit.Test; 4 | import org.junit.runner.RunWith; 5 | import org.springframework.boot.test.context.SpringBootTest; 6 | import org.springframework.test.context.junit4.SpringRunner; 7 | 8 | @RunWith(SpringRunner.class) 9 | @SpringBootTest 10 | public class DataVisualizationApplicationTests { 11 | 12 | @Test 13 | public void contextLoads() { 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /storm-train/src/main/java/cn/edu/nju/drpc/LocalDRPCTopology.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.drpc; 2 | 3 | import org.apache.storm.Config; 4 | import org.apache.storm.LocalCluster; 5 | import org.apache.storm.LocalDRPC; 6 | import org.apache.storm.drpc.LinearDRPCTopologyBuilder; 7 | import org.apache.storm.task.OutputCollector; 8 | import org.apache.storm.task.TopologyContext; 9 | import org.apache.storm.topology.OutputFieldsDeclarer; 10 | import org.apache.storm.topology.base.BaseRichBolt; 11 | import org.apache.storm.tuple.Fields; 12 | import org.apache.storm.tuple.Tuple; 13 | import org.apache.storm.tuple.Values; 14 | 15 | import java.util.Map; 16 | 17 | /** 18 | * Created by Thpffcj on 2018/4/6. 19 | * 本地的DRPC 20 | */ 21 | public class LocalDRPCTopology { 22 | 23 | public static class MyBolt extends BaseRichBolt { 24 | 25 | private OutputCollector outputCollector; 26 | 27 | public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { 28 | this.outputCollector = collector; 29 | } 30 | 31 | public void execute(Tuple input) { 32 | 33 | // 请求的id 34 | Object requestId = input.getValue(0); 35 | // 请求的参数 36 | String name = input.getString(1); 37 | 38 | /** 39 | * TODO... 业务逻辑处理 40 | */ 41 | String result = "add user: " + name; 42 | 43 | this.outputCollector.emit(new Values(requestId, result)); 44 | } 45 | 46 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 47 | declarer.declare(new Fields("id", "result")); 48 | } 49 | } 50 | 51 | public static void main(String[] args) { 52 | LinearDRPCTopologyBuilder builder = new LinearDRPCTopologyBuilder("addUser"); 53 | builder.addBolt(new MyBolt()); 54 | 55 | LocalCluster localCluster = new LocalCluster(); 56 | LocalDRPC drpc = new LocalDRPC(); 57 | localCluster.submitTopology("local-drpc", new Config(), 58 | builder.createLocalTopology(drpc)); 59 | 60 | String result = drpc.execute("addUser", "Thpffcj"); 61 | System.out.println("From client: " + result); 62 | 63 | localCluster.shutdown(); 64 | drpc.shutdown(); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /storm-train/src/main/java/cn/edu/nju/drpc/RPCClient.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.drpc; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.ipc.RPC; 5 | 6 | import java.io.IOException; 7 | import java.net.InetSocketAddress; 8 | 9 | /** 10 | * Created by Thpffcj on 2018/4/6. 11 | * RPC 客户端 12 | */ 13 | public class RPCClient { 14 | 15 | public static void main(String[] args) throws IOException { 16 | 17 | Configuration configuration = new Configuration(); 18 | 19 | long clientVersion = 88888888; 20 | 21 | UserService userService = RPC.getProxy(UserService.class, clientVersion, 22 | new InetSocketAddress("localhost", 9999), 23 | configuration); 24 | 25 | userService.addUser("Thpffcj", 21); 26 | System.out.println("From client invoked"); 27 | 28 | RPC.stopProxy(userService); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /storm-train/src/main/java/cn/edu/nju/drpc/RPCServer.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.drpc; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.ipc.RPC; 5 | 6 | import java.io.IOException; 7 | 8 | /** 9 | * Created by Thpffcj on 2018/4/6. 10 | * RPC Server服务 11 | */ 12 | public class RPCServer { 13 | 14 | public static void main(String[] args) throws IOException { 15 | 16 | Configuration configuration = new Configuration(); 17 | 18 | RPC.Builder builder = new RPC.Builder(configuration); 19 | 20 | // Java Builder 模式 21 | RPC.Server server = builder.setProtocol(UserService.class) 22 | .setInstance(new UserServiceImpl()) 23 | .setBindAddress("localhost").setPort(9999).build(); 24 | 25 | 26 | server.start(); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /storm-train/src/main/java/cn/edu/nju/drpc/RemoteDRPCClient.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.drpc; 2 | 3 | import org.apache.storm.Config; 4 | import org.apache.storm.thrift.transport.TTransportException; 5 | import org.apache.storm.utils.DRPCClient; 6 | 7 | /** 8 | * Created by Thpffcj on 2018/4/6. 9 | * Remote DRPC 客户端测试类 10 | */ 11 | public class RemoteDRPCClient { 12 | 13 | public static void main(String[] args) throws Exception { 14 | 15 | Config config = new Config(); 16 | config.put("storm.thrift.transport", "org.apache.storm.security.auth.SimpleTransportPlugin"); 17 | config.put(Config.STORM_NIMBUS_RETRY_TIMES, 3); 18 | config.put(Config.STORM_NIMBUS_RETRY_INTERVAL, 10); 19 | config.put(Config.STORM_NIMBUS_RETRY_INTERVAL_CEILING, 20); 20 | config.put(Config.DRPC_MAX_BUFFER_SIZE, 1048576); 21 | 22 | DRPCClient client = new DRPCClient(config, "thpffcj", 3772); 23 | String result = client.execute("addUser", "Thpffcj"); 24 | 25 | System.out.println("Client invoked: " + result); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /storm-train/src/main/java/cn/edu/nju/drpc/RemoteDRPCTopology.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.drpc; 2 | 3 | import org.apache.storm.Config; 4 | import org.apache.storm.LocalCluster; 5 | import org.apache.storm.LocalDRPC; 6 | import org.apache.storm.StormSubmitter; 7 | import org.apache.storm.drpc.LinearDRPCTopologyBuilder; 8 | import org.apache.storm.generated.AlreadyAliveException; 9 | import org.apache.storm.generated.AuthorizationException; 10 | import org.apache.storm.generated.InvalidTopologyException; 11 | import org.apache.storm.task.OutputCollector; 12 | import org.apache.storm.task.TopologyContext; 13 | import org.apache.storm.topology.OutputFieldsDeclarer; 14 | import org.apache.storm.topology.base.BaseRichBolt; 15 | import org.apache.storm.tuple.Fields; 16 | import org.apache.storm.tuple.Tuple; 17 | import org.apache.storm.tuple.Values; 18 | 19 | import java.util.Map; 20 | 21 | /** 22 | * Created by Thpffcj on 2018/4/6. 23 | * 远程的DRPC 24 | */ 25 | public class RemoteDRPCTopology { 26 | 27 | public static class MyBolt extends BaseRichBolt { 28 | 29 | private OutputCollector outputCollector; 30 | 31 | public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { 32 | this.outputCollector = collector; 33 | } 34 | 35 | public void execute(Tuple input) { 36 | 37 | // 请求的id 38 | Object requestId = input.getValue(0); 39 | // 请求的参数 40 | String name = input.getString(1); 41 | 42 | /** 43 | * TODO... 业务逻辑处理 44 | */ 45 | String result = "add user: " + name; 46 | 47 | this.outputCollector.emit(new Values(requestId, result)); 48 | } 49 | 50 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 51 | declarer.declare(new Fields("id", "result")); 52 | } 53 | } 54 | 55 | public static void main(String[] args) { 56 | LinearDRPCTopologyBuilder builder = new LinearDRPCTopologyBuilder("addUser"); 57 | builder.addBolt(new MyBolt()); 58 | 59 | try { 60 | StormSubmitter.submitTopology("drpc-topology", 61 | new Config(), 62 | builder.createRemoteTopology()); 63 | } catch (Exception e) { 64 | e.printStackTrace(); 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /storm-train/src/main/java/cn/edu/nju/drpc/UserService.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.drpc; 2 | 3 | /** 4 | * Created by Thpffcj on 2018/4/6. 5 | * 用户的服务 6 | */ 7 | public interface UserService { 8 | 9 | public static final long versionID = 88888888; 10 | 11 | /** 12 | * 添加用户 13 | * @param name 名字 14 | * @param age 年龄 15 | */ 16 | public void addUser(String name, int age); 17 | } 18 | -------------------------------------------------------------------------------- /storm-train/src/main/java/cn/edu/nju/drpc/UserServiceImpl.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.drpc; 2 | 3 | /** 4 | * Created by Thpffcj on 2018/4/6. 5 | * 用户的服务接口实现类 6 | */ 7 | public class UserServiceImpl implements UserService { 8 | 9 | public void addUser(String name, int age) { 10 | System.out.println("From Server Invoked: add user success, name is " + name); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /storm-train/src/main/java/cn/edu/nju/intergration/jdbc/ddl.sql: -------------------------------------------------------------------------------- 1 | create table wc( 2 | word varchar (20), 3 | word_count int 4 | ); -------------------------------------------------------------------------------- /storm-train/src/main/java/cn/edu/nju/intergration/kafka/DateUtils.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.intergration.kafka; 2 | 3 | import org.apache.commons.lang3.time.FastDateFormat; 4 | 5 | import java.text.ParseException; 6 | 7 | /** 8 | * Created by Thpffcj on 2018/4/10. 9 | * 时间解析工具类 10 | */ 11 | public class DateUtils { 12 | 13 | private DateUtils(){} 14 | 15 | private static DateUtils instance; 16 | 17 | public static DateUtils getInstance() { 18 | if (instance == null) { 19 | instance = new DateUtils(); 20 | } 21 | 22 | return instance; 23 | } 24 | 25 | FastDateFormat format = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss"); 26 | 27 | public long getTime(String time) throws Exception { 28 | return format.parse(time.substring(1, time.length() - 1)).getTime(); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /storm-train/src/main/java/cn/edu/nju/intergration/kafka/LogProcessBolt.java: -------------------------------------------------------------------------------- 1 | package cn.edu.nju.intergration.kafka; 2 | 3 | import org.apache.storm.task.OutputCollector; 4 | import org.apache.storm.task.TopologyContext; 5 | import org.apache.storm.topology.OutputFieldsDeclarer; 6 | import org.apache.storm.topology.base.BaseRichBolt; 7 | import org.apache.storm.tuple.Fields; 8 | import org.apache.storm.tuple.Tuple; 9 | import org.apache.storm.tuple.Values; 10 | 11 | import java.util.Map; 12 | 13 | /** 14 | * Created by Thpffcj on 2018/4/8. 15 | * 接收kafka的数据进行处理的BOLT 16 | */ 17 | public class LogProcessBolt extends BaseRichBolt { 18 | 19 | private OutputCollector collector; 20 | 21 | public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { 22 | this.collector = collector; 23 | } 24 | 25 | public void execute(Tuple input) { 26 | 27 | try { 28 | byte[] binaryByField = input.getBinaryByField("bytes"); 29 | String value = new String(binaryByField); 30 | 31 | // 解析出来日志信息 32 | String[] splits = value.split("\t"); 33 | String phone = splits[0]; 34 | String[] temp = splits[1].split(","); 35 | String longitude = temp[0]; 36 | String latitude = temp[1]; 37 | long time = DateUtils.getInstance().getTime(splits[2]); 38 | 39 | System.out.println(phone + " " + longitude + " " + latitude + " " + time); 40 | 41 | collector.emit(new Values(time, Double.parseDouble(longitude), Double.parseDouble(latitude))); 42 | 43 | this.collector.ack(input); 44 | } catch (Exception e) { 45 | this.collector.fail(input); 46 | } 47 | } 48 | 49 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 50 | declarer.declare(new Fields("time", "longitude", "latitude")); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /storm-train/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | #log4j.rootLogger=WARN, stdout 2 | #log4j.appender.stdout=org.apache.log4j.ConsoleAppender 3 | #log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 4 | #log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n --------------------------------------------------------------------------------