├── .gitignore
├── README.md
├── flink-train
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── cn
    │           │   └── edu
    │           │       └── nju
    │           │           ├── JavaWindowWordCount.java
    │           │           ├── SocketTextStreamWordCount.java
    │           │           ├── course04
    │           │               ├── JavaCounterApp.java
    │           │               ├── JavaDataSetDataSourceApp.java
    │           │               ├── JavaDataSetSinkApp.java
    │           │               ├── JavaDataSetTransformationApp.java
    │           │               ├── JavaDistributedCacheApp.java
    │           │               └── Person.java
    │           │           ├── course05
    │           │               ├── JavaCustomNonParallelSourceFunction.java
    │           │               ├── JavaCustomParallelSourceFunction.java
    │           │               ├── JavaCustomRichParallelSourceFunction.java
    │           │               ├── JavaCustomSinkToMySQL.java
    │           │               ├── JavaDataStreamSourceApp.java
    │           │               ├── JavaDataStreamTransformationApp.java
    │           │               ├── SinkToMySQL.java
    │           │               └── Student.java
    │           │           ├── course06
    │           │               └── JavaTableSQLAPI.java
    │           │           ├── course07
    │           │               ├── JavaWindowsApp.java
    │           │               ├── JavaWindowsProcessApp.java
    │           │               └── JavaWindowsReduceApp.java
    │           │           ├── hotItem
    │           │               ├── HotItems.java
    │           │               └── UserBehavior.java
    │           │           └── project
    │           │               └── MyKafkaProducer.java
    │       ├── resources
    │           └── log4j.properties
    │       └── scala
    │           └── cn
    │               └── edu
    │                   └── nju
    │                       ├── BatchJob.scala
    │                       ├── BatchWCScalaApp.scala
    │                       ├── StreamingJob.scala
    │                       ├── StreamingWCScalaApp.scala
    │                       ├── WindowWordCount.java
    │                       ├── course04
    │                           ├── CounterApp.scala
    │                           ├── DBUtils.scala
    │                           ├── DataSetDataSourceApp.scala
    │                           ├── DataSetSinkApp.scala
    │                           ├── DataSetTransformationApp.scala
    │                           └── DistributedCacheApp.scala
    │                       ├── course05
    │                           ├── CustomNonParallelSourceFunction.scala
    │                           ├── CustomParallelSourceFunction.scala
    │                           ├── CustomRichParallelSourceFunction.scala
    │                           ├── DataStreamSourceApp.scala
    │                           └── DataStreamTransformationApp.scala
    │                       ├── course06
    │                           └── TableSQLAPI.scala
    │                       ├── course07
    │                           ├── WindowsApp.scala
    │                           ├── WindowsProcessApp.scala
    │                           └── WindowsReduceApp.scala
    │                       ├── course08
    │                           ├── FileSystemSinkApp.scala
    │                           ├── KafkaConnectorConsumerApp.scala
    │                           └── KafkaConnectorProducerApp.scala
    │                       └── project
    │                           ├── LogAnalysis.scala
    │                           ├── LogAnalysis02.scala
    │                           ├── MyMySQLSource.scala
    │                           └── MyMySQLSourceTest.scala
├── hadoop-train
    ├── pom.xml
    └── src
    │   ├── main
    │       └── java
    │       │   └── cn
    │       │       └── edu
    │       │           └── nju
    │       │               └── hadoop
    │       │                   ├── mapreduce
    │       │                       ├── CombinerApp.java
    │       │                       ├── PartitionerApp.java
    │       │                       ├── WordCount2App.java
    │       │                       ├── WordCountApp.java
    │       │                       ├── sort
    │       │                       │   ├── GlobalSort.java
    │       │                       │   ├── GlobalSortPartitioner.java
    │       │                       │   ├── IntPair.java
    │       │                       │   └── SecondarySort.java
    │       │                       └── topk
    │       │                       │   ├── IPTimes.java
    │       │                       │   └── TopK.java
    │       │                   └── project
    │       │                       └── LogApp.java
    │   ├── resources
    │       ├── application.properties
    │       ├── beans.xml
    │       └── log.txt
    │   └── test
    │       └── java
    │           └── cn
    │               └── edu
    │                   └── nju
    │                       └── hadoop
    │                           ├── hdfs
    │                               └── HDFSApp.java
    │                           ├── project
    │                               └── UserAgentTest.java
    │                           └── spring
    │                               ├── SpringBootHDFSApp.java
    │                               └── SpringHadoopHDFSApp.java
├── hbase-train
    ├── hbase-api-test
    │   ├── pom.xml
    │   └── src
    │   │   ├── main
    │   │       └── java
    │   │       │   └── cn
    │   │       │       └── edu
    │   │       │           └── nju
    │   │       │               ├── HBaseConn.java
    │   │       │               └── HBaseUtil.java
    │   │   └── test
    │   │       └── java
    │   │           └── cn
    │   │               └── edu
    │   │                   └── nju
    │   │                       ├── HBaseConnTest.java
    │   │                       ├── HBaseFilterTest.java
    │   │                       └── HBaseUtilTest.java
    ├── hbase-endpoint-test
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       ├── java
    │   │           └── cn
    │   │           │   └── edu
    │   │           │       └── nju
    │   │           │           ├── GetRowCount.java
    │   │           │           └── TestRowCountEndPoint.java
    │   │       └── proto
    │   │           └── RowCountTest.proto
    ├── hbase-observer-test
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       └── java
    │   │           └── cn
    │   │               └── edu
    │   │                   └── nju
    │   │                       └── RegionObserverTest.java
    ├── pom.xml
    └── src
    │   ├── main
    │       └── java
    │       │   └── cn
    │       │       └── edu
    │       │           └── nju
    │       │               └── App.java
    │   └── test
    │       └── java
    │           └── cn
    │               └── edu
    │                   └── nju
    │                       └── AppTest.java
├── log-generator
    ├── generate_log.py
    ├── message.py
    └── message2.py
├── pyspark
    └── project
    │   ├── spark.py
    │   ├── spark_yarn.py
    │   ├── steam.py
    │   └── test.py
├── spark-data-visualization
    ├── .gitignore
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── cn
    │       │   │   └── edu
    │       │   │       └── nju
    │       │   │           ├── DataVisualizationApplication.java
    │       │   │           ├── dao
    │       │   │               └── CourseClickCountDAO.java
    │       │   │           ├── domain
    │       │   │               └── CourseClickCount.java
    │       │   │           ├── spark
    │       │   │               ├── HelloBoot.java
    │       │   │               └── ImoocStatApp.java
    │       │   │           └── utils
    │       │   │               └── HBaseUtils.java
    │       └── resources
    │       │   ├── application.properties
    │       │   ├── static
    │       │       └── js
    │       │       │   ├── echarts.min.js
    │       │       │   └── jquery.js
    │       │   └── templates
    │       │       ├── demo.html
    │       │       ├── echarts.html
    │       │       └── test.html
    │   └── test
    │       └── java
    │           └── cn
    │               └── edu
    │                   └── nju
    │                       └── DataVisualizationApplicationTests.java
├── spark-mllib
    ├── pom.xml
    └── src
    │   └── main
    │       ├── resources
    │           ├── house.csv
    │           ├── iris.data
    │           ├── neg.txt
    │           ├── pos.txt
    │           └── u.data
    │       └── scala
    │           └── cn
    │               └── edu
    │                   └── nju
    │                       ├── MovieRecommendation.scala
    │                       ├── classification
    │                           └── Iris.scala
    │                       ├── cluster
    │                           ├── KMeans.scala
    │                           └── Lda.scala
    │                       ├── dimensionalityReduction
    │                           └── PCADimensionalityReduction.scala
    │                       ├── emotionAnalysis
    │                           └── EmotionAnalysis.scala
    │                       └── regression
    │                           └── HousePriceForecast.scala
├── spark-sql-train
    ├── .gitignore
    ├── pom.xml
    └── src
    │   └── main
    │       ├── resources
    │           ├── ipDatabase.csv
    │           └── ipRegion.xlsx
    │       └── scala
    │           └── cn
    │               └── edu
    │                   └── nju
    │                       ├── log
    │                           ├── AccessConvertUtil.scala
    │                           ├── DateUtils.scala
    │                           ├── DayCityVideoAccessStat.scala
    │                           ├── DayVideoAccessStat.scala
    │                           ├── DayVideoTrafficsStat.scala
    │                           ├── IpUtils.scala
    │                           ├── MySQLUtils.scala
    │                           ├── SparkStatCleanJob.scala
    │                           ├── SparkStatFormatJob.scala
    │                           ├── StatDAO.scala
    │                           └── TopNStatJob.scala
    │                       └── spark
    │                           ├── DataFrameApp.scala
    │                           ├── DataFrameCase.scala
    │                           ├── DataFrameRDDApp.scala
    │                           ├── DataSetApp.scala
    │                           ├── HiveContextApp.scala
    │                           ├── HiveMySQLApp.scala
    │                           ├── ParquetApp.scala
    │                           ├── SQLContextApp.scala
    │                           ├── SparkSQLThriftServerApp.scala
    │                           └── SparkSessionApp.scala
├── spark-sql-visualization
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── cn
    │           │   └── edu
    │           │       └── nju
    │           │           ├── dao
    │           │               └── VideoAccessTopNDAO.java
    │           │           ├── domain
    │           │               └── VideoAccessTopN.java
    │           │           ├── utils
    │           │               └── MySQLUtils.java
    │           │           └── web
    │           │               └── VideoAccessTopNServlet.java
    │       └── webapp
    │           ├── WEB-INF
    │               └── web.xml
    │           ├── js
    │               ├── echarts.min.js
    │               └── jquery.js
    │           ├── test.html
    │           └── topn.html
├── spark-train
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── cn
    │       │   │   └── edu
    │       │   │       └── nju
    │       │   │           └── spark
    │       │   │               ├── StreamingWordCountApp.java
    │       │   │               ├── WordCountApp.java
    │       │   │               ├── kafkas
    │       │   │                   ├── KafkaClientApp.java
    │       │   │                   ├── KafkaConsumer.java
    │       │   │                   ├── KafkaProducer.java
    │       │   │                   └── KafkaProperties.java
    │       │   │               └── project
    │       │   │                   └── utils
    │       │   │                       └── HBaseUtils.java
    │       └── scala
    │       │   └── cn
    │       │       └── edu
    │       │           └── nju
    │       │               └── spark
    │       │                   ├── FlumePullWordCount.scala
    │       │                   ├── FlumePushWordCount.scala
    │       │                   ├── ForeachRDDApp.scala
    │       │                   ├── KafkaDirectWordCount.scala
    │       │                   ├── KafkaReceiverWordCount.scala
    │       │                   ├── KafkaStreamingApp.scala
    │       │                   ├── StatefulWordCount.scala
    │       │                   ├── TransformApp.scala
    │       │                   └── project
    │       │                       ├── dao
    │       │                           ├── CourseClickCountDAO.scala
    │       │                           └── CourseSearchClickCountDAO.scala
    │       │                       ├── domain
    │       │                           ├── ClickLog.scala
    │       │                           ├── CourseClickCount.scala
    │       │                           └── CourseSearchClickCount.scala
    │       │                       ├── spark
    │       │                           └── ImoocStatStreamingApp.scala
    │       │                       └── utils
    │       │                           └── DateUtils.scala
    │   └── test
    │       ├── java
    │           └── LoggerGenerator.java
    │       └── resources
    │           └── log4j.properties
├── storm-data-visualization
    ├── .gitignore
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── java
    │       │   └── cn
    │       │   │   └── edu
    │       │   │       └── nju
    │       │   │           ├── DataVisualizationApplication.java
    │       │   │           ├── controller
    │       │   │               └── StatApp.java
    │       │   │           ├── domain
    │       │   │               └── ResultBean.java
    │       │   │           └── service
    │       │   │               └── ResultBeanService.java
    │       └── resources
    │       │   ├── application.properties
    │       │   ├── static
    │       │       └── js
    │       │       │   └── jquery.js
    │       │   └── templates
    │       │       └── map.html
    │   └── test
    │       └── java
    │           └── cn
    │               └── edi
    │                   └── nju
    │                       └── DataVisualizationApplicationTests.java
├── storm-train
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── cn
    │           │   └── edu
    │           │       └── nju
    │           │           ├── ClusterSumAllGroupingStormTopology.java
    │           │           ├── ClusterSumFieldGroupingStormTopology.java
    │           │           ├── ClusterSumShuffleGroupingStormTopology.java
    │           │           ├── ClusterSumStormAckerTopology.java
    │           │           ├── ClusterSumStormExecutorsTopology.java
    │           │           ├── ClusterSumStormTasksTopology.java
    │           │           ├── ClusterSumStormTopology.java
    │           │           ├── ClusterSumStormWorkersTopology.java
    │           │           ├── LocalSumStormAckerTopology.java
    │           │           ├── LocalSumStormTopology.java
    │           │           ├── LocalWordCountStormTopology.java
    │           │           ├── drpc
    │           │               ├── LocalDRPCTopology.java
    │           │               ├── RPCClient.java
    │           │               ├── RPCServer.java
    │           │               ├── RemoteDRPCClient.java
    │           │               ├── RemoteDRPCTopology.java
    │           │               ├── UserService.java
    │           │               └── UserServiceImpl.java
    │           │           └── intergration
    │           │               ├── hbase
    │           │                   └── LocalWordCountHBaseStormTopology.java
    │           │               ├── hdfs
    │           │                   └── LocalWordCountHDFSStormTopology.java
    │           │               ├── jdbc
    │           │                   ├── LocalWordCountJDBCStormTopology.java
    │           │                   └── ddl.sql
    │           │               ├── kafka
    │           │                   ├── DateUtils.java
    │           │                   ├── LogProcessBolt.java
    │           │                   └── StormKafkaTopo.java
    │           │               └── redis
    │           │                   └── LocalWordCountRedisStormTopology.java
    │       └── resources
    │           └── log4j.properties
└── 集群搭建.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | /HadoopTrain/.idea
 2 | /HadoopTrain/target
 3 | /HadoopTrain/HadoopTrain.iml
 4 | 
 5 | /SparkTrain/.idea
 6 | /SparkTrain/target
 7 | /SparkTrain/SparkTrain.iml
 8 | 
 9 | /StormTrain/.idea
10 | /StormTrain/target
11 | /StormTrain/logs
12 | 
13 | /StormTrain/StormTrain.iml
14 | /LogGenerator/.idea
15 | /DataVisualization/DataVisualization.iml
16 | /StormVisualization/DataVisualization.iml
17 | /SparkSQLVisualization/DataVisualization.iml
18 | 
19 | /*/.idea
20 | /*/target
21 | 
22 | *.iml
23 | *.idea
24 | target
25 | 


--------------------------------------------------------------------------------
/flink-train/src/main/java/cn/edu/nju/JavaWindowWordCount.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju;
 2 | 
 3 | import org.apache.flink.api.common.functions.FlatMapFunction;
 4 | import org.apache.flink.api.java.tuple.Tuple2;
 5 | import org.apache.flink.streaming.api.datastream.DataStream;
 6 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
 7 | import org.apache.flink.streaming.api.windowing.time.Time;
 8 | import org.apache.flink.util.Collector;
 9 | 
10 | /**
11 |  * Created by thpffcj on 2019-08-12.
12 |  */
13 | public class JavaWindowWordCount {
14 | 
15 |     public static void main(String[] args) throws Exception {
16 | 
17 |         StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
18 | 
19 |         DataStream<Tuple2<String, Integer>> dataStream = env
20 |                 .socketTextStream("localhost", 9999)
21 |                 .flatMap(new Splitter())
22 |                 .keyBy(0)
23 |                 .timeWindow(Time.seconds(5))
24 |                 .sum(1);
25 | 
26 |         dataStream.print();
27 | 
28 |         env.execute("Window WordCount");
29 |     }
30 | 
31 |     public static final class Splitter implements FlatMapFunction<String, Tuple2<String, Integer>> {
32 | 
33 |         @Override
34 |         public void flatMap(String value, Collector<Tuple2<String, Integer>> out) {
35 |             // normalize and split the line
36 |             String[] tokens = value.toLowerCase().split("\\W+");
37 | 
38 |             // emit the pairs
39 |             for (String token : tokens) {
40 |                 if (token.length() > 0) {
41 |                     out.collect(new Tuple2<>(token, 1));
42 |                 }
43 |             }
44 |         }
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/flink-train/src/main/java/cn/edu/nju/SocketTextStreamWordCount.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju;
 2 | 
 3 | import org.apache.flink.api.common.functions.FlatMapFunction;
 4 | import org.apache.flink.api.java.tuple.Tuple2;
 5 | import org.apache.flink.streaming.api.datastream.DataStream;
 6 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
 7 | import org.apache.flink.util.Collector;
 8 | 
 9 | /**
10 |  * Created by thpffcj on 2019-08-04.
11 |  */
12 | public class SocketTextStreamWordCount {
13 | 
14 |     public static void main(String[] args) throws Exception {
15 | //        if (args.length != 2){
16 | //            System.err.println("USAGE:\nSocketTextStreamWordCount <hostname> <port>");
17 | //            return;
18 | //        }
19 | //        String hostName = args[0];
20 | //        Integer port = Integer.parseInt(args[1]);
21 | 
22 |         String hostName = "127.0.0.1";
23 |         Integer port = 9999;
24 | 
25 |         // set up the execution environment
26 |         final StreamExecutionEnvironment env = StreamExecutionEnvironment
27 |                 .getExecutionEnvironment();
28 | 
29 |         // get input data
30 |         DataStream<String> text = env.socketTextStream(hostName, port);
31 | 
32 |         text.flatMap(new LineSplitter()).setParallelism(1)
33 |                 // group by the tuple field "0" and sum up tuple field "1"
34 |                 .keyBy(0)
35 |                 .sum(1).setParallelism(1)
36 |                 .print();
37 | 
38 |         // execute program
39 |         env.execute("Java WordCount from SocketTextStream Example");
40 |     }
41 | 
42 |     /**
43 |      * Implements the string tokenizer that splits sentences into words as a user-defined
44 |      * FlatMapFunction. The function takes a line (String) and splits it into
45 |      * multiple pairs in the form of "(word,1)" (Tuple2&lt;String, Integer&gt;).
46 |      */
47 |     public static final class LineSplitter implements FlatMapFunction<String, Tuple2<String, Integer>> {
48 |         @Override
49 |         public void flatMap(String value, Collector<Tuple2<String, Integer>> out) {
50 |             // normalize and split the line
51 |             String[] tokens = value.toLowerCase().split("\\W+");
52 |             // emit the pairs
53 |             for (String token : tokens) {
54 |                 if (token.length() > 0) {
55 |                     out.collect(new Tuple2<String, Integer>(token, 1));
56 |                 }
57 |             }
58 |         }
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/flink-train/src/main/java/cn/edu/nju/course04/JavaCounterApp.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course04;
 2 | 
 3 | import org.apache.flink.api.common.JobExecutionResult;
 4 | import org.apache.flink.api.common.accumulators.LongCounter;
 5 | import org.apache.flink.api.common.functions.RichMapFunction;
 6 | import org.apache.flink.api.java.DataSet;
 7 | import org.apache.flink.api.java.ExecutionEnvironment;
 8 | import org.apache.flink.api.java.operators.DataSource;
 9 | import org.apache.flink.configuration.Configuration;
10 | import org.apache.flink.core.fs.FileSystem;
11 | 
12 | /**
13 |  * Created by thpffcj on 2019-07-04.
14 |  */
15 | public class JavaCounterApp {
16 | 
17 |     public static void main(String[] args) throws Exception {
18 | 
19 |         ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
20 | 
21 |         DataSource<String> data = env.fromElements("hadoop", "spark", "flink", "pyspark", "storm");
22 | 
23 |         DataSet<String> info = data.map(new RichMapFunction<String, String>() {
24 | 
25 |             LongCounter counter = new LongCounter();
26 | 
27 |             @Override
28 |             public void open(Configuration parameters) throws Exception {
29 |                 super.open(parameters);
30 |                 getRuntimeContext().addAccumulator("ele-count-java", counter);
31 |             }
32 | 
33 |             @Override
34 |             public String map(String value) throws Exception {
35 |                 counter.add(1);
36 |                 return value;
37 |             }
38 |         });
39 | 
40 |         String filePath = "file:///Users/thpffcj/Public/data/sink-java-count-out";
41 |         info.writeAsText(filePath, FileSystem.WriteMode.OVERWRITE).setParallelism(3);
42 |         JobExecutionResult jobResult = env.execute("JavaCounterApp");
43 |         // step3：获取计数器
44 |         long num = jobResult.getAccumulatorResult("ele-counts-scala");
45 | 
46 |         System.out.println("num: " + num);
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/flink-train/src/main/java/cn/edu/nju/course04/JavaDataSetDataSourceApp.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course04;
 2 | 
 3 | import org.apache.flink.api.java.ExecutionEnvironment;
 4 | 
 5 | import java.util.ArrayList;
 6 | import java.util.List;
 7 | 
 8 | /**
 9 |  * Created by thpffcj on 2019-07-02.
10 |  */
11 | public class JavaDataSetDataSourceApp {
12 | 
13 |     public static void main(String[] args) throws Exception {
14 |         ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
15 | //        fromCollection(env);
16 |         textFile(env);
17 |     }
18 | 
19 |     public static void textFile(ExecutionEnvironment env) throws Exception {
20 |         String filePath = "file:///Users/thpffcj/Public/data/hello.txt";
21 |         env.readTextFile(filePath).print();
22 |     }
23 | 
24 |     public static void fromCollection(ExecutionEnvironment env) throws Exception {
25 |         List<Integer> list = new ArrayList<>();
26 |         for (int i = 1; i <= 10; i++) {
27 |             list.add(i);
28 |         }
29 |         env.fromCollection(list).print();
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/flink-train/src/main/java/cn/edu/nju/course04/JavaDataSetSinkApp.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course04;
 2 | 
 3 | import org.apache.flink.api.java.ExecutionEnvironment;
 4 | import org.apache.flink.api.java.operators.DataSource;
 5 | import org.apache.flink.core.fs.FileSystem;
 6 | import java.util.ArrayList;
 7 | import java.util.List;
 8 | 
 9 | /**
10 |  * Created by thpffcj on 2019-07-04.
11 |  */
12 | public class JavaDataSetSinkApp {
13 | 
14 |     public static void main(String[] args) throws Exception {
15 | 
16 |         ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
17 | 
18 |         List<Integer> info = new ArrayList<>();
19 |         for (int i = 1; i <= 10; i++) {
20 |             info.add(i);
21 |         }
22 |         DataSource<Integer> data = env.fromCollection(info);
23 | 
24 |         String filePath = "file:///Users/thpffcj/Public/data/sink-out";
25 | 
26 |         data.writeAsText(filePath, FileSystem.WriteMode.OVERWRITE);
27 | 
28 |         env.execute("JavaDataSetSinkApp");
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/flink-train/src/main/java/cn/edu/nju/course04/JavaDistributedCacheApp.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course04;
 2 | 
 3 | import org.apache.commons.io.FileUtils;
 4 | import org.apache.flink.api.common.functions.RichMapFunction;
 5 | import org.apache.flink.api.java.ExecutionEnvironment;
 6 | import org.apache.flink.api.java.operators.DataSource;
 7 | import org.apache.flink.configuration.Configuration;
 8 | 
 9 | import java.io.File;
10 | import java.util.ArrayList;
11 | import java.util.List;
12 | 
13 | /**
14 |  * Created by thpffcj on 2019-07-04.
15 |  */
16 | public class JavaDistributedCacheApp {
17 | 
18 |     public static void main(String[] args) throws Exception {
19 | 
20 |         ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
21 | 
22 |         String filePath = "file:///Users/thpffcj/Public/data/hello.txt";
23 | 
24 |         // step1：注册一个本地/HDFS文件
25 |         env.registerCachedFile(filePath, "java-dc");
26 | 
27 |         DataSource<String> data = env.fromElements("hadoop", "spark", "flink", "pyspark", "storm");
28 | 
29 |         data.map(new RichMapFunction<String, String>() {
30 | 
31 |             List<String> list = new ArrayList<>();
32 | 
33 |             @Override
34 |             public void open(Configuration parameters) throws Exception {
35 |                 File file = getRuntimeContext().getDistributedCache().getFile("java-dc");
36 |                 List<String> lines = FileUtils.readLines(file);
37 |                 for (String line : lines) {
38 |                     list.add(line);
39 |                     System.out.println("line = " + line);
40 |                 }
41 |             }
42 | 
43 |             @Override
44 |             public String map(String value) throws Exception {
45 |                 return value;
46 |             }
47 |         }).print();
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/flink-train/src/main/java/cn/edu/nju/course04/Person.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course04;
 2 | 
 3 | /**
 4 |  * Created by thpffcj on 2019-07-02.
 5 |  */
 6 | public class Person {
 7 | 
 8 |     private String name;
 9 |     private int age;
10 |     private String work;
11 | 
12 |     public Person() {
13 |     }
14 | 
15 |     public String getName() {
16 |         return name;
17 |     }
18 | 
19 |     public void setName(String name) {
20 |         this.name = name;
21 |     }
22 | 
23 |     public int getAge() {
24 |         return age;
25 |     }
26 | 
27 |     public void setAge(int age) {
28 |         this.age = age;
29 |     }
30 | 
31 |     public String getWork() {
32 |         return work;
33 |     }
34 | 
35 |     public void setWork(String work) {
36 |         this.work = work;
37 |     }
38 | 
39 |     @Override
40 |     public String toString() {
41 |         return "Person{" +
42 |                 "name='" + name + '\'' +
43 |                 ", age=" + age +
44 |                 ", work='" + work + '\'' +
45 |                 '}';
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/flink-train/src/main/java/cn/edu/nju/course05/JavaCustomNonParallelSourceFunction.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course05;
 2 | 
 3 | import org.apache.flink.streaming.api.functions.source.SourceFunction;
 4 | 
 5 | /**
 6 |  * Created by thpffcj on 2019-07-05.
 7 |  */
 8 | public class JavaCustomNonParallelSourceFunction implements SourceFunction<Long> {
 9 | 
10 |     boolean isRunning = true;
11 |     Long count = 1L;
12 | 
13 |     @Override
14 |     public void run(SourceContext<Long> ctx) throws Exception {
15 |         while (isRunning) {
16 |             ctx.collect(count);
17 |             count += 1;
18 |             Thread.sleep(1000);
19 |         }
20 |     }
21 | 
22 |     @Override
23 |     public void cancel() {
24 |         isRunning = false;
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/flink-train/src/main/java/cn/edu/nju/course05/JavaCustomParallelSourceFunction.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course05;
 2 | 
 3 | import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction;
 4 | import org.apache.flink.streaming.api.functions.source.SourceFunction;
 5 | 
 6 | /**
 7 |  * Created by thpffcj on 2019-07-05.
 8 |  */
 9 | public class JavaCustomParallelSourceFunction implements ParallelSourceFunction<Long> {
10 | 
11 |     boolean isRunning = true;
12 |     Long count = 1L;
13 | 
14 |     @Override
15 |     public void run(SourceFunction.SourceContext<Long> ctx) throws Exception {
16 |         while (isRunning) {
17 |             ctx.collect(count);
18 |             count += 1;
19 |             Thread.sleep(1000);
20 |         }
21 |     }
22 | 
23 |     @Override
24 |     public void cancel() {
25 |         isRunning = false;
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/flink-train/src/main/java/cn/edu/nju/course05/JavaCustomRichParallelSourceFunction.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course05;
 2 | 
 3 | import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction;
 4 | import org.apache.flink.streaming.api.functions.source.SourceFunction;
 5 | 
 6 | /**
 7 |  * Created by thpffcj on 2019-07-05.
 8 |  */
 9 | public class JavaCustomRichParallelSourceFunction extends RichParallelSourceFunction<Long> {
10 | 
11 |     boolean isRunning = true;
12 |     Long count = 1L;
13 | 
14 |     @Override
15 |     public void run(SourceFunction.SourceContext<Long> ctx) throws Exception {
16 |         while (isRunning) {
17 |             ctx.collect(count);
18 |             count += 1;
19 |             Thread.sleep(1000);
20 |         }
21 |     }
22 | 
23 |     @Override
24 |     public void cancel() {
25 |         isRunning = false;
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/flink-train/src/main/java/cn/edu/nju/course05/JavaCustomSinkToMySQL.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course05;
 2 | 
 3 | import org.apache.flink.api.common.functions.MapFunction;
 4 | import org.apache.flink.streaming.api.datastream.DataStreamSource;
 5 | import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
 6 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
 7 | 
 8 | /**
 9 |  * Created by thpffcj on 2019-07-05.
10 |  */
11 | public class JavaCustomSinkToMySQL {
12 | 
13 |     public static void main(String[] args) throws Exception {
14 | 
15 |         StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
16 | 
17 |         DataStreamSource<String> source = env.socketTextStream("localhost", 7777);
18 | 
19 |         SingleOutputStreamOperator<Student> studentStream = source.map(new MapFunction<String, Student>() {
20 |             @Override
21 |             public Student map(String value) throws Exception {
22 |                 System.out.println(value);
23 |                 String[] splits = value.split(",");
24 |                 Student stu = new Student();
25 |                 stu.setId(Integer.parseInt(splits[0]));
26 |                 stu.setName(splits[1]);
27 |                 stu.setAge(Integer.parseInt(splits[2]));
28 |                 return stu;
29 |             }
30 |         });
31 | 
32 |         studentStream.addSink(new SinkToMySQL());
33 | 
34 |         env.execute("JavaCustomSinkToMySQL");
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/flink-train/src/main/java/cn/edu/nju/course05/JavaDataStreamSourceApp.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course05;
 2 | 
 3 | import org.apache.flink.streaming.api.datastream.DataStreamSource;
 4 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
 5 | 
 6 | /**
 7 |  * Created by thpffcj on 2019-07-04.
 8 |  */
 9 | public class JavaDataStreamSourceApp {
10 | 
11 |     public static void main(String[] args) throws Exception {
12 | 
13 |         StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
14 | 
15 | //        socketFunction(env);
16 | //        nonParallelSourceFunction(env);
17 | //        parallelSourceFunction(env);
18 |         richParallelSourceFunction(env);
19 | 
20 |         env.execute("JavaDataStreamSourceApp");
21 |     }
22 | 
23 |     public static void richParallelSourceFunction(StreamExecutionEnvironment env) {
24 |         DataStreamSource<Long> data = env.addSource(new JavaCustomRichParallelSourceFunction()).setParallelism(2);
25 |         data.print().setParallelism(1);
26 |     }
27 | 
28 |     public static void parallelSourceFunction(StreamExecutionEnvironment env) {
29 |         DataStreamSource<Long> data = env.addSource(new JavaCustomParallelSourceFunction()).setParallelism(2);
30 |         data.print().setParallelism(1);
31 |     }
32 | 
33 |     public static void nonParallelSourceFunction(StreamExecutionEnvironment env) {
34 |         DataStreamSource<Long> data = env.addSource(new JavaCustomNonParallelSourceFunction());
35 |         data.print().setParallelism(1);
36 |     }
37 | 
38 |     public static void socketFunction(StreamExecutionEnvironment env) {
39 |         DataStreamSource<String> data = env.socketTextStream("localhost", 9999);
40 |         data.print().setParallelism(1);
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/flink-train/src/main/java/cn/edu/nju/course05/JavaDataStreamTransformationApp.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course05;
 2 | 
 3 | import org.apache.flink.api.common.functions.FilterFunction;
 4 | import org.apache.flink.api.common.functions.MapFunction;
 5 | import org.apache.flink.streaming.api.collector.selector.OutputSelector;
 6 | import org.apache.flink.streaming.api.datastream.DataStreamSource;
 7 | import org.apache.flink.streaming.api.datastream.SplitStream;
 8 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
 9 | 
10 | import java.util.ArrayList;
11 | import java.util.List;
12 | 
13 | /**
14 |  * Created by thpffcj on 2019-07-05.
15 |  */
16 | public class JavaDataStreamTransformationApp {
17 | 
18 |     public static void main(String[] args) throws Exception {
19 | 
20 |         StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
21 | 
22 | //        filterFunction(env);
23 | //        unionFunction(env);
24 |         splitSelectFunction(env);
25 | 
26 |         env.execute("JavaDataStreamTransformationApp");
27 |     }
28 | 
29 |     public static void splitSelectFunction(StreamExecutionEnvironment env) {
30 |         DataStreamSource<Long> data = env.addSource(new JavaCustomNonParallelSourceFunction());
31 | 
32 |         SplitStream<Long> splits = data.split(new OutputSelector<Long>() {
33 |             @Override
34 |             public Iterable<String> select(Long value) {
35 |                 List<String> output = new ArrayList<>();
36 |                 if (value % 2 == 0) {
37 |                     output.add("even");
38 |                 } else {
39 |                     output.add("odd");
40 |                 }
41 |                 return output;
42 |             }
43 |         });
44 | 
45 |         splits.select("odd").print().setParallelism(1);
46 |     }
47 | 
48 |     public static void unionFunction(StreamExecutionEnvironment env) {
49 |         DataStreamSource<Long> data1 = env.addSource(new JavaCustomNonParallelSourceFunction());
50 |         DataStreamSource<Long> data2 = env.addSource(new JavaCustomNonParallelSourceFunction());
51 |         data1.union(data2).print().setParallelism(1);
52 |     }
53 | 
54 |     public static void filterFunction(StreamExecutionEnvironment env) {
55 |         DataStreamSource<Long> data = env.addSource(new JavaCustomNonParallelSourceFunction());
56 |         data.map(new MapFunction<Long, Long>() {
57 |             @Override
58 |             public Long map(Long value) throws Exception {
59 |                 System.out.println("receive: " + value);
60 |                 return value;
61 |             }
62 |         }).filter(new FilterFunction<Long>() {
63 |             @Override
64 |             public boolean filter(Long value) throws Exception {
65 |                 return value % 2 == 0;
66 |             }
67 |         }).print().setParallelism(1);
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/flink-train/src/main/java/cn/edu/nju/course05/SinkToMySQL.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course05;
 2 | 
 3 | import org.apache.flink.configuration.Configuration;
 4 | import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
 5 | 
 6 | import java.sql.Connection;
 7 | import java.sql.DriverManager;
 8 | import java.sql.PreparedStatement;
 9 | 
10 | /**
11 |  * Created by thpffcj on 2019-07-05.
12 |  */
13 | public class SinkToMySQL extends RichSinkFunction<Student> {
14 | 
15 |     Connection connection;
16 |     PreparedStatement preparedStatement;
17 | 
18 |     private Connection getConnection() {
19 |         Connection conn = null;
20 |         try {
21 |             String url = "jdbc:mysql://localhost:3306/test";
22 |             conn = DriverManager.getConnection(url, "root", "00000000");
23 |         } catch (Exception e) {
24 |             e.printStackTrace();
25 |         }
26 |         return conn;
27 |     }
28 | 
29 |     @Override
30 |     public void open(Configuration parameters) throws Exception {
31 |         super.open(parameters);
32 | 
33 |         connection = getConnection();
34 |         String sql = "insert into Student(id, name, age) values (?, ?, ?)";
35 |         preparedStatement = connection.prepareStatement(sql);
36 |     }
37 | 
38 |     // 每条记录插入时调用一次
39 |     public void invoke(Student value, Context context) throws Exception {
40 | 
41 |         // 为前面的占位符赋值
42 |         preparedStatement.setInt(1, value.getId());
43 |         preparedStatement.setString(2, value.getName());
44 |         preparedStatement.setInt(3, value.getAge());
45 | 
46 |         preparedStatement.executeUpdate();
47 |     }
48 | 
49 |     @Override
50 |     public void close() throws Exception {
51 |         if(connection != null) {
52 |             try {
53 |                 connection.close();
54 |             } catch(Exception e) {
55 |                 e.printStackTrace();
56 |             }
57 |             connection = null;
58 |         }
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/flink-train/src/main/java/cn/edu/nju/course05/Student.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course05;
 2 | 
 3 | /**
 4 |  * Created by thpffcj on 2019-07-05.
 5 |  */
 6 | public class Student {
 7 | 
 8 |     private int id;
 9 |     private String name;
10 |     private int age;
11 | 
12 |     public int getId() {
13 |         return id;
14 |     }
15 | 
16 |     public void setId(int id) {
17 |         this.id = id;
18 |     }
19 | 
20 |     public String getName() {
21 |         return name;
22 |     }
23 | 
24 |     public void setName(String name) {
25 |         this.name = name;
26 |     }
27 | 
28 |     public int getAge() {
29 |         return age;
30 |     }
31 | 
32 |     public void setAge(int age) {
33 |         this.age = age;
34 |     }
35 | 
36 |     @Override
37 |     public String toString() {
38 |         return "Student{" +
39 |                 "id=" + id +
40 |                 ", name='" + name + '\'' +
41 |                 ", age=" + age +
42 |                 '}';
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/flink-train/src/main/java/cn/edu/nju/course06/JavaTableSQLAPI.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course06;
 2 | 
 3 | import org.apache.flink.api.java.DataSet;
 4 | import org.apache.flink.api.java.ExecutionEnvironment;
 5 | import org.apache.flink.table.api.Table;
 6 | import org.apache.flink.table.api.TableEnvironment;
 7 | import org.apache.flink.table.api.java.BatchTableEnvironment;
 8 | import org.apache.flink.types.Row;
 9 | 
10 | /**
11 |  * Created by thpffcj on 2019-07-06.
12 |  */
13 | public class JavaTableSQLAPI {
14 | 
15 |     public static void main(String[] args) throws Exception {
16 |         ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
17 |         BatchTableEnvironment tableEnv = BatchTableEnvironment.create(env);
18 | 
19 |         String filePath = "file:///Users/thpffcj/Public/data/sales.csv";
20 |         DataSet<Sales> csv = env.readCsvFile(filePath)
21 |                 .ignoreFirstLine()
22 |                 .pojoType(Sales.class, "transactionId", "customerId", "itemId", "amountPaid");
23 | 
24 |         Table sales = tableEnv.fromDataSet(csv);
25 |         tableEnv.registerTable("sales", sales);
26 |         Table resultTable = tableEnv.sqlQuery("select customerId, sum(amountPaid) money from sales group by customerId");
27 | 
28 |         DataSet<Row> result = tableEnv.toDataSet(resultTable, Row.class);
29 |         result.print();
30 |     }
31 | 
32 |     public static class Sales {
33 |         public String transactionId;
34 |         public String customerId;
35 |         public String itemId;
36 |         public Double amountPaid;
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/flink-train/src/main/java/cn/edu/nju/course07/JavaWindowsApp.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course07;
 2 | 
 3 | import org.apache.flink.api.common.functions.FlatMapFunction;
 4 | import org.apache.flink.api.java.tuple.Tuple2;
 5 | import org.apache.flink.streaming.api.datastream.DataStreamSource;
 6 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
 7 | import org.apache.flink.streaming.api.windowing.time.Time;
 8 | import org.apache.flink.util.Collector;
 9 | 
10 | /**
11 |  * Created by thpffcj on 2019-07-06.
12 |  */
13 | public class JavaWindowsApp {
14 | 
15 |     public static void main(String[] args) throws Exception {
16 | 
17 |         StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
18 | 
19 |         DataStreamSource<String> text = env.socketTextStream("localhost", 9999);
20 | 
21 |         text.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
22 |             @Override
23 |             public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception {
24 |                 String[] tokens = value.toLowerCase().split(",");
25 |                 for (String token : tokens) {
26 |                     if (token.length() > 0) {
27 |                         out.collect(new Tuple2<String, Integer>(token, 1));
28 |                     }
29 |                 }
30 |             }
31 |         }).keyBy(0).timeWindow(Time.seconds(5)).sum(1).print().setParallelism(1);
32 | 
33 |         env.execute("JavaWindowsApp");
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/flink-train/src/main/java/cn/edu/nju/course07/JavaWindowsProcessApp.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course07;
 2 | 
 3 | import org.apache.flink.api.common.functions.FlatMapFunction;
 4 | import org.apache.flink.api.java.tuple.Tuple;
 5 | import org.apache.flink.api.java.tuple.Tuple2;
 6 | import org.apache.flink.streaming.api.datastream.DataStreamSource;
 7 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
 8 | import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
 9 | import org.apache.flink.streaming.api.windowing.time.Time;
10 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
11 | import org.apache.flink.util.Collector;
12 | 
13 | /**
14 |  * Created by thpffcj on 2019-07-06.
15 |  */
16 | public class JavaWindowsProcessApp {
17 | 
18 |     public static void main(String[] args) throws Exception {
19 | 
20 |         StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
21 | 
22 |         DataStreamSource<String> text = env.socketTextStream("localhost", 9999);
23 | 
24 |         text.flatMap(new FlatMapFunction<String, Tuple2<Integer, Integer>>() {
25 |             @Override
26 |             public void flatMap(String value, Collector<Tuple2<Integer, Integer>> out) throws Exception {
27 |                 String[] tokens = value.toLowerCase().split(",");
28 |                 for (String token : tokens) {
29 |                     if (token.length() > 0) {
30 |                         out.collect(new Tuple2<Integer, Integer>(1, Integer.parseInt(token)));
31 |                     }
32 |                 }
33 |             }
34 |         }).keyBy(0)
35 |                 .timeWindow(Time.seconds(5))
36 |                 .process(new ProcessWindowFunction<Tuple2<Integer, Integer>, Object, Tuple, TimeWindow>() {
37 |                     @Override
38 |                     public void process(Tuple tuple, Context context, Iterable<Tuple2<Integer, Integer>> elements, Collector<Object> out) throws Exception {
39 |                         System.out.println("----------");
40 |                         long count = 0;
41 |                         for (Tuple2<Integer, Integer> in : elements) {
42 |                             count++;
43 |                         }
44 |                         out.collect("Window: " + context.window() + "count: " + count);
45 |                     }
46 |                 }).print().setParallelism(1);
47 | 
48 |         env.execute("JavaWindowsProcessApp");
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/flink-train/src/main/java/cn/edu/nju/course07/JavaWindowsReduceApp.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course07;
 2 | 
 3 | import org.apache.flink.api.common.functions.FlatMapFunction;
 4 | import org.apache.flink.api.common.functions.ReduceFunction;
 5 | import org.apache.flink.api.java.tuple.Tuple2;
 6 | import org.apache.flink.streaming.api.datastream.DataStreamSource;
 7 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
 8 | import org.apache.flink.streaming.api.windowing.time.Time;
 9 | import org.apache.flink.util.Collector;
10 | 
11 | /**
12 |  * Created by thpffcj on 2019-07-06.
13 |  */
14 | public class JavaWindowsReduceApp {
15 | 
16 |     public static void main(String[] args) throws Exception {
17 | 
18 |         StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
19 | 
20 |         DataStreamSource<String> text = env.socketTextStream("localhost", 9999);
21 | 
22 |         text.flatMap(new FlatMapFunction<String, Tuple2<Integer, Integer>>() {
23 |             @Override
24 |             public void flatMap(String value, Collector<Tuple2<Integer, Integer>> out) throws Exception {
25 |                 String[] tokens = value.toLowerCase().split(",");
26 |                 for (String token : tokens) {
27 |                     if (token.length() > 0) {
28 |                         out.collect(new Tuple2<Integer, Integer>(1, Integer.parseInt(token)));
29 |                     }
30 |                 }
31 |             }
32 |         }).keyBy(0)
33 |                 .timeWindow(Time.seconds(5))
34 |                 .reduce(new ReduceFunction<Tuple2<Integer, Integer>>() {
35 |                     @Override
36 |                     public Tuple2<Integer, Integer> reduce(Tuple2<Integer, Integer> value1, Tuple2<Integer, Integer> value2) throws Exception {
37 |                         System.out.println("value1 = [" + value1 + "], value2 = [" + value2 + "]");
38 |                         return new Tuple2<>(value1.f0, value1.f1 + value2.f1);
39 |                     }
40 |                 }).print().setParallelism(1);
41 | 
42 |         env.execute("JavaWindowsReduceApp");
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/flink-train/src/main/java/cn/edu/nju/hotItem/UserBehavior.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.hotItem;
 2 | 
 3 | /**
 4 |  * Created by thpffcj on 2019-08-14.
 5 |  */
 6 | 
 7 | /** 用户行为数据结构 **/
 8 | public class UserBehavior {
 9 | 
10 |     public long userId;  // 用户 ID
11 |     public long itemId;  // 商品 ID
12 |     public int categoryId;  // 商品类目 ID
13 |     public String behavior;  // 用户行为, 包括("pv", "buy", "cart", "fav")
14 |     public long timestamp;  // 行为发生的时间戳，单位秒
15 | }
16 | 


--------------------------------------------------------------------------------
/flink-train/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | #  Licensed to the Apache Software Foundation (ASF) under one
 3 | #  or more contributor license agreements.  See the NOTICE file
 4 | #  distributed with this work for additional information
 5 | #  regarding copyright ownership.  The ASF licenses this file
 6 | #  to you under the Apache License, Version 2.0 (the
 7 | #  "License"); you may not use this file except in compliance
 8 | #  with the License.  You may obtain a copy of the License at
 9 | #
10 | #      http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | #  Unless required by applicable law or agreed to in writing, software
13 | #  distributed under the License is distributed on an "AS IS" BASIS,
14 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | #  See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | ################################################################################
18 | 
19 | log4j.rootLogger=INFO, console
20 | 
21 | log4j.appender.console=org.apache.log4j.ConsoleAppender
22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
23 | log4j.appender.console.layout.ConversionPattern=%d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n
24 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/BatchJob.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The ASF licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | package cn.edu.nju
20 | 
21 | import org.apache.flink.api.scala._
22 | 
23 | /**
24 |  * Skeleton for a Flink Batch Job.
25 |  *
26 |  * For a tutorial how to write a Flink batch application, check the
27 |  * tutorials and examples on the <a href="http://flink.apache.org/docs/stable/">Flink Website</a>.
28 |  *
29 |  * To package your application into a JAR file for execution,
30 |  * change the main class in the POM.xml file to this class (simply search for 'mainClass')
31 |  * and run 'mvn clean package' on the command line.
32 |  */
33 | object BatchJob {
34 | 
35 |   def main(args: Array[String]) {
36 |     // set up the batch execution environment
37 |     val env = ExecutionEnvironment.getExecutionEnvironment
38 | 
39 |     /*
40 |      * Here, you can start creating your execution plan for Flink.
41 |      *
42 |      * Start with getting some data from the environment, like
43 |      *  env.readTextFile(textPath);
44 |      *
45 |      * then, transform the resulting DataSet[String] using operations
46 |      * like
47 |      *   .filter()
48 |      *   .flatMap()
49 |      *   .join()
50 |      *   .group()
51 |      *
52 |      * and many more.
53 |      * Have a look at the programming guide:
54 |      *
55 |      * http://flink.apache.org/docs/latest/apis/batch/index.html
56 |      *
57 |      * and the examples
58 |      *
59 |      * http://flink.apache.org/docs/latest/apis/batch/examples.html
60 |      *
61 |      */
62 | 
63 |     // execute program
64 |     env.execute("Flink Batch Scala API Skeleton")
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/BatchWCScalaApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju
 2 | 
 3 | import org.apache.flink.api.scala.ExecutionEnvironment
 4 | 
 5 | /**
 6 |   * 使用Scala开发Flink的批处理应用程序
 7 |   * Created by thpffcj on 2019-06-28.
 8 |   */
 9 | object BatchWCScalaApp {
10 | 
11 |   def main(args: Array[String]): Unit = {
12 | 
13 |     val input = "file:///Users/thpffcj/Public/file/hello.txt"
14 | 
15 |     val env = ExecutionEnvironment.getExecutionEnvironment
16 | 
17 |     val text = env.readTextFile(input)
18 | 
19 |     // 引入隐式转换
20 |     import org.apache.flink.api.scala._
21 | 
22 |     text.flatMap(_.toLowerCase.split("\t"))
23 |       .filter(_.nonEmpty)
24 |       .map((_, 1))
25 |       .groupBy(0)
26 |       .sum(1).print()
27 |   }
28 | 
29 |   /**
30 |     * hadoop	welcome
31 |     * hadoop	hdfs	mapreduce
32 |     * hadoop	hdfs
33 |     *
34 |     * hadoop
35 |     * hdfs
36 |     * hadoop
37 |     * welcome
38 |     * hadoop
39 |     * hdfs
40 |     * mapreduce
41 |     *
42 |     * hadoop
43 |     * hdfs
44 |     * hadoop
45 |     * welcome
46 |     * hadoop
47 |     * hdfs
48 |     * mapreduce
49 |     *
50 |     * (hadoop,1)
51 |     * (hdfs,1)
52 |     * (mapreduce,1)
53 |     * (hadoop,1)
54 |     * (welcome,1)
55 |     * (hadoop,1)
56 |     * (hdfs,1)
57 |     *
58 |     * (hdfs,2)
59 |     * (hadoop,3)
60 |     * (mapreduce,1)
61 |     * (welcome,1)
62 |     */
63 | }
64 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/StreamingJob.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one
 3 |  * or more contributor license agreements.  See the NOTICE file
 4 |  * distributed with this work for additional information
 5 |  * regarding copyright ownership.  The ASF licenses this file
 6 |  * to you under the Apache License, Version 2.0 (the
 7 |  * "License"); you may not use this file except in compliance
 8 |  * with the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | 
19 | package cn.edu.nju
20 | 
21 | import org.apache.flink.streaming.api.scala._
22 | 
23 | /**
24 |  * Skeleton for a Flink Streaming Job.
25 |  *
26 |  * For a tutorial how to write a Flink streaming application, check the
27 |  * tutorials and examples on the <a href="http://flink.apache.org/docs/stable/">Flink Website</a>.
28 |  *
29 |  * To package your application into a JAR file for execution, run
30 |  * 'mvn clean package' on the command line.
31 |  *
32 |  * If you change the name of the main class (with the public static void main(String[] args))
33 |  * method, change the respective entry in the POM.xml file (simply search for 'mainClass').
34 |  */
35 | object StreamingJob {
36 |   def main(args: Array[String]) {
37 |     // set up the streaming execution environment
38 |     val env = StreamExecutionEnvironment.getExecutionEnvironment
39 | 
40 |     /*
41 |      * Here, you can start creating your execution plan for Flink.
42 |      *
43 |      * Start with getting some data from the environment, like
44 |      *  env.readTextFile(textPath);
45 |      *
46 |      * then, transform the resulting DataStream[String] using operations
47 |      * like
48 |      *   .filter()
49 |      *   .flatMap()
50 |      *   .join()
51 |      *   .group()
52 |      *
53 |      * and many more.
54 |      * Have a look at the programming guide:
55 |      *
56 |      * http://flink.apache.org/docs/latest/apis/streaming/index.html
57 |      *
58 |      */
59 | 
60 |     // execute program
61 |     env.execute("Flink Streaming Scala API Skeleton")
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/StreamingWCScalaApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju
 2 | 
 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
 4 | import org.apache.flink.streaming.api.windowing.time.Time
 5 | 
 6 | /**
 7 |   * 使用Scala开发Flink的实时处理应用程序
 8 |   * Created by thpffcj on 2019-06-29.
 9 |   */
10 | object StreamingWCScalaApp {
11 | 
12 |   def main(args: Array[String]): Unit = {
13 | 
14 |     val env = StreamExecutionEnvironment.getExecutionEnvironment
15 | 
16 |     val text = env.socketTextStream("localhost", 9999)
17 | 
18 |     import org.apache.flink.api.scala._
19 | 
20 | //    text.flatMap(_.split(","))
21 | //      .filter(_.nonEmpty)
22 | //      .map((_, 1))
23 | //      .keyBy(0)
24 | //      .timeWindow(Time.seconds(5))
25 | //      .sum(1).print()
26 | 
27 |     text.flatMap(_.split(","))
28 |       .filter(_.nonEmpty)
29 |       .map(x => WC(x, 1))
30 |       .keyBy("word")
31 |       .timeWindow(Time.seconds(5))
32 |       .sum("count").print()
33 | 
34 |     env.execute("StreamingWCScalaApp")
35 |   }
36 | 
37 |   case class WC(word: String, count:Int)
38 | }
39 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/WindowWordCount.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju;
 2 | 
 3 | import org.apache.flink.api.common.functions.FlatMapFunction;
 4 | import org.apache.flink.api.java.tuple.Tuple2;
 5 | import org.apache.flink.api.java.utils.ParameterTool;
 6 | import org.apache.flink.streaming.api.datastream.DataStream;
 7 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
 8 | import org.apache.flink.util.Collector;
 9 | 
10 | /**
11 |  * Created by thpffcj on 2019-07-28.
12 |  */
13 | public class WindowWordCount {
14 | 
15 |     public static void main(String[] args) throws Exception {
16 | 
17 |         final ParameterTool params = ParameterTool.fromArgs(args);
18 | 
19 |         // set up the execution environment
20 |         final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
21 | 
22 |         // get input data
23 |         DataStream<String> text = env.readTextFile(params.get("input")).setParallelism(2);
24 | 
25 |         // make parameters available in the web interface
26 |         env.getConfig().setGlobalJobParameters(params);
27 | 
28 |         final int windowSize = params.getInt("window", 10);
29 |         final int slideSize = params.getInt("slide", 5);
30 | 
31 |         DataStream<Tuple2<String, Integer>> counts =
32 |                 // split up the lines in pairs (2-tuples) containing: (word,1)
33 |                 text.flatMap(new Tokenizer()).setParallelism(4).slotSharingGroup("flatMap_sg")
34 |                         // create windows of windowSize records slided every slideSize records
35 |                         .keyBy(0)
36 |                         .countWindow(windowSize, slideSize)
37 |                         // group by the tuple field "0" and sum up tuple field "1"
38 |                         .sum(1).setParallelism(3).slotSharingGroup("sum_sg");
39 | 
40 |         // emit result
41 |         counts.print().setParallelism(3);
42 | 
43 |         // execute program
44 |         env.execute("WindowWordCount");
45 |     }
46 | 
47 |     public static final class Tokenizer implements FlatMapFunction<String, Tuple2<String, Integer>> {
48 | 
49 |         @Override
50 |         public void flatMap(String value, Collector<Tuple2<String, Integer>> out) {
51 |             // normalize and split the line
52 |             String[] tokens = value.toLowerCase().split("\\W+");
53 | 
54 |             // emit the pairs
55 |             for (String token : tokens) {
56 |                 if (token.length() > 0) {
57 |                     out.collect(new Tuple2<>(token, 1));
58 |                 }
59 |             }
60 |         }
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/course04/CounterApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course04
 2 | 
 3 | import org.apache.flink.api.common.accumulators.LongCounter
 4 | import org.apache.flink.api.common.functions.RichMapFunction
 5 | import org.apache.flink.api.scala.ExecutionEnvironment
 6 | import org.apache.flink.configuration.Configuration
 7 | import org.apache.flink.core.fs.FileSystem.WriteMode
 8 | import org.apache.flink.api.scala._
 9 | 
10 | /**
11 |   * 基于Flink编程的计数器开发三部曲
12 |   * step1：定义计数器
13 |   * step2：注册计数器
14 |   * step3：获取计数器
15 |   * Created by thpffcj on 2019-07-04.
16 |   */
17 | object CounterApp {
18 | 
19 |   def main(args: Array[String]): Unit = {
20 | 
21 |     val env = ExecutionEnvironment.getExecutionEnvironment
22 | 
23 |     val data = env.fromElements("hadoop", "spark", "flink", "pyspark", "storm")
24 | 
25 | //    data.map(new RichMapFunction[String, Long] {
26 | //      var counter = 0l
27 | //      override def map(value: String): Long = {
28 | //        counter = counter + 1
29 | //        println("counter : " + counter)
30 | //        counter
31 | //      }
32 | //    }).setParallelism(5).print()
33 | 
34 |     val info = data.map(new RichMapFunction[String, String] {
35 | 
36 |       // step1：定义计数器
37 |       var counter = new LongCounter()
38 | 
39 |       override def open(parameters: Configuration): Unit = {
40 |         // step2：注册计数器
41 |         getRuntimeContext.addAccumulator("ele-counts-scala", counter)
42 |       }
43 | 
44 |       override def map(value: String): String = {
45 |         counter.add(1)
46 |         value
47 |       }
48 |     }).setParallelism(5)
49 | 
50 |     val filePath = "file:///Users/thpffcj/Public/data/sink-scala-count-out"
51 |     info.writeAsText(filePath, WriteMode.OVERWRITE)
52 |     val jobResult = env.execute("CounterApp")
53 |     // step3：获取计数器
54 |     val num = jobResult.getAccumulatorResult[Long]("ele-counts-scala")
55 | 
56 |     println("num: " + num)
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/course04/DBUtils.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course04
 2 | 
 3 | import scala.util.Random
 4 | 
 5 | /**
 6 |   * Created by thpffcj on 2019-07-02.
 7 |   */
 8 | object DBUtils {
 9 | 
10 |   def getConnection()= {
11 |     new Random().nextInt(10) + ""
12 |   }
13 | 
14 |   def returnConnection(connection: String): Unit = {
15 | 
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/course04/DataSetDataSourceApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course04
 2 | 
 3 | import org.apache.flink.api.scala.ExecutionEnvironment
 4 | import org.apache.flink.configuration.Configuration
 5 | 
 6 | /**
 7 |   * Created by thpffcj on 2019-07-02.
 8 |   */
 9 | object DataSetDataSourceApp {
10 | 
11 |   def main(args: Array[String]): Unit = {
12 | 
13 |     val env = ExecutionEnvironment.getExecutionEnvironment
14 | 
15 | //    fromCollection(env)
16 | //    textFile(env)
17 | //    csvFile(env)
18 | //    readRecursiveFiles(env)
19 |     readCompressionFiles(env)
20 |   }
21 | 
22 |   def readCompressionFiles(env: ExecutionEnvironment): Unit = {
23 |     val filePath = "file:///Users/thpffcj/Public/data/compression"
24 |     env.readTextFile(filePath).print()
25 |   }
26 | 
27 |   def readRecursiveFiles(env: ExecutionEnvironment): Unit = {
28 |     val filePath = "file:///Users/thpffcj/Public/data/nested"
29 |     val parameters = new Configuration()
30 |     parameters.setBoolean("recursive.file.enumeration", true)
31 |     env.readTextFile(filePath).withParameters(parameters).print()
32 |   }
33 | 
34 |   case class MyCaseClass(name:String, age:Int)
35 | 
36 |   def csvFile(env: ExecutionEnvironment): Unit = {
37 | 
38 |     import org.apache.flink.api.scala._
39 |     val filePath = "file:///Users/thpffcj/Public/data/people.csv"
40 | 
41 |     env.readCsvFile[(String, Int, String)](filePath, ignoreFirstLine = true).print()
42 | 
43 |     env.readCsvFile[(String, Int)](filePath, ignoreFirstLine = true, includedFields = Array(0, 1)).print()
44 | 
45 |     env.readCsvFile[MyCaseClass](filePath, ignoreFirstLine = true, includedFields = Array(0, 1)).print()
46 | 
47 |     env.readCsvFile[Person](filePath, ignoreFirstLine = true, pojoFields = Array("name", "age", "work")).print()
48 |   }
49 | 
50 |   def textFile(env: ExecutionEnvironment): Unit = {
51 |     val filePath = "file:///Users/thpffcj/Public/data/hello.txt"
52 |     env.readTextFile(filePath).print()
53 |   }
54 | 
55 |   def fromCollection(env: ExecutionEnvironment): Unit = {
56 | 
57 |     import org.apache.flink.api.scala._
58 |     val data = 1 to 10
59 |     env.fromCollection(data).print()
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/course04/DataSetSinkApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course04
 2 | 
 3 | import org.apache.flink.api.scala.ExecutionEnvironment
 4 | import org.apache.flink.core.fs.FileSystem.WriteMode
 5 | import org.apache.flink.api.scala._
 6 | 
 7 | /**
 8 |   * Created by thpffcj on 2019-07-04.
 9 |   */
10 | object DataSetSinkApp {
11 | 
12 |   def main(args: Array[String]): Unit = {
13 | 
14 |     val env = ExecutionEnvironment.getExecutionEnvironment
15 | 
16 |     val data = 1.to(10)
17 |     val text = env.fromCollection(data)
18 | 
19 |     val filePath = "file:///Users/thpffcj/Public/data/sink-out"
20 | 
21 |     text.writeAsText(filePath, WriteMode.OVERWRITE).setParallelism(2)
22 | 
23 |     env.execute("DataSetSinkApp")
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/course04/DistributedCacheApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course04
 2 | 
 3 | import org.apache.commons.io.FileUtils
 4 | import org.apache.flink.api.common.functions.RichMapFunction
 5 | import org.apache.flink.api.scala.ExecutionEnvironment
 6 | import org.apache.flink.configuration.Configuration
 7 | import org.apache.flink.api.scala._
 8 | 
 9 | /**
10 |   * step1：注册一个本地/HDFS文件
11 |   * step2：在open方法中获取到分布式缓存的内容即可
12 |   * Created by thpffcj on 2019-07-04.
13 |   */
14 | object DistributedCacheApp {
15 | 
16 |   def main(args: Array[String]): Unit = {
17 | 
18 |     val env = ExecutionEnvironment.getExecutionEnvironment
19 | 
20 |     val filePath = "file:///Users/thpffcj/Public/data/hello.txt"
21 | 
22 |     // step1：注册一个本地/HDFS文件
23 |     env.registerCachedFile(filePath, "scala-dc")
24 | 
25 |     val data = env.fromElements("hadoop", "spark", "flink", "pyspark", "storm")
26 | 
27 |     data.map(new RichMapFunction[String, String] {
28 | 
29 |       // step2：在open方法中获取到分布式缓存的内容即可
30 |       override def open(parameters: Configuration): Unit = {
31 |         val dcFile = getRuntimeContext.getDistributedCache().getFile("scala-dc")
32 |         val lines = FileUtils.readLines(dcFile)
33 | 
34 |         /**
35 |           * 此时会出现一个异常，Java集合和Scala集合不兼容的问题
36 |           */
37 |         import scala.collection.JavaConverters._
38 |         for (ele <- lines.asScala) {
39 |           println(ele)
40 |         }
41 |       }
42 | 
43 |       override def map(value: String): String = {
44 |         value
45 |       }
46 |     }).print()
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/course05/CustomNonParallelSourceFunction.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course05
 2 | 
 3 | import org.apache.flink.streaming.api.functions.source.SourceFunction
 4 | 
 5 | /**
 6 |   * Created by thpffcj on 2019-07-05.
 7 |   */
 8 | class CustomNonParallelSourceFunction extends SourceFunction[Long]{
 9 | 
10 |   var count = 1L
11 | 
12 |   var isRunning = true
13 | 
14 |   override def run(ctx: SourceFunction.SourceContext[Long]): Unit = {
15 |     while (isRunning) {
16 |       ctx.collect(count)
17 |       count += 1
18 |       Thread.sleep(1000)
19 |     }
20 |   }
21 | 
22 |   override def cancel(): Unit = {
23 |     isRunning = false
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/course05/CustomParallelSourceFunction.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course05
 2 | 
 3 | import org.apache.flink.streaming.api.functions.source.{ParallelSourceFunction, SourceFunction}
 4 | 
 5 | /**
 6 |   * Created by thpffcj on 2019-07-05.
 7 |   */
 8 | class CustomParallelSourceFunction extends ParallelSourceFunction[Long] {
 9 | 
10 |   var count = 1l
11 |   var isRunning = true
12 | 
13 |   override def run(ctx: SourceFunction.SourceContext[Long]): Unit = {
14 |     while (isRunning) {
15 |       ctx.collect(count)
16 |       count += 1
17 |       Thread.sleep(1000)
18 |     }
19 |   }
20 | 
21 |   override def cancel(): Unit = {
22 |     isRunning = false
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/course05/CustomRichParallelSourceFunction.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course05
 2 | 
 3 | import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, SourceFunction}
 4 | 
 5 | /**
 6 |   * Created by thpffcj on 2019-07-05.
 7 |   */
 8 | class CustomRichParallelSourceFunction extends RichParallelSourceFunction[Long] {
 9 | 
10 |   var count = 1l
11 |   var isRunning = true
12 | 
13 |   override def run(ctx: SourceFunction.SourceContext[Long]): Unit = {
14 |     while (isRunning) {
15 |       ctx.collect(count)
16 |       count += 1
17 |       Thread.sleep(1000)
18 |     }
19 |   }
20 | 
21 |   override def cancel(): Unit = {
22 |     isRunning = false
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/course05/DataStreamSourceApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course05
 2 | 
 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
 4 | import org.apache.flink.api.scala._
 5 | 
 6 | /**
 7 |   * Created by thpffcj on 2019-07-04.
 8 |   */
 9 | object DataStreamSourceApp {
10 | 
11 |   def main(args: Array[String]): Unit = {
12 | 
13 |     val env = StreamExecutionEnvironment.getExecutionEnvironment
14 | //    socketFunction(env)
15 | 
16 | //    nonParallelSourceFunction(env)
17 | //    parallelSourceFunction(env)
18 |     richParallelSourceFunction(env)
19 | 
20 |     env.execute("DataStreamSourceApp")
21 |   }
22 | 
23 |   def richParallelSourceFunction(env: StreamExecutionEnvironment): Unit = {
24 |     val data = env.addSource(new CustomRichParallelSourceFunction).setParallelism(2)
25 |     data.print()
26 |   }
27 | 
28 |   def parallelSourceFunction(env: StreamExecutionEnvironment): Unit = {
29 |     val data = env.addSource(new CustomParallelSourceFunction).setParallelism(2)
30 |     data.print()
31 |   }
32 | 
33 |   def nonParallelSourceFunction(env: StreamExecutionEnvironment): Unit = {
34 |     val data = env.addSource(new CustomNonParallelSourceFunction)
35 |     data.print()
36 |   }
37 | 
38 |   def socketFunction(env: StreamExecutionEnvironment): Unit = {
39 | 
40 |     val data = env.socketTextStream("localhost", 9999)
41 |     data.print()
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/course05/DataStreamTransformationApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course05
 2 | 
 3 | import java.{lang, util}
 4 | 
 5 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
 6 | import org.apache.flink.api.scala._
 7 | import org.apache.flink.streaming.api.TimeCharacteristic
 8 | import org.apache.flink.streaming.api.collector.selector.OutputSelector
 9 | 
10 | /**
11 |   * Created by thpffcj on 2019-07-05.
12 |   */
13 | object DataStreamTransformationApp {
14 | 
15 |   def main(args: Array[String]): Unit = {
16 |     val env = StreamExecutionEnvironment.getExecutionEnvironment
17 | 
18 |     env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
19 | 
20 | //    filterFunction(env)
21 | //    unionFunction(env)
22 |     splitSelectFunction(env)
23 | 
24 |     env.execute("DataStreamTransformationApp")
25 |   }
26 | 
27 |   def splitSelectFunction(env: StreamExecutionEnvironment): Unit = {
28 |     val data = env.addSource(new CustomNonParallelSourceFunction)
29 | 
30 |     val splits = data.split(new OutputSelector[Long] {
31 |       override def select(value: Long): lang.Iterable[String] = {
32 |         val list = new util.ArrayList[String]()
33 |         if (value % 2 == 0) {
34 |           list.add("even")
35 |         } else {
36 |           list.add("odd")
37 |         }
38 |         list
39 |       }
40 |     })
41 | 
42 |     splits.select("even").print().setParallelism(1)
43 |   }
44 | 
45 |   def unionFunction(env: StreamExecutionEnvironment): Unit = {
46 |     val data1 = env.addSource(new CustomNonParallelSourceFunction)
47 |     val data2 = env.addSource(new CustomNonParallelSourceFunction)
48 |     data1.union(data2).print().setParallelism(1)
49 |   }
50 | 
51 |   def filterFunction(env: StreamExecutionEnvironment): Unit = {
52 |     val data = env.addSource(new CustomNonParallelSourceFunction)
53 | 
54 |     data.map(x =>{
55 |       println("received: " + x)
56 |       x
57 |     }).filter(_%2 == 0).print().setParallelism(1)
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/course06/TableSQLAPI.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course06
 2 | 
 3 | import org.apache.flink.api.scala.ExecutionEnvironment
 4 | import org.apache.flink.table.api.TableEnvironment
 5 | import org.apache.flink.api.scala._
 6 | import org.apache.flink.types.Row
 7 | 
 8 | /**
 9 |   * Created by thpffcj on 2019-07-06.
10 |   */
11 | object TableSQLAPI {
12 | 
13 |   def main(args: Array[String]): Unit = {
14 | 
15 |     val env = ExecutionEnvironment.getExecutionEnvironment
16 |     val tableEnv = TableEnvironment.getTableEnvironment(env)
17 | 
18 |     val filePath = "file:///Users/thpffcj/Public/data/sales.csv"
19 |     // 已经拿到DataSet
20 |     val csv = env.readCsvFile[SalesLog](filePath, ignoreFirstLine = true)
21 | 
22 |     // DataSet => Table
23 |     val salesTable = tableEnv.fromDataSet(csv)
24 |     // Table => table
25 |     tableEnv.registerTable("sales", salesTable)
26 | 
27 |     // sql
28 |     val resultTable = tableEnv.sqlQuery("select customerId, sum(amountPaid) money from sales group by customerId")
29 | 
30 |     tableEnv.toDataSet[Row](resultTable).print()
31 |   }
32 | 
33 |   case class SalesLog(transactionId: String, customerId: String, itemId: String, amountPaid: Double)
34 | }
35 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/course07/WindowsApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course07
 2 | 
 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
 4 | import org.apache.flink.api.scala._
 5 | import org.apache.flink.streaming.api.windowing.time.Time
 6 | 
 7 | /**
 8 |   * Created by thpffcj on 2019-07-06.
 9 |   */
10 | object WindowsApp {
11 | 
12 |   def main(args: Array[String]): Unit = {
13 | 
14 |     val env= StreamExecutionEnvironment.getExecutionEnvironment
15 | 
16 |     val text = env.socketTextStream("localhost", 9999)
17 | 
18 | //    text.flatMap(_.split(","))
19 | //      .map((_, 1))
20 | //      .keyBy(0)
21 | //      .timeWindow(Time.seconds(5))
22 | //      .sum(1).print().setParallelism(1)
23 | 
24 |     text.flatMap(_.split(","))
25 |       .map((_, 1))
26 |       .keyBy(0)
27 |       .timeWindow(Time.seconds(10), Time.seconds(5))
28 |       .sum(1).print().setParallelism(1)
29 | 
30 | 
31 |     env.execute("WindowsApp")
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/course07/WindowsProcessApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course07
 2 | 
 3 | import org.apache.flink.api.java.tuple.Tuple
 4 | import org.apache.flink.api.scala._
 5 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
 6 | import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
 7 | import org.apache.flink.streaming.api.windowing.time.Time
 8 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow
 9 | import org.apache.flink.util.Collector
10 | 
11 | /**
12 |   * Created by thpffcj on 2019-07-06.
13 |   */
14 | object WindowsProcessApp {
15 | 
16 |   def main(args: Array[String]): Unit = {
17 | 
18 |     val env = StreamExecutionEnvironment.getExecutionEnvironment
19 | 
20 |     val text = env.socketTextStream("localhost", 9999)
21 | 
22 |     text.flatMap(_.split(","))
23 |       .map(x => (1, x.toInt))
24 |       .keyBy(0)
25 |       .timeWindow(Time.seconds(5))
26 |       .process(new MyProcessWindowFunction())
27 |       .print().setParallelism(1)
28 | 
29 |     env.execute("WindowsReduceApp")
30 |   }
31 | 
32 |   class MyProcessWindowFunction extends ProcessWindowFunction[(Int, Int), String, Tuple, TimeWindow] {
33 | 
34 |     def process(key: Tuple, context: Context, input: Iterable[(Int, Int)], out: Collector[String]): Unit = {
35 |       var count = 0L
36 |       for (in <- input) {
37 |         count = count + 1
38 |       }
39 |       out.collect(s"Window ${context.window} count: $count")
40 |     }
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/course07/WindowsReduceApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course07
 2 | 
 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
 4 | import org.apache.flink.api.scala._
 5 | import org.apache.flink.streaming.api.windowing.time.Time
 6 | 
 7 | /**
 8 |   * Created by thpffcj on 2019-07-06.
 9 |   */
10 | object WindowsReduceApp {
11 | 
12 |   def main(args: Array[String]): Unit = {
13 | 
14 |     val env = StreamExecutionEnvironment.getExecutionEnvironment
15 | 
16 |     val text = env.socketTextStream("localhost", 9999)
17 | 
18 |     // 原来传递进来的是字符串，此处我们就使用数值类型，通过数值类型来演示增量的效果
19 |     text.flatMap(_.split(","))
20 |       .map(x => (1, x.toInt))  // 1,2,3,4,5 => (1,1)(1,2)(1,3)(1,4)(1,5)
21 |       .keyBy(0)  // 因为key都是1，所以所有的元素都要一个task去执行
22 |       .timeWindow(Time.seconds(5))
23 |       .reduce((v1, v2) => {  // 不是等待窗口所有的数据进行一次性处理，而是数据两两处理
24 |         println(v1 + "..." + v2)
25 |         (v1._1, v1._2 + v2._2)
26 |       }).print().setParallelism(1)
27 | 
28 |     env.execute("WindowsReduceApp")
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/course08/FileSystemSinkApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course08
 2 | 
 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
 4 | import org.apache.flink.streaming.connectors.fs.{SequenceFileWriter, StringWriter}
 5 | import org.apache.flink.streaming.connectors.fs.bucketing.{BucketingSink, DateTimeBucketer}
 6 | 
 7 | /**
 8 |   * Created by thpffcj on 2019-07-07.
 9 |   */
10 | object FileSystemSinkApp {
11 | 
12 |   def main(args: Array[String]): Unit = {
13 | 
14 |     val env = StreamExecutionEnvironment.getExecutionEnvironment
15 | 
16 |     val data = env.socketTextStream("localhost", 9999)
17 | 
18 |     val filePath = "file:///Users/thpffcj/Public/data"
19 |     val sink = new BucketingSink[String](filePath)
20 |     sink.setBucketer(new DateTimeBucketer("yyyy-MM-dd--HHmm"))
21 |     sink.setWriter(new StringWriter[String]())
22 |     sink.setBatchSize(1024 * 1024 * 400) // this is 400 MB,
23 |     sink.setBatchRolloverInterval(20 * 60 * 1000); // this is 20 mins
24 | 
25 |     data.addSink(sink)
26 |     env.execute("FileSystemSinkApp")
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/course08/KafkaConnectorConsumerApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course08
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import org.apache.flink.api.common.serialization.SimpleStringSchema
 6 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
 7 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
 8 | import org.apache.flink.api.scala._
 9 | import org.apache.flink.streaming.api.CheckpointingMode
10 | 
11 | /**
12 |   * Created by thpffcj on 2019-07-07.
13 |   */
14 | object KafkaConnectorConsumerApp {
15 | 
16 |   def main(args: Array[String]): Unit = {
17 | 
18 |     val env = StreamExecutionEnvironment.getExecutionEnvironment
19 | 
20 |     // checkpoint常用设置参数
21 |     env.enableCheckpointing(4000)
22 |     env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
23 |     env.getCheckpointConfig.setCheckpointTimeout(10000)
24 |     env.getCheckpointConfig.setMaxConcurrentCheckpoints(1)
25 | 
26 |     val topic = "test"
27 |     val properties = new Properties()
28 |     properties.setProperty("bootstrap.servers", "localhost:9092")
29 |     properties.setProperty("group.id", "test")
30 |     val data = env.addSource(new FlinkKafkaConsumer[String](topic, new SimpleStringSchema(), properties))
31 | 
32 |     data.print()
33 | 
34 |     env.execute("KafkaConnectorConsumerApp")
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/course08/KafkaConnectorProducerApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.course08
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import org.apache.flink.api.common.serialization.SimpleStringSchema
 6 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
 7 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer
 8 | import org.apache.flink.streaming.connectors.kafka.internals.KeyedSerializationSchemaWrapper
 9 | 
10 | /**
11 |   * Created by thpffcj on 2019-07-07.
12 |   */
13 | object KafkaConnectorProducerApp {
14 | 
15 |   def main(args: Array[String]): Unit = {
16 | 
17 |     val env = StreamExecutionEnvironment.getExecutionEnvironment
18 | 
19 |     // 从socket接收数据，通过Flink，将数据sink到Kafka
20 |     val data = env.socketTextStream("localhost", 9999)
21 | 
22 |     val topic = "test"
23 |     val properties = new Properties()
24 |     properties.setProperty("bootstrap.servers", "localhost:9092")
25 | 
26 | //    val kafkaSink = new FlinkKafkaProducer[String](topic,
27 | //      new KeyedSerializationSchemaWrapper[String](new SimpleStringSchema()), properties)
28 | 
29 |     val kafkaSink = new FlinkKafkaProducer[String](topic,
30 |       new KeyedSerializationSchemaWrapper[String](new SimpleStringSchema()),
31 |       properties,
32 |       FlinkKafkaProducer.Semantic.EXACTLY_ONCE)
33 | 
34 |     data.addSink(kafkaSink)
35 | 
36 |     env.execute("KafkaConnectorProducerApp")
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/project/MyMySQLSource.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.project
 2 | 
 3 | import java.sql.{Connection, DriverManager, PreparedStatement}
 4 | 
 5 | import org.apache.flink.configuration.Configuration
 6 | import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, SourceFunction}
 7 | 
 8 | import scala.collection.mutable
 9 | 
10 | /**
11 |   * Created by thpffcj on 2019-07-11.
12 |   */
13 | class MyMySQLSource extends RichParallelSourceFunction[mutable.HashMap[String, String]] {
14 | 
15 |   var connection:Connection = null
16 |   var ps:PreparedStatement = null
17 | 
18 |   // open：建立连接
19 |   override def open(parameters: Configuration): Unit = {
20 | 
21 |     val url = "jdbc:mysql://localhost:3306/test"
22 |     val user = "root"
23 |     val password = "00000000"
24 |     connection = DriverManager.getConnection(url, user, password)
25 | 
26 |     val sql = "select user_id, domain from user_domain_config"
27 |     ps = connection.prepareStatement(sql)
28 |   }
29 | 
30 |   // 释放资源
31 |   override def close(): Unit = {
32 |     if (ps != null) {
33 |       ps.close()
34 |     }
35 | 
36 |     if (connection != null) {
37 |       connection.close()
38 |     }
39 |   }
40 | 
41 |   /**
42 |     * 此处是代码的关键：要从MySQL表中把数据读取出来转成Map进行数据的封装
43 |     * @param ctx
44 |     */
45 |   override def run(ctx: SourceFunction.SourceContext[mutable.HashMap[String, String]]): Unit = {
46 | 
47 |     val resultMap = new mutable.HashMap[String, String]()
48 | 
49 |     val result = ps.executeQuery()
50 |     while (result.next()) {
51 |       val userId = result.getString(1)
52 |       val domain = result.getString(2)
53 |       resultMap.put(domain, userId)
54 |     }
55 |     ctx.collect(resultMap)
56 |   }
57 | 
58 |   override def cancel(): Unit = {}
59 | }
60 | 


--------------------------------------------------------------------------------
/flink-train/src/main/scala/cn/edu/nju/project/MyMySQLSourceTest.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.project
 2 | 
 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
 4 | import org.apache.flink.api.scala._
 5 | 
 6 | /**
 7 |   * Created by thpffcj on 2019-07-11.
 8 |   */
 9 | object MyMySQLSourceTest {
10 | 
11 |   def main(args: Array[String]): Unit = {
12 | 
13 |     val env = StreamExecutionEnvironment.getExecutionEnvironment
14 | 
15 |     val data = env.addSource(new MyMySQLSource)
16 |     data.print()
17 | 
18 |     env.execute("MyMySQLSourceTest")
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/hadoop-train/src/main/java/cn/edu/nju/hadoop/mapreduce/sort/GlobalSortPartitioner.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.hadoop.mapreduce.sort;
 2 | 
 3 | import org.apache.hadoop.conf.Configurable;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.apache.hadoop.io.LongWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Partitioner;
 8 | 
 9 | /**
10 |  * Created by thpffcj on 2020/2/5.
11 |  * <p>
12 |  * 让MapReduce产生一个全局排序的文件：
13 |  * <p>
14 |  * 1. 最简单的方法是只使用一个分区(partition)，这种在处理小规模文件时还行。但是在处理大型文件是效率极低，所有的数据都发送到一
15 |  * 个Reduce进行排序，这样不能充分利用集群的计算资源，而且在数据量很大的情况下，很有可能会出现OOM问题。
16 |  * 2. 首先创建一系列排好序的文件，其次串联这些文件，最后生成一个全局排序的文件。它主要的思路使用一个partitioner来描述输出的全
17 |  * 局排序。该方案的重点在于分区方法，默认情况下根据hash值进行分区(默认的分区函数是HashPartitioner，其实现的原理是计算map输
18 |  * 出key的 hashCode ，然后对Reduce个数 求余，余数相同的 key 都会发送到同一个Reduce)；还可以根据用户自定义partitioner
19 |  * (自定义一个类并且继承partitioner类，重写器getpartition方法)
20 |  */
21 | class GlobalSortPartitioner extends Partitioner<Text, LongWritable> implements Configurable {
22 | 
23 |     private Configuration configuration = null;
24 |     private int indexRange = 0;
25 | 
26 |     public int getPartition(Text text, LongWritable longWritable, int numPartitions) {
27 |         // 假如取值范围等于26的话，那么就意味着只需要根据第一个字母来划分索引
28 |         int index = 0;
29 |         if (indexRange == 26) {
30 |             index = text.toString().toCharArray()[0] - 'a';
31 |         } else if (indexRange == 26 * 26) {
32 |             //这里就是需要根据前两个字母进行划分索引了
33 |             char[] chars = text.toString().toCharArray();
34 |             if (chars.length == 1) {
35 |                 index = (chars[0] - 'a') * 26;
36 |             }
37 |             index = (chars[0] - 'a') * 26 + (chars[1] - 'a');
38 |         }
39 |         int perReducerCount = indexRange / numPartitions;
40 |         if (indexRange < numPartitions) {
41 |             return numPartitions;
42 |         }
43 | 
44 |         for (int i = 0; i < numPartitions; i++) {
45 |             int min = i * perReducerCount;
46 |             int max = (i + 1) * perReducerCount - 1;
47 |             if (index >= min && index <= max) {
48 |                 return i;
49 |             }
50 |         }
51 |         //这里我们采用的是第一种不太科学的方法
52 |         return numPartitions - 1;
53 |     }
54 | 
55 |     public void setConf(Configuration conf) {
56 |         this.configuration = conf;
57 |         indexRange = configuration.getInt("key.indexRange", 26 * 26);
58 |     }
59 | 
60 |     public Configuration getConf() {
61 |         return configuration;
62 |     }
63 | }


--------------------------------------------------------------------------------
/hadoop-train/src/main/java/cn/edu/nju/hadoop/mapreduce/sort/IntPair.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.hadoop.mapreduce.sort;
 2 | 
 3 | import java.io.DataInput;
 4 | import java.io.DataOutput;
 5 | import java.io.IOException;
 6 | 
 7 | import org.apache.hadoop.io.WritableComparable;
 8 | 
 9 | /**
10 |  * Created by thpffcj on 2020/2/12.
11 |  *
12 |  * 自定义key排序
13 |  * 在mr中，所有的key是需要被比较和排序的，并且是二次，先根据partitioner，再根据大小。而本例中也是要比较两次。
14 |  * 先按照第一字段排序，然后再对第一字段相同的按照第二字段排序。
15 |  * 根据这一点，我们可以构造一个复合类IntPair，他有两个字段，先利用分区对第一字段排序，再利用分区内的比较对第二字段排序
16 |  */
17 | public class IntPair implements WritableComparable<IntPair> {
18 | 
19 |     int first;
20 |     int second;
21 | 
22 |     public IntPair(){
23 |     }
24 | 
25 |     public IntPair(int first, int second){
26 |         this.first = first;
27 |         this.second = second;
28 |     }
29 | 
30 |     public int getFirst() {
31 |         return first;
32 |     }
33 | 
34 |     public int getSecond() {
35 |         return second;
36 |     }
37 | 
38 |     // 反序列化，从流中读进二进制转换成IntPair
39 |     @Override
40 |     public void readFields(DataInput in) throws IOException {
41 |         this.first = in.readInt();
42 |         this.second = in.readInt();
43 |     }
44 | 
45 |     // 序列化，将IntPair转换成二进制输出
46 |     @Override
47 |     public void write(DataOutput out) throws IOException {
48 |         out.writeInt(first);
49 |         out.writeInt(second);
50 |     }
51 | 
52 |     /*
53 |      * 为什么要重写equal方法？
54 |      * 因为Object的equal方法默认是两个对象的引用的比较，意思就是指向同一内存,地址则相等，否则不相等；
55 |      * 如果你现在需要利用对象里面的值来判断是否相等，则重载equal方法。
56 |      */
57 |     @Override
58 |     public boolean equals(Object obj) {
59 |         if (obj == null) {
60 |             return false;
61 |         }
62 |         if (this == obj) {
63 |             return true;
64 |         }
65 |         if (obj instanceof IntPair) {
66 |             IntPair r = (IntPair) obj;
67 |             return r.first == first && r.second == second;
68 |         } else {
69 |             return false;
70 |         }
71 |     }
72 | 
73 |     /*
74 |      * 重写equal 的同时为什么必须重写hashcode？
75 |      * hashCode是编译器为不同对象产生的不同整数，根据equal方法的定义：如果两个对象是相等（equal）的，那么两个对象
76 |      * 调用 hashCode必须产生相同的整数结果
77 |      * 即：equal为true，hashCode必须为true，equal为false，hashCode也必须 为false，所以必须重写hashCode来保证
78 |      * 与equal同步。
79 |      */
80 |     @Override
81 |     public int hashCode() {
82 |         return first * 157 + second;
83 |     }
84 | 
85 |     // 实现key的比较
86 |     @Override
87 |     public int compareTo(IntPair o) {
88 |         if (first != o.first) {
89 |             return first < o.first ? -1 : 1;
90 |         } else if (second != o.second) {
91 |             return second < o.second ? -1 : 1;
92 |         } else {
93 |             return 0;
94 |         }
95 |     }
96 | }
97 | 


--------------------------------------------------------------------------------
/hadoop-train/src/main/java/cn/edu/nju/hadoop/mapreduce/topk/IPTimes.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.hadoop.mapreduce.topk;
 2 | 
 3 | import org.apache.hadoop.io.IntWritable;
 4 | import org.apache.hadoop.io.Text;
 5 | import org.apache.hadoop.io.WritableComparable;
 6 | 
 7 | import java.io.DataInput;
 8 | import java.io.DataOutput;
 9 | import java.io.IOException;
10 | 
11 | /**
12 |  * Created by thpffcj on 2020/2/5.
13 |  */
14 | public class IPTimes implements WritableComparable {
15 | 
16 |     // IP
17 |     private Text ip;
18 |     // IP对应出现的次数
19 |     private IntWritable count;
20 | 
21 |     // 无参构造函数(一定要有，反射机制会出错，另外要对定义的变量进行初始化否则会报空指针异常)
22 |     public IPTimes() {
23 |         this.ip = new Text("");
24 |         this.count = new IntWritable(1);
25 |     }
26 | 
27 |     // 有参构造函数
28 |     public IPTimes(Text ip, IntWritable count) {
29 |         this.ip = ip;
30 |         this.count = count;
31 |     }
32 | 
33 |     // 反序列化
34 |     public void readFields(DataInput in) throws IOException {
35 |         ip.readFields(in);
36 |         count.readFields(in);
37 |     }
38 | 
39 |     // 序列化
40 |     public void write(DataOutput out) throws IOException {
41 |         ip.write(out);
42 |         count.write(out);
43 |     }
44 | 
45 |     public Text getIp() {
46 |         return ip;
47 |     }
48 | 
49 |     public void setIp(Text ip) {
50 |         this.ip = ip;
51 |     }
52 | 
53 |     public IntWritable getCount() {
54 |         return count;
55 |     }
56 | 
57 |     public void setCount(IntWritable count) {
58 |         this.count = count;
59 |     }
60 | 
61 |     // 这个方法是二次排序的关键
62 |     public int compareTo(Object o) {
63 |         // 强转
64 |         IPTimes ipAndCount = (IPTimes) o;
65 |         // 对第二列的count进行比较
66 |         long minus = this.getCount().compareTo(ipAndCount.getCount());
67 |         // 第二列不相同时降序排列
68 |         if (minus != 0) {
69 |             return ipAndCount.getCount().compareTo(this.count);
70 |         } else {  // 第二列相同时第一列升序排列
71 |             return this.ip.compareTo(ipAndCount.getIp());
72 |         }
73 |     }
74 | 
75 |     // hashCode和equals()方法
76 |     public int hashCode() {
77 |         return ip.hashCode();
78 |     }
79 | 
80 |     public boolean equals(Object o) {
81 |         if (!(o instanceof IPTimes)) {
82 |             return false;
83 |         }
84 |         IPTimes other = (IPTimes) o;
85 |         return ip.equals(other.ip) && count.equals(other.count);
86 |     }
87 | 
88 |     // 重写toString()方法
89 |     public String toString() {
90 |         return this.ip + "\t" + this.count;
91 |     }
92 | }
93 | 


--------------------------------------------------------------------------------
/hadoop-train/src/resources/application.properties:
--------------------------------------------------------------------------------
1 | spring.hadoop.fsUri = hdfs://192.168.92.130:8020


--------------------------------------------------------------------------------
/hadoop-train/src/resources/beans.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <beans xmlns="http://www.springframework.org/schema/beans"
 3 |        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |        xmlns:hdp="http://www.springframework.org/schema/hadoop"
 5 |        xmlns:context="http://www.springframework.org/schema/context"
 6 |        xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans.xsd
 7 |         http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context.xsd
 8 |         http://www.springframework.org/schema/hadoop http://www.springframework.org/schema/hadoop/spring-hadoop.xsd">
 9 | 
10 |     <context:property-placeholder location="application.properties" />
11 | 
12 |     <hdp:configuration id="hadoopConfiguration">
13 |         fs.defaultFS=${spring.hadoop.fsUri}
14 |     </hdp:configuration>
15 | 
16 |     <hdp:file-system id="fileSystem" configuration-ref="hadoopConfiguration" user="root"/>
17 | </beans>


--------------------------------------------------------------------------------
/hadoop-train/src/resources/log.txt:
--------------------------------------------------------------------------------
1 | Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0192.168.242.129 - - [27/Dec/2017:23:07:18 +0800] "GET /index.html HTTP/1.1" 304 0 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0" "-"
2 | Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0192.168.242.129 - - [27/Dec/2017:23:07:21 +0800] "GET /index.html HTTP/1.1" 304 0 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0" "-"
3 | Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0192.168.242.129 - - [27/Dec/2017:23:07:22 +0800] "GET /index.html HTTP/1.1" 304 0 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0" "-"
4 | Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0192.168.242.129 - - [27/Dec/2017:23:07:46 +0800] "GET /index.html HTTP/1.1" 403 169 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0" "-"
5 | Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0192.168.242.129 - - [27/Dec/2017:23:08:13 +0800] "GET /index.html HTTP/1.1" 200 612 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0" "-"
6 | Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0192.168.242.129 - - [27/Dec/2017:23:13:44 +0800] "GET /index.html HTTP/1.1" 304 0 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0" "-"
7 | Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0192.168.242.129 - - [27/Dec/2017:23:13:45 +0800] "GET /index.html HTTP/1.1" 304 0 "-" "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0" "-"
8 | 


--------------------------------------------------------------------------------
/hadoop-train/src/test/java/cn/edu/nju/hadoop/spring/SpringBootHDFSApp.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.hadoop.spring;
 2 | 
 3 | import org.apache.hadoop.fs.FileStatus;
 4 | import org.springframework.beans.factory.annotation.Autowired;
 5 | import org.springframework.boot.CommandLineRunner;
 6 | import org.springframework.boot.SpringApplication;
 7 | import org.springframework.boot.autoconfigure.SpringBootApplication;
 8 | import org.springframework.data.hadoop.fs.FsShell;
 9 | 
10 | /**
11 |  * Created by Thpffcj on 2018/1/9.
12 |  */
13 | @SpringBootApplication
14 | public class SpringBootHDFSApp implements CommandLineRunner {
15 | 
16 |     @Autowired
17 |     FsShell fsShell;
18 | 
19 |     @Override
20 |     public void run(String... strings) throws Exception {
21 |         for (FileStatus fileStatus : fsShell.lsr("/springhdfs")) {
22 |             System.out.println(">" + fileStatus.getPath());
23 |         }
24 |     }
25 | 
26 |     public static void main(String[] args) {
27 |         SpringApplication.run(SpringBootHDFSApp.class, args);
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/hadoop-train/src/test/java/cn/edu/nju/hadoop/spring/SpringHadoopHDFSApp.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.hadoop.spring;
 2 | 
 3 | import org.apache.hadoop.fs.FSDataInputStream;
 4 | import org.apache.hadoop.fs.FileSystem;
 5 | import org.apache.hadoop.fs.Path;
 6 | import org.apache.hadoop.io.IOUtils;
 7 | import org.junit.After;
 8 | import org.junit.Before;
 9 | import org.junit.Test;
10 | import org.springframework.context.ApplicationContext;
11 | import org.springframework.context.support.ClassPathXmlApplicationContext;
12 | 
13 | import java.io.IOException;
14 | 
15 | /**
16 |  * Created by Thpffcj on 2018/1/8.
17 |  */
18 | public class SpringHadoopHDFSApp {
19 | 
20 |     private ApplicationContext context;
21 |     private FileSystem fileSystem;
22 | 
23 |     /**
24 |      * 创建HDFS文件夹
25 |      * @throws Exception
26 |      */
27 |     @Test
28 |     public void testMkdirs() throws Exception {
29 |         fileSystem.mkdirs(new Path("/springhdfs/"));
30 |     }
31 | 
32 |     /**
33 |      * 读取HDFS文件内容
34 |      * @throws Exception
35 |      */
36 |     @Test
37 |     public void testText() throws Exception {
38 |         FSDataInputStream inputStream = fileSystem.open(new Path("/springhdfs/hello.txt"));
39 |         IOUtils.copyBytes(inputStream, System.out, 1024);
40 |         inputStream.close();
41 |     }
42 | 
43 |     @Before
44 |     public void setUp() {
45 |         context = new ClassPathXmlApplicationContext("beans.xml");
46 |         fileSystem = (FileSystem) context.getBean("fileSystem");
47 |     }
48 | 
49 |     @After
50 |     public void tearDown() throws IOException {
51 |         context = null;
52 |         fileSystem = null;
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/hbase-train/hbase-api-test/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 4 | 
 5 |     <parent>
 6 |         <artifactId>hbase-train</artifactId>
 7 |         <groupId>cn.edu.nju</groupId>
 8 |         <version>1.0-SNAPSHOT</version>
 9 |     </parent>
10 |     <modelVersion>4.0.0</modelVersion>
11 | 
12 |     <artifactId>hbase-api-test</artifactId>
13 | 
14 |     <dependencies>
15 |         <dependency>
16 |             <groupId>org.apache.hbase</groupId>
17 |             <artifactId>hbase-client</artifactId>
18 |             <version>1.2.0</version>
19 |         </dependency>
20 |         <dependency>
21 |             <groupId>junit</groupId>
22 |             <artifactId>junit</artifactId>
23 |             <version>4.12</version>
24 |             <scope>test</scope>
25 |         </dependency>
26 |     </dependencies>
27 | </project>
28 | 


--------------------------------------------------------------------------------
/hbase-train/hbase-api-test/src/main/java/cn/edu/nju/HBaseConn.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.hbase.HBaseConfiguration;
 5 | import org.apache.hadoop.hbase.TableName;
 6 | import org.apache.hadoop.hbase.client.Connection;
 7 | import org.apache.hadoop.hbase.client.ConnectionFactory;
 8 | import org.apache.hadoop.hbase.client.Table;
 9 | 
10 | 
11 | import java.io.IOException;
12 | 
13 | /**
14 |  * Created by thpffcj on 2019-04-05.
15 |  */
16 | public class HBaseConn {
17 | 
18 |     private static final HBaseConn INSTANCE = new HBaseConn();
19 |     private static Configuration configuration;
20 |     private static Connection connection;
21 | 
22 |     private HBaseConn() {
23 |         try {
24 |             if (configuration == null) {
25 |                 configuration = HBaseConfiguration.create();
26 |                 configuration.set("hbase.zookeeper.quorum", "localhost:2181");
27 |             }
28 |         } catch (Exception e) {
29 |             e.printStackTrace();
30 |         }
31 |     }
32 | 
33 |     private Connection getConnection() {
34 |         if (connection == null || connection.isClosed()) {
35 |             try {
36 |                 connection = ConnectionFactory.createConnection(configuration);
37 |             } catch (Exception e) {
38 |                 e.printStackTrace();
39 |             }
40 |         }
41 |         return connection;
42 |     }
43 | 
44 |     public static Connection getHBaseConn() {
45 |         return INSTANCE.getConnection();
46 |     }
47 | 
48 |     public static Table getTable(String tableName) throws IOException {
49 |         return INSTANCE.getConnection().getTable(TableName.valueOf(tableName));
50 |     }
51 | 
52 |     public static void closeConn() {
53 |         if (connection != null) {
54 |             try {
55 |                 connection.close();
56 |             } catch (IOException ioe) {
57 |                 ioe.printStackTrace();
58 |             }
59 |         }
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/hbase-train/hbase-api-test/src/test/java/cn/edu/nju/HBaseConnTest.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju;
 2 | 
 3 | import org.apache.hadoop.hbase.client.Connection;
 4 | import org.apache.hadoop.hbase.client.Table;
 5 | import org.junit.Test;
 6 | 
 7 | import java.io.IOException;
 8 | 
 9 | /**
10 |  * Created by thpffcj on 2019-04-05.
11 |  */
12 | public class HBaseConnTest {
13 | 
14 |     @Test
15 |     public void getConnTest() {
16 |         Connection conn = HBaseConn.getHBaseConn();
17 |         System.out.println(conn.isClosed());
18 |         HBaseConn.closeConn();
19 |         System.out.println(conn.isClosed());
20 |     }
21 | 
22 |     @Test
23 |     public void getTableTest() {
24 |         try {
25 |             Table table = HBaseConn.getTable("US_POPULATION");
26 |             System.out.println(table.getName().getNameAsString());
27 |             table.close();
28 |         } catch (IOException ioe) {
29 |             ioe.printStackTrace();
30 |         }
31 |     }
32 | }


--------------------------------------------------------------------------------
/hbase-train/hbase-api-test/src/test/java/cn/edu/nju/HBaseUtilTest.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju;
 2 | 
 3 | import org.apache.hadoop.hbase.client.Result;
 4 | import org.apache.hadoop.hbase.client.ResultScanner;
 5 | import org.apache.hadoop.hbase.util.Bytes;
 6 | import org.junit.Test;
 7 | 
 8 | /**
 9 |  * Created by thpffcj on 2019-04-06.
10 |  */
11 | public class HBaseUtilTest {
12 | 
13 |     @Test
14 |     public void createTable() {
15 |         HBaseUtil.createTable("FileTable", new String[]{"fileInfo", "saveInfo"});
16 |     }
17 | 
18 |     @Test
19 |     public void addFileDetails() {
20 |         HBaseUtil.putRow("FileTable", "rowkey1", "fileInfo", "name", "file1.txt");
21 |         HBaseUtil.putRow("FileTable", "rowkey1", "fileInfo", "type", "txt");
22 |         HBaseUtil.putRow("FileTable", "rowkey1", "fileInfo", "size", "1024");
23 |         HBaseUtil.putRow("FileTable", "rowkey1", "saveInfo", "creator", "thpffcj");
24 |         HBaseUtil.putRow("FileTable", "rowkey2", "fileInfo", "name", "file2.jpg");
25 |         HBaseUtil.putRow("FileTable", "rowkey2", "fileInfo", "type", "jpg");
26 |         HBaseUtil.putRow("FileTable", "rowkey2", "fileInfo", "size", "1024");
27 |         HBaseUtil.putRow("FileTable", "rowkey2", "saveInfo", "creator", "thpffcj");
28 | 
29 |     }
30 | 
31 |     @Test
32 |     public void getFileDetails() {
33 |         Result result = HBaseUtil.getRow("FileTable", "rowkey1");
34 |         if (result != null) {
35 |             System.out.println("rowkey=" + Bytes.toString(result.getRow()));
36 |             System.out.println("fileName=" + Bytes
37 |                     .toString(result.getValue(Bytes.toBytes("fileInfo"), Bytes.toBytes("name"))));
38 |         }
39 |     }
40 | 
41 |     @Test
42 |     public void scanFileDetails() {
43 |         ResultScanner scanner = HBaseUtil.getScanner("FileTable", "rowkey2", "rowkey2");
44 |         if (scanner != null) {
45 |             scanner.forEach(result -> {
46 |                 System.out.println("rowkey=" + Bytes.toString(result.getRow()));
47 |                 System.out.println("fileName=" + Bytes
48 |                         .toString(result.getValue(Bytes.toBytes("fileInfo"), Bytes.toBytes("name"))));
49 |             });
50 |             scanner.close();
51 |         }
52 |     }
53 | 
54 |     @Test
55 |     public void deleteRow() {
56 |         HBaseUtil.deleteRow("FileTable", "rowkey1");
57 |     }
58 | 
59 |     @Test
60 |     public void deleteTable() {
61 |         HBaseUtil.deleteTable("FileTable");
62 |     }
63 | }


--------------------------------------------------------------------------------
/hbase-train/hbase-endpoint-test/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 4 |     <parent>
 5 |         <artifactId>hbase-train</artifactId>
 6 |         <groupId>cn.edu.nju</groupId>
 7 |         <version>1.0-SNAPSHOT</version>
 8 |     </parent>
 9 |     <modelVersion>4.0.0</modelVersion>
10 | 
11 |     <artifactId>hbase-endpoint-test</artifactId>
12 | 
13 |     <dependencies>
14 |         <dependency>
15 |             <groupId>org.apache.hbase</groupId>
16 |             <artifactId>hbase-server</artifactId>
17 |             <version>1.2.0</version>
18 |         </dependency>
19 |         <dependency>
20 |             <groupId>org.apache.hbase</groupId>
21 |             <artifactId>hbase-common</artifactId>
22 |             <version>1.2.0</version>
23 |         </dependency>
24 |         <dependency>
25 |             <groupId>com.google.protobuf</groupId>
26 |             <artifactId>protobuf-java</artifactId>
27 |             <version>2.5.0</version>
28 |         </dependency>
29 |     </dependencies>
30 | </project>
31 | 


--------------------------------------------------------------------------------
/hbase-train/hbase-endpoint-test/src/main/proto/RowCountTest.proto:
--------------------------------------------------------------------------------
 1 | syntax = "proto2";
 2 | 
 3 | option java_package = "cn.edu.nju";
 4 | 
 5 | option java_outer_classname = "GetRowCount";
 6 | option java_generic_services = true;
 7 | option optimize_for = SPEED;
 8 | 
 9 | message getRowCountRequest{
10 | 
11 | }
12 | 
13 | message getRowCountResponse {
14 |   optional int64 rowCount = 1;
15 | }
16 | 
17 | 
18 | service hbaseEndPointTestService {
19 |   rpc getRowCount(getRowCountRequest)
20 |   returns(getRowCountResponse);
21 | }


--------------------------------------------------------------------------------
/hbase-train/hbase-observer-test/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 4 |     <parent>
 5 |         <artifactId>hbase-train</artifactId>
 6 |         <groupId>cn.edu.nju</groupId>
 7 |         <version>1.0-SNAPSHOT</version>
 8 |     </parent>
 9 |     <modelVersion>4.0.0</modelVersion>
10 | 
11 |     <artifactId>hbase-observer-test</artifactId>
12 | 
13 |     <dependencies>
14 |         <dependency>
15 |             <groupId>org.apache.hbase</groupId>
16 |             <artifactId>hbase-common</artifactId>
17 |             <version>1.2.0</version>
18 |         </dependency>
19 |         <dependency>
20 |             <groupId>org.apache.hbase</groupId>
21 |             <artifactId>hbase-server</artifactId>
22 |             <version>1.2.0</version>
23 |         </dependency>
24 |     </dependencies>
25 | </project>
26 | 


--------------------------------------------------------------------------------
/hbase-train/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | 
 3 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <groupId>cn.edu.nju</groupId>
 8 |     <artifactId>hbase-train</artifactId>
 9 |     <packaging>pom</packaging>
10 |     <version>1.0-SNAPSHOT</version>
11 | 
12 |     <properties>
13 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
14 |         <maven.compiler.source>1.8</maven.compiler.source>
15 |         <maven.compiler.target>1.8</maven.compiler.target>
16 |     </properties>
17 | 
18 |     <modules>
19 |         <module>hbase-api-test</module>
20 |         <module>hbase-observer-test</module>
21 |         <module>hbase-endpoint-test</module>
22 |     </modules>
23 | 
24 | </project>
25 | 


--------------------------------------------------------------------------------
/hbase-train/src/main/java/cn/edu/nju/App.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju;
 2 | 
 3 | /**
 4 |  * Hello world!
 5 |  *
 6 |  */
 7 | public class App 
 8 | {
 9 |     public static void main( String[] args )
10 |     {
11 |         System.out.println( "Hello World!" );
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/hbase-train/src/test/java/cn/edu/nju/AppTest.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju;
 2 | 
 3 | import static org.junit.Assert.assertTrue;
 4 | 
 5 | import org.junit.Test;
 6 | 
 7 | /**
 8 |  * Unit test for simple App.
 9 |  */
10 | public class AppTest 
11 | {
12 |     /**
13 |      * Rigorous Test :-)
14 |      */
15 |     @Test
16 |     public void shouldAnswerWithTrue()
17 |     {
18 |         assertTrue( true );
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/log-generator/generate_log.py:
--------------------------------------------------------------------------------
 1 | # _*_ coding: utf-8 _*_
 2 | __author__ = 'Thpffcj'
 3 | 
 4 | import random
 5 | import time
 6 | 
 7 | url_path = [
 8 |     "class/112.html",
 9 |     "class/128.html",
10 |     "class/145.html",
11 |     "class/146.html",
12 |     "class/130.html",
13 |     "learn/821",
14 |     "course/list"
15 | ]
16 | 
17 | ip_slices = [132, 156, 124, 10, 29, 143, 187, 30, 46, 55, 63, 72, 98, 168]
18 | 
19 | http_referers = [
20 |     "http://www.baidu.com/s?wd={query}",
21 |     "http://www.sogou.com/?web={query}",
22 |     "http://cn.bing.com/search?q={query}",
23 |     "https://search.yahoo.com/search?p={query}",
24 | ]
25 | 
26 | search_keyword = [
27 |     "Spark SQL实战",
28 |     "Hadoop基础",
29 |     "Storm实战",
30 |     "Spark Streaming实战",
31 |     "大数据面试"
32 | ]
33 | 
34 | status_code = ["200", "404", "500"]
35 | 
36 | 
37 | def sample_url():
38 |     return random.sample(url_path, 1)[0]
39 | 
40 | 
41 | def sample_ip():
42 |     slice = random.sample(ip_slices, 4)
43 |     return ".".join([str(item) for item in slice])
44 | 
45 | 
46 | def sample_referrer():
47 |     if random.uniform(0, 1) > 0.2:
48 |         return "-"
49 | 
50 |     refer_str = random.sample(http_referers, 1)
51 |     query_str = random.sample(search_keyword, 1)
52 |     return refer_str[0].format(query=query_str[0])
53 | 
54 | 
55 | def sample_status_code():
56 |     return random.sample(status_code, 1)[0]
57 | 
58 | 
59 | def generate_log(count=10):
60 |     time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
61 | 
62 |     f = open("/home/thpffcj/data/project/logs/access.log", "a")
63 | 
64 |     while count >= 1:
65 |         query_log = "{ip}\t{local_time}\t\"GET /{url} HTTP/1.1\"\t{status_code}\t{refer}".format(
66 |             ip=sample_ip(), local_time=time_str, url=sample_url(), status_code=sample_status_code(),
67 |             refer=sample_referrer())
68 |         print(query_log)
69 | 
70 |         f.write(query_log + "\n")
71 | 
72 |         count = count - 1
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     generate_log()
77 | 


--------------------------------------------------------------------------------
/log-generator/message.py:
--------------------------------------------------------------------------------
 1 | # _*_ coding: utf-8 _*_
 2 | __author__ = 'Thpffcj'
 3 | 
 4 | import random
 5 | import time
 6 | 
 7 | infos = [
 8 |     "116.397026,39.918058",
 9 |     "116.410886,39.881949",
10 |     "116.272876,39.99243",
11 |     "116.544079,40.417555",
12 |     "116.225404,40.258186",
13 |     "116.38631,39.937209",
14 |     "116.399466,39.989743"
15 | ]
16 | 
17 | phones = [
18 |     "13888888888", "13877777777", "13866666666",
19 |     "13988888888", "13977777777", "13966666666",
20 |     "13788888888", "13777777777", "13766666666",
21 |     "13688888888", "13677777777", "13666666666",
22 | ]
23 | 
24 | def sample_phone():
25 |     return random.sample(phones, 1)[0]
26 | 
27 | 
28 | def sample_info():
29 |     return random.sample(infos, 1)[0]
30 | 
31 | 
32 | def generate_log(count = 3):
33 |     time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
34 |     f = open("/home/thpffcj/data/logs/access.log", "a+")
35 |     while count >= 1:
36 |         query_log = "{phone}\t{info}\t[{local_time}]".format(phone = sample_phone(),
37 |                                                              info = sample_info(), local_time = time_str)
38 |         # print(query_log)
39 |         f.write(query_log + "\n")
40 |         count = count - 1
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     generate_log(10)


--------------------------------------------------------------------------------
/log-generator/message2.py:
--------------------------------------------------------------------------------
 1 | # _*_ coding: utf-8 _*_
 2 | __author__ = 'Thpffcj'
 3 | 
 4 | import random
 5 | import time
 6 | 
 7 | url_path = [
 8 |     "http://www.imooc.com/video/8701",
 9 |     "http://www.imooc.com/video/8702",
10 |     "http://www.imooc.com/video/8703",
11 |     "http://www.imooc.com/article/8701",
12 |     "http://www.imooc.com/article/8704",
13 |     "http://www.imooc.com/article/8705",
14 |     "http://www.imooc.com/video/8709"
15 | ]
16 | 
17 | ip_slices = [132, 156, 124, 10, 29, 143, 187, 30, 46, 55, 63, 72, 98, 168]
18 | 
19 | 
20 | def sample_traffic():
21 |     return random.randint(0, 100)
22 | 
23 | 
24 | def sample_url():
25 |     return random.sample(url_path, 1)[0]
26 | 
27 | 
28 | def sample_ip():
29 |     slice = random.sample(ip_slices, 4)
30 |     return ".".join([str(item) for item in slice])
31 | 
32 | 
33 | def generate_log(count=1000):
34 |     time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
35 |     while count >= 1:
36 |         query_log = "{local_time}\t{url}\t{traffic}\t{ip}".format(local_time=time_str, url=sample_url(), traffic=sample_traffic(), ip=sample_ip())
37 |         print(query_log)
38 |         count = count - 1
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     generate_log()


--------------------------------------------------------------------------------
/pyspark/project/spark.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | # Created by thpffcj on 2019/10/19.
 3 | from pyspark.sql import SparkSession
 4 | from pyspark.sql.types import *
 5 | from pyspark.sql.functions import udf
 6 | 
 7 | if __name__ == '__main__':
 8 |     spark = SparkSession.builder.appName("project").getOrCreate()
 9 | 
10 |     data2015 = spark.read.format("csv")\
11 |         .option("header", "true")\
12 |         .option("inferSchema", "true")\
13 |         .load("file:///Users/thpffcj/Public/file/Beijing_2015_HourlyPM25_created20160201.csv")\
14 |         .select("Year", "Month", "Day", "Hour", "Value", "QC Name")
15 | 
16 |     data2016 = spark.read.format("csv")\
17 |         .option("header", "true")\
18 |         .option("inferSchema", "true")\
19 |         .load("file:///Users/thpffcj/Public/file/Beijing_2016_HourlyPM25_created20170201.csv")\
20 |         .select("Year", "Month", "Day", "Hour", "Value", "QC Name")
21 | 
22 |     data2017 = spark.read.format("csv")\
23 |         .option("header", "true")\
24 |         .option("inferSchema", "true")\
25 |         .load("file:///Users/thpffcj/Public/file/Beijing_2017_HourlyPM25_created20170803.csv")\
26 |         .select("Year", "Month", "Day", "Hour", "Value", "QC Name")
27 | 
28 | 
29 |     def get_grade(value):
30 |         if value <= 50 and value >= 0:
31 |             return "健康"
32 |         elif value <= 100:
33 |             return "中等"
34 |         elif value <= 150:
35 |             return "对敏感人群不健康"
36 |         elif value <= 200:
37 |             return "不健康"
38 |         elif value <= 300:
39 |             return "非常不健康"
40 |         elif value <= 500:
41 |             return "危险"
42 |         elif value > 500:
43 |             return "爆表"
44 |         else:
45 |             return None
46 | 
47 |     grade_function_udf = udf(get_grade, StringType())
48 | 
49 |     # 进来一个Value，出去一个Grade
50 |     group2017 = data2017.withColumn("Grade", grade_function_udf(data2017['Value'])).groupBy("Grade").count()
51 |     group2016 = data2016.withColumn("Grade", grade_function_udf(data2016['Value'])).groupBy("Grade").count()
52 |     group2015 = data2015.withColumn("Grade", grade_function_udf(data2015['Value'])).groupBy("Grade").count()
53 | 
54 |     group2017.select("Grade", "count", group2017['count'] / data2017.count()).show()
55 |     group2016.select("Grade", "count", group2016['count'] / data2016.count()).show()
56 |     group2015.select("Grade", "count", group2015['count'] / data2015.count()).show()
57 | 
58 |     spark.stop()
59 | 


--------------------------------------------------------------------------------
/pyspark/project/steam.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | # Created by thpffcj on 2019/10/21.
 3 | from pyspark.sql import SparkSession
 4 | from pyspark.sql.types import *
 5 | from pyspark.sql.functions import udf
 6 | 
 7 | if __name__ == '__main__':
 8 | 
 9 |     spark = SparkSession.builder.appName("steam").getOrCreate()
10 | 
11 |     steam_data = spark.read.format("csv")\
12 |         .option("header", "true")\
13 |         .option("inferSchema", "true")\
14 |         .load("/data/steam.csv")\
15 |         .select("userId", "gameName", "behavior", "duration")
16 | 
17 |     # 200000
18 |     # print(steam_data.count())
19 | 
20 |     def get_time(value):
21 |         if value <= 10 and value >= 0:
22 |             return "0 ~ 10小时"
23 |         elif value <= 50:
24 |             return "10 ~ 50小时"
25 |         elif value <= 100:
26 |             return "51 ~ 100小时"
27 |         elif value <= 200:
28 |             return "101 ~ 200小时"
29 |         elif value <= 300:
30 |             return "201 ~ 300小时"
31 |         elif value <= 500:
32 |             return "301 ~ 500小时"
33 |         elif value > 500:
34 |             return "大于500小时"
35 |         else:
36 |             return None
37 | 
38 |     grade_function_udf = udf(get_time, StringType())
39 | 
40 |     dota2_data = steam_data.filter(steam_data["behavior"] == "play").filter(steam_data["gameName"] == "Dota 2")\
41 |         .withColumn("range", grade_function_udf(steam_data['duration']))
42 |     dota2_group = dota2_data.groupBy("range").count()
43 |     dota2_result = dota2_group.select("range", "count")\
44 |         .withColumn("percent", dota2_group['count'] / dota2_data.count() * 100)
45 | 
46 |     team_fortress2_data = steam_data.filter(steam_data["behavior"] == "play").filter(steam_data["gameName"] == "Team Fortress 2")\
47 |         .withColumn("range", grade_function_udf(steam_data['duration']))
48 |     team_fortress2_group = team_fortress2_data.groupBy("range").count()
49 |     team_fortress2_result = team_fortress2_group.select("range", "count")\
50 |         .withColumn("percent", team_fortress2_group['count'] / team_fortress2_data.count() * 100)
51 | 
52 |     # team_fortress2_result.show()
53 | 
54 |     dota2_data.write.format("org.elasticsearch.spark.sql").option("es.nodes", "172.19.170.131:9200").mode(
55 |             "overwrite").save("dota2/data")
56 | 
57 |     team_fortress2_data.write.format("org.elasticsearch.spark.sql").option("es.nodes", "172.19.170.131:9200").mode(
58 |         "overwrite").save("team_fortress2/data")
59 | 
60 |     dota2_result.write.format("org.elasticsearch.spark.sql").option("es.nodes", "172.19.170.131:9200").mode(
61 |         "overwrite").save("aggregation_dota2/aggregation")
62 | 
63 |     team_fortress2_result.write.format("org.elasticsearch.spark.sql").option("es.nodes", "172.19.170.131:9200").mode(
64 |         "overwrite").save("aggregation_fortress2/aggregation")
65 | 
66 | 
67 |     spark.stop()
68 | 


--------------------------------------------------------------------------------
/pyspark/project/test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | # Created by thpffcj on 2019/11/18.
 3 | 
 4 | def inputPre():
 5 |     global name, pre, m, n
 6 |     for i in range(m):
 7 |         v = 0
 8 |         u = 0
 9 |         while u < n:
10 |             if p[i][0] == name[u]:
11 |                 break
12 |             else:
13 |                 u += 1
14 |         if u == n:
15 |             name.append(p[i][0])
16 |             n += 1
17 |         while v < n:
18 |             if p[i][1] == name[v]:
19 |                 break
20 |             else:
21 |                 v += 1
22 |         if v == n:
23 |             name.append(p[i][1])
24 |             n += 1
25 |         pre[v] |= (1 << u)
26 | 
27 | 
28 | def solve():
29 |     global dp, n
30 |     dp[0] = 1
31 |     for s in range(1 << n):
32 |         if dp[s] != 0:
33 |             for i in range(n):
34 |                 if ((s & pre[i]) == pre[i]) and not (s & (1 << i)):
35 |                     dp[s | (1 << i)] += dp[s]
36 |     print(dp[(1 << n) - 1])
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     N = int(input())
41 |     for k in range(N):
42 |         pairs = list(map(str, input().split(",")))  # 起点终点对集合
43 |         m = len(pairs)
44 |         n = 0
45 |         p = []  # 存储起点终点对
46 |         for i in range(m):
47 |             pair = pairs[i].split()
48 |             p.append(pair)
49 |         name = []
50 |         size = 13
51 |         pre = [0 for i in range(size)]
52 |         dp = [0 for i in range(1 << size)]
53 |         inputPre()
54 |         solve()


--------------------------------------------------------------------------------
/spark-data-visualization/.gitignore:
--------------------------------------------------------------------------------
 1 | target/
 2 | !.mvn/wrapper/maven-wrapper.jar
 3 | 
 4 | ### STS ###
 5 | .apt_generated
 6 | .classpath
 7 | .factorypath
 8 | .project
 9 | .settings
10 | .springBeans
11 | 
12 | ### IntelliJ IDEA ###
13 | .idea
14 | *.iws
15 | *.iml
16 | *.ipr
17 | 
18 | ### NetBeans ###
19 | nbproject/private/
20 | build/
21 | nbbuild/
22 | dist/
23 | nbdist/
24 | .nb-gradle/


--------------------------------------------------------------------------------
/spark-data-visualization/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 4 |     <modelVersion>4.0.0</modelVersion>
 5 | 
 6 |     <groupId>cn.edu.nju</groupId>
 7 |     <artifactId>DataVisualization</artifactId>
 8 |     <version>0.0.1</version>
 9 |     <packaging>jar</packaging>
10 | 
11 |     <name>spark-data-visualization</name>
12 |     <description>Demo project for Spring Boot</description>
13 | 
14 |     <parent>
15 |         <groupId>org.springframework.boot</groupId>
16 |         <artifactId>spring-boot-starter-parent</artifactId>
17 |         <version>1.5.8.RELEASE</version>
18 |         <relativePath/> <!-- lookup parent from repository -->
19 |     </parent>
20 | 
21 |     <properties>
22 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
23 |         <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
24 |         <java.version>1.8</java.version>
25 |     </properties>
26 | 
27 |     <repositories>
28 |         <repository>
29 |             <id>cloudera</id>
30 |             <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
31 |         </repository>
32 |     </repositories>
33 | 
34 |     <dependencies>
35 |         <dependency>
36 |             <groupId>org.springframework.boot</groupId>
37 |             <artifactId>spring-boot-starter-web</artifactId>
38 |         </dependency>
39 | 
40 |         <dependency>
41 |             <groupId>org.springframework.boot</groupId>
42 |             <artifactId>spring-boot-starter-test</artifactId>
43 |             <scope>test</scope>
44 |         </dependency>
45 | 
46 |         <dependency>
47 |             <groupId>org.springframework.boot</groupId>
48 |             <artifactId>spring-boot-starter-thymeleaf</artifactId>
49 |         </dependency>
50 | 
51 |         <dependency>
52 |             <groupId>org.apache.hbase</groupId>
53 |             <artifactId>hbase-client</artifactId>
54 |             <version>1.2.0-cdh5.7.0</version>
55 |         </dependency>
56 | 
57 |         <dependency>
58 |             <groupId>net.sf.json-lib</groupId>
59 |             <artifactId>json-lib</artifactId>
60 |             <version>2.4</version>
61 |             <classifier>jdk15</classifier>
62 |         </dependency>
63 |     </dependencies>
64 | 
65 |     <build>
66 |         <plugins>
67 |             <plugin>
68 |                 <groupId>org.springframework.boot</groupId>
69 |                 <artifactId>spring-boot-maven-plugin</artifactId>
70 |             </plugin>
71 |         </plugins>
72 |     </build>
73 | 
74 | </project>
75 | 


--------------------------------------------------------------------------------
/spark-data-visualization/src/main/java/cn/edu/nju/DataVisualizationApplication.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju;
 2 | 
 3 | import org.springframework.boot.SpringApplication;
 4 | import org.springframework.boot.autoconfigure.SpringBootApplication;
 5 | 
 6 | @SpringBootApplication
 7 | public class DataVisualizationApplication {
 8 | 
 9 | 	public static void main(String[] args) {
10 | 		SpringApplication.run(DataVisualizationApplication.class, args);
11 | 	}
12 | }
13 | 


--------------------------------------------------------------------------------
/spark-data-visualization/src/main/java/cn/edu/nju/dao/CourseClickCountDAO.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.dao;
 2 | 
 3 | import cn.edu.nju.domain.CourseClickCount;
 4 | import cn.edu.nju.utils.HBaseUtils;
 5 | import org.springframework.stereotype.Component;
 6 | 
 7 | import java.util.ArrayList;
 8 | import java.util.List;
 9 | import java.util.Map;
10 | 
11 | /**
12 |  * Created by Thpffcj on 2018/1/15.
13 |  * 实战课程访问数量数据访问层
14 |  */
15 | @Component
16 | public class CourseClickCountDAO {
17 | 
18 |     /**
19 |      * 根据天查询
20 |      */
21 |     public List<CourseClickCount> query(String day) throws Exception {
22 | 
23 |         List<CourseClickCount> list = new ArrayList<>();
24 | 
25 |         // 去HBase表中根据day获取实战课程对应的访问量
26 |         Map<String, Long> map = HBaseUtils.getInstance().query("imooc_course_clickcount", day);
27 | 
28 |         for(Map.Entry<String, Long> entry: map.entrySet()) {
29 |             CourseClickCount model = new CourseClickCount();
30 |             model.setName(entry.getKey());
31 |             model.setValue(entry.getValue());
32 | 
33 |             list.add(model);
34 |         }
35 | 
36 |         return list;
37 |     }
38 | 
39 |     public static void main(String[] args) throws Exception{
40 |         CourseClickCountDAO dao = new CourseClickCountDAO();
41 |         List<CourseClickCount> list = dao.query("20180115");
42 |         for(CourseClickCount model : list) {
43 |             System.out.println(model.getName() + " : " + model.getValue());
44 |         }
45 |     }
46 | }


--------------------------------------------------------------------------------
/spark-data-visualization/src/main/java/cn/edu/nju/domain/CourseClickCount.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.domain;
 2 | 
 3 | import org.springframework.stereotype.Component;
 4 | 
 5 | /**
 6 |  * Created by Thpffcj on 2018/1/15.
 7 |  * 实战课程访问数量实体类
 8 |  */
 9 | @Component
10 | public class CourseClickCount {
11 | 
12 |     private String name;
13 |     private long value;
14 | 
15 |     public String getName() {
16 |         return name;
17 |     }
18 | 
19 |     public void setName(String name) {
20 |         this.name = name;
21 |     }
22 | 
23 |     public long getValue() {
24 |         return value;
25 |     }
26 | 
27 |     public void setValue(long value) {
28 |         this.value = value;
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/spark-data-visualization/src/main/java/cn/edu/nju/spark/HelloBoot.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark;
 2 | 
 3 | import org.springframework.web.bind.annotation.RequestMapping;
 4 | import org.springframework.web.bind.annotation.RequestMethod;
 5 | import org.springframework.web.bind.annotation.RestController;
 6 | import org.springframework.web.servlet.ModelAndView;
 7 | 
 8 | /**
 9 |  * Created by Thpffcj on 2018/1/15.
10 |  * 这是我们的第一个Boot应用
11 |  */
12 | @RestController
13 | public class HelloBoot {
14 | 
15 |     @RequestMapping(value = "/hello", method = RequestMethod.GET)
16 |     public String sayHello() {
17 | 
18 |         return "Hello World Spring Boot...";
19 |     }
20 | 
21 |     @RequestMapping(value = "/first", method = RequestMethod.GET)
22 |     public ModelAndView firstDemo() {
23 |         return new ModelAndView("test");
24 |     }
25 | 
26 |     @RequestMapping(value = "/course_clickcount", method = RequestMethod.GET)
27 |     public ModelAndView courseClickCountStat() {
28 |         return new ModelAndView("demo");
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/spark-data-visualization/src/main/java/cn/edu/nju/spark/ImoocStatApp.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark;
 2 | 
 3 | import cn.edu.nju.dao.CourseClickCountDAO;
 4 | import cn.edu.nju.domain.CourseClickCount;
 5 | import net.sf.json.JSONArray;
 6 | import org.springframework.beans.factory.annotation.Autowired;
 7 | import org.springframework.stereotype.Component;
 8 | import org.springframework.web.bind.annotation.RequestMapping;
 9 | import org.springframework.web.bind.annotation.RequestMethod;
10 | import org.springframework.web.bind.annotation.ResponseBody;
11 | import org.springframework.web.bind.annotation.RestController;
12 | import org.springframework.web.servlet.ModelAndView;
13 | 
14 | import java.util.HashMap;
15 | import java.util.List;
16 | import java.util.Map;
17 | 
18 | /**
19 |  * Created by Thpffcj on 2018/1/15.
20 |  * web层
21 |  */
22 | @RestController
23 | public class ImoocStatApp {
24 | 
25 |     private static Map<String, String> courses = new HashMap<>();
26 |     static {
27 |         courses.put("112","Spark SQL慕课网日志分析");
28 |         courses.put("128","10小时入门大数据");
29 |         courses.put("145","深度学习之神经网络核心原理与算法");
30 |         courses.put("146","强大的Node.js在Web开发的应用");
31 |         courses.put("131","Vue+Django实战");
32 |         courses.put("130","Web前端性能优化");
33 |     }
34 | 
35 |     @Autowired
36 |     CourseClickCountDAO courseClickCountDAO;
37 | 
38 | //    @RequestMapping(value = "/course_clickcount_dynamic", method = RequestMethod.GET)
39 | //    public ModelAndView courseClickCount() throws Exception {
40 | //
41 | //        ModelAndView view = new ModelAndView("index");
42 | //
43 | //        List<CourseClickCount> list = courseClickCountDAO.query("20180115");
44 | //        for(CourseClickCount model : list) {
45 | //            model.setName(courses.get(model.getName().substring(9)));
46 | //        }
47 | //        JSONArray json = JSONArray.fromObject(list);
48 | //
49 | //        view.addObject("data_json", json);
50 | //
51 | //        return view;
52 | //    }
53 | 
54 |     @RequestMapping(value = "/course_clickcount_dynamic", method = RequestMethod.POST)
55 |     @ResponseBody
56 |     public List<CourseClickCount> courseClickCount() throws Exception {
57 | 
58 |         List<CourseClickCount> list = courseClickCountDAO.query("20180115");
59 |         for(CourseClickCount model : list) {
60 |             model.setName(courses.get(model.getName().substring(9)));
61 |         }
62 | 
63 |         return list;
64 |     }
65 | 
66 |     @RequestMapping(value = "/echarts", method = RequestMethod.GET)
67 |     public ModelAndView echarts(){
68 |         return new ModelAndView("echarts");
69 |     }
70 | }
71 | 


--------------------------------------------------------------------------------
/spark-data-visualization/src/main/resources/application.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/BigData-Getting-Started/5fe231ddaafb31504a41b4ec1c8f0008cd2f1ad2/spark-data-visualization/src/main/resources/application.properties


--------------------------------------------------------------------------------
/spark-data-visualization/src/main/resources/templates/demo.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8"/>
 5 |     <title>imooc_stat</title>
 6 | 
 7 |     <!-- 引入 ECharts 文件 -->
 8 |     <script src="js/echarts.min.js"></script>
 9 | </head>
10 | <body>
11 | 
12 | <!-- 为 ECharts 准备一个具备大小（宽高）的 DOM -->
13 | <div id="main" style="width: 600px;height:400px;position: absolute; top:50%; left: 50%; margin-top: -200px;margin-left: -300px"></div>
14 | 
15 | 
16 | <script type="text/javascript">
17 |     // 基于准备好的dom，初始化echarts实例
18 |     var myChart = echarts.init(document.getElementById('main'));
19 | 
20 |     // 指定图表的配置项和数据
21 |     var option = {
22 |         title : {
23 |             text: '慕课网实战课程实时访问量统计',
24 |             subtext: '实战课程访问次数',
25 |             x:'center'
26 |         },
27 |         tooltip : {
28 |             trigger: 'item',
29 |             formatter: "{a} <br/>{b} : {c} ({d}%)"
30 |         },
31 |         legend: {
32 |             orient: 'vertical',
33 |             left: 'left',
34 |             data: ['Spark SQL项目实战','Hadoop入门','Spark Streaming项目实战','大数据面试题','Storm项目实战']
35 |         },
36 |         series : [
37 |             {
38 |                 name: '访问次数',
39 |                 type: 'pie',
40 |                 radius : '55%',
41 |                 center: ['50%', '60%'],
42 |                 data:[
43 |                     {value:3350, name:'Spark SQL项目实战'},
44 |                     {value:3100, name:'Hadoop入门'},
45 |                     {value:2340, name:'Spark Streaming项目实战'},
46 |                     {value:1350, name:'大数据面试题'},
47 |                     {value:15480, name:'Storm项目实战'}
48 |                 ],
49 |                 itemStyle: {
50 |                     emphasis: {
51 |                         shadowBlur: 10,
52 |                         shadowOffsetX: 0,
53 |                         shadowColor: 'rgba(0, 0, 0, 0.5)'
54 |                     }
55 |                 }
56 |             }
57 |         ]
58 |     };
59 | 
60 |     // 使用刚指定的配置项和数据显示图表。
61 |     myChart.setOption(option);
62 | </script>
63 | </body>
64 | </html>


--------------------------------------------------------------------------------
/spark-data-visualization/src/main/resources/templates/echarts.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8"/>
 5 |     <title>慕课网实战课程实时访问量统计</title>
 6 | 
 7 |     <!-- 引入 ECharts 文件 -->
 8 |     <script src="js/echarts.min.js"></script>
 9 | 
10 |     <!-- 引入 jQuery 文件 -->
11 |     <script src="js/jquery.js"></script>
12 | </head>
13 | <body>
14 | 
15 | <!-- 为 ECharts 准备一个具备大小（宽高）的 DOM -->
16 | <div id="main" style="width: 600px;height:400px;position: absolute; top:50%; left: 50%; margin-top: -200px;margin-left: -300px"></div>
17 | 
18 | 
19 | <script type="text/javascript">
20 |     // 基于准备好的dom，初始化echarts实例
21 |     var myChart = echarts.init(document.getElementById('main'));
22 | 
23 |     // 指定图表的配置项和数据
24 |     var option = {
25 |         title : {
26 |             text: '慕课网实战课程实时访问量统计',
27 |             subtext: '实战课程访问次数',
28 |             x:'center'
29 |         },
30 |         tooltip : {
31 |             trigger: 'item',
32 |             formatter: "{a} <br/>{b} : {c} ({d}%)"
33 |         },
34 |         legend: {
35 |             orient: 'vertical',
36 |             left: 'left'
37 |         },
38 |         series : [
39 |             {
40 |                 name: '访问次数',
41 |                 type: 'pie',
42 |                 radius : '55%',
43 |                 center: ['50%', '60%'],
44 |                 data: (function(){ //<![CDATA[
45 |                     var datas = [];
46 |                     $.ajax({
47 |                         type: "POST",
48 |                         url: "/course_clickcount_dynamic",
49 |                         dataType: 'json',
50 |                         async: false,
51 |                         success: function(result) {
52 |                             for(var i=0; i<result.length; i++) {
53 |                                 datas.push({"value":result[i].value, "name":result[i].name})
54 |                             }
55 |                         }
56 |                     });
57 |                     return datas;
58 |                     //]]>
59 |                 })(),
60 |                 itemStyle: {
61 |                     emphasis: {
62 |                         shadowBlur: 10,
63 |                         shadowOffsetX: 0,
64 |                         shadowColor: 'rgba(0, 0, 0, 0.5)'
65 |                     }
66 |                 }
67 |             }
68 |         ]
69 |     };
70 | 
71 |     // 使用刚指定的配置项和数据显示图表。
72 |     myChart.setOption(option);
73 | </script>
74 | </body>
75 | </html>


--------------------------------------------------------------------------------
/spark-data-visualization/src/main/resources/templates/test.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8"/>
 5 |     <title>test</title>
 6 | 
 7 |     <!-- 引入 ECharts 文件 -->
 8 |     <script src="js/echarts.min.js"></script>
 9 | </head>
10 | <body>
11 | 
12 | <!-- 为 ECharts 准备一个具备大小（宽高）的 DOM -->
13 | <div id="main" style="width: 600px;height:400px;position: absolute; top:50%; left: 50%; margin-top: -200px;margin-left: -300px"></div>
14 | 
15 | 
16 | <script type="text/javascript">
17 |     // 基于准备好的dom，初始化echarts实例
18 |     var myChart = echarts.init(document.getElementById('main'));
19 | 
20 |     // 指定图表的配置项和数据
21 |     var option = {
22 |         title: {
23 |             text: 'ECharts 入门示例'
24 |         },
25 |         tooltip: {},
26 |         legend: {
27 |             data:['销量']
28 |         },
29 |         xAxis: {
30 |             data: ["衬衫","羊毛衫","雪纺衫","裤子","高跟鞋","袜子"]
31 |         },
32 |         yAxis: {},
33 |         series: [{
34 |             name: '销量',
35 |             type: 'bar',
36 |             data: [5, 20, 36, 10, 10, 20]
37 |         }]
38 |     };
39 | 
40 |     // 使用刚指定的配置项和数据显示图表。
41 |     myChart.setOption(option);
42 | </script>
43 | </body>
44 | </html>


--------------------------------------------------------------------------------
/spark-data-visualization/src/test/java/cn/edu/nju/DataVisualizationApplicationTests.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju;
 2 | 
 3 | import org.junit.Test;
 4 | import org.junit.runner.RunWith;
 5 | import org.springframework.boot.test.context.SpringBootTest;
 6 | import org.springframework.test.context.junit4.SpringRunner;
 7 | 
 8 | @RunWith(SpringRunner.class)
 9 | @SpringBootTest
10 | public class DataVisualizationApplicationTests {
11 | 
12 | 	@Test
13 | 	public void contextLoads() {
14 | 	}
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/cn/edu/nju/MovieRecommendation.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.ml.evaluation.RegressionEvaluator
 5 | import org.apache.spark.ml.recommendation.ALS
 6 | import org.apache.spark.ml.recommendation.ALS.Rating
 7 | import org.apache.spark.sql.SparkSession
 8 | 
 9 | /**
10 |  * Created by thpffcj on 2019/10/31.
11 |  */
12 | object MovieRecommendation {
13 | 
14 |   def main(args: Array[String]): Unit = {
15 | 
16 |     val conf = new SparkConf().setMaster("local").setAppName("MovieRecommendation")
17 |     val spark = SparkSession.builder().config(conf).getOrCreate()
18 |     spark.sparkContext.setLogLevel("WARN")
19 | 
20 |     val parseRating = (string: String ) => {
21 |       // 分割
22 |       val stringArray = string.split("\t")
23 |       // 包装
24 |       Rating(stringArray(0).toInt, stringArray(1).toInt, stringArray(2).toFloat)
25 |     }
26 | 
27 |     import spark.implicits._
28 |     val data = spark.read.textFile("src/main/resources/u.data")
29 |       // 分割
30 |       .map(parseRating)
31 |       // 转换成DataFrame
32 |       .toDF("userId", "itemId", "rating")
33 | 
34 | //    data.show()
35 | 
36 |     val Array(train, test) = data.randomSplit(Array(0.8, 0.2))
37 | 
38 |     val als = new ALS()
39 |       .setMaxIter(20)
40 |       .setUserCol("userId")
41 |       .setItemCol("itemId")
42 |       .setRatingCol("rating")
43 |       // 正则化参数
44 |       .setRegParam(0.01)
45 | 
46 |     val model = als.fit(train)
47 | 
48 |     // 冷启动策略
49 |     model.setColdStartStrategy("drop")
50 | 
51 |     val predictions = model.transform(test)
52 |     // 根据(userID,itemID)预测rating
53 | //     predictions.show(false)
54 | 
55 |     // MovieLens数据集(学术界可靠的一种数据集) 给196号用户推荐10部电影
56 |     val users = spark.createDataset(Array(196)).toDF("userID")
57 | //    users.show(false)
58 |     model.recommendForUserSubset(users, 10).show(false)
59 | 
60 |     // 模型评估
61 |     val evaluator = new RegressionEvaluator()
62 |       .setMetricName("rmse")
63 |       .setLabelCol("rating")
64 |       .setPredictionCol("prediction")
65 | 
66 |     val rmse = evaluator.evaluate(predictions)
67 |     println(s"Root-mean-square error is $rmse \n")
68 | 
69 | //     Spark机器学习模型的持久化
70 | //     模型保存
71 | //     model.save("./xxx")
72 | //     模型加载
73 | //     val model = ALS.load("xxxx")
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/cn/edu/nju/classification/Iris.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.classification
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.ml.classification.DecisionTreeClassifier
 5 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
 6 | import org.apache.spark.ml.feature.VectorAssembler
 7 | import org.apache.spark.sql.SparkSession
 8 | 
 9 | import scala.util.Random
10 | 
11 | /**
12 |  * Created by thpffcj on 2019/10/29.
13 |  */
14 | object Iris {
15 | 
16 |   def main(args: Array[String]): Unit = {
17 | 
18 |     val conf = new SparkConf().setMaster("local").setAppName("Iris")
19 |     val spark = SparkSession.builder().config(conf).getOrCreate()
20 |     spark.sparkContext.setLogLevel("WARN")
21 | 
22 |     val file = spark.read.format("csv").load("src/main/resources/iris.data")
23 | 
24 |     val random = new Random()
25 | 
26 |     import spark.implicits._
27 |     val data = file.map(row => {
28 |       val label = row.getString(4) match {
29 |         case "Iris-setosa" => 0
30 |         case "Iris-versicolor" => 1
31 |         case "Iris-virginica" => 2
32 |       }
33 | 
34 |       (row.getString(0).toDouble, row.getString(1).toDouble,
35 |         row.getString(2).toDouble, row.getString(3).toDouble,
36 |         label, random.nextDouble())
37 |     }).toDF("_c0", "_c1", "_c2", "_c3", "label", "rand").sort("rand")
38 | 
39 |     val assembler = new VectorAssembler().setInputCols(Array("_c0", "_c1", "_c2", "_c3")).setOutputCol("features")
40 | 
41 |     val dataset = assembler.transform(data)
42 |     val Array(train, test) = dataset.randomSplit(Array(0.8, 0.2))
43 | 
44 |     val dt = new DecisionTreeClassifier().setFeaturesCol("features").setLabelCol("label")
45 |     val model = dt.fit(train)
46 | 
47 |     val result = model.transform(test)
48 |     result.show()
49 | 
50 |     val evaluator = new MulticlassClassificationEvaluator()
51 |       .setLabelCol("label")
52 |       .setPredictionCol("prediction")
53 |       .setMetricName("accuracy")
54 | 
55 |     val accuracy = evaluator.evaluate(result)
56 |     println(s"""accuracy is $accuracy""")
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/cn/edu/nju/cluster/KMeans.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.cluster
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.ml.feature.VectorAssembler
 5 | import org.apache.spark.ml.clustering.KMeans
 6 | import org.apache.spark.sql.SparkSession
 7 | 
 8 | import scala.util.Random
 9 | 
10 | /**
11 |  * Created by thpffcj on 2019/10/29.
12 |  */
13 | object KMeans {
14 | 
15 |   def main(args: Array[String]): Unit = {
16 | 
17 |     val conf = new SparkConf().setMaster("local").setAppName("KMeans")
18 |     val spark = SparkSession.builder().config(conf).getOrCreate()
19 | 
20 |     val file = spark.read.format("csv").load("src/main/resources/iris.data")
21 | 
22 |     val random = new Random()
23 |     import spark.implicits._
24 |     val data= file.map(row => {
25 |       val label = row.getString(4) match {
26 |         case "Iris-setosa" => 0
27 |         case "Iris-versicolor" => 1
28 |         case "Iris-virginica" => 2
29 |       }
30 | 
31 |       (row.getString(0).toDouble,
32 |         row.getString(1).toDouble,
33 |         row.getString(2).toDouble,
34 |         row.getString(3).toDouble,
35 |         label,
36 |         random.nextDouble())
37 |     }).toDF("_c0", "_c1", "_c2", "_c3", "label", "rand").sort("rand")
38 | 
39 |     val assembler = new VectorAssembler()
40 |       .setInputCols(Array("_c0", "_c1", "_c2", "_c3"))
41 |       .setOutputCol("features")
42 | 
43 |     // 分割
44 |     val dataset = assembler.transform(data)
45 |     val Array(train, test) = dataset.randomSplit(Array(0.8, 0.2))
46 |     train.show()
47 | 
48 |     // kmeans 算法
49 |     val kmeans = new KMeans().setFeaturesCol("features").setK(3).setMaxIter(20)
50 |     val model = kmeans.fit(train)
51 | 
52 |     model.transform(train).show()
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/cn/edu/nju/cluster/Lda.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.cluster
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.ml.clustering.LDA
 5 | import org.apache.spark.ml.feature.VectorAssembler
 6 | import org.apache.spark.sql.SparkSession
 7 | 
 8 | import scala.util.Random
 9 | 
10 | /**
11 |  * Created by thpffcj on 2019/10/29.
12 |  */
13 | object Lda {
14 | 
15 |   def main(args: Array[String]): Unit = {
16 | 
17 |     val conf = new SparkConf().setMaster("local").setAppName("LDA")
18 |     val spark = SparkSession.builder().config(conf).getOrCreate()
19 | 
20 |     // 加载数据
21 |     val file = spark.read.format("csv").load("src/main/resources/iris.data")
22 | 
23 |     val random = new Random()
24 | 
25 |     import spark.implicits._
26 |     val data = file.map(row => {
27 |       val label = row.getString(4) match {
28 |         case "Iris-setosa" => 0
29 |         case "Iris-versicolor" => 1
30 |         case "Iris-virginica" => 2
31 |       }
32 | 
33 |       (row.getString(0).toDouble, row.getString(1).toDouble,
34 |         row.getString(2).toDouble, row.getString(3).toDouble,
35 |         label, random.nextDouble())
36 |     }).toDF("_c0", "_c1", "_c2", "_c3", "label", "rand").sort("rand")
37 | 
38 |     val assembler = new VectorAssembler()
39 |       .setInputCols(Array("_c0", "_c1", "_c2", "_c3")).setOutputCol("features")
40 | 
41 |     val dataset = assembler.transform(data)
42 |     val Array(train, test) = dataset.randomSplit(Array(0.8, 0.2))
43 | 
44 |     // 训练一个LDA模型
45 |     val lda = new LDA().setFeaturesCol("features").setK(3).setMaxIter(40)
46 |     val model = lda.fit(train)
47 | 
48 |     // 展示结果
49 |     val prediction = model.transform(test)
50 |     prediction.show()
51 | 
52 |     val ll = model.logLikelihood(train)
53 |     val lp = model.logPerplexity(train)
54 | 
55 |     // Describe topics.
56 |     val topics = model.describeTopics(3)
57 |     prediction.select("label", "topicDistribution").show(false)
58 |     println("The topics described by their top-weighted terms:")
59 |     topics.show(false)
60 |     println(s"The lower bound on the log likelihood of the entire corpus: $ll")
61 |     println(s"The upper bound on perplexity: $lp")
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/cn/edu/nju/dimensionalityReduction/PCADimensionalityReduction.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.dimensionalityReduction
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.ml.classification.DecisionTreeClassifier
 5 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
 6 | import org.apache.spark.ml.feature.{PCA, VectorAssembler}
 7 | import org.apache.spark.sql.SparkSession
 8 | 
 9 | import scala.util.Random
10 | 
11 | /**
12 |  * Created by thpffcj on 2019/10/30.
13 |  */
14 | object PCADimensionalityReduction {
15 | 
16 |   def main(args: Array[String]): Unit = {
17 | 
18 |     val conf = new SparkConf().setMaster("local").setAppName("PCADimensionalityReduction")
19 |     val spark = SparkSession.builder().config(conf).getOrCreate()
20 | 
21 |     // 日志级别
22 |     spark.sparkContext.setLogLevel("WARN")
23 | 
24 |     val file = spark.read.format("csv").load("src/main/resources/iris.data")
25 | 
26 |     val random = new Random()
27 |     import spark.implicits._
28 |     val data = file.map(row => {
29 |       val label = row.getString(4) match {
30 |         case "Iris-setosa" => 0
31 |         case "Iris-versicolor" => 1
32 |         case "Iris-virginica" => 2
33 |       }
34 | 
35 |       (row.getString(0).toDouble,
36 |         row.getString(1).toDouble,
37 |         row.getString(2).toDouble,
38 |         row.getString(3).toDouble,
39 |         label,
40 |         random.nextDouble())
41 |     }).toDF("_c0", "_c1", "_c2", "_c3", "label", "rand").sort("rand")
42 | 
43 |     val assembler = new VectorAssembler().setInputCols(Array("_c0", "_c1", "_c2", "_c3")).setOutputCol("features")
44 | 
45 |     val pca = new PCA()
46 |       .setInputCol("features")
47 |       .setOutputCol("features2")
48 |       .setK(3)
49 | 
50 |     val dataset = assembler.transform(data)
51 |     val pcaModel = pca.fit(dataset)
52 | 
53 |     val dataset2 = pcaModel.transform(dataset)
54 |     val Array(train, test) = dataset2.randomSplit(Array(0.8, 0.2))
55 | 
56 |     val dt = new DecisionTreeClassifier().setFeaturesCol("features").setLabelCol("label")
57 |     val model = dt.fit(train)
58 |     val result = model.transform(test)
59 |     result.show(false)
60 | 
61 |     val evaluator = new MulticlassClassificationEvaluator()
62 |       .setLabelCol("label")
63 |       .setPredictionCol("prediction")
64 |       .setMetricName("accuracy")
65 |     val accuracy = evaluator.evaluate(result)
66 |     println(s"""accuracy is $accuracy""")
67 |   }
68 | }
69 | 


--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/cn/edu/nju/emotionAnalysis/EmotionAnalysis.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.emotionAnalysis
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.ml.classification.NaiveBayes
 5 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
 6 | import org.apache.spark.ml.feature.{HashingTF, IDF}
 7 | import org.apache.spark.sql.SparkSession
 8 | 
 9 | import scala.util.Random
10 | 
11 | /**
12 |  * Created by thpffcj on 2019/10/30.
13 |  */
14 | object EmotionAnalysis {
15 | 
16 |   def main(args: Array[String]): Unit = {
17 | 
18 |     val conf = new SparkConf().setMaster("local").setAppName("EmotionAnalysis")
19 |     val spark = SparkSession.builder().config(conf).getOrCreate()
20 |     // 日志级别
21 |     spark.sparkContext.setLogLevel("WARN")
22 | 
23 |     val rand = new Random()
24 | 
25 |     import spark.implicits._
26 |     // 数据预处理
27 |     val neg = spark.read.textFile("src/main/resources/neg.txt").map(line => {
28 |       // 分词
29 |       (line.split(" ").filter(!_.equals(" ")), 0, rand.nextDouble())
30 |     }).toDF("words", "value", "random")
31 | 
32 |     val pos = spark.read.textFile("src/main/resources/pos.txt").map(line => {
33 |       (line.split(" ").filter(!_.equals(" ")), 1, rand.nextDouble())
34 |     }).toDF("words", "value", "random")  // 思考：这里把inner function提出重用来如何操作
35 | 
36 |     // 合并乱序
37 |     val data = neg.union(pos).sort("random")
38 |     data.show(false)
39 |     println(neg.count(), pos.count(), data.count()) // 合并
40 | 
41 |     // 文本特征抽取(TF-IDF)
42 |     val hashingTf = new HashingTF()
43 |       .setInputCol("words")
44 |       .setOutputCol("hashing")
45 |       .transform(data)
46 | 
47 |     val idfModel = new IDF()
48 |       .setInputCol("hashing")
49 |       .setOutputCol("tfidf")
50 |       .fit(hashingTf)
51 | 
52 |     val transformedData = idfModel.transform(hashingTf)
53 |     val Array(training, test) = transformedData
54 |       .randomSplit(Array(0.7, 0.3))
55 | 
56 |     // 根据抽取到的文本特征，使用分类器进行分类，这是一个二分类问题
57 |     // 分类器是可替换的
58 |     val bayes = new NaiveBayes()
59 |       .setFeaturesCol("tfidf")  // X
60 |       .setLabelCol("value")  // y 0:消极,1:积极
61 |       .fit(training)
62 | 
63 |     // 交叉验证
64 |     val result = bayes.transform(test)
65 |     result.show(false)
66 | 
67 |     // 评估模型的准确率
68 |     val evaluator = new MulticlassClassificationEvaluator()
69 |       .setLabelCol("value")
70 |       .setPredictionCol("prediction")
71 |       .setMetricName("accuracy")
72 | 
73 |     val accuracy = evaluator.evaluate(result)
74 |     println(s"""accuracy is $accuracy""")
75 | 
76 |     // 重构思考：
77 |     // 尝试用pipeline重构代码
78 |     // 尝试用模型预测随便属于一句话的情感，例如：
79 |     // You are a bad girl,I hate you. ^_^
80 |   }
81 | }
82 | 


--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/cn/edu/nju/regression/HousePriceForecast.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.regression
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.ml.feature.VectorAssembler
 5 | import org.apache.spark.ml.regression.LinearRegression
 6 | import org.apache.spark.sql.SparkSession
 7 | 
 8 | import scala.util.Random
 9 | 
10 | /**
11 |  * Created by thpffcj on 2019/10/29.
12 |  */
13 | object HousePriceForecast {
14 | 
15 |   def main(args: Array[String]): Unit = {
16 | 
17 |     val conf = new SparkConf().setMaster("local").setAppName("HousePriceForecast")
18 | 
19 |     val spark = SparkSession.builder().config(conf).getOrCreate()
20 | 
21 |     // 加载文件
22 |     val file = spark.read.format("csv")
23 |       .option("sep", ";").option("header", "true")
24 |       .load("src/main/resources/house.csv")
25 | 
26 |     import spark.implicits._
27 |     // 开始shuffle
28 |     // 打乱顺序
29 |     val rand = new Random()
30 |     val data = file.select("square", "price").map(row => {
31 |       (row.getAs[String](0).toDouble, row.getString(1).toDouble, rand.nextDouble())
32 |     }).toDF("square", "price", "rand").sort("rand")  // 强制类型转换过程
33 | 
34 |     val assembler = new VectorAssembler().setInputCols(Array("square")).setOutputCol("features")
35 | 
36 |     // 特征包装
37 |     val dataset = assembler.transform(data)
38 | 
39 |     // 训练集，测试集
40 |     // 拆分成训练数据集和测试数据集
41 |     val Array(train, test) = dataset.randomSplit(Array(0.8, 0.2))
42 | 
43 |     val lr = new LinearRegression().setLabelCol("price").setFeaturesCol("features")
44 |       .setRegParam(0.3).setElasticNetParam(0.8).setMaxIter(10)
45 |     val model = lr.fit(train)
46 | 
47 |     model.transform(test).show()
48 |     val s = model.summary.totalIterations
49 |     println(s"iter: ${s}")
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/spark-sql-train/.gitignore:
--------------------------------------------------------------------------------
1 | /memetastore_db
2 | /spark-warehouse
3 | /derby.log
4 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/resources/ipRegion.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Thpffcj/BigData-Getting-Started/5fe231ddaafb31504a41b4ec1c8f0008cd2f1ad2/spark-sql-train/src/main/resources/ipRegion.xlsx


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/log/AccessConvertUtil.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.log
 2 | 
 3 | import org.apache.spark.sql.Row
 4 | import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
 5 | 
 6 | /**
 7 |   * Created by Thpffcj on 2018/5/7.
 8 |   * 访问日志转换(输入==>输出)工具类
 9 |   */
10 | object AccessConvertUtil {
11 | 
12 |   // 定义的输出的字段
13 |   val struct = StructType(
14 |     Array(
15 |       StructField("url", StringType),
16 |       StructField("cmsType", StringType),
17 |       StructField("cmsId", LongType),
18 |       StructField("traffic", LongType),
19 |       StructField("ip", StringType),
20 |       StructField("city", StringType),
21 |       StructField("time", StringType),
22 |       StructField("day", StringType)
23 |     )
24 |   )
25 | 
26 |   /**
27 |     * 根据输入的每一行信息转换成输出的样式
28 |     *
29 |     * @param log 输入的每一行记录信息
30 |     */
31 |   def parseLog(log: String) = {
32 | 
33 |     try {
34 |       val splits = log.split("\t")
35 | 
36 |       val url = splits(1)
37 |       val traffic = splits(2).toLong
38 |       val ip = splits(3)
39 | 
40 |       val domain = "http://www.imooc.com/"
41 |       val cms = url.substring(url.indexOf(domain) + domain.length)
42 |       val cmsTypeId = cms.split("/")
43 | 
44 |       var cmsType = ""
45 |       var cmsId = 0l
46 |       if (cmsTypeId.length > 1) {
47 |         cmsType = cmsTypeId(0)
48 |         cmsId = cmsTypeId(1).toLong
49 |       }
50 | 
51 |       val city = IpUtils.getCity(ip)
52 |       val time = splits(0)
53 |       val day = time.substring(0, 10).replaceAll("-", "")
54 | 
55 |       // 这个row里面的字段要和struct中的字段对应上
56 |       Row(url, cmsType, cmsId, traffic, ip, city, time, day)
57 |     } catch {
58 |       case e: Exception => Row(0)
59 |     }
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/log/DateUtils.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.log
 2 | 
 3 | import java.util.{Date, Locale}
 4 | 
 5 | import org.apache.commons.lang3.time.FastDateFormat
 6 | 
 7 | /**
 8 |   * Created by Thpffcj on 2018/5/7.
 9 |   * 日期时间解析工具类:
10 |   * 注意：SimpleDateFormat是线程不安全
11 |   */
12 | object DateUtils {
13 | 
14 |   //输入文件日期时间格式
15 |   //10/Nov/2016:00:01:02 +0800
16 |   val YYYYMMDDHHMM_TIME_FORMAT = FastDateFormat.getInstance("dd/MMM/yyyy:HH:mm:ss Z", Locale.ENGLISH)
17 | 
18 |   //目标日期格式
19 |   val TARGET_FORMAT = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss")
20 | 
21 |   /**
22 |     * 获取时间：yyyy-MM-dd HH:mm:ss
23 |     */
24 |   def parse(time: String) = {
25 |     TARGET_FORMAT.format(new Date(getTime(time)))
26 |   }
27 | 
28 |   /**
29 |     * 获取输入日志时间：long类型
30 |     *
31 |     * time: [10/Nov/2016:00:01:02 +0800]
32 |     */
33 |   def getTime(time: String) = {
34 |     try {
35 |       YYYYMMDDHHMM_TIME_FORMAT.parse(time.substring(time.indexOf("[") + 1,
36 |         time.lastIndexOf("]"))).getTime
37 |     } catch {
38 |       case e: Exception => {
39 |         0l
40 |       }
41 |     }
42 |   }
43 | 
44 |   def main(args: Array[String]) {
45 |     println(parse("[10/Nov/2016:00:01:02 +0800]"))
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/log/DayCityVideoAccessStat.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.log
2 | 
3 | /**
4 |   * Created by Thpffcj on 2018/5/7.
5 |   */
6 | case class DayCityVideoAccessStat(day:String, cmsId:Long, city:String,times:Long,timesRank:Int)
7 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/log/DayVideoAccessStat.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.log
2 | 
3 | /**
4 |   * Created by Thpffcj on 2018/5/7.
5 |   * 每天课程访问次数实体类
6 |   */
7 | case class DayVideoAccessStat(day: String, cmsId: Long, times: Long)
8 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/log/DayVideoTrafficsStat.scala:
--------------------------------------------------------------------------------
1 | package cn.edu.nju.log
2 | 
3 | /**
4 |   * Created by Thpffcj on 2018/5/7.
5 |   */
6 | case class DayVideoTrafficsStat(day:String,cmsId:Long,traffics:Long)
7 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/log/IpUtils.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.log
 2 | 
 3 | import com.ggstar.util.ip.IpHelper
 4 | 
 5 | /**
 6 |   * Created by Thpffcj on 2018/5/7.
 7 |   * IP解析工具类
 8 |   */
 9 | object IpUtils {
10 | 
11 |   def getCity(ip:String) = {
12 |     IpHelper.findRegionByIp(ip)
13 |   }
14 | 
15 |   def main(args: Array[String]) {
16 |     println(getCity("218.197.153.150"))
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/log/MySQLUtils.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.log
 2 | 
 3 | import java.sql.DriverManager
 4 | 
 5 | import java.sql.{Connection, PreparedStatement, DriverManager}
 6 | 
 7 | /**
 8 |   * Created by Thpffcj on 2018/5/7.
 9 |   * MySQL操作工具类
10 |   */
11 | object MySQLUtils {
12 | 
13 |   /**
14 |     * 获取数据库连接
15 |     */
16 |   def getConnection() = {
17 |     DriverManager.getConnection("jdbc:mysql://localhost:3306/sparksql?user=root&password=000000")
18 |   }
19 | 
20 |   /**
21 |     * 释放数据库连接等资源
22 |     * @param connection
23 |     * @param pstmt
24 |     */
25 |   def release(connection: Connection, pstmt: PreparedStatement): Unit = {
26 |     try {
27 |       if (pstmt != null) {
28 |         pstmt.close()
29 |       }
30 |     } catch {
31 |       case e: Exception => e.printStackTrace()
32 |     } finally {
33 |       if (connection != null) {
34 |         connection.close()
35 |       }
36 |     }
37 |   }
38 | 
39 |   def main(args: Array[String]) {
40 |     println(getConnection())
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/log/SparkStatCleanJob.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.log
 2 | 
 3 | import org.apache.spark.sql.{SaveMode, SparkSession}
 4 | 
 5 | /**
 6 |   * Created by Thpffcj on 2018/5/7.
 7 |   * 使用Spark完成我们的数据清洗操作
 8 |   */
 9 | object SparkStatCleanJob {
10 | 
11 |   def main(args: Array[String]): Unit = {
12 | 
13 |     val spark = SparkSession.builder().appName("SparkStatCleanJob")
14 |       .master("local[2]").getOrCreate()
15 | 
16 |     val accessRDD = spark.sparkContext.textFile("D:/access.log")
17 | //    accessRDD.take(10).foreach(println)
18 | 
19 |     // RDD ==> DF
20 |     val accessDF = spark.createDataFrame(accessRDD.map(x => AccessConvertUtil.parseLog(x)),
21 |       AccessConvertUtil.struct)
22 | 
23 | //        accessDF.printSchema()
24 | //        accessDF.show(false)
25 | 
26 |     accessDF.coalesce(1).write.format("parquet").mode(SaveMode.Overwrite)
27 |       .partitionBy("day").save("D:/clean")
28 | 
29 |     spark.stop()
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/log/SparkStatFormatJob.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.log
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | /**
 6 |   * Created by Thpffcj on 2018/5/7.
 7 |   * 第一步清洗：抽取出我们所需要的指定列的数据
 8 |   */
 9 | object SparkStatFormatJob {
10 | 
11 |   def main(args: Array[String]): Unit = {
12 | 
13 |     val spark = SparkSession.builder().appName("SparkStatFormatJob")
14 |       .master("local[2]").getOrCreate()
15 | 
16 |     val access = spark.sparkContext.textFile("D:/access.log")
17 | //    access.take(10).foreach(println)
18 | 
19 |     access.map(line => {
20 |       val splits = line.split(" ")
21 |       val ip = splits(0)
22 | 
23 |       /**
24 |         * 原始日志的第三个和第四个字段拼接起来就是完整的访问时间：
25 |         * [10/Nov/2016:00:01:02 +0800] ==> yyyy-MM-dd HH:mm:ss
26 |         */
27 |       val time = splits(3) + " " + splits(4)
28 |       val url = splits(11).replaceAll("\"","")
29 |       val traffic = splits(9)
30 |       DateUtils.parse(time) + "\t" + url + "\t" + traffic + "\t" + ip
31 |     }).saveAsTextFile("D:/output")
32 | 
33 |     spark.stop()
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/log/StatDAO.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.log
 2 | 
 3 | import java.sql.{PreparedStatement, Connection}
 4 | 
 5 | import scala.collection.mutable.ListBuffer
 6 | 
 7 | /**
 8 |   * Created by Thpffcj on 2018/5/7.
 9 |   * 各个维度统计的DAO操作
10 |   */
11 | object StatDAO {
12 | 
13 |   /**
14 |     * 批量保存DayVideoAccessStat到数据库
15 |     */
16 |   def insertDayVideoAccessTopN(list: ListBuffer[DayVideoAccessStat]): Unit = {
17 | 
18 |     var connection: Connection = null
19 |     var pstmt: PreparedStatement = null
20 | 
21 |     try {
22 |       connection = MySQLUtils.getConnection()
23 | 
24 |       connection.setAutoCommit(false) //设置手动提交
25 | 
26 |       val sql = "insert into day_video_access_topn_stat(day, cms_id, times) values (?,?,?) "
27 |       pstmt = connection.prepareStatement(sql)
28 | 
29 |       for (ele <- list) {
30 |         pstmt.setString(1, ele.day)
31 |         pstmt.setLong(2, ele.cmsId)
32 |         pstmt.setLong(3, ele.times)
33 | 
34 |         pstmt.addBatch()
35 |       }
36 | 
37 |       pstmt.executeBatch() // 执行批量处理
38 |       connection.commit() //手工提交
39 |     } catch {
40 |       case e: Exception => e.printStackTrace()
41 |     } finally {
42 |       MySQLUtils.release(connection, pstmt)
43 |     }
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/log/TopNStatJob.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.log
 2 | 
 3 | import org.apache.spark.sql.{DataFrame, SparkSession}
 4 | import org.apache.spark.sql.functions._
 5 | 
 6 | import scala.collection.mutable.ListBuffer
 7 | 
 8 | /**
 9 |   * Created by Thpffcj on 2018/5/7.
10 |   * TopN统计Spark作业
11 |   */
12 | object TopNStatJob {
13 | 
14 |   def main(args: Array[String]): Unit = {
15 |     val spark = SparkSession.builder().appName("TopNStatJob")
16 |       .config("spark.sql.sources.partitionColumnTypeInference.enabled", "false")
17 |       .master("local[2]").getOrCreate()
18 | 
19 |     val accessDF = spark.read.format("parquet").load("D:/clean")
20 | 
21 |     //    accessDF.printSchema()
22 |     //    accessDF.show(false)
23 | 
24 |     val day = "20180507"
25 | 
26 |     // 最受欢迎的TopN课程
27 |     videoAccessTopNStat(spark, accessDF, day)
28 | 
29 |     spark.stop()
30 |   }
31 | 
32 |   /**
33 |     * 按照地市进行统计TopN课程
34 |     */
35 |   def videoAccessTopNStat(spark: SparkSession, accessDF: DataFrame, day: String): Unit = {
36 | 
37 |     /**
38 |       * 使用DataFrame的方式进行统计
39 |       */
40 |     import spark.implicits._
41 | 
42 |     val videoAccessTopNDF = accessDF.filter($"day" === day && $"cmsType" === "video")
43 |       .groupBy("day", "cmsId").agg(count("cmsId").as("times")).orderBy($"times".desc)
44 | 
45 |     videoAccessTopNDF.show(false)
46 | 
47 |     /**
48 |       * 使用SQL的方式进行统计
49 |       */
50 | //    accessDF.createOrReplaceTempView("access_logs")
51 | //    val videoAccessTopNDF = spark.sql("select day,cmsId, count(1) as times from access_logs " +
52 | //      "where day='20180507' and cmsType='video' " +
53 | //      "group by day,cmsId order by times desc")
54 | //
55 | //    videoAccessTopNDF.show(false)
56 | 
57 |     /**
58 |       * 将统计结果写入到MySQL中
59 |       */
60 |     try {
61 |       videoAccessTopNDF.foreachPartition(partitionOfRecords => {
62 |         val list = new ListBuffer[DayVideoAccessStat]
63 | 
64 |         partitionOfRecords.foreach(info => {
65 |           val day = info.getAs[String]("day")
66 |           val cmsId = info.getAs[Long]("cmsId")
67 |           val times = info.getAs[Long]("times")
68 | 
69 |           /**
70 |             * 不建议大家在此处进行数据库的数据插入
71 |             */
72 |           list.append(DayVideoAccessStat(day, cmsId, times))
73 |         })
74 | 
75 |         StatDAO.insertDayVideoAccessTopN(list)
76 |       })
77 |     } catch {
78 |       case e:Exception => e.printStackTrace()
79 |     }
80 |   }
81 | }
82 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/spark/DataFrameApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | /**
 6 |   * Created by Thpffcj on 2018/5/3.
 7 |   * DataFrame API基本操作
 8 |   */
 9 | object DataFrameApp {
10 | 
11 |   def main(args: Array[String]): Unit = {
12 | 
13 |     val spark = SparkSession.builder().appName("DataFrameApp").master("local[2]").getOrCreate()
14 | 
15 |     // 将json文件加载成一个dataframe
16 |     val peopleDF = spark.read.format("json").load("D:/people.json")
17 | 
18 |     // 输出dataframe对应的schema信息
19 |     peopleDF.printSchema()
20 | 
21 |     // 输出数据集的前20条记录
22 |     peopleDF.show()
23 | 
24 |     //查询某列所有的数据： select name from table
25 |     peopleDF.select("name").show()
26 | 
27 |     // 查询某几列所有的数据，并对列进行计算： select name, age+10 as age2 from table
28 |     peopleDF.select(peopleDF.col("name"), (peopleDF.col("age") + 10).as("age2")).show()
29 | 
30 |     //根据某一列的值进行过滤： select * from table where age>19
31 |     peopleDF.filter(peopleDF.col("age") > 19).show()
32 | 
33 |     //根据某一列进行分组，然后再进行聚合操作： select age,count(1) from table group by age
34 |     peopleDF.groupBy("age").count().show()
35 | 
36 |     spark.stop()
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/spark/DataFrameCase.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | /**
 6 |   * Created by Thpffcj on 2018/5/3.
 7 |   * DataFrame中的操作操作
 8 |   */
 9 | object DataFrameCase {
10 | 
11 |   def main(args: Array[String]): Unit = {
12 |     val spark = SparkSession.builder().appName("DataFrameRDDApp").master("local[2]").getOrCreate()
13 | 
14 |     // RDD ==> DataFrame
15 |     val rdd = spark.sparkContext.textFile("D:/student.data")
16 | 
17 |     // 注意：需要导入隐式转换
18 |     import spark.implicits._
19 |     val studentDF = rdd.map(_.split("\\|")).map(line => Student(line(0).toInt, line(1), line(2), line(3))).toDF()
20 | 
21 |     // show默认只显示前20条
22 |     studentDF.show
23 |     studentDF.show(30)
24 |     studentDF.show(30, false)
25 | 
26 |     studentDF.take(10)
27 |     studentDF.first()
28 |     studentDF.head(3)
29 | 
30 |     studentDF.select("email").show(30,false)
31 | 
32 |     studentDF.filter("name=''").show
33 |     studentDF.filter("name='' OR name='NULL'").show
34 | 
35 |     // name以M开头的人
36 |     studentDF.filter("SUBSTR(name, 0, 1)='M'").show
37 | 
38 |     studentDF.sort(studentDF("name")).show
39 |     studentDF.sort(studentDF("name").desc).show
40 | 
41 |     studentDF.sort("name","id").show
42 |     studentDF.sort(studentDF("name").asc, studentDF("id").desc).show
43 | 
44 |     studentDF.select(studentDF("name").as("student_name")).show
45 | 
46 |     val studentDF2 = rdd.map(_.split("\\|")).map(line => Student(line(0).toInt, line(1), line(2), line(3))).toDF()
47 | 
48 |     studentDF.join(studentDF2, studentDF.col("id") === studentDF2.col("id")).show
49 | 
50 |     spark.stop()
51 |   }
52 | 
53 |   case class Student(id: Int, name: String, phone: String, email: String)
54 | }
55 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/spark/DataFrameRDDApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark
 2 | 
 3 | import org.apache.spark.sql.types.{StringType, IntegerType, StructField, StructType}
 4 | import org.apache.spark.sql.{Row, SparkSession}
 5 | 
 6 | /**
 7 |   * Created by Thpffcj on 2018/5/3.
 8 |   * DataFrame和RDD的互操作
 9 |   */
10 | object DataFrameRDDApp {
11 | 
12 |   def main(args: Array[String]) {
13 | 
14 |     val spark = SparkSession.builder().appName("DataFrameRDDApp").master("local[2]").getOrCreate()
15 | 
16 |     //inferReflection(spark)
17 | 
18 |     program(spark)
19 | 
20 |     spark.stop()
21 |   }
22 | 
23 |   def program(spark: SparkSession): Unit = {
24 |     // RDD ==> DataFrame
25 |     val rdd = spark.sparkContext.textFile("D:/infos.txt")
26 | 
27 |     val infoRDD = rdd.map(_.split(",")).map(line => Row(line(0).toInt, line(1), line(2).toInt))
28 | 
29 |     val structType = StructType(Array(StructField("id", IntegerType, true),
30 |       StructField("name", StringType, true),
31 |       StructField("age", IntegerType, true)))
32 | 
33 |     val infoDF = spark.createDataFrame(infoRDD, structType)
34 |     infoDF.printSchema()
35 |     infoDF.show()
36 | 
37 | 
38 |     //通过df的api进行操作
39 |     infoDF.filter(infoDF.col("age") > 30).show
40 | 
41 |     //通过sql的方式进行操作
42 |     infoDF.createOrReplaceTempView("infos")
43 |     spark.sql("select * from infos where age > 30").show()
44 |   }
45 | 
46 |   def inferReflection(spark: SparkSession) {
47 |     // RDD ==> DataFrame
48 |     val rdd = spark.sparkContext.textFile("D:/infos.txt")
49 | 
50 |     //注意：需要导入隐式转换
51 |     import spark.implicits._
52 |     val infoDF = rdd.map(_.split(",")).map(line => Info(line(0).toInt, line(1), line(2).toInt)).toDF()
53 | 
54 |     infoDF.show()
55 | 
56 |     infoDF.filter(infoDF.col("age") > 30).show
57 | 
58 |     infoDF.createOrReplaceTempView("infos")
59 |     spark.sql("select * from infos where age > 30").show()
60 |   }
61 | 
62 |   case class Info(id: Int, name: String, age: Int)
63 | }
64 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/spark/DataSetApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | /**
 6 |   * Created by Thpffcj on 2018/5/3.
 7 |   * DataSet操作
 8 |   */
 9 | object DataSetApp {
10 | 
11 |   def main(args: Array[String]) {
12 |     val spark = SparkSession.builder().appName("DatasetApp")
13 |       .master("local[2]").getOrCreate()
14 | 
15 |     //注意：需要导入隐式转换
16 |     import spark.implicits._
17 | 
18 |     val path = "/Users/thpffcj/Public/data/sales.csv"
19 | 
20 |     //spark如何解析csv文件？
21 |     val df = spark.read.option("header","true").option("inferSchema","true").csv(path)
22 |     df.show
23 | 
24 |     val ds = df.as[Sales]
25 |     ds.map(line => line.itemId).show
26 | 
27 |     ds.map(line => line.itemId)
28 | 
29 |     spark.stop()
30 |   }
31 | 
32 |   case class Sales(transactionId:Int,customerId:Int,itemId:Int,amountPaid:Double)
33 | }
34 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/spark/HiveContextApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark
 2 | 
 3 | import org.apache.spark.sql.hive.HiveContext
 4 | import org.apache.spark.{SparkConf, SparkContext}
 5 | 
 6 | /**
 7 |   * Created by Thpffcj on 2018/4/29.
 8 |   * HiveContext的使用
 9 |   * 使用时需要通过--jars 把mysql的驱动传递到classpath
10 |   * 不能直接运行，需要打包到服务器运行
11 |   */
12 | object HiveContextApp {
13 | 
14 |   def main(args: Array[String]) {
15 |     // 1. 创建相应的Context
16 |     val sparkConf = new SparkConf()
17 | 
18 |     // 在测试或者生产中，AppName和Master我们是通过脚本进行指定
19 | //     sparkConf.setAppName("HiveContextApp").setMaster("local[2]")
20 | 
21 |     val sc = new SparkContext(sparkConf)
22 |     val hiveContext = new HiveContext(sc)
23 | 
24 |     // 2. 相关的处理:
25 |     hiveContext.table("emp").show
26 | 
27 |     // 3. 关闭资源
28 |     sc.stop()
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/spark/HiveMySQLApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | /**
 6 |   * Created by Thpffcj on 2018/5/4.
 7 |   * 使用外部数据源综合查询Hive和MySQL的表数据
 8 |   * 不能直接运行，需要打包到服务器运行
 9 |   */
10 | object HiveMySQLApp {
11 | 
12 |   def main(args: Array[String]) {
13 | 
14 |     val spark = SparkSession.builder()
15 | //      .appName("HiveMySQLApp")
16 | //      .master("local[2]")
17 |       .getOrCreate()
18 | 
19 |     // 加载Hive表数据
20 |     val hiveDF = spark.table("emp")
21 | 
22 |     // 加载MySQL表数据
23 |     val mysqlDF = spark.read.format("jdbc")
24 |       .option("url", "jdbc:mysql://localhost:3306")
25 |       .option("dbtable", "spark.DEPT")
26 |       .option("user", "root")
27 |       .option("password", "000000")
28 |       .option("driver", "com.mysql.jdbc.Driver").load()
29 | 
30 |     // JOIN
31 |     val resultDF = hiveDF.join(mysqlDF, hiveDF.col("deptno") === mysqlDF.col("DEPTNO"))
32 |     resultDF.show
33 | 
34 |     resultDF.select(hiveDF.col("empno"),hiveDF.col("ename"),
35 |       mysqlDF.col("deptno"), mysqlDF.col("dname")).show
36 | 
37 |     spark.stop()
38 |   }
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/spark/ParquetApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | /**
 6 |   * Created by Thpffcj on 2018/5/4.
 7 |   */
 8 | object ParquetApp {
 9 | 
10 |   def main(args: Array[String]) {
11 | 
12 |     val spark = SparkSession.builder().appName("ParquetApp")
13 |       .master("local[2]").getOrCreate()
14 | 
15 |     /**
16 |       * spark.read.format("parquet").load 这是标准写法
17 |       */
18 |     val userDF = spark.read.format("parquet").load("file:///home/thpffcj/app/spark-2.2.0-bin-2.6.0-cdh5.7.0/examples/src/main/resources/users.parquet")
19 | 
20 |     userDF.printSchema()
21 |     userDF.show()
22 | 
23 |     userDF.select("name","favorite_color").show
24 | 
25 |     userDF.select("name","favorite_color").write.format("json").save("file:///home/thpffcj/tmp/jsonout")
26 | 
27 |     spark.read.load("file:///home/thpffcj/app/spark-2.2.0-bin-2.6.0-cdh5.7.0/examples/src/main/resources/users.parquet").show
28 | 
29 |     // 会报错，因为sparksql默认处理的format就是parquet
30 |     spark.read.load("file:///home/thpffcj/app/spark-2.2.0-bin-2.6.0-cdh5.7.0/examples/src/main/resources/people.json").show
31 | 
32 |     spark.read.format("parquet").option("path","file:///home/thpffcj/app/spark-2.2.0-bin-2.6.0-cdh5.7.0/examples/src/main/resources/users.parquet").load().show
33 |     spark.stop()
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/spark/SQLContextApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark
 2 | 
 3 | import org.apache.spark.{SparkConf, SparkContext}
 4 | import org.apache.spark.sql.SQLContext
 5 | 
 6 | /**
 7 |   * Created by Thpffcj on 2018/4/29.
 8 |   * SQLContext的使用
 9 |   * 注意：IDEA是在本地，而测试数据是在服务器上，能不能在本地进行开发测试的？
10 |   */
11 | object SQLContextApp {
12 | 
13 |   def main(args: Array[String]): Unit = {
14 | 
15 |     val path = args(0)
16 | 
17 |     // 1. 创建相应的Context
18 |     val sparkConf = new SparkConf()
19 | 
20 |     //在测试或者生产中，AppName和Master我们是通过脚本进行指定
21 |     //sparkConf.setAppName("SQLContextApp").setMaster("local[2]")
22 | 
23 |     val sc = new SparkContext(sparkConf)
24 | 
25 |     val sqlContext = new SQLContext(sc)
26 | 
27 |     // 2. 相关的处理: json
28 |     val people = sqlContext.read.format("json").load(path)
29 |     people.printSchema()
30 |     people.show()
31 | 
32 |     // 3. 关闭资源
33 |     sc.stop()
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/spark/SparkSQLThriftServerApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark
 2 | 
 3 | import java.sql.DriverManager
 4 | 
 5 | /**
 6 |   * Created by Thpffcj on 2018/5/2.
 7 |   * 通过JDBC的方式访问
 8 |   */
 9 | object SparkSQLThriftServerApp {
10 | 
11 |   def main(args: Array[String]): Unit = {
12 | 
13 |     Class.forName("org.apache.hive.jdbc.HiveDriver")
14 | 
15 |     val conn = DriverManager.getConnection("jdbc:hive2://thpffcj:10000","thpffcj","")
16 |     val pstmt = conn.prepareStatement("select empno, ename, sal from emp")
17 |     val rs = pstmt.executeQuery()
18 | 
19 |     while (rs.next()) {
20 |       println("empno:" + rs.getInt("empno") +
21 |         " , ename:" + rs.getString("ename") +
22 |         " , sal:" + rs.getDouble("sal"))
23 | 
24 |     }
25 | 
26 |     rs.close()
27 |     pstmt.close()
28 |     conn.close()
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/spark-sql-train/src/main/scala/cn/edu/nju/spark/SparkSessionApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | /**
 6 |   * Created by Thpffcj on 2018/5/2.
 7 |   * SparkSession的使用
 8 |   */
 9 | object SparkSessionApp {
10 | 
11 |   def main(args: Array[String]) {
12 | 
13 |     val spark = SparkSession.builder().appName("SparkSessionApp")
14 |       .master("local[2]").getOrCreate()
15 | 
16 |     val people = spark.read.json("D:/people.json")
17 |     people.show()
18 | 
19 |     spark.stop()
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/spark-sql-visualization/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 4 |     <modelVersion>4.0.0</modelVersion>
 5 | 
 6 |     <groupId>cn.edu.nju</groupId>
 7 |     <artifactId>151250052</artifactId>
 8 |     <version>1.0-SNAPSHOT</version>
 9 |     <packaging>war</packaging>
10 | 
11 |     <dependencies>
12 |         <dependency>
13 |             <groupId>javax.servlet</groupId>
14 |             <artifactId>servlet-api</artifactId>
15 |             <version>2.5</version>
16 |         </dependency>
17 | 
18 |         <dependency>
19 |             <groupId>javax.servlet</groupId>
20 |             <artifactId>jsp-api</artifactId>
21 |             <version>2.0</version>
22 |         </dependency>
23 | 
24 |         <dependency>
25 |             <groupId>mysql</groupId>
26 |             <artifactId>mysql-connector-java</artifactId>
27 |             <version>5.1.38</version>
28 |         </dependency>
29 | 
30 |         <dependency>
31 |             <groupId>net.sf.json-lib</groupId>
32 |             <artifactId>json-lib</artifactId>
33 |             <version>2.4</version>
34 |             <classifier>jdk15</classifier>
35 |         </dependency>
36 |     </dependencies>
37 | </project>
38 | 


--------------------------------------------------------------------------------
/spark-sql-visualization/src/main/java/cn/edu/nju/dao/VideoAccessTopNDAO.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.dao;
 2 | 
 3 | import cn.edu.nju.domain.VideoAccessTopN;
 4 | import cn.edu.nju.utils.MySQLUtils;
 5 | 
 6 | import java.sql.Connection;
 7 | import java.sql.PreparedStatement;
 8 | import java.sql.ResultSet;
 9 | import java.util.ArrayList;
10 | import java.util.HashMap;
11 | import java.util.List;
12 | import java.util.Map;
13 | 
14 | /**
15 |  * Created by Thpffcj on 2018/5/8.
16 |  */
17 | public class VideoAccessTopNDAO {
18 | 
19 | 
20 |     static Map<String,String> courses = new HashMap<String,String>();
21 |     static {
22 |         courses.put("8701", "MySQL优化");
23 |         courses.put("8702", "神经网络");
24 |         courses.put("8703", "Swift");
25 |         courses.put("8709", "机器学习");
26 |     }
27 | 
28 |     /**
29 |      * 根据课程编号查询课程名称
30 |      */
31 |     public String getCourseName(String id) {
32 |         return courses.get(id);
33 |     }
34 | 
35 | 
36 |     /**
37 |      * 根据day查询当天的最受欢迎的Top5课程
38 |      * @param day
39 |      */
40 |     public List<VideoAccessTopN> query(String day) {
41 |         List<VideoAccessTopN> list = new ArrayList<VideoAccessTopN>();
42 | 
43 |         Connection connection = null;
44 |         PreparedStatement psmt = null;
45 |         ResultSet rs = null;
46 | 
47 |         try {
48 |             connection = MySQLUtils.getConnection();
49 |             String sql = "select cms_id ,times  from  day_video_access_topn_stat where day =? order by times desc limit 5";
50 |             psmt = connection.prepareStatement(sql);
51 |             psmt.setString(1, day);
52 | 
53 |             rs = psmt.executeQuery();
54 | 
55 |             VideoAccessTopN domain = null;
56 |             while(rs.next()) {
57 |                 domain = new VideoAccessTopN();
58 |                 /**
59 |                  * TODO... 在页面上应该显示的是课程名称，而我们此时拿到的是课程编号
60 |                  *
61 |                  * 如何根据课程编号去获取课程名称呢？
62 |                  * 编号和名称是有一个对应关系的，一般是存放在关系型数据库
63 |                  */
64 |                 domain.setName(getCourseName(rs.getLong("cms_id")+""));
65 |                 domain.setValue(rs.getLong("times"));
66 | 
67 |                 list.add(domain);
68 |             }
69 | 
70 |         }catch (Exception e) {
71 |             e.printStackTrace();
72 |         } finally {
73 |             MySQLUtils.release(connection, psmt, rs);
74 |         }
75 |         return list;
76 |     }
77 | 
78 |     public static void main(String[] args) {
79 |         VideoAccessTopNDAO dao = new VideoAccessTopNDAO();
80 |         List<VideoAccessTopN> list = dao.query("20180507");
81 |         for(VideoAccessTopN result: list) {
82 |             System.out.println(result.getName() + " , " + result.getValue());
83 |         }
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/spark-sql-visualization/src/main/java/cn/edu/nju/domain/VideoAccessTopN.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.domain;
 2 | 
 3 | /**
 4 |  * Created by Thpffcj on 2018/5/8.
 5 |  */
 6 | public class VideoAccessTopN {
 7 | 
 8 |     private String name;
 9 |     private long value ;
10 | 
11 |     public String getName() {
12 |         return name;
13 |     }
14 | 
15 |     public void setName(String name) {
16 |         this.name = name;
17 |     }
18 | 
19 |     public long getValue() {
20 |         return value;
21 |     }
22 | 
23 |     public void setValue(long value) {
24 |         this.value = value;
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/spark-sql-visualization/src/main/java/cn/edu/nju/utils/MySQLUtils.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.utils;
 2 | 
 3 | import java.sql.*;
 4 | 
 5 | /**
 6 |  * Created by Thpffcj on 2018/5/8.
 7 |  */
 8 | public class MySQLUtils {
 9 | 
10 |     private static final String USERNAME = "root";
11 | 
12 |     private static final String PASSWORD = "000000";
13 | 
14 |     private static final String DRIVERCLASS = "com.mysql.jdbc.Driver";
15 | 
16 |     private static final String URL = "jdbc:mysql://localhost:3306/sparksql";
17 | 
18 |     /**
19 |      * 获取数据库连接
20 |      */
21 |     public static Connection getConnection() {
22 |         Connection connection = null;
23 |         try {
24 |             Class.forName(DRIVERCLASS);
25 |             connection = DriverManager.getConnection(URL,USERNAME,PASSWORD);
26 |         } catch (Exception e) {
27 |             e.printStackTrace();
28 |         }
29 | 
30 |         return connection;
31 |     }
32 | 
33 |     /**
34 |      * 释放资源
35 |      */
36 |     public static void release(Connection connection, PreparedStatement pstmt, ResultSet rs) {
37 |         if(rs != null) {
38 |             try {
39 |                 rs.close();
40 |             } catch (SQLException e) {
41 |                 e.printStackTrace();
42 |             }
43 |         }
44 | 
45 |         if(pstmt != null) {
46 |             try {
47 |                 pstmt.close();
48 |             } catch (SQLException e) {
49 |                 e.printStackTrace();
50 |             }
51 |         }
52 | 
53 |         if(connection != null) {
54 |             try {
55 |                 connection.close();
56 |             } catch (SQLException e) {
57 |                 e.printStackTrace();
58 |             }
59 |         }
60 |     }
61 | 
62 |     public static void main(String[] args) {
63 |         System.out.println(getConnection());
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/spark-sql-visualization/src/main/java/cn/edu/nju/web/VideoAccessTopNServlet.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.web;
 2 | 
 3 | import cn.edu.nju.dao.VideoAccessTopNDAO;
 4 | import cn.edu.nju.domain.VideoAccessTopN;
 5 | import net.sf.json.JSONArray;
 6 | 
 7 | import javax.servlet.ServletException;
 8 | import javax.servlet.http.HttpServlet;
 9 | import javax.servlet.http.HttpServletRequest;
10 | import javax.servlet.http.HttpServletResponse;
11 | import java.io.IOException;
12 | import java.io.PrintWriter;
13 | import java.util.List;
14 | 
15 | /**
16 |  * Created by Thpffcj on 2018/5/8.
17 |  */
18 | public class VideoAccessTopNServlet extends HttpServlet{
19 | 
20 |     private VideoAccessTopNDAO dao;
21 | 
22 |     @Override
23 |     public void init() throws ServletException {
24 |         dao = new VideoAccessTopNDAO();
25 |     }
26 | 
27 |     @Override
28 |     protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
29 |         String day = req.getParameter("day");
30 | 
31 |         List<VideoAccessTopN> results =  dao.query(day);
32 |         JSONArray json = JSONArray.fromObject(results);
33 | 
34 |         resp.setContentType("text/html;charset=utf-8");
35 | 
36 |         PrintWriter writer = resp.getWriter();
37 |         writer.println(json);
38 |         writer.flush();
39 |         writer.close();
40 |     }
41 | 
42 |     @Override
43 |     protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws ServletException, IOException {
44 |         this.doGet(req, resp);
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/spark-sql-visualization/src/main/webapp/WEB-INF/web.xml:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE web-app PUBLIC
 2 |  "-//Sun Microsystems, Inc.//DTD Web Application 2.3//EN"
 3 |  "http://java.sun.com/dtd/web-app_2_3.dtd" >
 4 | 
 5 | <web-app>
 6 |   <display-name>Archetype Created Web Application</display-name>
 7 | 
 8 |   <servlet>
 9 |     <servlet-name>stat</servlet-name>
10 |     <servlet-class>com.imooc.web.VideoAccessTopNServlet</servlet-class>
11 |   </servlet>
12 |   
13 |   <servlet-mapping>
14 |     <servlet-name>stat</servlet-name>
15 |     <url-pattern>/stat</url-pattern>
16 |   </servlet-mapping>
17 | </web-app>
18 | 


--------------------------------------------------------------------------------
/spark-sql-visualization/src/main/webapp/test.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>Echarts HelloWorld</title>
 6 | 
 7 |     <!-- 引入 ECharts 文件 -->
 8 |     <script src="js/echarts.min.js"></script>
 9 | </head>
10 | <body>
11 | 
12 | <!-- 为 ECharts 准备一个具备大小（宽高）的 DOM -->
13 | <div id="main" style="width: 600px;height:400px;"></div>
14 | 
15 | <script type="text/javascript">
16 |     // 基于准备好的dom，初始化echarts实例
17 |     var myChart = echarts.init(document.getElementById('main'));
18 | 
19 |     // 指定图表的配置项和数据
20 |     var option = {
21 |         title : {
22 |             text: '某站点用户访问来源',
23 |             subtext: '纯属虚构',
24 |             x:'center'
25 |         },
26 |         tooltip : {
27 |             trigger: 'item',
28 |             formatter: "{a} <br/>{b} : {c} ({d}%)"
29 |         },
30 |         legend: {
31 |             orient: 'vertical',
32 |             left: 'left',
33 |             data: ['直接访问','邮件营销','联盟广告','视频广告','搜索引擎']
34 |         },
35 |         series : [
36 |             {
37 |                 name: '访问来源',
38 |                 type: 'pie',
39 |                 radius : '55%',
40 |                 center: ['50%', '60%'],
41 |                 data:[
42 |                     {value:335, name:'直接访问'},
43 |                     {value:310, name:'邮件营销'},
44 |                     {value:234, name:'联盟广告'},
45 |                     {value:135, name:'视频广告'},
46 |                     {value:1548, name:'搜索引擎'}
47 |                 ],
48 |                 itemStyle: {
49 |                     emphasis: {
50 |                         shadowBlur: 10,
51 |                         shadowOffsetX: 0,
52 |                         shadowColor: 'rgba(0, 0, 0, 0.5)'
53 |                     }
54 |                 }
55 |             }
56 |         ]
57 |     };
58 | 
59 |     // 使用刚指定的配置项和数据显示图表。
60 |     myChart.setOption(option);
61 | </script>
62 | </body>
63 | </html>


--------------------------------------------------------------------------------
/spark-sql-visualization/src/main/webapp/topn.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>主站最受欢迎的TopN课程</title>
 6 | 
 7 |     <!-- 引入 ECharts 文件 -->
 8 |     <script src="js/echarts.min.js"></script>
 9 |     <SCRIPT src="js/jquery.js"></SCRIPT>
10 | </head>
11 | <body>
12 | 
13 | <!-- 为 ECharts 准备一个具备大小（宽高）的 DOM -->
14 | <div id="main" style="width: 600px;height:400px;"></div>
15 | 
16 | <script type="text/javascript">
17 |     // 基于准备好的dom，初始化echarts实例
18 |     var myChart = echarts.init(document.getElementById('main'));
19 | 
20 |     // 指定图表的配置项和数据
21 |     var option = {
22 |         title : {
23 |             text: '主站最受欢迎的TopN课程',
24 |             x:'center'
25 |         },
26 |         tooltip : {
27 |             trigger: 'item',
28 |             formatter: "{a} <br/>{b} : {c} ({d}%)"
29 |         },
30 |         legend: {
31 |             orient: 'vertical',
32 |             left: 'left',
33 |             data: []
34 |         },
35 |         series : [
36 |             {
37 |                 name: '访问次数',
38 |                 type: 'pie',
39 |                 radius : '55%',
40 |                 center: ['50%', '60%'],
41 |                 data:(function(){
42 |                     var courses = [];
43 |                     $.ajax({
44 |                         type:"GET",
45 |                         url:"/stat?day=20170511",
46 |                         dataType:'json',
47 |                         async:false,
48 |                         success:function(result) {
49 |                             for(var i=0;i<result.length; i++){
50 |                                 courses.push({"value": result[i].value,"name":result[i].name});
51 |                             }
52 |                         }
53 |                     })
54 |                     return courses;
55 |                 })(),
56 |                 itemStyle: {
57 |                     emphasis: {
58 |                         shadowBlur: 10,
59 |                         shadowOffsetX: 0,
60 |                         shadowColor: 'rgba(0, 0, 0, 0.5)'
61 |                     }
62 |                 }
63 |             }
64 |         ]
65 |     };
66 | 
67 |     // 使用刚指定的配置项和数据显示图表。
68 |     myChart.setOption(option);
69 | </script>
70 | </body>
71 | </html>


--------------------------------------------------------------------------------
/spark-train/src/main/java/cn/edu/nju/spark/StreamingWordCountApp.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark;
 2 | 
 3 | import org.apache.spark.*;
 4 | import org.apache.spark.streaming.*;
 5 | import org.apache.spark.streaming.api.java.*;
 6 | import scala.Tuple2;
 7 | 
 8 | import java.util.Arrays;
 9 | 
10 | 
11 | /**
12 |  * Created by Thpffcj on 2018/1/16.
13 |  * 使用Java开发Spark Streaming应用程序
14 |  */
15 | public class StreamingWordCountApp {
16 | 
17 |     public static void main(String[] args) throws InterruptedException {
18 | 
19 |         SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("StreamingWordCountApp");
20 |         JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));
21 | 
22 |         // 创建一个DStream(hostname + port)
23 |         JavaReceiverInputDStream<String> lines = jssc.socketTextStream("localhost", 9999);
24 | 
25 |         JavaDStream<String> words = lines.flatMap(x -> Arrays.asList(x.split(" ")).iterator());
26 | 
27 |         JavaPairDStream<String, Integer> pairs = words.mapToPair(word -> new Tuple2<>(word, 1));
28 |         JavaPairDStream<String, Integer> wordCounts = pairs.reduceByKey((i1, i2) -> i1 + i2);
29 | 
30 |         wordCounts.print();
31 | 
32 |         jssc.start();
33 |         jssc.awaitTermination();
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/spark-train/src/main/java/cn/edu/nju/spark/WordCountApp.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark;
 2 | 
 3 | import org.apache.spark.api.java.JavaPairRDD;
 4 | import org.apache.spark.api.java.JavaRDD;
 5 | import org.apache.spark.sql.SparkSession;
 6 | import scala.Tuple2;
 7 | 
 8 | import java.util.Arrays;
 9 | import java.util.List;
10 | 
11 | /**
12 |  * Created by Thpffcj on 2018/1/16.
13 |  * 使用Java开发Spark应用程序
14 |  */
15 | public class WordCountApp {
16 | 
17 |     public static void main(String[] args) {
18 | 
19 |         SparkSession spark = SparkSession.builder().appName("WordCountApp").master("local[2]").getOrCreate();
20 | 
21 |         JavaRDD<String> lines = spark.read().textFile("/Users/thpffcj/Public/file/hello.txt").javaRDD();
22 |         JavaRDD<String> words = lines.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
23 |         JavaPairRDD<String, Integer> counts = words.mapToPair(word -> new Tuple2<String, Integer>(word, 1)).reduceByKey((x, y) -> x+y);
24 | 
25 |         List<Tuple2<String, Integer>> output = counts.collect();
26 | 
27 |         for (Tuple2<String, Integer> tuple : output) {
28 |             System.out.println(tuple._1() + " : " + tuple._2());
29 |         }
30 | 
31 |         spark.stop();
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/spark-train/src/main/java/cn/edu/nju/spark/kafkas/KafkaClientApp.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark.kafkas;
 2 | 
 3 | /**
 4 |  * Created by Thpffcj on 2018/1/11.
 5 |  */
 6 | public class KafkaClientApp {
 7 | 
 8 |     public static void main(String[] args) {
 9 |         new KafkaProducer(KafkaProperties.TOPIC).start();
10 |         new KafkaConsumer(KafkaProperties.TOPIC).start();
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/spark-train/src/main/java/cn/edu/nju/spark/kafkas/KafkaConsumer.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark.kafkas;
 2 | 
 3 | import kafka.consumer.Consumer;
 4 | import kafka.consumer.ConsumerConfig;
 5 | import kafka.consumer.ConsumerIterator;
 6 | import kafka.consumer.KafkaStream;
 7 | import kafka.javaapi.consumer.ConsumerConnector;
 8 | 
 9 | import java.util.HashMap;
10 | import java.util.List;
11 | import java.util.Map;
12 | import java.util.Properties;
13 | 
14 | /**
15 |  * Created by Thpffcj on 2018/1/11.
16 |  */
17 | public class KafkaConsumer extends Thread {
18 | 
19 |     private String topic;
20 | 
21 |     public KafkaConsumer(String topic) {
22 |         this.topic = topic;
23 |     }
24 | 
25 |     private ConsumerConnector createConnector() {
26 | 
27 |         Properties properties = new Properties();
28 |         properties.put("zookeeper.connect", KafkaProperties.ZOOKEEPER);
29 |         properties.put("group.id", KafkaProperties.GROUP_ID);
30 | 
31 |         return Consumer.createJavaConsumerConnector(new ConsumerConfig(properties));
32 |     }
33 | 
34 |     @Override
35 |     public void run() {
36 |         ConsumerConnector consumer = createConnector();
37 | 
38 |         Map<String, Integer> topicCountMap = new HashMap<String, Integer>();
39 |         topicCountMap.put(topic, 1);
40 | 
41 |         // String: topic
42 |         // List<KafkaStream<byte[], byte[]>>  对应的数据流
43 |         Map<String, List<KafkaStream<byte[], byte[]>>> messageStream =  consumer.createMessageStreams(topicCountMap);
44 | 
45 |         KafkaStream<byte[], byte[]> stream = messageStream.get(topic).get(0);   //获取我们每次接收到的数据
46 | 
47 |         ConsumerIterator<byte[], byte[]> iterator = stream.iterator();
48 | 
49 |         while (iterator.hasNext()) {
50 |             String message = new String(iterator.next().message());
51 |             System.out.println("rec: " + message);
52 |         }
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/spark-train/src/main/java/cn/edu/nju/spark/kafkas/KafkaProducer.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark.kafkas;
 2 | 
 3 | import kafka.javaapi.producer.Producer;
 4 | import kafka.producer.KeyedMessage;
 5 | import kafka.producer.ProducerConfig;
 6 | 
 7 | import java.util.Properties;
 8 | 
 9 | /**
10 |  * Created by Thpffcj on 2018/1/11.
11 |  */
12 | public class KafkaProducer extends Thread {
13 | 
14 |     private String topic;
15 | 
16 |     private Producer<Integer, String> producer;
17 | 
18 |     public KafkaProducer(String topic) {
19 |         this.topic = topic;
20 | 
21 |         Properties properties = new Properties();
22 |         properties.put("metadata.broker.list", KafkaProperties.BROKER_LIST);
23 |         properties.put("serializer.class", "kafka.serializer.StringEncoder");
24 |         properties.put("request.required.acks", "1");
25 | 
26 |         producer = new Producer<Integer, String>(new ProducerConfig(properties));
27 |     }
28 | 
29 |     @Override
30 |     public void run() {
31 | 
32 |         int messageNo = 1;
33 | 
34 |         while (true) {
35 |             String message = "message_" + messageNo;
36 |             producer.send(new KeyedMessage<Integer, String>(topic, message));
37 |             System.out.println("Sent：" + message);
38 | 
39 |             messageNo++;
40 | 
41 |             try {
42 |                 Thread.sleep(2000);
43 |             } catch (Exception e) {
44 |                 e.printStackTrace();
45 |             }
46 |         }
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/spark-train/src/main/java/cn/edu/nju/spark/kafkas/KafkaProperties.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark.kafkas;
 2 | 
 3 | /**
 4 |  * Created by Thpffcj on 2018/1/11.
 5 |  * Kafka常用配置文件
 6 |  */
 7 | public class KafkaProperties {
 8 | 
 9 |     public static final String ZOOKEEPER = "192.168.92.130:2181";
10 |     public static final String TOPIC = "kafka-topic";
11 |     public static final String BROKER_LIST = "192.168.92.130:9092";
12 |     public static final String GROUP_ID = "test_group1";
13 | }
14 | 


--------------------------------------------------------------------------------
/spark-train/src/main/scala/cn/edu/nju/spark/FlumePullWordCount.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.streaming.flume.FlumeUtils
 5 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 6 | 
 7 | /**
 8 |   * Created by Thpffcj on 2018/1/13.
 9 |   * Spark Streaming整合Flume的第二种方式
10 |   */
11 | object FlumePullWordCount {
12 | 
13 |   def main(args: Array[String]): Unit = {
14 | 
15 |     if(args.length != 2) {
16 |       System.err.println("Usage: FlumePullWordCount <hostname> <port>")
17 |       System.exit(1)
18 |     }
19 | 
20 |     val Array(hostname, port) = args
21 | 
22 |     val sparkConf = new SparkConf().setMaster("local[2]") //.setAppName("FlumePullWordCount")
23 |     val ssc = new StreamingContext(sparkConf, Seconds(5))
24 | 
25 |     val flumeStream = FlumeUtils.createPollingStream(ssc, hostname, port.toInt)
26 | 
27 |     flumeStream.map(x=> new String(x.event.getBody.array()).trim)
28 |       .flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).print()
29 | 
30 |     ssc.start()
31 |     ssc.awaitTermination()
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/spark-train/src/main/scala/cn/edu/nju/spark/FlumePushWordCount.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.streaming.flume.FlumeUtils
 5 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 6 | 
 7 | /**
 8 |   * Created by Thpffcj on 2018/1/13.
 9 |   * Spark Streaming整合Flume的第一种方式
10 |   */
11 | object FlumePushWordCount {
12 | 
13 |   def main(args: Array[String]): Unit = {
14 | 
15 |     if(args.length != 2) {
16 |       System.err.println("Usage: FlumePushWordCount <hostname> <port>")
17 |       System.exit(1)
18 |     }
19 | 
20 |     val Array(hostname, port) = args
21 | 
22 |     val sparkCof = new SparkConf().setMaster("local[2]").setAppName("FlumePushWordCount")
23 |     val ssc = new StreamingContext(sparkCof, Seconds(5))
24 | 
25 |     val flumeStream = FlumeUtils.createStream(ssc, hostname, port.toInt)
26 | 
27 |     flumeStream.map(x => new String(x.event.getBody.array()).trim)
28 |       .flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _).print()
29 | 
30 |     ssc.start()
31 |     ssc.awaitTermination()
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/spark-train/src/main/scala/cn/edu/nju/spark/ForeachRDDApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark
 2 | 
 3 | import java.sql.DriverManager
 4 | 
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 7 | 
 8 | /**
 9 |   * Created by Thpffcj on 2018/1/13.
10 |   * 使用Spark Streaming完成词频统计，并将结果写入到MySQL数据库中
11 |   */
12 | object ForeachRDDApp {
13 | 
14 |   def main(args: Array[String]): Unit = {
15 | 
16 |     val sparkConf = new SparkConf().setAppName("ForeachRDDApp").setMaster("local[2]")
17 |     val ssc = new StreamingContext(sparkConf, Seconds(5))
18 | 
19 |     val lines = ssc.socketTextStream("192.168.92.130", 6789)
20 | 
21 |     val result = lines.flatMap(_.split(" ")).map((_, 1)).reduceByKey(_ + _)
22 | 
23 |     // 此处仅仅是将统计结果输出到控制台
24 |     result.print()
25 | 
26 |     // 将结果写入到MySQL
27 |     result.foreachRDD(rdd => {
28 |       rdd.foreachPartition(partitionOfRecords => {
29 |         val connection = createConnection()
30 |         partitionOfRecords.foreach(record => {
31 |           val sql = "insert into wordcount(word, wordcount) values('" + record._1 + "'," + record._2 + ")"
32 |           connection.createStatement().execute(sql)
33 |         })
34 | 
35 |         connection.close()
36 |       })
37 |     })
38 | 
39 |     ssc.start()
40 |     ssc.awaitTermination()
41 |   }
42 | 
43 |   /**
44 |     * 获取MySQL的连接
45 |     */
46 |   def createConnection() = {
47 |     Class.forName("com.mysql.jdbc.Driver")
48 |     DriverManager.getConnection("jdbc:mysql://localhost:3306/spark", "root", "000000")
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/spark-train/src/main/scala/cn/edu/nju/spark/KafkaDirectWordCount.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark
 2 | 
 3 | import kafka.serializer.StringDecoder
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.streaming.kafka.KafkaUtils
 6 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 7 | 
 8 | /**
 9 |   * Created by Thpffcj on 2018/1/13.
10 |   * Spark Streaming对接Kafka的方式二
11 |   */
12 | object KafkaDirectWordCount {
13 | 
14 |   def main(args: Array[String]): Unit = {
15 | 
16 |     if(args.length != 2) {
17 |       System.err.println("Usage: KafkaDirectWordCount <brokers> <topics>")
18 |       System.exit(1)
19 |     }
20 | 
21 |     val Array(brokers, topics) = args
22 | 
23 |     val sparkConf = new SparkConf() //.setAppName("KafkaDirectWordCount").setMaster("local[2]")
24 | 
25 |     val ssc = new StreamingContext(sparkConf, Seconds(5))
26 | 
27 |     val topicsSet = topics.split(",").toSet
28 |     val kafkaParams = Map[String, String]("metadata.broker.list"-> brokers)
29 | 
30 |     val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
31 |       ssc, kafkaParams, topicsSet
32 |     )
33 | 
34 |     messages.map(_._2).flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).print()
35 | 
36 |     ssc.start()
37 |     ssc.awaitTermination()
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/spark-train/src/main/scala/cn/edu/nju/spark/KafkaReceiverWordCount.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.streaming.kafka.KafkaUtils
 5 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 6 | 
 7 | /**
 8 |   * Created by Thpffcj on 2018/1/13.
 9 |   * Spark Streaming对接Kafka的方式一
10 |   */
11 | object KafkaReceiverWordCount {
12 | 
13 |   def main(args: Array[String]): Unit = {
14 | 
15 |     if(args.length != 4) {
16 |       System.err.println("Usage: KafkaReceiverWordCount <zkQuorum> <group> <topics> <numThreads>")
17 |     }
18 | 
19 |     val Array(zkQuorum, group, topics, numThreads) = args
20 | 
21 |     val sparkConf = new SparkConf().setAppName("KafkaReceiverWordCount").setMaster("local[2]")
22 |     val ssc = new StreamingContext(sparkConf, Seconds(5))
23 | 
24 |     val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
25 | 
26 |     val messages = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap)
27 | 
28 |     messages.map(_._2).flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).print()
29 | 
30 |     ssc.start()
31 |     ssc.awaitTermination()
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/spark-train/src/main/scala/cn/edu/nju/spark/KafkaStreamingApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.streaming.kafka.KafkaUtils
 5 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 6 | 
 7 | /**
 8 |   * Created by Thpffcj on 2018/1/14.
 9 |   */
10 | object KafkaStreamingApp {
11 | 
12 |   def main(args: Array[String]): Unit = {
13 | 
14 |     if(args.length != 4) {
15 |       System.err.println("Usage: KafkaStreamingApp <zkQuorum> <group> <topics> <numThreads>")
16 |     }
17 | 
18 |     val Array(zkQuorum, group, topics, numThreads) = args
19 | 
20 |     val sparkConf = new SparkConf().setAppName("KafkaStreamingApp")
21 |       .setMaster("local[2]")
22 | 
23 |     val ssc = new StreamingContext(sparkConf, Seconds(5))
24 | 
25 |     val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
26 | 
27 |     val messages = KafkaUtils.createStream(ssc, zkQuorum, group,topicMap)
28 | 
29 |     messages.map(_._2).count().print()
30 | 
31 |     ssc.start()
32 |     ssc.awaitTermination()
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/spark-train/src/main/scala/cn/edu/nju/spark/StatefulWordCount.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 5 | 
 6 | /**
 7 |   * Created by Thpffcj on 2018/1/12.
 8 |   * 使用Spark Streaming完成有状态统计
 9 |   */
10 | object StatefulWordCount {
11 | 
12 |   def main(args: Array[String]): Unit = {
13 | 
14 |     val sparkConf = new SparkConf().setAppName("StatefulWordCount").setMaster("local[2]")
15 |     val ssc = new StreamingContext(sparkConf, Seconds(5))
16 | 
17 |     // 如果使用了stateful的算子，必须要设置checkpoint
18 |     // 在生产环境中，建议大家把checkpoint设置到HDFS的某个文件夹中
19 |     // . 代表当前目录
20 |     ssc.checkpoint(".")
21 | 
22 |     val lines = ssc.socketTextStream("localhost", 9999)
23 | 
24 |     val result = lines.flatMap(_.split(" ")).map((_,1))
25 |     val state = result.updateStateByKey[Int](updateFunction _)
26 | 
27 |     state.print()
28 | 
29 |     ssc.start()
30 |     ssc.awaitTermination()
31 |   }
32 | 
33 |   /**
34 |     * 把当前的数据去更新已有的或者是老的数据
35 |     * @param currentValues  当前的
36 |     * @param preValues  老的
37 |     * @return
38 |     */
39 |   def updateFunction(currentValues: Seq[Int], preValues: Option[Int]): Option[Int] = {
40 |     val current = currentValues.sum
41 |     val pre = preValues.getOrElse(0)
42 | 
43 |     Some(current + pre)
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/spark-train/src/main/scala/cn/edu/nju/spark/TransformApp.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 5 | 
 6 | /**
 7 |   * Created by Thpffcj on 2018/1/13.
 8 |   * 黑名单过滤
 9 |   */
10 | object TransformApp {
11 | 
12 |   def main(args: Array[String]): Unit = {
13 | 
14 |     val sparkConf = new SparkConf().setMaster("local[2]").setAppName("TransformApp")
15 | 
16 |     // 创建StreamingContext需要两个参数：SparkConf和batch interval
17 |     val ssc = new StreamingContext(sparkConf, Seconds(5))
18 | 
19 |     // 构建黑名单
20 |     val blacks = List("zs", "ls")
21 |     val blacksRDD = ssc.sparkContext.parallelize(blacks).map(x => (x, true))
22 | 
23 |     val lines = ssc.socketTextStream("192.168.92.130", 6789)
24 |     val clicklog = lines.map(x => (x.split(",")(1), x)).transform(rdd => {
25 |       rdd.leftOuterJoin(blacksRDD)
26 |         .filter(x => x._2._2.getOrElse(false) != true)
27 |         .map(x => x._2._1)
28 |     })
29 | 
30 |     clicklog.print()
31 | 
32 |     ssc.start()
33 |     ssc.awaitTermination()
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/spark-train/src/main/scala/cn/edu/nju/spark/project/dao/CourseClickCountDAO.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark.project.dao
 2 | 
 3 | import cn.edu.nju.spark.project.domain.CourseClickCount
 4 | import cn.edu.nju.spark.project.utils.HBaseUtils
 5 | import org.apache.hadoop.hbase.client.Get
 6 | import org.apache.hadoop.hbase.util.Bytes
 7 | 
 8 | import scala.collection.mutable.ListBuffer
 9 | 
10 | /**
11 |   * Created by Thpffcj on 2018/1/15.
12 |   * 实战课程点击数-数据访问层
13 |   */
14 | object CourseClickCountDAO {
15 | 
16 |   val tableName = "imooc_course_clickcount"
17 |   val cf = "info"
18 |   val qualifer = "click_count"
19 | 
20 |   /**
21 |     * 保存数据到HBase
22 |     * @param list  CourseClickCount集合
23 |     */
24 |   def save(list: ListBuffer[CourseClickCount]): Unit = {
25 | 
26 |     val table = HBaseUtils.getInstance().getTable(tableName)
27 | 
28 |     for(ele <- list) {
29 |       table.incrementColumnValue(Bytes.toBytes(ele.day_course),
30 |         Bytes.toBytes(cf),
31 |         Bytes.toBytes(qualifer),
32 |         ele.click_count)
33 |     }
34 |   }
35 | 
36 |   /**
37 |     * 根据rowkey查询值
38 |     */
39 |   def count(day_course: String): Long = {
40 |     val table = HBaseUtils.getInstance().getTable(tableName)
41 | 
42 |     val get = new Get(Bytes.toBytes(day_course))
43 |     val value = table.get(get).getValue(cf.getBytes, qualifer.getBytes)
44 | 
45 |     if(value == null) {
46 |       0L
47 |     }else{
48 |       Bytes.toLong(value)
49 |     }
50 |   }
51 | 
52 |   def main(args: Array[String]): Unit = {
53 | 
54 | 
55 |     val list = new ListBuffer[CourseClickCount]
56 |     list.append(CourseClickCount("20171111_8",8))
57 |     list.append(CourseClickCount("20171111_9",9))
58 |     list.append(CourseClickCount("20171111_1",100))
59 | 
60 |     save(list)
61 | 
62 |     println(count("20171111_8") + " : " + count("20171111_9")+ " : " + count("20171111_1"))
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/spark-train/src/main/scala/cn/edu/nju/spark/project/dao/CourseSearchClickCountDAO.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark.project.dao
 2 | 
 3 | import cn.edu.nju.spark.project.domain.CourseSearchClickCount
 4 | import cn.edu.nju.spark.project.utils.HBaseUtils
 5 | import org.apache.hadoop.hbase.client.Get
 6 | import org.apache.hadoop.hbase.util.Bytes
 7 | 
 8 | import scala.collection.mutable.ListBuffer
 9 | 
10 | /**
11 |   * Created by Thpffcj on 2018/1/15.
12 |   * 从搜索引擎过来的实战课程点击数-数据访问层
13 |   */
14 | object CourseSearchClickCountDAO {
15 | 
16 |   val tableName = "imooc_course_search_clickcount"
17 |   val cf = "info"
18 |   val qualifer = "click_count"
19 | 
20 |   /**
21 |     * 保存数据到HBase
22 |     *
23 |     * @param list  CourseSearchClickCount集合
24 |     */
25 |   def save(list: ListBuffer[CourseSearchClickCount]): Unit = {
26 | 
27 |     val table = HBaseUtils.getInstance().getTable(tableName)
28 | 
29 |     for(ele <- list) {
30 |       table.incrementColumnValue(Bytes.toBytes(ele.day_search_course),
31 |         Bytes.toBytes(cf),
32 |         Bytes.toBytes(qualifer),
33 |         ele.click_count)
34 |     }
35 | 
36 |   }
37 | 
38 |   /**
39 |     * 根据rowkey查询值
40 |     */
41 |   def count(day_search_course: String):Long = {
42 |     val table = HBaseUtils.getInstance().getTable(tableName)
43 | 
44 |     val get = new Get(Bytes.toBytes(day_search_course))
45 |     val value = table.get(get).getValue(cf.getBytes, qualifer.getBytes)
46 | 
47 |     if(value == null) {
48 |       0L
49 |     }else{
50 |       Bytes.toLong(value)
51 |     }
52 |   }
53 | 
54 |   def main(args: Array[String]): Unit = {
55 | 
56 |     val list = new ListBuffer[CourseSearchClickCount]
57 |     list.append(CourseSearchClickCount("20171111_www.baidu.com_8",8))
58 |     list.append(CourseSearchClickCount("20171111_cn.bing.com_9",9))
59 | 
60 |     save(list)
61 | 
62 |     println(count("20171111_www.baidu.com_8") + " : " + count("20171111_cn.bing.com_9"))
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/spark-train/src/main/scala/cn/edu/nju/spark/project/domain/ClickLog.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark.project.domain
 2 | 
 3 | /**
 4 |   * Created by Thpffcj on 2018/1/15.
 5 |   * 清洗后的日志信息
 6 |   * @param ip  日志访问的ip地址
 7 |   * @param time  日志访问的时间
 8 |   * @param courseId  日志访问的实战课程编号
 9 |   * @param statusCode 日志访问的状态码
10 |   * @param referrer  日志访问的referrer
11 |   */
12 | case class ClickLog(ip:String, time:String, courseId:Int, statusCode:Int, referrer:String)
13 | 


--------------------------------------------------------------------------------
/spark-train/src/main/scala/cn/edu/nju/spark/project/domain/CourseClickCount.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark.project.domain
 2 | 
 3 | /**
 4 |   * Created by Thpffcj on 2018/1/15.
 5 |   * 实战课程点击数实体类
 6 |   * @param day_course  对应的就是HBase中的rowkey，20171111_1
 7 |   * @param click_count 对应的20171111_1的访问总数
 8 |   */
 9 | case class CourseClickCount(day_course:String, click_count:Long)
10 | 


--------------------------------------------------------------------------------
/spark-train/src/main/scala/cn/edu/nju/spark/project/domain/CourseSearchClickCount.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark.project.domain
 2 | 
 3 | /**
 4 |   * Created by Thpffcj on 2018/1/15.
 5 |   * 从搜索引擎过来的实战课程点击数实体类
 6 |   * @param day_search_course
 7 |   * @param click_count
 8 |   */
 9 | case class CourseSearchClickCount(day_search_course:String, click_count:Long)
10 | 


--------------------------------------------------------------------------------
/spark-train/src/main/scala/cn/edu/nju/spark/project/utils/DateUtils.scala:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.spark.project.utils
 2 | 
 3 | import java.util.Date
 4 | 
 5 | import org.apache.commons.lang3.time.FastDateFormat
 6 | 
 7 | /**
 8 |   * Created by Thpffcj on 2018/1/15.
 9 |   * 日期时间工具类
10 |   */
11 | object DateUtils {
12 | 
13 |   val YYYYMMDDHHMMSS_FORMAT = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss")
14 |   val TARGE_FORMAT = FastDateFormat.getInstance("yyyyMMddHHmmss")
15 | 
16 | 
17 |   def getTime(time: String) = {
18 |     YYYYMMDDHHMMSS_FORMAT.parse(time).getTime
19 |   }
20 | 
21 |   def parseToMinute(time :String) = {
22 |     TARGE_FORMAT.format(new Date(getTime(time)))
23 |   }
24 | 
25 |   def main(args: Array[String]): Unit = {
26 | 
27 |     println(parseToMinute("2017-10-22 14:46:01"))
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/spark-train/src/test/java/LoggerGenerator.java:
--------------------------------------------------------------------------------
 1 | import org.apache.log4j.Logger;
 2 | 
 3 | /**
 4 |  * Created by Thpffcj on 2018/1/14.
 5 |  * 模拟日志产生
 6 |  */
 7 | public class LoggerGenerator {
 8 | 
 9 |     private static Logger logger = Logger.getLogger(LoggerGenerator.class.getName());
10 | 
11 |     public static void main(String[] args) throws Exception{
12 | 
13 |         int index = 0;
14 |         while(true) {
15 |             Thread.sleep(1000);
16 |             logger.info("value : " + index++);
17 |         }
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/spark-train/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | log4j.rootLogger=INFO,stdout,flume
 2 | 
 3 | log4j.appender.stdout = org.apache.log4j.ConsoleAppender
 4 | log4j.appender.stdout.target = System.out
 5 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS} [%t] [%c] [%p] - %m%n
 7 | 
 8 | log4j.appender.flume = org.apache.flume.clients.log4jappender.Log4jAppender
 9 | log4j.appender.flume.Hostname = 192.168.92.130
10 | log4j.appender.flume.Port = 41414
11 | log4j.appender.flume.UnsafeMode = true
12 | 
13 | 


--------------------------------------------------------------------------------
/storm-data-visualization/.gitignore:
--------------------------------------------------------------------------------
 1 | /target/
 2 | !.mvn/wrapper/maven-wrapper.jar
 3 | 
 4 | ### STS ###
 5 | .apt_generated
 6 | .classpath
 7 | .factorypath
 8 | .project
 9 | .settings
10 | .springBeans
11 | .sts4-cache
12 | 
13 | ### IntelliJ IDEA ###
14 | .idea
15 | *.iws
16 | *.iml
17 | *.ipr
18 | 
19 | ### NetBeans ###
20 | /nbproject/private/
21 | /build/
22 | /nbbuild/
23 | /dist/
24 | /nbdist/
25 | /.nb-gradle/


--------------------------------------------------------------------------------
/storm-data-visualization/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 3 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 4 |     <modelVersion>4.0.0</modelVersion>
 5 | 
 6 |     <groupId>cn.edu.nju</groupId>
 7 |     <artifactId>151250052</artifactId>
 8 |     <version>0.0.1-SNAPSHOT</version>
 9 |     <packaging>jar</packaging>
10 | 
11 |     <name>storm-data-visualization</name>
12 |     <description></description>
13 | 
14 |     <parent>
15 |         <groupId>org.springframework.boot</groupId>
16 |         <artifactId>spring-boot-starter-parent</artifactId>
17 |         <version>2.0.1.RELEASE</version>
18 |         <relativePath/> <!-- lookup parent from repository -->
19 |     </parent>
20 | 
21 |     <properties>
22 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
23 |         <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
24 |         <java.version>1.8</java.version>
25 |     </properties>
26 | 
27 |     <dependencies>
28 |         <dependency>
29 |             <groupId>org.springframework.boot</groupId>
30 |             <artifactId>spring-boot-starter</artifactId>
31 |         </dependency>
32 | 
33 |         <dependency>
34 |             <groupId>org.springframework.boot</groupId>
35 |             <artifactId>spring-boot-starter-web</artifactId>
36 |         </dependency>
37 | 
38 |         <dependency>
39 |             <groupId>org.springframework.boot</groupId>
40 |             <artifactId>spring-boot-starter-test</artifactId>
41 |             <scope>test</scope>
42 |         </dependency>
43 | 
44 |         <dependency>
45 |             <groupId>org.springframework.boot</groupId>
46 |             <artifactId>spring-boot-starter-jdbc</artifactId>
47 |         </dependency>
48 | 
49 |         <dependency>
50 |             <groupId>org.springframework.boot</groupId>
51 |             <artifactId>spring-boot-starter-thymeleaf</artifactId>
52 |         </dependency>
53 | 
54 |         <dependency>
55 |             <groupId>mysql</groupId>
56 |             <artifactId>mysql-connector-java</artifactId>
57 |             <version>5.1.38</version>
58 |         </dependency>
59 | 
60 |         <dependency>
61 |             <groupId>net.sf.json-lib</groupId>
62 |             <artifactId>json-lib</artifactId>
63 |             <version>2.4</version>
64 |             <classifier>jdk15</classifier>
65 |         </dependency>
66 | 
67 |     </dependencies>
68 | 
69 |     <build>
70 |         <plugins>
71 |             <plugin>
72 |                 <groupId>org.springframework.boot</groupId>
73 |                 <artifactId>spring-boot-maven-plugin</artifactId>
74 |             </plugin>
75 |         </plugins>
76 |     </build>
77 | 
78 | 
79 | </project>
80 | 


--------------------------------------------------------------------------------
/storm-data-visualization/src/main/java/cn/edu/nju/DataVisualizationApplication.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju;
 2 | 
 3 | import org.springframework.boot.SpringApplication;
 4 | import org.springframework.boot.autoconfigure.SpringBootApplication;
 5 | 
 6 | @SpringBootApplication
 7 | public class DataVisualizationApplication {
 8 | 
 9 |     public static void main(String[] args) {
10 |         SpringApplication.run(DataVisualizationApplication.class, args);
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/storm-data-visualization/src/main/java/cn/edu/nju/controller/StatApp.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.controller;
 2 | 
 3 | import cn.edu.nju.domain.ResultBean;
 4 | import cn.edu.nju.service.ResultBeanService;
 5 | import net.sf.json.JSONArray;
 6 | import org.springframework.beans.factory.annotation.Autowired;
 7 | import org.springframework.web.bind.annotation.RequestMapping;
 8 | import org.springframework.web.bind.annotation.RequestMethod;
 9 | import org.springframework.web.bind.annotation.ResponseBody;
10 | import org.springframework.web.bind.annotation.RestController;
11 | import org.springframework.web.servlet.ModelAndView;
12 | 
13 | import java.util.List;
14 | 
15 | /**
16 |  * Created by Thpffcj on 2018/4/10.
17 |  */
18 | @RestController
19 | public class StatApp {
20 | 
21 |     @Autowired
22 |     ResultBeanService resultBeanService;
23 | 
24 |     @RequestMapping(value = "/map", method = RequestMethod.GET)
25 |     public ModelAndView map() {
26 |         return new ModelAndView("map.html");
27 |     }
28 | 
29 |     @RequestMapping(value = "/map_stat", method = RequestMethod.POST)
30 |     @ResponseBody
31 |     public List<ResultBean> mapStat() {
32 |         List<ResultBean> results = resultBeanService.query();
33 |         return results;
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/storm-data-visualization/src/main/java/cn/edu/nju/domain/ResultBean.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.domain;
 2 | 
 3 | /**
 4 |  * Created by Thpffcj on 2018/4/10.
 5 |  */
 6 | public class ResultBean {
 7 | 
 8 |     private double lng;
 9 |     private double lat;
10 |     private long count;
11 | 
12 |     public double getLng() {
13 |         return lng;
14 |     }
15 | 
16 |     public void setLng(double lng) {
17 |         this.lng = lng;
18 |     }
19 | 
20 |     public double getLat() {
21 |         return lat;
22 |     }
23 | 
24 |     public void setLat(double lat) {
25 |         this.lat = lat;
26 |     }
27 | 
28 |     public long getCount() {
29 |         return count;
30 |     }
31 | 
32 |     public void setCount(long count) {
33 |         this.count = count;
34 |     }
35 | 
36 |     @Override
37 |     public String toString() {
38 |         return "ResultBean{" +
39 |                 "lng=" + lng +
40 |                 ", lat=" + lat +
41 |                 ", count=" + count +
42 |                 '}';
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/storm-data-visualization/src/main/java/cn/edu/nju/service/ResultBeanService.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.service;
 2 | 
 3 | import cn.edu.nju.domain.ResultBean;
 4 | import org.springframework.beans.factory.annotation.Autowired;
 5 | import org.springframework.jdbc.core.JdbcTemplate;
 6 | import org.springframework.jdbc.core.RowMapper;
 7 | import org.springframework.stereotype.Service;
 8 | 
 9 | import java.sql.ResultSet;
10 | import java.sql.SQLException;
11 | import java.util.List;
12 | 
13 | /**
14 |  * Created by Thpffcj on 2018/4/10.
15 |  */
16 | @Service
17 | public class ResultBeanService {
18 | 
19 |     @Autowired
20 |     JdbcTemplate jdbcTemplate;
21 | 
22 |     public List<ResultBean> query() {
23 | 
24 |         String sql = "select longitude, latitude, count(1) as c from stat where time > unix_timestamp(date_sub(current_timestamp(), interval 10 hour)) * 1000 group by longitude, latitude";
25 |         
26 |         return (List<ResultBean>) jdbcTemplate.query(sql, new RowMapper<ResultBean>() {
27 | 
28 |             @Override
29 |             public ResultBean mapRow(ResultSet resultSet, int i) throws SQLException {
30 |                 ResultBean bean = new ResultBean();
31 |                 bean.setLng(resultSet.getDouble("longitude"));
32 |                 bean.setLat(resultSet.getDouble("latitude"));
33 |                 bean.setCount(resultSet.getLong("c"));
34 |                 return bean;
35 |             }
36 |         });
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/storm-data-visualization/src/main/resources/application.properties:
--------------------------------------------------------------------------------
1 | spring.datasource.driver-class-name: com.mysql.jdbc.Driver
2 | spring.datasource.url: jdbc:mysql://127.0.0.1:3306/storm?useSSL=false
3 | spring.datasource.username: root
4 | spring.datasource.password: 000000


--------------------------------------------------------------------------------
/storm-data-visualization/src/main/resources/templates/map.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |     <title>基于Storm的实时区域游客量热力图统计</title>
 5 | 
 6 |     <link rel="stylesheet" href="http://cache.amap.com/lbs/static/main1119.css"/>
 7 |     <script src="http://webapi.amap.com/maps?v=1.4.5&key=edaee476f1aece49ed92adf8b15d3a52"></script>
 8 |     <script src="js/jquery.js"></script>
 9 | </head>
10 | <body>
11 | <div id="container"></div>
12 | 
13 | <script type="text/javascript">
14 |     var map = new AMap.Map("container", {
15 |         resizeEnable: true,
16 |         center: [116.418261, 39.921984],
17 |         zoom: 11
18 |     });
19 | 
20 |     var heatmap;
21 |     // var points =[
22 |     //     {"lng":116.191031,"lat":39.988585,"count":1000},
23 |     //     {"lng":116.389275,"lat":39.925818,"count":11},
24 |     //     {"lng":116.287444,"lat":39.810742,"count":500},
25 |     //     {"lng":116.481707,"lat":39.940089,"count":13},
26 |     //     {"lng":116.410588,"lat":39.880172,"count":14},
27 |     //     {"lng":116.394816,"lat":39.91181,"count":15},
28 |     //     {"lng":116.416002,"lat":39.952917,"count":16}
29 |     // ];
30 | 
31 |     map.plugin(["AMap.Heatmap"], function () {      //加载热力图插件
32 |         heatmap = new AMap.Heatmap(map, {
33 |             radius: 50,
34 |             opacity: [0, 0.8]
35 |         });    //在地图对象叠加热力图
36 |         heatmap.setDataSet({
37 |             data: (function(){ //<![CDATA[
38 |                 var points = [];
39 |                 $.ajax({
40 |                     type: "POST",
41 |                     url: "/map_stat",
42 |                     dataType: 'json',
43 |                     async: false,
44 |                     success: function (result) {
45 |                         for (var i = 0; i < result.length; i++) {
46 |                             points.push({"lng": result[i].lng, "lat": result[i].lat, "count": result[i].count * 100})
47 |                         }
48 |                         console.log(points);
49 |                     }
50 |                 });
51 |                 return points;
52 |                 //]]>
53 |             })(),
54 |             max: 100
55 |         }); //设置热力图数据集
56 |         //具体参数见接口文档
57 |     });
58 | 
59 |     AMap.plugin(["AMap.ToolBar"], function () {
60 |         map.addControl(new AMap.ToolBar());
61 |     });
62 | 
63 | </script>
64 | </body>
65 | </html>
66 | 
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/storm-data-visualization/src/test/java/cn/edi/nju/DataVisualizationApplicationTests.java:
--------------------------------------------------------------------------------
 1 | package cn.edi.nju;
 2 | 
 3 | import org.junit.Test;
 4 | import org.junit.runner.RunWith;
 5 | import org.springframework.boot.test.context.SpringBootTest;
 6 | import org.springframework.test.context.junit4.SpringRunner;
 7 | 
 8 | @RunWith(SpringRunner.class)
 9 | @SpringBootTest
10 | public class DataVisualizationApplicationTests {
11 | 
12 |     @Test
13 |     public void contextLoads() {
14 |     }
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/storm-train/src/main/java/cn/edu/nju/drpc/LocalDRPCTopology.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.drpc;
 2 | 
 3 | import org.apache.storm.Config;
 4 | import org.apache.storm.LocalCluster;
 5 | import org.apache.storm.LocalDRPC;
 6 | import org.apache.storm.drpc.LinearDRPCTopologyBuilder;
 7 | import org.apache.storm.task.OutputCollector;
 8 | import org.apache.storm.task.TopologyContext;
 9 | import org.apache.storm.topology.OutputFieldsDeclarer;
10 | import org.apache.storm.topology.base.BaseRichBolt;
11 | import org.apache.storm.tuple.Fields;
12 | import org.apache.storm.tuple.Tuple;
13 | import org.apache.storm.tuple.Values;
14 | 
15 | import java.util.Map;
16 | 
17 | /**
18 |  * Created by Thpffcj on 2018/4/6.
19 |  * 本地的DRPC
20 |  */
21 | public class LocalDRPCTopology {
22 | 
23 |     public static class MyBolt extends BaseRichBolt {
24 | 
25 |         private OutputCollector outputCollector;
26 | 
27 |         public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
28 |             this.outputCollector = collector;
29 |         }
30 | 
31 |         public void execute(Tuple input) {
32 | 
33 |             // 请求的id
34 |             Object requestId = input.getValue(0);
35 |             // 请求的参数
36 |             String name = input.getString(1);
37 | 
38 |             /**
39 |              * TODO... 业务逻辑处理
40 |              */
41 |             String result = "add user: " + name;
42 | 
43 |             this.outputCollector.emit(new Values(requestId, result));
44 |         }
45 | 
46 |         public void declareOutputFields(OutputFieldsDeclarer declarer) {
47 |             declarer.declare(new Fields("id", "result"));
48 |         }
49 |     }
50 | 
51 |     public static void main(String[] args) {
52 |         LinearDRPCTopologyBuilder builder = new LinearDRPCTopologyBuilder("addUser");
53 |         builder.addBolt(new MyBolt());
54 | 
55 |         LocalCluster localCluster = new LocalCluster();
56 |         LocalDRPC drpc = new LocalDRPC();
57 |         localCluster.submitTopology("local-drpc", new Config(),
58 |                 builder.createLocalTopology(drpc));
59 | 
60 |         String result = drpc.execute("addUser", "Thpffcj");
61 |         System.out.println("From client: " + result);
62 | 
63 |         localCluster.shutdown();
64 |         drpc.shutdown();
65 |     }
66 | }
67 | 


--------------------------------------------------------------------------------
/storm-train/src/main/java/cn/edu/nju/drpc/RPCClient.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.drpc;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.ipc.RPC;
 5 | 
 6 | import java.io.IOException;
 7 | import java.net.InetSocketAddress;
 8 | 
 9 | /**
10 |  * Created by Thpffcj on 2018/4/6.
11 |  * RPC 客户端
12 |  */
13 | public class RPCClient {
14 | 
15 |     public static void main(String[] args) throws IOException {
16 | 
17 |         Configuration configuration = new Configuration();
18 | 
19 |         long clientVersion = 88888888;
20 | 
21 |         UserService userService = RPC.getProxy(UserService.class, clientVersion,
22 |                 new InetSocketAddress("localhost", 9999),
23 |                 configuration);
24 | 
25 |         userService.addUser("Thpffcj", 21);
26 |         System.out.println("From client invoked");
27 | 
28 |         RPC.stopProxy(userService);
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/storm-train/src/main/java/cn/edu/nju/drpc/RPCServer.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.drpc;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.ipc.RPC;
 5 | 
 6 | import java.io.IOException;
 7 | 
 8 | /**
 9 |  * Created by Thpffcj on 2018/4/6.
10 |  * RPC Server服务
11 |  */
12 | public class RPCServer {
13 | 
14 |     public static void main(String[] args) throws IOException {
15 | 
16 |         Configuration configuration = new Configuration();
17 | 
18 |         RPC.Builder builder = new RPC.Builder(configuration);
19 | 
20 |         // Java Builder 模式
21 |         RPC.Server server = builder.setProtocol(UserService.class)
22 |                 .setInstance(new UserServiceImpl())
23 |                 .setBindAddress("localhost").setPort(9999).build();
24 | 
25 | 
26 |         server.start();
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/storm-train/src/main/java/cn/edu/nju/drpc/RemoteDRPCClient.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.drpc;
 2 | 
 3 | import org.apache.storm.Config;
 4 | import org.apache.storm.thrift.transport.TTransportException;
 5 | import org.apache.storm.utils.DRPCClient;
 6 | 
 7 | /**
 8 |  * Created by Thpffcj on 2018/4/6.
 9 |  * Remote DRPC 客户端测试类
10 |  */
11 | public class RemoteDRPCClient {
12 | 
13 |     public static void main(String[] args) throws Exception {
14 | 
15 |         Config config = new Config();
16 |         config.put("storm.thrift.transport", "org.apache.storm.security.auth.SimpleTransportPlugin");
17 |         config.put(Config.STORM_NIMBUS_RETRY_TIMES, 3);
18 |         config.put(Config.STORM_NIMBUS_RETRY_INTERVAL, 10);
19 |         config.put(Config.STORM_NIMBUS_RETRY_INTERVAL_CEILING, 20);
20 |         config.put(Config.DRPC_MAX_BUFFER_SIZE, 1048576);
21 | 
22 |         DRPCClient client = new DRPCClient(config, "thpffcj", 3772);
23 |         String result = client.execute("addUser", "Thpffcj");
24 | 
25 |         System.out.println("Client invoked: " + result);
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/storm-train/src/main/java/cn/edu/nju/drpc/RemoteDRPCTopology.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.drpc;
 2 | 
 3 | import org.apache.storm.Config;
 4 | import org.apache.storm.LocalCluster;
 5 | import org.apache.storm.LocalDRPC;
 6 | import org.apache.storm.StormSubmitter;
 7 | import org.apache.storm.drpc.LinearDRPCTopologyBuilder;
 8 | import org.apache.storm.generated.AlreadyAliveException;
 9 | import org.apache.storm.generated.AuthorizationException;
10 | import org.apache.storm.generated.InvalidTopologyException;
11 | import org.apache.storm.task.OutputCollector;
12 | import org.apache.storm.task.TopologyContext;
13 | import org.apache.storm.topology.OutputFieldsDeclarer;
14 | import org.apache.storm.topology.base.BaseRichBolt;
15 | import org.apache.storm.tuple.Fields;
16 | import org.apache.storm.tuple.Tuple;
17 | import org.apache.storm.tuple.Values;
18 | 
19 | import java.util.Map;
20 | 
21 | /**
22 |  * Created by Thpffcj on 2018/4/6.
23 |  * 远程的DRPC
24 |  */
25 | public class RemoteDRPCTopology {
26 | 
27 |     public static class MyBolt extends BaseRichBolt {
28 | 
29 |         private OutputCollector outputCollector;
30 | 
31 |         public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
32 |             this.outputCollector = collector;
33 |         }
34 | 
35 |         public void execute(Tuple input) {
36 | 
37 |             // 请求的id
38 |             Object requestId = input.getValue(0);
39 |             // 请求的参数
40 |             String name = input.getString(1);
41 | 
42 |             /**
43 |              * TODO... 业务逻辑处理
44 |              */
45 |             String result = "add user: " + name;
46 | 
47 |             this.outputCollector.emit(new Values(requestId, result));
48 |         }
49 | 
50 |         public void declareOutputFields(OutputFieldsDeclarer declarer) {
51 |             declarer.declare(new Fields("id", "result"));
52 |         }
53 |     }
54 | 
55 |     public static void main(String[] args) {
56 |         LinearDRPCTopologyBuilder builder = new LinearDRPCTopologyBuilder("addUser");
57 |         builder.addBolt(new MyBolt());
58 | 
59 |         try {
60 |             StormSubmitter.submitTopology("drpc-topology",
61 |                     new Config(),
62 |                     builder.createRemoteTopology());
63 |         } catch (Exception e) {
64 |             e.printStackTrace();
65 |         }
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/storm-train/src/main/java/cn/edu/nju/drpc/UserService.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.drpc;
 2 | 
 3 | /**
 4 |  * Created by Thpffcj on 2018/4/6.
 5 |  * 用户的服务
 6 |  */
 7 | public interface UserService {
 8 | 
 9 |     public static final long versionID = 88888888;
10 | 
11 |     /**
12 |      * 添加用户
13 |      * @param name 名字
14 |      * @param age 年龄
15 |      */
16 |     public void addUser(String name, int age);
17 | }
18 | 


--------------------------------------------------------------------------------
/storm-train/src/main/java/cn/edu/nju/drpc/UserServiceImpl.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.drpc;
 2 | 
 3 | /**
 4 |  * Created by Thpffcj on 2018/4/6.
 5 |  * 用户的服务接口实现类
 6 |  */
 7 | public class UserServiceImpl implements UserService {
 8 | 
 9 |     public void addUser(String name, int age) {
10 |         System.out.println("From Server Invoked: add user success, name is " + name);
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/storm-train/src/main/java/cn/edu/nju/intergration/jdbc/ddl.sql:
--------------------------------------------------------------------------------
1 | create table wc(
2 | word varchar (20),
3 | word_count int
4 | );


--------------------------------------------------------------------------------
/storm-train/src/main/java/cn/edu/nju/intergration/kafka/DateUtils.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.intergration.kafka;
 2 | 
 3 | import org.apache.commons.lang3.time.FastDateFormat;
 4 | 
 5 | import java.text.ParseException;
 6 | 
 7 | /**
 8 |  * Created by Thpffcj on 2018/4/10.
 9 |  * 时间解析工具类
10 |  */
11 | public class DateUtils {
12 | 
13 |     private DateUtils(){}
14 | 
15 |     private static DateUtils instance;
16 | 
17 |     public static DateUtils getInstance() {
18 |         if (instance == null) {
19 |             instance = new DateUtils();
20 |         }
21 | 
22 |         return instance;
23 |     }
24 | 
25 |     FastDateFormat format = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss");
26 | 
27 |     public long getTime(String time) throws Exception {
28 |         return format.parse(time.substring(1, time.length() - 1)).getTime();
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/storm-train/src/main/java/cn/edu/nju/intergration/kafka/LogProcessBolt.java:
--------------------------------------------------------------------------------
 1 | package cn.edu.nju.intergration.kafka;
 2 | 
 3 | import org.apache.storm.task.OutputCollector;
 4 | import org.apache.storm.task.TopologyContext;
 5 | import org.apache.storm.topology.OutputFieldsDeclarer;
 6 | import org.apache.storm.topology.base.BaseRichBolt;
 7 | import org.apache.storm.tuple.Fields;
 8 | import org.apache.storm.tuple.Tuple;
 9 | import org.apache.storm.tuple.Values;
10 | 
11 | import java.util.Map;
12 | 
13 | /**
14 |  * Created by Thpffcj on 2018/4/8.
15 |  * 接收kafka的数据进行处理的BOLT
16 |  */
17 | public class LogProcessBolt extends BaseRichBolt {
18 | 
19 |     private OutputCollector collector;
20 | 
21 |     public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
22 |         this.collector = collector;
23 |     }
24 | 
25 |     public void execute(Tuple input) {
26 | 
27 |         try {
28 |             byte[] binaryByField = input.getBinaryByField("bytes");
29 |             String value = new String(binaryByField);
30 | 
31 |             // 解析出来日志信息
32 |             String[] splits = value.split("\t");
33 |             String phone = splits[0];
34 |             String[] temp = splits[1].split(",");
35 |             String longitude = temp[0];
36 |             String latitude = temp[1];
37 |             long time = DateUtils.getInstance().getTime(splits[2]);
38 | 
39 |             System.out.println(phone + " " + longitude + " " + latitude + " " + time);
40 | 
41 |             collector.emit(new Values(time, Double.parseDouble(longitude), Double.parseDouble(latitude)));
42 | 
43 |             this.collector.ack(input);
44 |         } catch (Exception e) {
45 |             this.collector.fail(input);
46 |         }
47 |     }
48 | 
49 |     public void declareOutputFields(OutputFieldsDeclarer declarer) {
50 |         declarer.declare(new Fields("time", "longitude", "latitude"));
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/storm-train/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | #log4j.rootLogger=WARN, stdout
2 | #log4j.appender.stdout=org.apache.log4j.ConsoleAppender
3 | #log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
4 | #log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n


--------------------------------------------------------------------------------