7 | * Purpose : 离线处理
8 | */
9 | public class WordCountJava {
10 | public static void main(String[] args) {
11 |
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/bigdata-flink/src/main/scala/com/libin/data/flink/base/FlinkStreamingTrait.scala:
--------------------------------------------------------------------------------
1 | package com.libin.data.flink.base
2 |
3 | /**
4 | * Copyright (c) 2020/4/3 libin Inc. All Rights Reserved.
5 | * Authors: libin<2578858653@qq.com>
6 | *
7 | * Purpose :
8 | */
9 | trait FlinkStreamingTrait {
10 |
11 | /**
12 | * Application Name
13 | */
14 | def appName: String = this.getClass.getSimpleName
15 | }
16 |
--------------------------------------------------------------------------------
/bigdata-flink/src/main/scala/com/libin/data/flink/base/client/KafkaFlinkStreamingTrait.scala:
--------------------------------------------------------------------------------
1 | package com.libin.data.flink.base.client
2 |
3 | /**
4 | * Copyright (c) 2020/4/3 libin Inc. All Rights Reserved.
5 | * Authors: libin<2578858653@qq.com>
6 | *
7 | * Purpose : Kafka和Flink结合trait
8 | */
9 | trait KafkaFlinkStreamingTrait {
10 |
11 | }
12 |
--------------------------------------------------------------------------------
/bigdata-flink/src/main/scala/com/libin/data/flink/batch/WordCount.scala:
--------------------------------------------------------------------------------
1 | package com.libin.data.flink.batch
2 |
3 | /**
4 | * Copyright (c) 2020/4/3 libin Inc. All Rights Reserved.
5 | * Authors: libin<2578858653@qq.com>
6 | *
7 | * Purpose :
8 | */
9 | object WordCount {
10 |
11 | }
12 |
--------------------------------------------------------------------------------
/bigdata-flink/src/main/scala/com/libin/data/flink/streaming/jobs/GenCodeFromMysql.scala:
--------------------------------------------------------------------------------
1 | package com.libin.data.flink.streaming.jobs
2 |
3 | /**
4 | * Copyright (c) 2020/9/6. libin Inc. All Rights Reserved.
5 | * Authors: libin
7 | * Purpose : 将处理的数据写入到MySQL中
8 | */
9 | object GenCodeFromMysql {
10 |
11 | }
12 |
--------------------------------------------------------------------------------
/bigdata-flink/src/main/scala/com/libin/data/flink/streaming/jobs/GenCodeFromState.scala:
--------------------------------------------------------------------------------
1 | package com.libin.data.flink.streaming.jobs
2 |
3 | /**
4 | * Copyright (c) 2020/9/3. libin Inc. All Rights Reserved.
5 | * Authors: libin
7 | * Purpose :
8 | */
9 | object GenCodeFromState {
10 |
11 | }
12 |
--------------------------------------------------------------------------------
/bigdata-flink/src/main/scala/com/libin/data/flink/streaming/jobs/GenCodeFromWindow.scala:
--------------------------------------------------------------------------------
1 | package com.libin.data.flink.streaming.jobs
2 |
3 | import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment, createTypeInformation}
4 | import org.apache.flink.streaming.api.windowing.time.Time
5 |
6 | /**
7 | * Copyright (c) 2020/4/2 libin Inc. All Rights Reserved.
8 | * Authors: libin<2578858653@qq.com>
9 | *
10 | * Purpose : nc -lk 9999
11 | */
12 | object GenCodeFromWindow {
13 |
14 | implicit val inTypeInfo = createTypeInformation[String]
15 |
16 | def main(args: Array[String]) {
17 | // create env
18 | val env = StreamExecutionEnvironment.getExecutionEnvironment
19 | // data source
20 | val text = env.socketTextStream("localhost", 9999)
21 |
22 | val counts: DataStream[(String, Int)] = text
23 | .flatMap {
24 | line =>
25 | line.toLowerCase.split("\\W+") filter {
26 | x => x.nonEmpty
27 | }
28 | }
29 | .map {
30 | x =>
31 | (x, 1)
32 | }
33 | .keyBy(0)
34 | .timeWindow(Time.seconds(5))
35 | //.countWindowAll(5, 2)
36 | //.timeWindowAll(Time.minutes(1),Time.seconds(30))
37 | .sum(1)
38 |
39 | counts.print()
40 |
41 | env.execute("Window Stream WordCount")
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/bigdata-flink/src/main/scala/com/libin/data/flink/streaming/jobs/GenCodeFromWordCount.scala:
--------------------------------------------------------------------------------
1 | package com.libin.data.flink.streaming.jobs
2 |
3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
4 | import org.apache.flink.streaming.api.windowing.time.Time
5 |
6 | /**
7 | * Copyright (c) 2019/05/18. xixi Inc. All Rights Reserved.
8 | * Authors: libin <2578858653@qq.com>
9 | *
10 | * Purpose : 服务器上执行 nc -l 9000 , 运行代码
11 | */
12 | object GenCodeFromWordCount {
13 |
14 | case class wc(word: String, count: Long)
15 |
16 | def main(args: Array[String]): Unit = {
17 | val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
18 | val hostname = "localhost"
19 | val port = 9000
20 | val stream = env.socketTextStream(hostname, port, '\n')
21 |
22 | import org.apache.flink.api.scala._
23 | val wcStream =
24 | stream
25 | .flatMap(x => x.split("\t"))
26 | .map(w => wc(w, 1))
27 | .keyBy("word")
28 | .timeWindow(Time.seconds(2), Time.seconds(1))
29 | //.sum("count")
30 | .reduce((a, b) => wc(a.word, a.count + b.count))
31 |
32 | wcStream.print().setParallelism(1)
33 | env.execute("socket wc")
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/bigdata-flink/src/main/scala/com/libin/data/flink/streaming/jobs/config/GenCodeFromBucketingSink.scala:
--------------------------------------------------------------------------------
1 | package com.libin.data.flink.streaming.jobs.config
2 |
3 | import org.apache.flink.streaming.api.scala.DataStream
4 | import org.apache.flink.streaming.connectors.fs.StringWriter
5 | import org.apache.flink.streaming.connectors.fs.bucketing.{BucketingSink, DateTimeBucketer}
6 |
7 | /**
8 | * Copyright (c) 2020/4/2 libin Inc. All Rights Reserved.
9 | * Authors: libin<2578858653@qq.com>
10 | *
11 | * Purpose : 使用BucketingSink对存储的数据进行输出
12 | */
13 | object GenCodeFromBucketingSink {
14 | def main(args: Array[String]): Unit = {
15 |
16 | val resultDS: DataStream[Long] = null
17 |
18 | val sink = new BucketingSink[Long]("output path")
19 | sink.setBucketer(new DateTimeBucketer[Long]("yyyy-MM-dd--HHmm"))
20 | sink.setWriter(new StringWriter[Long]())
21 | // sink.setBatchSize(1024 * 1024 * 100) // this is 100 MB,
22 | sink.setBatchSize(1024 * 1024 * 1) // this is 1 MB,
23 | // sink.setBatchRolloverInterval( 60 * 1000); // this is 30 seconds
24 | resultDS.addSink(sink)
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/bigdata-flink/src/main/scala/com/libin/data/flink/streaming/jobs/config/GenCodeFromCheckpoint.scala:
--------------------------------------------------------------------------------
1 | package com.libin.data.flink.streaming.jobs.config
2 |
3 | import org.apache.flink.runtime.state.filesystem.FsStateBackend
4 | import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup
5 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
6 | import org.apache.flink.streaming.api.{CheckpointingMode, TimeCharacteristic}
7 |
8 | /**
9 | * Copyright (c) 2020/4/2 libin Inc. All Rights Reserved.
10 | * Authors: libin<2578858653@qq.com>
11 | *
12 | * Purpose : Env和Checkpoint 一些常见配置
13 | */
14 | object GenCodeFromCheckpoint {
15 | def main(args: Array[String]): Unit = {
16 | // create env
17 | val env = StreamExecutionEnvironment.getExecutionEnvironment
18 |
19 | // 设置env属性值
20 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
21 | env.setParallelism(16)
22 | env.enableCheckpointing(20000)
23 | env.setStateBackend(new FsStateBackend("checkpoint path")) // kafka offset,确保 exactly-once
24 |
25 | // 设置config属性值
26 | val config = env.getCheckpointConfig
27 | config.enableExternalizedCheckpoints(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
28 | config.setCheckpointingMode(CheckpointingMode.AT_LEAST_ONCE)
29 | // config.setCheckpointInterval(10000)
30 | config.setCheckpointInterval(5 * 60 * 1000); // Checkpoint的触发频率;
31 | config.setMinPauseBetweenCheckpoints(5 * 60 * 1000); // Checkpoint之间的最小间隔;
32 | config.setCheckpointTimeout(10 * 60 * 1000); // Checkpoint的超时时间;
33 | config.setTolerableCheckpointFailureNumber(3); // 连续3次checkpoint失败,才会导致作业失败重启;默认值是0 。
34 |
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/bigdata-hadoop/.gitignore:
--------------------------------------------------------------------------------
1 | *.bak
2 | build_info.properties
3 | .classpath
4 | dependency-reduced-pom.xml
5 | *.diff
6 | .DS_Store
7 | .idea/
8 | *.iml
9 | *.jar
10 | .project
11 | .settings/
12 | .tags*
13 | target/
14 | tmp*
15 | test-output/
16 | nohup*
17 | *.log
18 | *.swp
19 | *.pyc
20 | script/__pycache__/
--------------------------------------------------------------------------------
/bigdata-hadoop/README.md:
--------------------------------------------------------------------------------
1 |
2 | ## MapReduce
3 | * [Mapreduce应用API编程](src/main/java/com/libin/api/mapreduce)
4 | * MapReduce原理
5 | * MapReduce源码分析
6 |
7 | ## Hdfs
8 | * [Hdfs应用](src/main/java/com/libin/api/hdfs)
9 | * [Hdfs原理](src/main/java/com/libin/doc/hdfs)
10 | * [Hdfs源码分析](src/main/java/com/libin/code/hdfs)
11 |
12 | ## Yarn
13 | * Yarn原理
14 | * Yarn源码分析
15 |
--------------------------------------------------------------------------------
/bigdata-hadoop/src/main/java/com/libin/api/hdfs/README.md:
--------------------------------------------------------------------------------
1 |
2 | ## HDFS使用
3 | 在大数据开发中,对HDFS的使用无处不在,大部分计算和存储框架直接与HDFS交互,因此学习HDFS很重要.
4 | RD(Research and Development engineer)们一般都喜欢通过命令行进行操作HDFS,那敲键盘的感觉很爽...当然代码API操作也是必须的.
5 | PM(Product Manager)们一般不会研发那点儿东东,更多使用自研的数据管理平台或HUE之类的进行网页查看数据...
6 | ##
7 | ##### 1.HDFS命令行使用
8 | 之前博客整理的文档:https://blog.csdn.net/baolibin528/article/details/43854291
9 | ##
10 | ##### 2.[HDFS的Java API使用](HdfsUtils.java)
11 | 之前博客整理的文档:https://blog.csdn.net/baolibin528/article/details/43868515
12 |
--------------------------------------------------------------------------------
/bigdata-hadoop/src/main/java/com/libin/api/mapreduce/GetInputSplit.java:
--------------------------------------------------------------------------------
1 | package com.libin.api.mapreduce;
2 |
3 | import org.apache.hadoop.io.IntWritable;
4 | import org.apache.hadoop.io.LongWritable;
5 | import org.apache.hadoop.io.NullWritable;
6 | import org.apache.hadoop.io.Text;
7 | import org.apache.hadoop.mapreduce.Mapper;
8 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
9 |
10 | import java.io.IOException;
11 |
12 | /**
13 | * Copyright (c) 2015/10/30. xixi Inc. All Rights Reserved.
14 | * Authors: libin <2578858653@qq.com>
15 | *
16 | * Purpose :
17 | * 问题场景:当有很多个小文件,需要把每个小文件的目录名加进小文件内容中并转换输出,用一个map类的话可以处理每一行数据的时候读取这行数据的目录名加到第一个字段输出。
18 | */
19 | public class GetInputSplit {
20 | public static class MapClass extends Mapper
17 | * Purpose :
18 | */
19 | public class FindMaxValueInputFormat extends InputFormat
14 | * Purpose : Map函数的输入格式有所改变
15 | */
16 | public class FindMaxValueMapper extends Mapper
15 | * Purpose : Ruducer比较两个Map函数输出的最大值,结果输出在HDFS上面
16 | * 这个例子就比较两个值,有几个Map比较几个
17 | */
18 | public class FindMaxValueReducer extends Reducer
17 | * Purpose :
18 | */
19 | public class MaxValueDriver {
20 | public static void main(String[] args) throws Exception {
21 | Configuration conf = new Configuration();
22 | Job job = Job.getInstance(conf, MaxValueDriver.class.getSimpleName());
23 | job.setJarByClass(MaxValueDriver.class);
24 |
25 | job.setNumReduceTasks(1);
26 |
27 | job.setMapperClass(FindMaxValueMapper.class);
28 | job.setReducerClass(FindMaxValueReducer.class);
29 |
30 | job.setMapOutputKeyClass(IntWritable.class);
31 | job.setMapOutputValueClass(FloatWritable.class);
32 |
33 | job.setOutputKeyClass(Text.class);
34 | job.setOutputValueClass(FloatWritable.class);
35 |
36 | job.setInputFormatClass(FindMaxValueInputFormat.class);
37 | job.setOutputFormatClass(TextOutputFormat.class);
38 |
39 | // FileInputFormat.setInputPaths(job, args[0]);
40 | FileOutputFormat.setOutputPath(job, new Path(args[0]));
41 |
42 | job.waitForCompletion(true);
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/bigdata-hadoop/src/main/java/com/libin/api/yarn/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hadoop/src/main/java/com/libin/api/yarn/README.md
--------------------------------------------------------------------------------
/bigdata-hadoop/src/main/java/com/libin/code/hdfs/README.md:
--------------------------------------------------------------------------------
1 | ## HDFS源码阅读
2 | 选用的版本是基于hadoop2.6.0版本,对这个版本还是蛮钟爱的。
3 |
4 | ##
5 | ##### 1.HDFS客户端
6 | HDFS目前提供3个客户端访问操作接口:
7 | 1.DistributedFileSystem:(org.apache.hadoop.hdfs)为用户开发提供基于HDFS的应用操作API。
8 | 2.FsShell:(org.apache.hadoop.fs)可以通过HDFS Shell命令执行常见的文件系统操作。
9 | 3.DFSAdmin:(org.apache.hadoop.hdfs.tools)向系统管理员提供管理HDFS的工具,如升级、管理安全模式等。
10 | 上面3个接口都是直接或间接持有DFSClient(org.apache.hadoop.hdfs)提供的接口方法对HDFS进行管理和操作的。
11 |
12 | * DFSAdmin是一个真正实现分布式文件系统客户端功能的类,使用户进行HDFS操作的起点。
13 | * DFSAdmin会连接到HDFS,对外提供关联文件/目录,读写文件以及管理与配置HDFS系统等功能。
14 | * DFSAdmin通过ClientProtocol(org.apache.hadoop.hdfs.protocol)接口调用NameNode的接口。
15 | * DFSAdmin通过DataTransferProtocol(org.apache.hadoop.hdfs.protocol.datatransfer)与DataNode交互数据。
16 |
17 | ##
18 | ##### 2.RPC通信
19 |
20 |
21 |
22 | ##
23 | ##### 3.NameNode
24 |
25 |
26 | ##
27 | ##### 4.DataNode
28 |
--------------------------------------------------------------------------------
/bigdata-hadoop/src/main/java/com/libin/doc/hdfs/README.md:
--------------------------------------------------------------------------------
1 | ## Hdfs原理
2 | 说起分布式存储,可能首先想到的就是HDFS,对于我来说是这个反应的.
3 | HDFS目前已得到了广泛的应用,不管是开源版本、商业版本还是二次开发版本,所以既然使用了就又必要理解它的内部实现原理.
4 |
5 | ##### Hdfs架构图
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/bigdata-hadoop/src/main/java/com/libin/doc/mapreduce/README.md:
--------------------------------------------------------------------------------
1 |
2 | ## MapReduce原理
3 |
--------------------------------------------------------------------------------
/bigdata-hadoop/src/main/java/com/libin/doc/yarn/README.md:
--------------------------------------------------------------------------------
1 |
2 | ## Yarn原理
3 | * [YARN文档](http://hadoop.apache.org/docs/r2.6.5/hadoop-yarn/hadoop-yarn-site/YARN.html)
4 |
5 |
6 | ##### YARN架构图
7 | 
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/bigdata-hadoop/src/main/java/com/libin/doc/yarn/images/resource_manager.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hadoop/src/main/java/com/libin/doc/yarn/images/resource_manager.jpg
--------------------------------------------------------------------------------
/bigdata-hadoop/src/main/java/com/libin/doc/yarn/images/timg.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hadoop/src/main/java/com/libin/doc/yarn/images/timg.jpg
--------------------------------------------------------------------------------
/bigdata-hadoop/src/main/java/com/libin/doc/yarn/images/yarn.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hadoop/src/main/java/com/libin/doc/yarn/images/yarn.jpg
--------------------------------------------------------------------------------
/bigdata-hadoop/src/main/java/com/libin/doc/yarn/images/yarn_architecture.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hadoop/src/main/java/com/libin/doc/yarn/images/yarn_architecture.gif
--------------------------------------------------------------------------------
/bigdata-hbase/.gitignore:
--------------------------------------------------------------------------------
1 | *.bak
2 | build_info.properties
3 | .classpath
4 | dependency-reduced-pom.xml
5 | *.diff
6 | .DS_Store
7 | .idea/
8 | *.iml
9 | *.jar
10 | .project
11 | .settings/
12 | .tags*
13 | target
14 | tmp*
15 | test-output/
16 | nohup*
17 | *.log
18 | *.swp
19 | *.pyc
20 | script/__pycache__/
21 | venv
22 |
--------------------------------------------------------------------------------
/bigdata-hbase/README.md:
--------------------------------------------------------------------------------
1 |
2 | ##### 1、HBase简介
3 | HBase是一个分布式的、面向列的开源数据库,一个结构化数据的分布式存储系统”,HBase在Hadoop之上提供了类似于Bigtable的能力。
4 | * [HBase体系结构](src/main/scala/com/libin/doc/HBase体系结构.md)
5 | * [HBase数据模型](src/main/scala/com/libin/doc/HBase数据模型.md)
6 |
7 |
8 | ##### 2、HBase原理
9 | * [HBase RegionServer内部结构](src/main/scala/com/libin/doc/RegionServer/RegionServer内部结构.md)
10 | * [HBase HLog](src/main/scala/com/libin/doc/RegionServer/HLog.md)
11 | * [HBase MemStore](src/main/scala/com/libin/doc/RegionServer/MemStore.md)
12 | * [HBase HFile](src/main/scala/com/libin/doc/RegionServer/HFile.md)
13 | * [HBase BlockCache](src/main/scala/com/libin/doc/RegionServer/BlockCache.md)
14 |
15 |
16 | ##### 3、HBase相关算法
17 | * [HBase跳跃表](src/main/scala/com/libin/doc/HBase算法/跳跃表.md)
18 | * [HBase LSM树](src/main/scala/com/libin/doc/HBase算法/LSM树.md)
19 | * [HBase布隆过滤器](src/main/scala/com/libin/doc/HBase算法/布隆过滤器.md)
20 |
21 |
22 | ##### 4、HBase依赖服务
23 | * [ZooKeeper](src/main/scala/com/libin/doc/依赖服务组件/ZooKeeper.md)
24 | * [Hdfs](src/main/scala/com/libin/doc/依赖服务组件/Hdfs.md)
25 |
26 |
27 | ##### 5、HBase相关学习资料
28 | * [HBase官网地址](http://hbase.apache.org/)
29 | * [HBase gitbook地址](http://hbase.apache.org/book.html)
30 |
31 |
32 |
--------------------------------------------------------------------------------
/bigdata-hbase/image/HBase体系结构.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hbase/image/HBase体系结构.png
--------------------------------------------------------------------------------
/bigdata-hbase/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
7 | * Purpose : Hive相关操作
8 | */
9 | public class HiveUtils {
10 | }
11 |
--------------------------------------------------------------------------------
/bigdata-info/.gitignore:
--------------------------------------------------------------------------------
1 | *.bak
2 | build_info.properties
3 | .classpath
4 | dependency-reduced-pom.xml
5 | *.diff
6 | .DS_Store
7 | .idea/
8 | *.iml
9 | *.jar
10 | .project
11 | .settings/
12 | .tags*
13 | target/
14 | tmp*
15 | test-output/
16 | nohup*
17 | *.log
18 | *.swp
19 | *.pyc
20 | script/__pycache__/
21 |
--------------------------------------------------------------------------------
/bigdata-info/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-info/README.md
--------------------------------------------------------------------------------
/bigdata-info/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
7 | * Purpose :
8 | */
9 | public class Test {
10 | }
11 |
--------------------------------------------------------------------------------
/bigdata-info/src/main/java/com/libin/elasticsearch/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-info/src/main/java/com/libin/elasticsearch/README.md
--------------------------------------------------------------------------------
/bigdata-info/src/main/java/com/libin/griffin/2.Griffin指标使用.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-info/src/main/java/com/libin/griffin/2.Griffin指标使用.md
--------------------------------------------------------------------------------
/bigdata-info/src/main/java/com/libin/griffin/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-info/src/main/java/com/libin/griffin/README.md
--------------------------------------------------------------------------------
/bigdata-info/src/main/java/com/libin/griffin/image/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-info/src/main/java/com/libin/griffin/image/1.png
--------------------------------------------------------------------------------
/bigdata-info/src/main/java/com/libin/griffin/image/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-info/src/main/java/com/libin/griffin/image/2.jpg
--------------------------------------------------------------------------------
/bigdata-info/src/main/java/com/libin/griffin/image/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-info/src/main/java/com/libin/griffin/image/3.png
--------------------------------------------------------------------------------
/bigdata-info/src/main/java/com/libin/griffin/image/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-info/src/main/java/com/libin/griffin/image/4.jpg
--------------------------------------------------------------------------------
/bigdata-info/src/main/java/com/libin/oozie/README.md:
--------------------------------------------------------------------------------
1 |
2 | ##### 1、Oozie简介
3 | Oozie是一个管理 Apache Hadoop 作业的工作流调度系统。
4 | Oozie的 workflow jobs 是由 actions 组成的 有向无环图(DAG)。
5 | Oozie的 coordinator jobs 是由时间 (频率)和数据可用性触发的重复的 workflow jobs 。
6 | Oozie与Hadoop生态圈的其他部分集成在一起,支持多种类型的Hadoop作业(如Java map-reduce、流式map-reduce、Pig、Hive、Sqoop和Distcp)以及特定于系统的工作(如Java程序和shell脚本)。
7 | Oozie是一个可伸缩、可靠和可扩展的系统。
8 | Oozie是大数据四大协作框架之一——任务调度框架,另外三个分别为数据转换工具Sqoop,文件收集库框架Flume,大数据WEB工具Hue。
9 |
10 | ##### 2、Oozie操作
11 |
12 |
13 |
14 | ##### 3、Oozie学习资料
15 |
16 |
17 |
--------------------------------------------------------------------------------
/bigdata-info/src/main/java/com/libin/pegasus/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | * [学习资料](./学习资料.md)
4 |
--------------------------------------------------------------------------------
/bigdata-info/src/main/java/com/libin/pegasus/学习资料.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ##### 学习文档
4 | * [1、GitHub](https://github.com/XiaoMi/pegasus)
5 | * [2、Apache官网](https://pegasus.apache.org/)
6 | * [3、bookstack文档](https://www.bookstack.cn/read/Pegasus/128323)
7 |
8 |
--------------------------------------------------------------------------------
/bigdata-info/src/main/java/com/libin/talos/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-info/src/main/java/com/libin/talos/README.md
--------------------------------------------------------------------------------
/bigdata-kafka/.gitignore:
--------------------------------------------------------------------------------
1 | *.bak
2 | build_info.properties
3 | .classpath
4 | dependency-reduced-pom.xml
5 | *.diff
6 | .DS_Store
7 | .idea/
8 | *.iml
9 | *.jar
10 | .project
11 | .settings/
12 | .tags*
13 | target
14 | tmp*
15 | test-output/
16 | nohup*
17 | *.log
18 | *.swp
19 | *.pyc
20 | script/__pycache__/
21 | venv
22 |
--------------------------------------------------------------------------------
/bigdata-kafka/README.md:
--------------------------------------------------------------------------------
1 |
2 | ##### 1、Kafka原理
3 | * [Kafka基本概念](src/main/docs/Kafka基本概念.md)
4 | * [Kafka副本](src/main/docs/Kafka副本.md)
5 | * [消费者与消费组](src/main/docs/消费者与消费组.md)
6 |
7 | ##### 2、Kafka操作
8 | * [Kafka客户端操作](src/main/scala/com/libin/code/client/KafkaClient.scala)
9 |
10 |
11 | ##### 3、Kafka面试
12 | * [Kafka面试题](src/main/docs/Kafka面试题.md)
13 |
14 |
--------------------------------------------------------------------------------
/bigdata-kafka/src/main/docs/Kafka副本.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ##### Kafka复本机制
4 | 分区中的所有副本统称为AR(Assigned Replicas)。
5 | 所有与leader副本保持一定程度同步的副本(包括leader副本在内)组成ISR(In-Sync Replicas),ISR集合是AR集合中的一个子集。
6 | 与leader副本同步滞后过多的副本(不包括leader副本)组成OSR(Out-of-Sync Replicas),由此可见,AR=ISR+OSR。
7 | 在正常情况下,所有的 follower 副本都应该与 leader 副本保持一定程度的同步,即 AR=ISR,OSR集合为空。
8 | 默认情况下,当leader副本发生故障时,只有在ISR集合中的副本才有资格被选举为新的leader,而在OSR集合中的副本则没有任何机会。
9 |
10 |
11 | ISR与HW和LEO也有紧密的关系。HW是High Watermark的缩写,俗称高水位,它标识了一个特定的消息偏移量(offset),消费者只能拉取到这个offset之前的消息。
12 |
13 | LEO是Log End Offset的缩写,它标识当前日志文件中下一条待写入消息的offset,LEO的大小相当于当前日志分区中最后一条消息的offset值加1。
14 |
15 |
--------------------------------------------------------------------------------
/bigdata-kafka/src/main/docs/Kafka基本概念.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ##### Kafka基本概念
4 | 1.主题
5 | Kafka将一组消息抽象归纳为一个主题(Topic),也就是说,一个主题就是对消息的一个分类。
6 | 生产者将消息发送到特定主题,消费者订阅主题或主题的某些分区进行消费。
7 | 2.消息
8 | 消息是Kafka通信的基本单位,由一个固定长度的消息头和一个可变长度的消息体构成。在老版本中,每一条消息称为Message;
9 | 在由Java重新实现的客户端中,每一条消息称为Record。
10 | 3.分区和副本
11 | Kafka将一组消息归纳为一个主题,而每个主题又被分成一个或多个分区(Partition)。
12 | 每个分区由一系列有序、不可变的消息组成,是一个有序队列。
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/bigdata-kafka/src/main/docs/消费者与消费组.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ##### 1.消费者与消费组
4 | 消费者(Consumer)负责订阅Kafka中的主题(Topic),并且从订阅的主题上拉取消息。
5 | 与其他一些消息中间件不同的是:
6 | 在Kafka的消费理念中还有一层消费组(Consumer Group)的概念,每个消费者都有一个对应的消费组。
7 | 当消息发布到主题后,只会被投递给订阅它的每个消费组中的一个消费者。
8 |
9 |
10 | ##### 2.消息投递模式
11 | 1. 点对点(P2P,Point-to-Point)
12 | 点对点模式是基于队列的,消息生产者发送消息到队列,消息消费者从队列中接收消息。
13 | 2. 发布/订阅(Pub/Sub)模式
14 | 发布订阅模式定义了如何向一个内容节点发布和订阅消息,这个内容节点称为主题(Topic),主题可以认为是消息传递的中介,
15 | 消息发布者将消息发布到某个主题,而消息订阅者从主题中订阅消息。主题使得消息的订阅者和发布者互相保持独立,不需要进行
16 | 接触即可保证消息的传递,发布/订阅模式在消息的一对多广播时采用。
17 |
18 | 如果所有的消费者都隶属于同一个消费组,那么所有的消息都会被均衡地投递给每一个消费者,即每条消息只会被一个消费者处理,这就相当于点对点模式的应用。
19 | 如果所有的消费者都隶属于不同的消费组,那么所有的消息都会被广播给所有的消费者,即每条消息会被所有的消费者处理,这就相当于发布/订阅模式的应用。
20 |
21 |
22 |
--------------------------------------------------------------------------------
/bigdata-kafka/src/main/docs/索引和分段.md:
--------------------------------------------------------------------------------
1 |
2 | ##### 索引和分段
3 |
4 |
5 |
6 | ###### 1. Kafka中有那些索引文件?
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/bigdata-kafka/src/main/images/Kafka体系结构.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-kafka/src/main/images/Kafka体系结构.png
--------------------------------------------------------------------------------
/bigdata-kafka/src/main/images/kafka多副本架构.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-kafka/src/main/images/kafka多副本架构.png
--------------------------------------------------------------------------------
/bigdata-kafka/src/main/scala/com/libin/README.md:
--------------------------------------------------------------------------------
1 |
2 | ##### 1、Kafka操作
3 |
4 |
--------------------------------------------------------------------------------
/bigdata-kafka/src/main/scala/com/libin/code/base/KafkaJobTrait.scala:
--------------------------------------------------------------------------------
1 | package com.libin.code.base
2 |
3 | /**
4 | * Copyright (c) 2020/9/26. libin Inc. All Rights Reserved.
5 | * Authors: libin
7 | * Purpose :
8 | */
9 | trait KafkaJobTrait {
10 |
11 | }
12 |
--------------------------------------------------------------------------------
/bigdata-kafka/src/main/scala/com/libin/code/client/KafkaClient.scala:
--------------------------------------------------------------------------------
1 | package com.libin.code.client
2 |
3 | /**
4 | * Copyright (c) 2020/9/26. libin Inc. All Rights Reserved.
5 | * Authors: libin
7 | * Purpose :
8 | */
9 | object KafkaClient {
10 |
11 | }
12 |
--------------------------------------------------------------------------------
/bigdata-kafka/src/main/scala/com/libin/code/streaming/FlinkStramingJob.scala:
--------------------------------------------------------------------------------
1 | package com.libin.code.streaming
2 |
3 | /**
4 | * Copyright (c) 2020/9/28. libin Inc. All Rights Reserved.
5 | * Authors: libin
7 | * Purpose :
8 | */
9 |
10 | object FlinkStramingJob {
11 |
12 | }
13 |
--------------------------------------------------------------------------------
/bigdata-kafka/src/main/scala/com/libin/code/streaming/SparkStreamingKafkaJob.scala:
--------------------------------------------------------------------------------
1 | package com.libin.code.streaming
2 |
3 | /**
4 | * Copyright (c) 2020/9/26. libin Inc. All Rights Reserved.
5 | * Authors: libin
7 | * Purpose :
8 | */
9 | object SparkStreamingKafkaJob {
10 |
11 | }
12 |
--------------------------------------------------------------------------------
/bigdata-kafka/src/main/scala/com/libin/code/utils/KafkaUtils.scala:
--------------------------------------------------------------------------------
1 | package com.libin.code.utils
2 |
3 | /**
4 | * Copyright (c) 2020/9/26. libin Inc. All Rights Reserved.
5 | * Authors: libin
7 | * Purpose :
8 | */
9 | object KafkaUtils {
10 |
11 | }
12 |
--------------------------------------------------------------------------------
/bigdata-project/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
7 | * Purpose :
8 | */
9 |
10 | public class FileUtils {
11 | public static final String STU_File = "stu.json";
12 | public static final String SCHOOL_File = "school.json";
13 |
14 | public static final String PEOPLE_File = "people.txt";
15 | public static final String USERS_File = "users.parquet";
16 | }
17 |
--------------------------------------------------------------------------------
/bigdata-spark-sql/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 |
2 | log4j.logger.org.apache=ERROR
3 |
--------------------------------------------------------------------------------
/bigdata-spark-sql/src/main/resources/people.txt:
--------------------------------------------------------------------------------
1 | Michael, 29
2 | Andy, 30
3 | Justin, 19
4 |
--------------------------------------------------------------------------------
/bigdata-spark-sql/src/main/resources/school.json:
--------------------------------------------------------------------------------
1 | {"name": "xiaoming", "school": "qinghua", "location": "bj"}
2 | {"name": "xiaoli", "school": "fudan", "location": "shanghai"}
3 | {"name": "xiaoqiang", "school": "nankai", "location": "tianjin"}
4 | {"name": "xiaohong", "school": "chuanda", "location": "sichuan"}
--------------------------------------------------------------------------------
/bigdata-spark-sql/src/main/resources/stu.json:
--------------------------------------------------------------------------------
1 | {"name": "xiaoming", "age": 22, "height": 175}
2 | {"name": "xiaoli", "age": 18, "height": 161}
3 | {"name": "xiaoqiang", "age": 26, "height": 198}
4 | {"name": "xiaohong", "age": 18, "height": 158}
--------------------------------------------------------------------------------
/bigdata-spark-sql/src/main/resources/users.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-spark-sql/src/main/resources/users.parquet
--------------------------------------------------------------------------------
/bigdata-spark-sql/src/main/scala/com/libin/common/sparkJobBase.scala:
--------------------------------------------------------------------------------
1 | package com.libin.common
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 | import org.apache.spark.sql.SparkSession
5 | import org.joda.time.DateTime
6 | import org.slf4j.{Logger, LoggerFactory}
7 |
8 | /**
9 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved.
10 | * Authors: libin <2578858653@qq.com>
11 | *
12 | * Purpose :
13 | */
14 |
15 | trait SparkJobBase {
16 |
17 | /**
18 | * 得到app名称
19 | */
20 | def appName: String = this.getClass.getSimpleName.stripSuffix("$")
21 |
22 | /**
23 | * 对数据处理的一些常用分隔符,使用时需要被重写
24 | */
25 | val separator: String = "\t"
26 |
27 | /**
28 | * 对数据的填充值
29 | */
30 | val fillValue: String = ""
31 |
32 | /**
33 | * 对数据处理的分区数,使用时需要被重写
34 | */
35 | val partitionNum: Int = 400
36 |
37 | /**
38 | * 获取作业名字,使用时需要被重写
39 | */
40 | def jobName: String = this.getClass.getSimpleName
41 |
42 | /**
43 | * 日志对象
44 | */
45 | val logger: Logger = LoggerFactory.getLogger(jobName)
46 |
47 | /**
48 | * 创建SparkSession对象
49 | */
50 | def createSparkSession(): SparkSession = {
51 | SparkSession.builder().appName(appName).getOrCreate()
52 | }
53 |
54 | /**
55 | * 创建本地的SparkSession对象
56 | */
57 | def createSparkSessionLocal(): SparkSession = {
58 | SparkSession.builder().appName(appName).master("local[2]").getOrCreate()
59 | }
60 |
61 | /**
62 | * 创建本地的SparkContext对象
63 | */
64 | def createSparkContextLocal(): SparkContext = {
65 | SparkContext.getOrCreate(new SparkConf().setAppName(appName).setMaster("local[2]"))
66 | }
67 |
68 | /**
69 | * 初始化
70 | *
71 | * @return
72 | */
73 | def initContext: SparkSession = createSparkSessionLocal()
74 |
75 | /**
76 | * 停掉一个作业
77 | *
78 | */
79 | def destroyJob(): Unit = {
80 | createSparkSessionLocal().stop()
81 | logger.info(s"$jobName stopped at ${new DateTime()}")
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/bigdata-spark-sql/src/main/scala/com/libin/etl/jobs/ConvertJobScheduler.scala:
--------------------------------------------------------------------------------
1 | package com.libin.etl.jobs
2 |
3 | import com.libin.common.SparkJobBase
4 | import com.libin.etl.loader.data.{DfBuilder, RddBuilder}
5 | import com.libin.etl.utils.LoadUtils.stu
6 | import com.libin.utils.FileUtils
7 | import org.apache.spark.rdd.RDD
8 | import org.apache.spark.sql.DataFrame
9 |
10 | /**
11 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved.
12 | * Authors: libin <2578858653@qq.com>
13 | *
14 | * Purpose :
15 | */
16 |
17 | class ConvertJobScheduler extends SparkJobBase {
18 | override def appName = "convertJobScheduler"
19 |
20 | val sc = createSparkContextLocal()
21 | val ss = createSparkSessionLocal()
22 | }
23 |
24 | object ConvertJobScheduler {
25 |
26 | def apply() = new ConvertJobScheduler
27 |
28 | def main(args: Array[String]) {
29 | val convertScheduler: ConvertJobScheduler = apply()
30 | convertScheduler.logger.info("convertJobScheduler start ...")
31 |
32 | // 创建RDD
33 | val rdd: RDD[stu] = RddBuilder.createRdd(convertScheduler.sc)
34 |
35 | import convertScheduler.ss.implicits._
36 | println("rdd.toDF().show() ...")
37 | rdd.toDF().show()
38 | println("rdd.toDS().show() ...")
39 | rdd.toDS().show()
40 |
41 | // --------------------------------------------------------------------------------
42 | // join操作
43 | val stuDf: DataFrame = DfBuilder.readJsonToDf(convertScheduler.ss, FileUtils.STU_File)
44 | val schoolDf: DataFrame = DfBuilder.readJsonToDf(convertScheduler.ss, FileUtils.SCHOOL_File)
45 |
46 | // stuDf.createOrReplaceTempView("stu_df")
47 | // schoolDf.createOrReplaceTempView("school_df")
48 | println("stuDf.join(schoolDf,\"name\").show() ...")
49 | stuDf.join(schoolDf, "name").show()
50 |
51 | // --------------------------------------------------------------------------------
52 | // 读取test,parquet格式
53 | println(s"dfBuilder.readTextToDf(convertScheduler.ss,${FileUtils.PEOPLE_File}).show() ...")
54 | DfBuilder.readTextToDf(convertScheduler.ss, FileUtils.PEOPLE_File).show()
55 | println(s"dfBuilder.readParquetToDf(convertScheduler.ss,${FileUtils.USERS_File}).show() ...")
56 | DfBuilder.readParquetToDf(convertScheduler.ss, FileUtils.USERS_File).show()
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/bigdata-spark-sql/src/main/scala/com/libin/etl/jobs/DfJobScheduler.scala:
--------------------------------------------------------------------------------
1 | package com.libin.etl.jobs
2 |
3 | import com.libin.common.SparkJobBase
4 | import com.libin.etl.loader.data.DfBuilder
5 | import com.libin.etl.utils.LogUtils
6 | import com.libin.utils.FileUtils
7 | import org.apache.spark.sql.DataFrame
8 |
9 | /**
10 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved.
11 | * Authors: libin <2578858653@qq.com>
12 | *
13 | * Purpose :
14 | */
15 |
16 | class DfJobScheduler extends SparkJobBase {
17 | override def appName = "dfJobScheduler"
18 |
19 | val ss = createSparkSessionLocal()
20 | }
21 |
22 | object DfJobScheduler {
23 | def apply() = new DfJobScheduler
24 |
25 | def main(args: Array[String]) {
26 | val dfScheduler: DfJobScheduler = apply()
27 | LogUtils.setSparkLogLevels()
28 | dfScheduler.logger.info("dfJobScheduler start ...")
29 | // 测试读取json配置数据
30 | // loadUtils.readResourceFile("stu.json").foreach(println)
31 | // 读取df操作
32 | val df: DataFrame = DfBuilder.readJsonToDf(dfScheduler.ss, FileUtils.STU_File)
33 |
34 | /**
35 | * DataFrame基本操作
36 | */
37 | // op1.显示数据
38 | println("df.show() ...")
39 | df.show()
40 | // op2.输出结构信息schema
41 | println("df.printSchema() ...")
42 | df.printSchema()
43 | // op3.查询字段
44 | println("df.select(\"name\").show() ...")
45 | df.select("name").show()
46 | // op4.身高高于150,按照年龄倒叙排序输出
47 | println("df.select($\"name\", $\"age\", $\"height\").where($\"height\" > 150).orderBy(df(\"age\").desc).show() ...")
48 | import dfScheduler.ss.implicits._
49 | df.select($"name", $"age", $"height").where($"height" > 150).orderBy(df("age").desc).show()
50 | // op5.使用groupBy
51 | println("df.groupBy(\"age\").max(\"height\").show()")
52 | df.groupBy("age").max("height").show()
53 |
54 | /**
55 | * 执行sql语句
56 | */
57 | df.createOrReplaceTempView("stu")
58 | println("df.sqlContext.sql(\"select * from stu\")")
59 | df.sqlContext.sql("select * from stu").show()
60 | println("df.sqlContext.sql(\"select name,age,height from stu\")")
61 | df.sqlContext.sql("select name,age,height from stu").show()
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/bigdata-spark-sql/src/main/scala/com/libin/etl/jobs/DsJobScheduler.scala:
--------------------------------------------------------------------------------
1 | package com.libin.etl.jobs
2 |
3 | import com.libin.common.SparkJobBase
4 | import com.libin.etl.loader.data.DsBuilder
5 | import com.libin.etl.utils.LoadUtils.stu
6 | import org.apache.spark.sql.Dataset
7 |
8 | /**
9 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved.
10 | * Authors: libin <2578858653@qq.com>
11 | *
12 | * Purpose :
13 | */
14 |
15 | class DsJobScheduler extends SparkJobBase {
16 | override def appName = "dsJobScheduler"
17 |
18 | val ss = createSparkSessionLocal()
19 | }
20 |
21 | object DsJobScheduler {
22 | def apply() = new DsJobScheduler()
23 |
24 | def main(args: Array[String]) {
25 | val dsScheduler: DsJobScheduler = apply()
26 | dsScheduler.logger.info("dsJobScheduler start ...")
27 |
28 | val ds: Dataset[stu] = DsBuilder.createDsBySeq(dsScheduler.ss)
29 |
30 | ds.show()
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/bigdata-spark-sql/src/main/scala/com/libin/etl/loader/data/DsBuilder.scala:
--------------------------------------------------------------------------------
1 | package com.libin.etl.loader.data
2 |
3 | import com.libin.etl.utils.LoadUtils.stu
4 | import org.apache.spark.sql.{Dataset, SparkSession}
5 |
6 | /**
7 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved.
8 | * Authors: libin <2578858653@qq.com>
9 | *
10 | * Purpose :
11 | */
12 |
13 | object DsBuilder {
14 |
15 | /**
16 | * 读取配置文件中的数据创建Dataset
17 | *
18 | * @param ss SparkSession
19 | * @param fileName 文件名字
20 | * @return 返回DataSet数据集
21 | */
22 | def readJsonToDs(ss: SparkSession, fileName: String): Dataset[stu] = {
23 | import ss.implicits._
24 | val url: String = this.getClass.getClassLoader.getResource(fileName).toString
25 | ss.read.json(url).as[stu]
26 | }
27 |
28 | /**
29 | * 使用Seq+toDf创建DataSet
30 | *
31 | * @param ss SparkSession
32 | * @return 返回DataSet数据集
33 | */
34 | def createDsBySeq(ss: SparkSession): Dataset[stu] = {
35 | import ss.implicits._
36 | Seq(stu("xiaoming", 22, 175), stu("xiaoli", 18, 161), stu("xiaoqiang", 26, 198), stu("xiaohong", 18, 158))
37 | .toDS()
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/bigdata-spark-sql/src/main/scala/com/libin/etl/loader/data/RddBuilder.scala:
--------------------------------------------------------------------------------
1 | package com.libin.etl.loader.data
2 |
3 | import com.libin.common.SparkJobBase
4 | import com.libin.etl.utils.LoadUtils.stu
5 | import com.libin.etl.utils.PathUtils
6 | import org.apache.spark.SparkContext
7 | import org.apache.spark.rdd.RDD
8 | import org.apache.spark.sql.Row
9 |
10 | /**
11 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved.
12 | * Authors: libin <2578858653@qq.com>
13 | *
14 | * Purpose :
15 | */
16 |
17 | object RddBuilder extends SparkJobBase {
18 |
19 | /**
20 | * 返回RDD类型
21 | *
22 | * @param sc SparkContext
23 | * @return
24 | */
25 | def createRdd(sc: SparkContext): RDD[stu] = {
26 | val arr: Array[stu] = Array(stu("xiaoming", 22, 175),
27 | stu("xiaoli", 18, 161),
28 | stu("xiaoqiang", 26, 198),
29 | stu("xiaohong", 18, 158))
30 | sc.makeRDD(arr)
31 | }
32 |
33 | /**
34 | * 读取指定路径下面,指定字段的数据
35 | *
36 | * @param path 数据路径
37 | * @param date 分区时间
38 | * @param isTest 是否测试
39 | * @param cols 列字段集合
40 | */
41 | def loadDwsBigDataDeviceProfileDBySql(path: String,
42 | date: String,
43 | isTest: Boolean = false,
44 | cols: List[String]): RDD[Row] = {
45 | createSparkSessionLocal()
46 | .read
47 | .parquet(PathUtils.pathAssemble(path, date))
48 | .selectExpr(cols: _*).rdd
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/bigdata-spark-sql/src/main/scala/com/libin/etl/processor/ProcessorOp.scala:
--------------------------------------------------------------------------------
1 | package com.libin.etl.processor
2 |
3 | /**
4 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved.
5 | * Authors: libin <2578858653@qq.com>
6 | *
7 | * Purpose :
8 | */
9 |
10 | object ProcessorOp {
11 |
12 | }
13 |
--------------------------------------------------------------------------------
/bigdata-spark-sql/src/main/scala/com/libin/etl/utils/DateUtils.scala:
--------------------------------------------------------------------------------
1 | package com.libin.etl.utils
2 |
3 | import org.joda.time.DateTime
4 | import org.joda.time.format.DateTimeFormat
5 |
6 | /**
7 | * Copyright (c) 2020/4/15. libin Inc. All Rights Reserved.
8 | * Authors: libin <2578858653@qq.com>
9 | *
10 | * Purpose :
11 | */
12 | object DateUtils {
13 | val DATE_FORMAT = "yyyyMMdd"
14 |
15 | /**
16 | * 日期字符串转为DateTime
17 | *
18 | * @param input 日期
19 | * @return
20 | */
21 | def parseDate(input: String): Option[DateTime] =
22 | try {
23 | Some(DateTimeFormat.forPattern(DATE_FORMAT).parseDateTime(input))
24 | } catch {
25 | case e: Exception => None
26 | }
27 | }
--------------------------------------------------------------------------------
/bigdata-spark-sql/src/main/scala/com/libin/etl/utils/LoadUtils.scala:
--------------------------------------------------------------------------------
1 | package com.libin.etl.utils
2 |
3 | import scala.io.Source
4 |
5 | /**
6 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved.
7 | * Authors: libin <2578858653@qq.com>
8 | *
9 | * Purpose :
10 | */
11 |
12 | object LoadUtils {
13 |
14 | case class stu(name: String, age: Int, height: Int)
15 |
16 | /**
17 | * 读取配置中的文件
18 | *
19 | * @param fileName 配置文件名
20 | * @return 返回所有行记录的集合数据
21 | */
22 | def readResourceFile(fileName: String): Array[String] = {
23 | val inputStream = this.getClass.getClassLoader.getResourceAsStream(fileName)
24 | Source.fromInputStream(inputStream).getLines().toArray
25 | }
26 |
27 |
28 | /**
29 | * 读取配置中的文件,按照制定分隔符返回Map
30 | *
31 | * @param fileName 配置文件名
32 | * @return 返回所有行记录的集合数据
33 | */
34 | def readResourceFile(fileName: String, delimit: String): Map[String, String] = {
35 | val inputStream = this.getClass.getClassLoader.getResourceAsStream(fileName)
36 | Source.fromInputStream(inputStream).getLines()
37 | .map {
38 | line =>
39 | val sp = line.split(delimit)
40 | (sp(0), sp(1))
41 | }.toMap
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/bigdata-spark-sql/src/main/scala/com/libin/etl/utils/LogUtils.scala:
--------------------------------------------------------------------------------
1 | package com.libin.etl.utils
2 |
3 | import org.apache.spark.internal.Logging
4 | import org.apache.log4j.{Level, Logger}
5 |
6 | /**
7 | * Copyright (c) 2020/4/15. libin Inc. All Rights Reserved.
8 | * Authors: libin <2578858653@qq.com>
9 | *
10 | * Purpose : 只显示WARN日志,大量的INFO日志都可以被屏蔽掉
11 | */
12 | object LogUtils extends Logging {
13 | /** Set reasonable logging levels for streaming if the user has not configured log4j. */
14 | def setSparkLogLevels() {
15 | val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
16 | // if (!log4jInitialized) {
17 | if (log4jInitialized) {
18 | // We first log something to initialize Spark's default logging, then we override the
19 | // logging level.
20 | logInfo("Setting log level to [WARN] for streaming example." +
21 | " To override add a custom log4j.properties to the classpath.")
22 | Logger.getRootLogger.setLevel(Level.WARN)
23 | } else {
24 | Logger.getRootLogger.setLevel(Level.WARN)
25 | }
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/bigdata-spark-sql/src/main/scala/com/libin/etl/utils/PathUtils.scala:
--------------------------------------------------------------------------------
1 | package com.libin.etl.utils
2 |
3 | import org.joda.time.DateTime
4 |
5 | /**
6 | * Copyright (c) 2020/4/15. libin Inc. All Rights Reserved.
7 | * Authors: libin <2578858653@qq.com>
8 | *
9 | * Purpose :
10 | */
11 | object PathUtils {
12 | /**
13 | * 路径拼接DateTime
14 | *
15 | * @param root 数据根目录
16 | * @param date 读取的数据日期
17 | * @return 完整数据路径
18 | */
19 | def pathAssemble(root: String, date: DateTime): String = s"$root/date=${date.toString(DateUtils.DATE_FORMAT)}"
20 |
21 | def pathAssembleAll(root: String, date: DateTime): String = s"$root/date=${date.toString(DateUtils.DATE_FORMAT)}/*"
22 |
23 | /**
24 | * 路径拼接String
25 | *
26 | * @param root 数据根目录
27 | * @param date 读取的数据日期
28 | * @return 完整数据路径
29 | */
30 | def pathAssemble(root: String, date: String): String = pathAssemble(root, DateUtils.parseDate(date).get)
31 | }
32 |
--------------------------------------------------------------------------------
/bigdata-spark-sql/src/main/scala/com/libin/source/sql.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark
19 |
20 | import org.apache.spark.annotation.{DeveloperApi, InterfaceStability}
21 | import org.apache.spark.sql.execution.SparkStrategy
22 |
23 | /**
24 | * Allows the execution of relational queries, including those expressed in SQL using Spark.
25 | *
26 | * @groupname dataType Data types
27 | * @groupdesc Spark SQL data types.
28 | * @groupprio dataType -3
29 | * @groupname field Field
30 | * @groupprio field -2
31 | * @groupname row Row
32 | * @groupprio row -1
33 | */
34 | package object sql {
35 |
36 | /**
37 | * Converts a logical plan into zero or more SparkPlans. This API is exposed for experimenting
38 | * with the query planner and is not designed to be stable across spark releases. Developers
39 | * writing libraries should instead consider using the stable APIs provided in
40 | * [[org.apache.spark.sql.sources]]
41 | */
42 | @DeveloperApi
43 | @InterfaceStability.Unstable
44 | type Strategy = SparkStrategy
45 |
46 | type DataFrame = Dataset[Row]
47 | }
48 |
--------------------------------------------------------------------------------
/bigdata-spark-sql/src/test/scala/com/libin/etl/testProcessor.scala:
--------------------------------------------------------------------------------
1 | package com.libin.etl
2 |
3 | /**
4 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved.
5 | * Authors: libin <2578858653@qq.com>
6 | *
7 | * Purpose :
8 | */
9 |
10 | class testProcessor {
11 |
12 | val SEPARATOR: String = "\t"
13 |
14 | val PARTITIONNUM: Int = 100
15 | }
16 |
17 | object testProcessor {
18 |
19 | def apply() = new testProcessor()
20 |
21 | def main(args: Array[String]) {
22 | val processor: testProcessor = apply()
23 | println(processor.PARTITIONNUM)
24 |
25 | println(this.getClass.getClassLoader.getResource("stu.json"))
26 | }
27 | }
28 |
29 |
--------------------------------------------------------------------------------
/bigdata-spark-streaming/.gitignore:
--------------------------------------------------------------------------------
1 | *.bak
2 | build_info.properties
3 | .classpath
4 | dependency-reduced-pom.xml
5 | *.diff
6 | .DS_Store
7 | .idea/
8 | *.iml
9 | *.jar
10 | .project
11 | .settings/
12 | .tags*
13 | target
14 | tmp*
15 | test-output/
16 | nohup*
17 | *.log
18 | *.swp
19 | *.pyc
20 | script/__pycache__/
21 | venv
22 |
--------------------------------------------------------------------------------
/bigdata-spark-streaming/README.md:
--------------------------------------------------------------------------------
1 |
2 | ##### spark streaming
3 |
4 | * [官网](http://spark.apache.org/streaming/)
5 | * [文档](http://spark.apache.org/docs/latest/streaming-programming-guide.html)
6 | * [Spark Streaming2.2文档](https://spark.apache.org/docs/2.2.0/streaming-programming-guide.html)
7 |
8 | #####
9 | spark streaming基本原理为将输入数据流以时间片为单位进行拆分,然后以批处理的方式处理每个时间片的数据。
10 |
11 | 
12 |
13 | 
14 |
15 | Spark Streaming使用DSTream来表示一个连续的数据流。
16 | DSTream被表示为一系列连续的RDDs,其中每个RDD包含来自一定时间间隔的数据。
17 |
--------------------------------------------------------------------------------
/bigdata-spark-streaming/src/main/doc/优化.md:
--------------------------------------------------------------------------------
1 |
2 | ##### spark streaming作业优化
3 |
4 | ##### 1.数据序列化
5 | 推荐使用Kryo序列化。
6 |
7 | ##### 2.数据缓存
8 | 尝试使用lru算法。
9 |
10 | ##### 3.增大并发数
11 | 不少于上游topic个数。
12 |
13 | ##### 4.合理设置batch时间
14 | mini batch计算模式,数据量堆积严重,可以尝试增大batch时间,在符合数据流延迟时间之内。
15 |
16 | ##### 5、设置合理的core和memory
17 | 根据每秒数据量、计算逻辑、数据是否缓存、是否读写内存数据库、峰值数据量等合理设置。
18 |
19 |
--------------------------------------------------------------------------------
/bigdata-spark-streaming/src/main/doc/常见问题.md:
--------------------------------------------------------------------------------
1 |
2 | ##### 作业堆积原因?
3 | batch时间设置过小
4 | 资源配置过少
5 | 缓存文件
6 | 读写外部缓存耗时
7 | 上游不稳定
8 |
9 |
10 | ##### 作业堆积优化?
11 |
12 |
13 | ##### 一致性语义
14 | 读取上游保证恰好消费一次Exactly-once
15 | 写数据保证原子性,幂等性
16 |
17 |
--------------------------------------------------------------------------------
/bigdata-spark-streaming/src/main/doc/检查点CheckPoint.md:
--------------------------------------------------------------------------------
1 |
2 | ##### 进行CheckPoint的数据有哪些?
3 | 1、元数据checkpoint
4 | 配置信息:创建的Spark Streaming程序的配置信息,比如SparkConf中的信息。
5 | DStream的操作信息。
6 | 未处理的batch信息,有些job在排队,还没处理的batch信息。
7 | 2、数据checkpoint
8 | 将实时计算中产生的RDD的数据保存在可靠的存储系统中,比如HDFS。
9 |
10 | ##### 什么时候启用checkpoint机制?
11 | 1、使用了有状态的转换、比如reduceByKeyAndWindow操作。
12 | 2、作业失败重启
13 |
14 |
--------------------------------------------------------------------------------
/bigdata-spark-streaming/src/main/image/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-spark-streaming/src/main/image/1.png
--------------------------------------------------------------------------------
/bigdata-spark-streaming/src/main/image/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-spark-streaming/src/main/image/2.png
--------------------------------------------------------------------------------
/bigdata-spark-streaming/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Global logging configuration
2 | # log4j.rootLogger=WARN, stdout
3 | # Console output...
4 | # log4j.appender.stdout=org.apache.log4j.ConsoleAppender
5 | # log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
6 | # log4j.appender.stdout.layout.ConversionPattern=%5p [%t] - %m%n
7 |
--------------------------------------------------------------------------------
/bigdata-spark-streaming/src/main/scala/com/libin/data/streaming/base/SparkStreamingTrait.scala:
--------------------------------------------------------------------------------
1 | package com.libin.data.streaming.base
2 |
3 | import org.slf4j.{Logger, LoggerFactory}
4 |
5 | /**
6 | * Copyright (c) 2020/4/3 libin Inc. All Rights Reserved.
7 | * Authors: libin<2578858653@qq.com>
8 | *
9 | * Purpose :
10 | */
11 | trait SparkStreamingTrait {
12 |
13 | /**
14 | * Application Name
15 | */
16 | def appName: String = this.getClass.getSimpleName
17 |
18 | /**
19 | * logger
20 | */
21 | def logger: Logger = LoggerFactory.getLogger(appName)
22 |
23 |
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/bigdata-spark-streaming/src/main/scala/com/libin/data/streaming/base/client/SocketSparkStreamingTrait.scala:
--------------------------------------------------------------------------------
1 | package com.libin.data.streaming.base.client
2 |
3 | import com.libin.data.streaming.base.SparkStreamingTrait
4 | import org.apache.spark.SparkConf
5 | import org.apache.spark.streaming.{Seconds, StreamingContext}
6 |
7 | /**
8 | * Copyright (c) 2020/4/11. libin Inc. All Rights Reserved.
9 | * Authors: libin <2578858653@qq.com>
10 | *
11 | * Purpose : SparkStreaming + Socket
12 | */
13 | trait SocketSparkStreamingTrait extends SparkStreamingTrait {
14 |
15 | /**
16 | * 根据指定的batch间隔时间,生成StreamingContext对象
17 | *
18 | * @param interval batch间隔时间
19 | * @return StreamingContext对象
20 | */
21 | def createStreamContext(interval: Int): StreamingContext = {
22 | val conf = new SparkConf().setAppName(appName).setIfMissing("spark.master", "local")
23 | new StreamingContext(conf, Seconds(interval))
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/bigdata-spark-streaming/src/main/scala/com/libin/data/streaming/jobs/GenCodeFromCheckpoint.scala:
--------------------------------------------------------------------------------
1 | package com.libin.data.streaming.jobs
2 |
3 | import com.libin.data.streaming.utils.StreamingExamples
4 | import org.apache.spark.SparkConf
5 | import org.apache.spark.streaming.{Seconds, StreamingContext}
6 |
7 | /**
8 | * Copyright (c) 2020/04/04. libin Inc. All Rights Reserved.
9 | * Authors: libin <2578858653@qq.com>
10 | *
11 | * Purpose : checkpoint
12 | * Linux: nc -lk 9999
13 | * windows: nc -l -p 9999
14 | */
15 | object GenCodeFromCheckpoint {
16 | def main(args: Array[String]): Unit = {
17 | val conf = new SparkConf().setMaster("local[2]").setAppName("GenCodeFromCheckpoint")
18 | val ssc = new StreamingContext(conf, Seconds(5))
19 |
20 | StreamingExamples.setStreamingLogLevels()
21 | val lines = ssc.socketTextStream("localhost", 9999)
22 | ssc.checkpoint("E:\\2020_github\\checkout\\GenCodeFromCheckpoint")
23 |
24 | lines.print()
25 | ssc.start() // Start the computation
26 | ssc.awaitTermination() // Wait for the computation to terminate
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/bigdata-spark-streaming/src/main/scala/com/libin/data/streaming/jobs/GenCodeFromForeachRDD.scala:
--------------------------------------------------------------------------------
1 | package com.libin.data.streaming.jobs
2 |
3 | import com.libin.data.streaming.utils.StreamingExamples
4 | import org.apache.spark.SparkConf
5 | import org.apache.spark.rdd.RDD
6 | import org.apache.spark.streaming.{Seconds, StreamingContext}
7 |
8 | /**
9 | * Copyright (c) 2020/4/3 libin Inc. All Rights Reserved.
10 | * Authors: libin<2578858653@qq.com>
11 | *
12 | * Purpose : Spark Streaming的foreachRDD算子使用
13 | *
14 | * Linux: nc -lk 9999
15 | * windows: nc -l -p 9999
16 | */
17 | object GenCodeFromForeachRDD {
18 | def main(args: Array[String]): Unit = {
19 | val conf = new SparkConf().setMaster("local[2]").setAppName("GenCodeFromForeachRDD")
20 | val ssc = new StreamingContext(conf, Seconds(5))
21 |
22 | StreamingExamples.setStreamingLogLevels()
23 | val lines = ssc.socketTextStream("localhost", 9999)
24 |
25 | lines.map((_, 1))
26 | .foreachRDD {
27 | rdd =>
28 | val saveRdd: RDD[(String, Int)] = rdd.mapPartitions {
29 | iter =>
30 | val result = iter.map {
31 | line =>
32 | line
33 | }.toList
34 | result.toIterator
35 | }
36 | println(saveRdd.count())
37 | }
38 | ssc.start() // Start the computation
39 | ssc.awaitTermination() // Wait for the computation to terminate
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/bigdata-spark-streaming/src/main/scala/com/libin/data/streaming/jobs/GenCodeFromKafka.scala:
--------------------------------------------------------------------------------
1 | package com.libin.data.streaming.jobs
2 |
3 | import kafka.serializer.StringDecoder
4 | import org.apache.kafka.common.serialization.StringDeserializer
5 | import org.apache.spark.SparkConf
6 | import org.apache.spark.streaming.dstream.{DStream, InputDStream}
7 | import org.apache.spark.streaming.{Seconds, StreamingContext}
8 | import org.apache.spark.streaming.kafka.KafkaUtils
9 |
10 | /**
11 | * Copyright (c) 2020/04/06. libin Inc. All Rights Reserved.
12 | * Authors: libin <2578858653@qq.com>
13 | *
14 | * Purpose : kafka
15 | */
16 | object GenCodeFromKafka {
17 | def main(args: Array[String]): Unit = {
18 | /**
19 | * 配置kafka参数
20 | */
21 | val kafkaParams = Map[String, Object](
22 | "bootstrap.servers" -> "localhost:port",
23 | "key.deserializer" -> classOf[StringDeserializer],
24 | "value.deserializer" -> classOf[StringDeserializer],
25 | "group.id" -> "group_id",
26 | "auto.offset.reset" -> "latest",
27 | "enable.auto.commit" -> (false)
28 | )
29 |
30 | /**
31 | * 上游kafka topic
32 | */
33 | val topics = Set("topic1", "topic2")
34 |
35 | val conf = new SparkConf().setMaster("local[2]").setAppName("GenCodeFromKafka")
36 | val ssc = new StreamingContext(conf, Seconds(5))
37 |
38 | /*val stream: InputDStream[(String, String)] = KafkaUtils
39 | .createDirectStream[String, String, StringDecoder, StringDecoder,(String, String)](
40 | ssc, kafkaParams, topics
41 | )*/
42 |
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/bigdata-spark-streaming/src/main/scala/com/libin/data/streaming/jobs/GenCodeFromParams.scala:
--------------------------------------------------------------------------------
1 | package com.libin.data.streaming.jobs
2 |
3 | import com.libin.data.streaming.base.client.SocketSparkStreamingTrait
4 | import org.apache.spark.SparkConf
5 |
6 | /**
7 | * Copyright (c) 2020/4/11. libin Inc. All Rights Reserved.
8 | * Authors: libin <2578858653@qq.com>
9 | *
10 | * Purpose : Spark Streaming一些配置参数
11 | */
12 | object GenCodeFromParams extends SocketSparkStreamingTrait {
13 | def main(args: Array[String]): Unit = {
14 | // val ssc = createStreamContext(5)
15 | val conf = new SparkConf().setMaster("local[2]").setAppName(appName)
16 |
17 | /**
18 | * 16个分区,5分钟一个batch,则一个batch消费处理的数据量是 5000 * 16 * 5 * 60 = 24000000
19 | */
20 | // 启用反压机制,开启后spark自动根据系统负载选择最优消费速率
21 | conf.set("spark.streaming.backpressure.enabled", "true")
22 | // 限制第一次批处理应该消费的数据,因为程序冷启动队列里面有大量积压,防止第一次全部读取,造成系统阻塞
23 | conf.set("spark.streaming.backpressure.initialRate", "24000000")
24 | // 限制每秒每个消费线程读取每个kafka分区最大的数据量
25 | conf.set("spark.streaming.kafka.maxRatePerPartition", "5000") // 一般用在反压,限流上
26 |
27 | // 确保在kill任务时,能够处理完最后一批数据,再关闭程序,不会发生强制kill导致数据处理中断,没处理完的数据丢失
28 | conf.set("spark.streaming.stopGracefullyOnShutdown", "true")
29 |
30 |
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/bigdata-spark-streaming/src/main/scala/com/libin/data/streaming/jobs/GenCodeFromWindow.scala:
--------------------------------------------------------------------------------
1 | package com.libin.data.streaming.jobs
2 |
3 | import com.libin.data.streaming.utils.StreamingExamples
4 | import org.apache.spark.SparkConf
5 | import org.apache.spark.streaming.{Seconds, StreamingContext}
6 |
7 | /**
8 | * Copyright (c) 2020/4/3 libin Inc. All Rights Reserved.
9 | * Authors: libin<2578858653@qq.com>
10 | *
11 | * Purpose : Spark Streaming的Window算子使用
12 | * Linux: nc -lk 9999
13 | * windows: nc -l -p 9999
14 | */
15 | object GenCodeFromWindow {
16 | def main(args: Array[String]): Unit = {
17 | val conf = new SparkConf().setMaster("local[2]").setAppName("GenCodeFromWindow")
18 | val ssc = new StreamingContext(conf, Seconds(5))
19 |
20 | ssc.checkpoint("/home/baolibin/2020_github/checkout")
21 | StreamingExamples.setStreamingLogLevels()
22 |
23 | val lines = ssc.socketTextStream("localhost", 9999)
24 |
25 | // reduceByKeyAndWindow
26 | val res = lines.flatMap(_.split(" "))
27 | .map((_, 1))
28 | .reduceByKeyAndWindow((a: Int, b: Int) => (a + b), Seconds(10), Seconds(5))
29 |
30 | // countByWindow
31 | val resByCount = lines.flatMap(_.split(" "))
32 | .countByWindow(Seconds(10), Seconds(5))
33 |
34 | res.print()
35 | resByCount.print()
36 |
37 | ssc.start() // Start the computation
38 | ssc.awaitTermination() // Wait for the computation to terminate
39 | }
40 | }
41 |
42 | /**
43 | Input:
44 | a
45 | a
46 | a
47 | a
48 | a
49 |
50 | b
51 | b
52 | b
53 | b
54 | b
55 |
56 |
57 | a
58 | b
59 | c
60 | d
61 | e
62 | ...
63 | Output:
64 | Time: 1585905790000 ms
65 | -------------------------------------------
66 | (d,7)
67 | (b,42)
68 | (,21)
69 | (V,1)
70 | (e,7)
71 | (a,42)
72 | (c,7)
73 | -------------------------------------------
74 | Time: 1585905795000 ms
75 | -------------------------------------------
76 | (d,1)
77 | (b,6)
78 | (,3)
79 | (a,6)
80 | (c,1)
81 | */
82 |
83 |
84 | /**
85 | -------------------------------------------
86 | Time: 1585911530000 ms
87 | -------------------------------------------
88 | (d,1)
89 | (b,6)
90 | (,3)
91 | (e,1)
92 | (a,6)
93 | (c,1)
94 | -------------------------------------------
95 | Time: 1585911530000 ms
96 | -------------------------------------------
97 | 18
98 | */
99 |
--------------------------------------------------------------------------------
/bigdata-spark-streaming/src/main/scala/com/libin/data/streaming/jobs/NetworkWordCount.scala:
--------------------------------------------------------------------------------
1 | package com.libin.data.streaming.jobs
2 |
3 | import com.libin.data.streaming.utils.StreamingExamples
4 | import org.apache.spark.SparkConf
5 | import org.apache.spark.streaming.{Seconds, StreamingContext}
6 |
7 | /**
8 | * Copyright (c) 2019/02/16. libin Inc. All Rights Reserved.
9 | * Authors: libin <2578858653@qq.com>
10 | *
11 | * Purpose : Spark Streaming的WordCount
12 | * Linux: nc -lk 9999
13 | * windows: nc -l -p 9999
14 | */
15 | object NetworkWordCount {
16 | def main(args: Array[String]): Unit = {
17 | // Create a local StreamingContext with two working thread and batch interval of 1 second.
18 | // The master requires 2 cores to prevent from a starvation scenario.
19 | val conf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCount")
20 | val ssc = new StreamingContext(conf, Seconds(5))
21 |
22 | // Create a DStream that will connect to hostname:port, like localhost:9999
23 | val lines = ssc.socketTextStream("localhost", 9999)
24 | StreamingExamples.setStreamingLogLevels()
25 |
26 | // Split each line into words
27 | val words = lines.flatMap(_.split(" "))
28 |
29 | // Count each word in each batch
30 | val pairs = words.map(word => (word, 1))
31 | val wordCounts = pairs.reduceByKey(_ + _)
32 |
33 | // Print the first ten elements of each RDD generated in this DStream to the console
34 | wordCounts.print()
35 |
36 | ssc.start() // Start the computation
37 | ssc.awaitTermination() // Wait for the computation to terminate
38 |
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/bigdata-spark-streaming/src/main/scala/com/libin/data/streaming/utils/StreamingExamples.scala:
--------------------------------------------------------------------------------
1 | package com.libin.data.streaming.utils
2 |
3 | import org.apache.spark.internal.Logging
4 | import org.apache.log4j.{Level, Logger}
5 |
6 | /**
7 | * Copyright (c) 2020/4/3 libin Inc. All Rights Reserved.
8 | * Authors: libin<2578858653@qq.com>
9 | *
10 | * Purpose : 只显示WARN日志,大量的INFO日志都可以被屏蔽掉
11 | */
12 | object StreamingExamples extends Logging {
13 | /** Set reasonable logging levels for streaming if the user has not configured log4j. */
14 | def setStreamingLogLevels() {
15 | val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements
16 | // if (!log4jInitialized) {
17 | if (log4jInitialized) {
18 | // We first log something to initialize Spark's default logging, then we override the
19 | // logging level.
20 | logInfo("Setting log level to [WARN] for streaming example." +
21 | " To override add a custom log4j.properties to the classpath.")
22 | Logger.getRootLogger.setLevel(Level.WARN)
23 | }else{
24 | Logger.getRootLogger.setLevel(Level.WARN)
25 | }
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/doc1/README-PLAN.md:
--------------------------------------------------------------------------------
1 | 待补充
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
9 | * Purpose :
10 | */
11 | object AccumulatorDemo {
12 | def main(args: Array[String]): Unit = {
13 | val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("AccumulatorDemo")
14 | val sc: SparkContext = new SparkContext(conf)
15 |
16 | val arrAccu = Array(0L, 0L, 0L, 0L, 0L)
17 | val accumulatorArr = sc.accumulator(arrAccu, "HADOOP")(MyAcculumatorParam)
18 |
19 | val accumulatorMl = sc.accumulator(0, "ML")
20 | val accumulatorDl = sc.accumulator(0L, "DL")
21 | val arr = Array("ML", "DL", "CNN", "RNN", "ML", "HADOOP", "SPARK", "ML")
22 | for (i <- 0 to arr.length - 1) {
23 | if (arr(i).equals("ML")) {
24 | accumulatorMl += 1
25 | } else if (arr(i).equals("DL")) {
26 | accumulatorDl += 1
27 | } else if (arr(i).equals("HADOOP")) {
28 | accumulatorArr += Array(1L, 1L, 1L, 1L, 1L)
29 | }
30 | }
31 | println("ML=" + accumulatorMl.name.get + "、" + accumulatorMl.value)
32 | println("DL=" + accumulatorDl.name.get + "、" + accumulatorDl.value)
33 | println("HADOOP=" + accumulatorArr.name.get + "、" + accumulatorArr.value.mkString(","))
34 | }
35 |
36 | object MyAcculumatorParam extends AccumulatorParam[Array[Long]] {
37 | override def addInPlace(r1: Array[Long], r2: Array[Long]): Array[Long] = {
38 | r1.zip(r2).map(x => x._1 + x._2)
39 | }
40 |
41 | def zero(initialValue: Array[Long]): Array[Long] = {
42 | new Array[Long](initialValue.length)
43 | }
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/spark-core/src/main/scala/com/libin/client/AggregateByKeyDemo.scala:
--------------------------------------------------------------------------------
1 | package com.libin.client
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 | import scala.collection.JavaConverters._
5 |
6 | /**
7 | * Copyright (c) 2017/07/25. xixi Inc. All Rights Reserved.
8 | * Authors: libin <2578858653@qq.com>
9 | *
10 | * Purpose : 求key只出现一次的数据, 如果用groupByKey或reduceByKey很容易就做出来了,现在用aggregateByKey求解一下。
11 | */
12 | object AggregateByKeyDemo {
13 | def main(args: Array[String]): Unit = {
14 | val conf = new SparkConf().setAppName("aggregateByKeyDemo").setMaster("local")
15 | val sc = new SparkContext(conf)
16 |
17 | sc.textFile("D://sparkmllibData/sparkml/mllibdata/arrregation.txt")
18 | .map {
19 | line =>
20 | (line.split("\t")(0), line.split("\t")(1).toLong)
21 | }.aggregateByKey(0L)(seqOp, combOp)
22 | .filter(line => line._2 == 1L)
23 | .collect().foreach(println)
24 | }
25 |
26 | def seqOp(U: Long, V: Long): Long = {
27 | U + 1L
28 | }
29 |
30 | def combOp(U: Long, V: Long): Long = {
31 | U + V
32 | }
33 | }
34 |
35 | /**
36 | * asdfgh 546346
37 | * retr 4567
38 | * asdfgh 7685678
39 | * ghj 2345
40 | * asd 234
41 | * hadoop 435
42 | * ghj 23454
43 | * asdfgh 54675
44 | * asdfgh 546759878
45 | * asd 234
46 | * asdfgh 5467598782
47 | */
48 |
--------------------------------------------------------------------------------
/spark-core/src/main/scala/com/libin/client/BroadcastDemo.scala:
--------------------------------------------------------------------------------
1 | package com.libin.client
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 | import scala.collection.immutable.HashMap
5 |
6 | /**
7 | * Copyright (c) 2016/11/02. xixi Inc. All Rights Reserved.
8 | * Authors: libin <2578858653@qq.com>
9 | *
10 | * Purpose :
11 | */
12 | object BroadcastDemo {
13 | def main(args: Array[String]): Unit = {
14 | val conf: SparkConf = new SparkConf().setAppName("CacheRadius").setMaster("local[2]")
15 | val sc = new SparkContext(conf)
16 | val input = "E://sparkmllibData/cacheAndPersist.txt"
17 | val data = sc.textFile(input).map(_.split("\\|", 100)).map(line => {
18 | val Array(privateIP, account, timeFormat, timeType) = line
19 | (privateIP, (account, timeFormat.toLong, timeType.toInt))
20 | })
21 |
22 | var accountHash = new HashMap[String, Set[(String, Long, Int)]]()
23 | data.groupByKey().collect().foreach(x => {
24 | accountHash += (x._1 -> x._2.toSet)
25 | })
26 | val broacast = sc.broadcast(accountHash)
27 |
28 | println(broacast.id)
29 | val hashvalue = broacast.value
30 | for (entry <- hashvalue) {
31 | println(entry._1 + "|" + entry._2)
32 | }
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/spark-core/src/main/scala/com/libin/client/MyPartitioner.scala:
--------------------------------------------------------------------------------
1 | package com.libin.client
2 |
3 | import org.apache.spark.{Partitioner, SparkConf, SparkContext}
4 |
5 | /**
6 | * Copyright (c) 2015/05/02. xixi Inc. All Rights Reserved.
7 | * Authors: libin <2578858653@qq.com>
8 | *
9 | * Purpose : 自定义分区小Demo
10 | */
11 | object MyPartitioner {
12 | def main(args: Array[String]): Unit = {
13 | val conf = new SparkConf().setAppName("MyPartitioner").setMaster("local[10]")
14 | val sc = new SparkContext(conf)
15 | val arr = Array((2, 3), (4, 6), (4, 2), (2, 1), (22, 3), (34, 6),
16 | (74, 2), (12, 1), (62, 3), (34, 6), (114, 2), (92, 1))
17 | val rdd = sc.makeRDD(arr)
18 | rdd.partitionBy(new myPartitioner(10))
19 | .foreachPartition(x => println(x.toList.mkString(",")))
20 | }
21 | }
22 |
23 | /**
24 | * 不使用已有的分区策略HashPartitioner和RangePartitioner,自定义分区
25 | *
26 | * @param partitions 分区个数
27 | */
28 | class myPartitioner(partitions: Int) extends Partitioner {
29 | override def numPartitions: Int = partitions
30 |
31 | override def getPartition(key: Any): Int = key match {
32 | case null => 0
33 | case _ =>
34 | try {
35 | val curNum = key.asInstanceOf[Int]
36 | if (curNum < 10) curNum
37 | else if (curNum < 100) curNum / numPartitions
38 | else 0
39 | } catch {
40 | case e: Exception => 0
41 | }
42 | }
43 | }
--------------------------------------------------------------------------------
/spark-core/src/main/scala/com/libin/client/README.md:
--------------------------------------------------------------------------------
1 |
2 | ## Spark编程API应用
3 | * [二次排序](SecondarySort.scala)
4 | * [自定义分区](MyPartitioner.scala)
5 | * [累加器](AccumulatorDemo.scala)
6 | * [广播变量](BroadcastDemo.scala)
7 | * [cache](cacheAndPersist.scala)
8 | * [aggregateByKey](AggregateByKeyDemo.scala)
9 |
--------------------------------------------------------------------------------
/spark-core/src/main/scala/com/libin/client/SecondarySort.scala:
--------------------------------------------------------------------------------
1 | package com.libin.client
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | /**
6 | * Copyright (c) 2018/05/02. xixi Inc. All Rights Reserved.
7 | * Authors: libin <2578858653@qq.com>
8 | *
9 | * Purpose :
10 | */
11 | object SecondarySort {
12 | def main(args: Array[String]): Unit = {
13 | val conf = new SparkConf().setAppName("secondarySort").setMaster("local[2]")
14 | val sc = new SparkContext(conf)
15 |
16 | val arr = Array((2, 3), (4, 6), (4, 2), (2, 1))
17 | val rdd = sc.makeRDD(arr)
18 | rdd.map(x => (new secondarySortUtils(x._1, x._2), x))
19 | .sortByKey(ascending = true).map(_._2)
20 | .collect().foreach(println)
21 | }
22 | }
23 |
24 | /**
25 | * 继承Ordered和Serializable实现自定义排序key,并使用sortByKey对自定义的key进行排序.
26 | *
27 | * @param first 第一列数据
28 | * @param second 第二列数据
29 | */
30 | class secondarySortUtils(val first: Int, val second: Int) extends Ordered[secondarySortUtils] with Serializable {
31 | override def compare(that: secondarySortUtils): Int = {
32 | if (this.first - that.first != 0) this.first - that.first
33 | else this.second - that.second
34 | }
35 | }
--------------------------------------------------------------------------------
/spark-core/src/main/scala/com/libin/client/cacheAndPersist.scala:
--------------------------------------------------------------------------------
1 | package com.libin.client
2 |
3 | import org.apache.spark.storage.StorageLevel
4 | import org.apache.spark.{SparkConf, SparkContext}
5 |
6 | /**
7 | * Copyright (c) 2016/11/02. xixi Inc. All Rights Reserved.
8 | * Authors: libin <2578858653@qq.com>
9 | *
10 | * Purpose :
11 | */
12 | object cacheAndPersist {
13 | def main(args: Array[String]): Unit = {
14 | /*if (args.length != 1) {
15 | System.err.println("Usage ")
16 | System.exit(1)
17 | val Array(input) = args
18 | }*/
19 |
20 | val input = "E://sparkmllibData/cache.txt"
21 | val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("cacheAndPersist")
22 | val sc: SparkContext = new SparkContext(conf)
23 |
24 | val data1 = sc.textFile(input)
25 | .map(_.split("\\|", 100))
26 | .map(line => {
27 | val Array(name, age) = line
28 | (name, age)
29 | }).cache()
30 | val data2 = sc.textFile(input)
31 | .map(line => {
32 | line.split("\\|", 100)
33 | }).map(x => {
34 | val Array(name, age) = x
35 | (name, age)
36 | }).filter(y => {y._1.equals("ML")
37 | }).persist(StorageLevel.MEMORY_AND_DISK)
38 |
39 | data1.intersection(data2).foreach(println)
40 | }
41 |
42 | }
43 |
--------------------------------------------------------------------------------
/spark-core/src/main/scala/com/libin/jobs/READMD.md:
--------------------------------------------------------------------------------
1 |
2 | ##### 相关的一些作业处理模板
3 |
4 |
--------------------------------------------------------------------------------
/spark-core/src/main/scala/com/libin/loader/READMD.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | ##### 读写存储相关操作代码
5 |
--------------------------------------------------------------------------------
/spark-core/src/main/scala/com/libin/processor/READMD.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | #### 处理数据相关代码
4 |
5 |
6 |
--------------------------------------------------------------------------------
/spark-core/src/main/scala/com/libin/source/READMD.md:
--------------------------------------------------------------------------------
1 |
2 | ##### 源码相关阅读
3 |
4 |
--------------------------------------------------------------------------------
/spark-core/src/main/scala/com/libin/utils/DateUtils.scala:
--------------------------------------------------------------------------------
1 | package com.libin.utils
2 |
3 | import org.joda.time.DateTime
4 | import org.joda.time.format.DateTimeFormat
5 |
6 | /**
7 | * Copyright (c) 2020/4/14. libin Inc. All Rights Reserved.
8 | * Authors: libin <2578858653@qq.com>
9 | *
10 | * Purpose :
11 | */
12 | object DateUtils {
13 | val DATE_FORMAT = "yyyyMMdd"
14 |
15 | /**
16 | * 日期字符串转为DateTime
17 | *
18 | * @param input 日期
19 | * @return
20 | */
21 | def parseDate(input: String): Option[DateTime] =
22 | try {
23 | Some(DateTimeFormat.forPattern(DATE_FORMAT).parseDateTime(input))
24 | } catch {
25 | case e: Exception => None
26 | }
27 |
28 | /**
29 | * DateTime转为日期字符串
30 | *
31 | * @param input 日期
32 | */
33 | def parseDateTimeToStr(input: DateTime): Option[String] =
34 | try {
35 | Some(input.toString(DATE_FORMAT))
36 | } catch {
37 | case e: Exception => None
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/spark-core/src/main/scala/com/libin/utils/MySQLUtils.scala:
--------------------------------------------------------------------------------
1 | package com.libin.utils
2 |
3 | import java.sql.{Connection, DriverManager, ResultSet, SQLException}
4 |
5 | import com.typesafe.config.ConfigFactory
6 | import org.slf4j.{Logger, LoggerFactory}
7 |
8 | import scala.collection.mutable.ArrayBuffer
9 |
10 | /**
11 | * Copyright (c) 2020/4/22 libin Inc. All Rights Reserved.
12 | * Authors: libin<2578858653@qq.com>
13 | *
14 | * Purpose :
15 | */
16 | object MySQLUtils {
17 | val logger: Logger = LoggerFactory.getLogger("MySQLUtils")
18 | val splitTable = "\t" // 制表符字符
19 | /**
20 | * 获取MySQL访问链接
21 | *
22 | * @param config 数据库配置信息
23 | */
24 | def getMySQLConn(config: String): Connection = {
25 | try {
26 | def dbConf = ConfigFactory.load(config).getConfig("db.default")
27 |
28 | Class.forName(dbConf.getString("driver"))
29 | DriverManager.getConnection(dbConf.getString("url"),
30 | dbConf.getString("user"),
31 | dbConf.getString("password"))
32 | } catch {
33 | case e: SQLException => e.printStackTrace(); null
34 | case _: Throwable => null
35 | }
36 | }
37 |
38 | /**
39 | * 查询指定Sql语句,返回指定字段内容
40 | *
41 | * @param sql 语句
42 | * @param conn Connection
43 | * @param num 读取几个字段,把查询出来的字段每一行拼接在一起
44 | */
45 | def executeSql(sql: String, conn: Connection, num: Int): Array[String] = {
46 | val stmt = conn.createStatement();
47 | try {
48 | val rs: ResultSet = stmt.executeQuery(sql)
49 | var arr = new ArrayBuffer[String]
50 | val sb = new StringBuilder
51 | while (rs.next()) {
52 | sb.clear()
53 | for (i <- 1 to (num)) {
54 | sb.append(rs.getString(i))
55 | // 查询中有多个字段之间用制表符分割开
56 | if (i < num) sb.append(splitTable)
57 | }
58 | arr += sb.toString()
59 | }
60 | rs.close()
61 | arr.toArray
62 | } catch {
63 | case e: SQLException => e.printStackTrace(); null
64 | case _: Throwable => null
65 | } finally {
66 | stmt.close()
67 | }
68 | }
69 |
70 | /**
71 | * 关闭MySQL链接
72 | *
73 | * @param conn Connection链接
74 | */
75 | def close(conn: Connection): Unit = {
76 | try {
77 | conn.close()
78 | } catch {
79 | case a: SQLException => a.printStackTrace(); null
80 | case _: Throwable => null
81 | }
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/spark-core/src/main/scala/com/libin/utils/README.md:
--------------------------------------------------------------------------------
1 |
2 | ##### 相关的一些工具类
3 | 1.日期操作类
4 | 2.路径操作类
5 | 3.
6 |
7 |
--------------------------------------------------------------------------------
/spark-core/src/main/scala/com/libin/utils/ResourceUtils.scala:
--------------------------------------------------------------------------------
1 | package com.libin.utils
2 |
3 | import org.apache.commons.lang3.StringUtils
4 |
5 | import scala.io.Source
6 |
7 | /**
8 | * Copyright (c) 2020/4/20 XiaoMi Inc. All Rights Reserved.
9 | * Authors: libin
7 | * Purpose : 常用的一些分隔符
8 | */
9 | object SeparatorUtils {
10 |
11 | // 常用分隔符
12 | val SEPARATOR_TAB = "\t"
13 | val SEPARATOR_EMPTY = "\\N"
14 | val SEPARATOR_ENTER = "\n"
15 | val SEPARATOR_EQUAL = "=" // 等号字符
16 | val SEPARATOR_SPACE = " " // 空格字符
17 | val SEPARATOR_POINT = "\\." // 点字符
18 | val SEPARATOR_COMMA = "," // 逗号分隔字符
19 | val SEPARATOR_SEMICOLON = ";" // 分号字符
20 | val SEPARATOR_BAR = "-" // 横杠分隔符
21 | val SEPARATOR_LEFT_SLASH = "/" // 左斜线
22 |
23 | val pathSuccess = "_SUCCESS" // 数据成功标识符
24 | }
25 |
--------------------------------------------------------------------------------
/spark-graphx/.gitignore:
--------------------------------------------------------------------------------
1 | *.bak
2 | build_info.properties
3 | .classpath
4 | dependency-reduced-pom.xml
5 | *.diff
6 | .DS_Store
7 | .idea/
8 | *.iml
9 | *.jar
10 | .project
11 | .settings/
12 | .tags*
13 | target/
14 | tmp*
15 | test-output/
16 | nohup*
17 | *.log
18 | *.swp
19 | *.pyc
20 | script/__pycache__/
21 |
--------------------------------------------------------------------------------
/spark-graphx/README.md:
--------------------------------------------------------------------------------
1 |
2 | ## 图框架
3 |
4 | * 图基本学习资料整理
5 |
6 | * [图处理框架学习资料整理](src/main/scala/com/libin/docs)
7 |
8 | * 图存储框架学习资料整理
9 |
10 | * 图可视化框架学习资料整理
11 |
--------------------------------------------------------------------------------
/spark-graphx/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 |
2 | log4j.logger.org.apache=ERROR
3 |
--------------------------------------------------------------------------------
/spark-graphx/src/main/scala/com/libin/graphX/etl/GraphXProcessor.scala:
--------------------------------------------------------------------------------
1 | package com.libin.graphX.etl
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 | import org.apache.spark.graphx.{Graph, Edge, VertexId}
5 | import org.apache.spark.rdd.RDD
6 | import org.slf4j.{LoggerFactory, Logger}
7 |
8 | /**
9 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved.
10 | * Authors: libin <2578858653@qq.com>
11 | *
12 | * Purpose :
13 | */
14 |
15 | object GraphXProcessor {
16 | def main(args: Array[String]) {
17 | val conf = new SparkConf().setAppName("joinVertexDemo").setMaster("local")
18 | val sc = new SparkContext(conf)
19 |
20 | val logger: Logger = LoggerFactory.getLogger("GraphProcessor")
21 | logger.info("GraphProcessor start ...")
22 |
23 | val vertexRdd: RDD[(VertexId, (String, String, Long))] =
24 | sc.parallelize(Array(
25 | (1L, ("mid_1", "mid", 1513048521000L)),
26 | (2L, ("imei_1", "phone", 1523048521003L)),
27 | (3L, ("pn_1", "pn", 1523048521005L))
28 | ))
29 |
30 | val edgeRdd: RDD[Edge[Long]] =
31 | sc.parallelize(Array(
32 | Edge(1L, 2L, 1513048521000L),
33 | Edge(2L, 3L, 1523048521003L)
34 | ))
35 |
36 | // 构造图
37 | val graphTest = Graph(vertexRdd, edgeRdd)
38 | // 输出图的顶点信息
39 | graphTest.vertices.foreach(println)
40 |
41 | val addAttrRdd = sc.makeRDD(Array((1L, 1L), (3L, 3L), (5L, 5L)))
42 |
43 | graphTest.mapVertices((_, attr) => attr._3).joinVertices(addAttrRdd)((_, _, newAttr) => newAttr)
44 | .vertices.foreach(println)
45 | /**
46 | * 操作joinVertices输出结果.
47 | * (1,1)
48 | * (3,3)
49 | * (2,1523048521003)
50 | */
51 |
52 | graphTest.mapVertices((_, attr) => attr._3).outerJoinVertices(addAttrRdd)((_, _, newAttr) => newAttr)
53 | .vertices.foreach(println)
54 | /**
55 | * 操作outerJoinVertices输出结果.
56 | * (1,Some(1))
57 | * (3,Some(3))
58 | * (2,None)
59 | */
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/spark-mllib/.gitignore:
--------------------------------------------------------------------------------
1 | *.bak
2 | build_info.properties
3 | .classpath
4 | dependency-reduced-pom.xml
5 | *.diff
6 | .DS_Store
7 | .idea/
8 | *.iml
9 | *.jar
10 | .project
11 | .settings/
12 | .tags*
13 | target/
14 | tmp*
15 | test-output/
16 | nohup*
17 | *.log
18 | *.swp
19 | *.pyc
20 | script/__pycache__/
21 | *.h
--------------------------------------------------------------------------------
/spark-mllib/README.md:
--------------------------------------------------------------------------------
1 |
2 | ## Spark MLlib
3 |
4 | * [Spark MLLib示例代码](src/main/scala/com/libin)
5 |
6 |
7 | ## 软件版本号
8 | * scala版本2.11.7
9 | * jdk版本1.8
10 | * spark版本2.1
11 |
12 |
13 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/com/libin/AlsRecommend.scala:
--------------------------------------------------------------------------------
1 | package com.libin
2 |
3 | import org.apache.log4j.{Level, Logger}
4 | import org.apache.spark.{SparkConf, SparkContext}
5 | import org.apache.spark.mllib.recommendation.ALS
6 | import org.apache.spark.mllib.recommendation.Rating
7 |
8 | /**
9 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved.
10 | * Authors: libin <2578858653@qq.com>
11 | *
12 | * Purpose :
13 | */
14 | object AlsRecommend {
15 | def main(args: Array[String]) {
16 | //0 构建Spark对象
17 | val conf = new SparkConf()
18 | .setAppName("ALS")
19 | .setMaster("local")
20 | val sc = new SparkContext(conf)
21 | Logger.getRootLogger.setLevel(Level.WARN)
22 |
23 | //1 读取样本数据
24 | val data = sc.textFile("D://sparkmllibData/sparkml/mllibdata/test.data")
25 | val ratings = data.map(_.split(',') match {
26 | case Array(user, item, rate) =>
27 | Rating(user.toInt, item.toInt, rate.toDouble)
28 | })
29 |
30 | //2 使用ALS训练数据建立推荐模型
31 | val rank = 10
32 | val numIterations = 20
33 | val model = ALS.train(ratings, rank, numIterations, 0.01)
34 |
35 | //3从rating中获取user以及product数据集
36 | val usersProducts = ratings.map {
37 | case Rating(user, product, rate) =>
38 | (user, product)
39 | }
40 | // 使用推荐模型预对用户和商品进行评分,得到预测评分的数据集
41 | val predictions =
42 | model.predict(usersProducts).map {
43 | case Rating(user, product, rate) =>
44 | ((user, product), rate)
45 | }
46 | // 真实数据和预测数据进行合并
47 | val ratesAndPreds = ratings.map {
48 | case Rating(user, product, rate) =>
49 | ((user, product), rate)
50 | }.join(predictions)
51 |
52 | val MSE = ratesAndPreds.map {
53 | case ((user, product), (r1, r2)) =>
54 | val err = r1 - r2
55 | err * err
56 | }.mean()
57 | println("Mean Squared Error = " + MSE)
58 |
59 | //4 保存/加载模型
60 | /*model.save(sc, "myModelPath")
61 | val sameModel = MatrixFactorizationModel.load(sc, "myModelPath")*/
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/com/libin/DistributedMatrixRow.scala:
--------------------------------------------------------------------------------
1 | package com.libin
2 |
3 | import org.apache.spark.mllib.linalg.Vectors
4 | import org.apache.spark.mllib.linalg.distributed._
5 | import org.apache.spark.{SparkContext, SparkConf}
6 |
7 | /**
8 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved.
9 | * Authors: libin <2578858653@qq.com>
10 | *
11 | * Purpose :分布式行矩阵有:
12 | * * 行矩阵、带索引的行矩阵、坐标行矩阵、块行矩阵
13 | */
14 | object DistributedMatrixRow {
15 | def main(args: Array[String]) {
16 | val conf = new SparkConf()
17 | .setMaster("local")
18 | .setAppName("distributedMatrixRow")
19 | val sc = new SparkContext(conf)
20 |
21 | println("First:RowMatrix ")
22 | val rdd = sc.textFile("D://sparkmllibData/sparkml/mllibdata/MatrixRow.txt") //创建RDD文件路径
23 | .map(_.split(' ') //按“ ”分割
24 | .map(_.toDouble)) //转成Double类型
25 | .map(line => Vectors.dense(line)) //转成Vector格式
26 | val rm = new RowMatrix(rdd) //读入行矩阵
27 | println(rm.numRows()) //打印列数
28 | println(rm.numCols()) //打印行数
29 | rm.rows.foreach(println)
30 |
31 | println("Second:IndexedRow ")
32 | val rdd2 = sc.textFile("D://sparkmllibData/sparkml/mllibdata/MatrixRow.txt") //创建RDD文件路径
33 | .map(_.split(' ') //按“ ”分割
34 | .map(_.toDouble)) //转成Double类型
35 | .map(line => Vectors.dense(line)) //转化成向量存储
36 | .map(vd => IndexedRow(vd.size, vd)) //转化格式
37 | val irm = new IndexedRowMatrix(rdd2) //建立索引行矩阵实例
38 | println(irm.getClass) //打印类型
39 | irm.rows.foreach(println) //打印内容数据
40 |
41 | println("Third: CoordinateMatrix ")
42 | val rdd3 = sc.textFile("D://sparkmllibData/sparkml/mllibdata/MatrixRow.txt") //创建RDD文件路径
43 | .map(_.split(' ') //按“ ”分割
44 | .map(_.toDouble)) //转成Double类型
45 | .map(vue => (vue(0).toLong, vue(1).toLong, vue(2))) //转化成坐标格式
46 | .map(vue2 => MatrixEntry(vue2 _1, vue2 _2, vue2 _3)) //转化成坐标矩阵格式
47 | val crm = new CoordinateMatrix(rdd3) //实例化坐标矩阵
48 | crm.entries.foreach(println) //打印数据
49 | println(crm.numCols())
50 | println(crm.numCols())
51 | println(crm.entries.countApproxDistinct())
52 |
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/com/libin/FPGrowthDemo.scala:
--------------------------------------------------------------------------------
1 | package com.libin
2 |
3 | import org.apache.log4j.{Level, Logger}
4 | import org.apache.spark.{SparkConf, SparkContext}
5 | import org.apache.spark.mllib.fpm.{FPGrowth, FPGrowthModel}
6 |
7 | /**
8 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved.
9 | * Authors: libin <2578858653@qq.com>
10 | *
11 | * Purpose :
12 | */
13 | object FPGrowthDemo {
14 | def main(args: Array[String]) {
15 | //0 构建Spark对象
16 | val conf = new SparkConf()
17 | .setAppName("fpg")
18 | .setMaster("local")
19 | val sc = new SparkContext(conf)
20 | Logger.getRootLogger.setLevel(Level.WARN)
21 |
22 | //1 读取样本数据
23 | val data_path = "D://sparkmllibData/sparkml/mllibdata/sample_fpgrowth.txt"
24 | val data = sc.textFile(data_path)
25 | val examples = data.map(_.split(" ")).cache()
26 |
27 | //2 建立模型
28 | val minSupport = 0.6
29 | val numPartition = 10
30 | val model = new FPGrowth()
31 | .setMinSupport(minSupport)
32 | .setNumPartitions(numPartition)
33 | .run(examples)
34 |
35 | //3 打印结果
36 | println("Number of frequent itemsets:" + model.freqItemsets.count())
37 | model.freqItemsets.collect().foreach { itemset =>
38 | println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq)
39 | }
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/com/libin/KMeans.scala:
--------------------------------------------------------------------------------
1 | package com.libin
2 |
3 | import org.apache.log4j.{Level, Logger}
4 | import org.apache.spark.mllib.clustering._
5 | import org.apache.spark.mllib.linalg.Vectors
6 | import org.apache.spark.{SparkConf, SparkContext}
7 |
8 | /**
9 | * Copyright (c) 2018/09/04. xixi Inc. All Rights Reserved.
10 | * Authors: libin <2578858653@qq.com>
11 | *
12 | * Purpose :
13 | */
14 | object KMeans {
15 | def main(args: Array[String]) {
16 | //1 构建Spark对象
17 | val conf = new SparkConf()
18 | .setAppName("KMeans")
19 | .setMaster("local")
20 | val sc = new SparkContext(conf)
21 | Logger.getRootLogger.setLevel(Level.WARN)
22 |
23 | // 读取样本数据1,格式为LIBSVM format
24 | val data = sc.textFile("E://sparkmllibData/kMeans_demo/testSet.txt")
25 | //val data = sc.textFile("D://sparkmllibData/sparkml/mllibdata/kmeans_data.txt")
26 | val parsedData = data.map(s => Vectors.dense(s.split('\t').map(_.toDouble))).cache()
27 |
28 | // 新建KMeans聚类模型,并训练
29 | val initMode = "k-means"
30 | //val initMode = "k-means++"
31 | //val initMode = "k-means||"
32 | val numClusters = 5
33 | val numIterations = 100
34 |
35 | val model = new KMeans()
36 | .setInitializationMode(initMode)
37 | .setK(numClusters)
38 | .setMaxIterations(numIterations)
39 | .run(parsedData)
40 | val centers = model.clusterCenters
41 | println("centers")
42 | for (i <- 0 to centers.length - 1) {
43 | println(centers(i)(0) + "\t" + centers(i)(1))
44 | }
45 | // 误差计算
46 | val WSSSE = model.computeCost(parsedData)
47 | println("Within Set Sum of Squared Errors = " + WSSSE)
48 |
49 | //保存模型
50 | /*val ModelPath = "D://sparkmllibData/sparkml/mllibdata/KMeans_Model"
51 | model.save(sc, ModelPath)
52 | val sameModel = KMeansModel.load(sc, ModelPath)*/
53 |
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/com/libin/PCADemo.scala:
--------------------------------------------------------------------------------
1 | package com.libin
2 |
3 | import org.apache.spark.mllib.linalg.Vectors
4 | import org.apache.spark.mllib.linalg.distributed.RowMatrix
5 | import org.apache.spark.{SparkContext, SparkConf}
6 |
7 | /**
8 | * Copyright (c) 2018/09/04. xixi Inc. All Rights Reserved.
9 | * Authors: libin <2578858653@qq.com>
10 | *
11 | * Purpose :
12 | */
13 | object PCADemo {
14 | val conf = new SparkConf()
15 | .setMaster("local")
16 | .setAppName("PCA")
17 | val sc = new SparkContext(conf)
18 |
19 | def main(args: Array[String]) {
20 | val data = sc.textFile("D://sparkmllibData/sparkml/mllibdata/svd.txt")
21 | .map(_.split(" ").map(_.toDouble))
22 | .map(line => Vectors.dense(line))
23 |
24 | val rm = new RowMatrix(data)
25 | val pc = rm.computePrincipalComponents(3)
26 | //提取主成分,设置主成分个数为3
27 | val mx = rm.multiply(pc) //创建主成分矩阵
28 |
29 | mx.rows.foreach(println)
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/com/libin/RFDemo.scala:
--------------------------------------------------------------------------------
1 | package com.libin
2 |
3 | import org.apache.spark.mllib.linalg.Vectors
4 | import org.apache.spark.mllib.regression.LabeledPoint
5 | import org.apache.spark.mllib.tree.RandomForest
6 | import org.apache.spark.{ml, SparkConf, SparkContext}
7 | import org.apache.spark.mllib.util.MLUtils
8 |
9 | /**
10 | * Copyright (c) 2018/09/04. xixi Inc. All Rights Reserved.
11 | * Authors: libin <2578858653@qq.com>
12 | *
13 | * Purpose : 随机森林
14 | */
15 | object RFDemo {
16 | def main(args: Array[String]) {
17 | val conf = new SparkConf()
18 | .setMaster("local")
19 | .setAppName("RF")
20 | val sc = new SparkContext(conf)
21 |
22 | val data = MLUtils.loadLibSVMFile(sc, "D://sparkmllibData/sparkml/mllibdata/sample_libsvm_data.txt")
23 |
24 | val numClasses = 2 //分类数量
25 | val categoricalFeaturesInfo = Map[Int, Int]()
26 | //设定输入格式
27 | val numTrees = 3 // 随机森林中决策树的数目
28 | val featureSubSetStrategy = "auto" //设置属性在节点计算数,自动决定每个节点的属性数 Supported: "auto", "all", "sqrt", "log2", "onethird".
29 | val impurity = "gini" //设定信息增益计算方式 Supported values: "gini" (recommended) or "entropy".
30 | val maxDepth = 5 //最大深度
31 | val maxBins = 3 // 设定分割数据集
32 |
33 | /**
34 | * 建立模型 分类
35 | */
36 | val model = RandomForest.trainClassifier(data, numClasses, categoricalFeaturesInfo, numTrees,
37 | featureSubSetStrategy, impurity, maxDepth, maxBins
38 | )
39 | model.trees.foreach(println) //打印每棵树信息
40 | println(model.numTrees)
41 | println(model.algo)
42 |
43 | /**
44 | * 建立模型 回归
45 | */
46 | val data_path1 = "D://sparkmllibData/sparkml/mllibdata/lpsa.data"
47 | val data2 = sc.textFile(data_path1)
48 | val inputdata = data2.map { line =>
49 | val parts = line.split(',')
50 | LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(" ").map(_.toDouble)))
51 | }.cache()
52 |
53 | val impurity2 = "variance"
54 | val seed = 11
55 | val model2 = RandomForest.trainRegressor(inputdata, categoricalFeaturesInfo, numTrees,
56 | featureSubSetStrategy, impurity2, maxDepth, maxBins, seed)
57 | model2.trees.foreach(println) //打印每棵树信息
58 | println(model2.numTrees)
59 | println(model2.algo)
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/com/libin/RowmatriTest01.scala:
--------------------------------------------------------------------------------
1 | package com.libin
2 |
3 | import org.apache.log4j.{Level, Logger}
4 | import org.apache.spark.{SparkConf, SparkContext}
5 | import breeze.linalg._
6 | import breeze.numerics._
7 | import org.apache.spark.mllib.linalg.Vectors
8 | import org.apache.spark.mllib.linalg.distributed.RowMatrix
9 |
10 | /**
11 | * Copyright (c) 2018/09/04. xixi Inc. All Rights Reserved.
12 | * Authors: libin <2578858653@qq.com>
13 | *
14 | * Purpose :
15 | */
16 | object RowmatriTest01 {
17 | def main(args: Array[String]) {
18 | val conf = new SparkConf().setAppName("rowmatri_test01").setMaster("local")
19 | val sc = new SparkContext(conf)
20 | Logger.getRootLogger.setLevel(Level.WARN)
21 |
22 | // 3.6 分布式矩阵
23 | // 3.6.2 行矩阵(RowMatrix)
24 | val rdd1 = sc.parallelize(Array(Array(1.0, 2.0, 3.0, 4.0), Array(2.0, 3.0, 4.0, 5.0), Array(3.0, 4.0, 5.0, 6.0))).map(f => Vectors.dense(f))
25 | val RM = new RowMatrix(rdd1)
26 | val simic1 = RM.columnSimilarities(0.5)
27 | val simic2 = RM.columnSimilarities()
28 | val simic3 = RM.computeColumnSummaryStatistics()
29 | simic3.max
30 | simic3.min
31 | simic3.mean
32 | val cc1 = RM.computeCovariance
33 | val cc2 = RM.computeGramianMatrix
34 | val pc1 = RM.computePrincipalComponents(3)
35 | val svd = RM.computeSVD(4, true)
36 | val U = svd.U
37 | U.rows.foreach(println)
38 | val s = svd.s
39 | val V = svd.V
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/com/libin/SVD.scala:
--------------------------------------------------------------------------------
1 | package com.libin
2 |
3 | import org.apache.spark.mllib.linalg.Vectors
4 | import org.apache.spark.mllib.linalg.distributed.RowMatrix
5 | import org.apache.spark.{SparkContext, SparkConf}
6 |
7 | /**
8 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved.
9 | * Authors: libin <2578858653@qq.com>
10 | *
11 | * Purpose : 降维SVD
12 | * 奇异值分解(SVD):一个矩阵分解成带有方向向量的矩阵相乘
13 | */
14 | object SVD {
15 | val conf = new SparkConf()
16 | .setMaster("local")
17 | .setAppName("SVD")
18 | val sc = new SparkContext(conf)
19 |
20 | def main(args: Array[String]) {
21 | val data = sc.textFile("D://sparkmllibData/sparkml/mllibdata/svd.txt")
22 | .map(_.split(" ").map(_.toDouble))
23 | .map(line => Vectors.dense(line))
24 |
25 | val rm = new RowMatrix(data) //读入行矩阵
26 | val SVD = rm.computeSVD(2, computeU = true) //进行SVD计算
27 | //求 SVD 分解的矩阵
28 | val u = SVD.U
29 | val s = SVD.s
30 | val v = SVD.V
31 | println("SVD.U")
32 | u.rows.foreach(println)
33 | println("SVD.s")
34 | println(s)
35 | println("SVD.V")
36 | println(v)
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/com/libin/Svm.scala:
--------------------------------------------------------------------------------
1 | package com.libin
2 |
3 | import org.apache.log4j.{Level, Logger}
4 | import org.apache.spark.{SparkConf, SparkContext}
5 | import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
6 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
7 | import org.apache.spark.mllib.util.MLUtils
8 |
9 | /**
10 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved.
11 | * Authors: libin <2578858653@qq.com>
12 | *
13 | * Purpose :
14 | */
15 | object Svm {
16 | def main(args: Array[String]) {
17 | //1 构建Spark对象
18 | val conf = new SparkConf()
19 | .setAppName("svm")
20 | .setMaster("local")
21 | val sc = new SparkContext(conf)
22 | Logger.getRootLogger.setLevel(Level.WARN)
23 |
24 | // 读取样本数据1,格式为LIBSVM format
25 | val data = MLUtils.loadLibSVMFile(sc, "D://sparkmllibData/sparkml/mllibdata/sample_libsvm_data.txt")
26 |
27 | //样本数据划分训练样本与测试样本
28 | val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
29 | val training = splits(0).cache()
30 | val test = splits(1)
31 |
32 | //新建逻辑回归模型,并训练
33 | val numIterations = 100
34 | val model = SVMWithSGD.train(training, numIterations)
35 |
36 | //对测试样本进行测试
37 | val predictionAndLabel = test.map { point =>
38 | val score = model.predict(point.features)
39 | (score, point.label)
40 | }
41 | val print_predict = predictionAndLabel.take(20)
42 | println("prediction" + "\t" + "label")
43 | for (i <- 0 to print_predict.length - 1) {
44 | println(print_predict(i)._1 + "\t" + print_predict(i)._2)
45 | }
46 |
47 | // 误差计算
48 | val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count()
49 | println("Area under ROC = " + accuracy)
50 |
51 | //保存模型
52 | /*val ModelPath = "D://sparkmllibData/sparkml/mllibdata/svm_model"
53 | model.save(sc, ModelPath)
54 | val sameModel = SVMModel.load(sc, ModelPath)*/
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/com/libin/Test.scala:
--------------------------------------------------------------------------------
1 | package com.libin
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | /**
6 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved.
7 | * Authors: libin <2578858653@qq.com>
8 | *
9 | * Purpose :
10 | */
11 | object Test {
12 | def main(args: Array[String]) {
13 | val conf: SparkConf = new SparkConf().setMaster("local").setAppName("test")
14 | val sc: SparkContext = new SparkContext(conf)
15 | val rdd1 = sc.parallelize(List(('a', 2), ('b', 4), ('c', 6), ('d', 9)))
16 | val rdd2 = sc.parallelize(List(('c', 6), ('c', 7), ('d', 8), ('e', 10)))
17 | val unionrdd = rdd1 union rdd2
18 | //rdd1.coalesce()
19 | unionrdd.foreach(println)
20 |
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/com/libin/Tfidf.scala:
--------------------------------------------------------------------------------
1 | package com.libin
2 |
3 | import org.apache.spark.mllib.feature.{IDFModel, IDF, HashingTF}
4 | import org.apache.spark.mllib.linalg.Vector
5 | import org.apache.spark.rdd.RDD
6 | import org.apache.spark.{SparkContext, SparkConf}
7 |
8 | /**
9 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved.
10 | * Authors: libin <2578858653@qq.com>
11 | *
12 | * Purpose :
13 | */
14 | object Tfidf {
15 | def main(args: Array[String]) {
16 | val conf: SparkConf = new SparkConf()
17 | .setMaster("local")
18 | .setAppName("tf_idf")
19 | val sc: SparkContext = new SparkContext(conf)
20 | //读取数据
21 | val document = sc.textFile("D://sparkmllibData/sparkml/mllibdata/tf_idf.txt").map(_.split("\t").toSeq)
22 | //创建TF计算实例
23 | val hashingTF = new HashingTF()
24 | //计算文档TF值
25 | val tf: RDD[Vector] = hashingTF.transform(document).cache()
26 | tf.foreach(println)
27 | //创建IDF实例并计算
28 | val idf: IDFModel = new IDF().fit(tf)
29 | println(idf)
30 | //计算TF_IDF词频
31 | val tf_idf: RDD[Vector] = idf.transform(tf)
32 | tf_idf.foreach(println)
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/com/libin/TfidfWord2vec.scala:
--------------------------------------------------------------------------------
1 | package com.libin
2 |
3 | /**
4 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved.
5 | * Authors: libin <2578858653@qq.com>
6 | *
7 | * Purpose :
8 | */
9 | object TfidfWord2vec {
10 |
11 | }
12 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/com/libin/Tree.scala:
--------------------------------------------------------------------------------
1 | package com.libin
2 |
3 | import org.apache.log4j.{Level, Logger}
4 | import org.apache.spark.{SparkConf, SparkContext}
5 | import org.apache.spark.mllib.tree.DecisionTree
6 | import org.apache.spark.mllib.util.MLUtils
7 |
8 | /**
9 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved.
10 | * Authors: libin <2578858653@qq.com>
11 | *
12 | * Purpose :
13 | */
14 | object Tree {
15 | def main(args: Array[String]) {
16 | //1 构建Spark对象
17 | val conf = new SparkConf()
18 | .setAppName("DecisionTree")
19 | .setMaster("local")
20 | val sc = new SparkContext(conf)
21 | Logger.getRootLogger.setLevel(Level.WARN)
22 |
23 | // 读取样本数据1,格式为LIBSVM format
24 | val data = MLUtils.loadLibSVMFile(sc, "D://sparkmllibData/sparkml/mllibdata/sample_libsvm_data.txt")
25 | // Split the data into training and test sets (30% held out for testing)
26 | val splits = data.randomSplit(Array(0.7, 0.3))
27 | val (trainingData, testData) = (splits(0), splits(1))
28 |
29 | // 新建决策树
30 | val numClasses = 2 //设定分类数量
31 | val categoricalFeaturesInfo = Map[Int, Int]() //设定输入格式
32 | val impurity = "gini" //设定信息增益计算方式
33 | val maxDepth = 3 //设定树高度
34 | val maxBins = 32 //设定分裂数据集
35 |
36 | //建立模型
37 | val model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo,
38 | impurity, maxDepth, maxBins)
39 |
40 | // 实际值 预测值
41 | val labelAndPreds = testData.map { point =>
42 | val prediction = model.predict(point.features)
43 | (point.label, prediction)
44 | }
45 | val print_predict = labelAndPreds.take(20)
46 | println("label" + "\t" + "prediction")
47 | for (i <- print_predict.indices) {
48 | println(print_predict(i)._1 + "\t" + print_predict(i)._2)
49 | }
50 | //计算误差
51 | val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
52 | println("Test Error = " + testErr)
53 | println("Learned classification tree model:\n" + model.toDebugString)
54 |
55 | // 保存模型
56 | /*val ModelPath = "D://sparkmllibData/sparkml/mllibdata/Decision_Tree_Model"
57 | model.save(sc, ModelPath)
58 | val sameModel = DecisionTreeModel.load(sc, ModelPath)*/
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/com/libin/VectorDemo.scala:
--------------------------------------------------------------------------------
1 | package com.libin
2 |
3 | import org.apache.spark.mllib.linalg.{Matrices, Vector, Vectors}
4 | import org.apache.spark.mllib.regression.LabeledPoint
5 |
6 | /**
7 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved.
8 | * Authors: libin <2578858653@qq.com>
9 | *
10 | * Purpose :
11 | */
12 | object VectorDemo {
13 | def main(args: Array[String]) {
14 | //建立密集向量
15 | val vd: Vector = Vectors.dense(9, 5, 2, 7)
16 | val pos = LabeledPoint(1, vd)
17 | println(pos.features)
18 | println(pos.label)
19 | //println(vd(2))
20 | //建立稀疏向量
21 | val vs: Vector = Vectors.sparse(4, Array(0, 1, 2, 3), Array(9, 5, 2, 7))
22 | val neg = LabeledPoint(0, vs)
23 | println(neg.features)
24 | println(neg.label)
25 | //println(vs(2))
26 |
27 | /*val conf: SparkConf = new SparkConf()
28 | .setAppName("vector")
29 | .setMaster("local")
30 | val sc: SparkContext = new SparkContext(conf)
31 | val mu = MLUtils.loadLibSVMFile(sc,"D://sparkmllibData/sparkml/mllibdata/vectors.txt")
32 | mu.foreach(println)*/
33 | //本地矩阵
34 | val mx = Matrices.dense(2, 3, Array(1, 2, 3, 4, 5, 6))
35 | println(mx)
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/com/libin/kaggle/kaggle_digit_recognizer_data.scala:
--------------------------------------------------------------------------------
1 | package com.libin.kaggle
2 |
3 | import org.apache.log4j.{Level, Logger}
4 | import org.apache.spark.{SparkContext, SparkConf}
5 |
6 | /**
7 | * Copyright (c) 2017/06/26. xixi Inc. All Rights Reserved.
8 | * Authors: libin <2578858653@qq.com>
9 | *
10 | * Purpose :
11 | */
12 | object kaggle_digit_recognizer_data {
13 | def main(args: Array[String]) {
14 | //构建Spark对象
15 | val conf = new SparkConf()
16 | .setAppName("kaggle_digit_recognizer_data")
17 | .setMaster("local")
18 | .set("spark.driver.memory", "2G")
19 | val sc = new SparkContext(conf)
20 | Logger.getRootLogger.setLevel(Level.WARN)
21 |
22 | //将预测结果转换成kaggle识别的格式 , 第一行还要加上 ImageId,Label
23 | var count = 0
24 | val train_data = sc.textFile("E://_deeplearning/Digit-Recognizer-Kaggle-master/data/prediction_rf/part-00000")
25 | .map(line => {
26 | count += 1
27 | count + "," + line
28 | })
29 | .repartition(1).saveAsTextFile("E://_deeplearning/Digit-Recognizer-Kaggle-master/data/result_rf")
30 |
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/com/libin/scala/AaidTest.scala:
--------------------------------------------------------------------------------
1 | package com.libin.scala
2 |
3 | import org.apache.spark.rdd.RDD
4 | import org.apache.spark.{SparkConf, SparkContext}
5 |
6 | import scala.collection.mutable
7 | import scala.collection.JavaConverters._
8 |
9 | /**
10 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved.
11 | * Authors: libin <2578858653@qq.com>
12 | *
13 | * Purpose :
14 | */
15 | object AaidTest {
16 | def main(args: Array[String]): Unit = {
17 | val conf = new SparkConf().setAppName("AaidTest").setMaster("local")
18 | val sc = new SparkContext(conf)
19 |
20 | sc.textFile("D://sparkmllibData/sparkml/mllibdata/arrregation.txt")
21 | .map(line => {
22 | (line.split("\t")(0), line.split("\t")(1).toLong)
23 | }).aggregateByKey(0L)(seqOp, seqOp)
24 | .filter(line => line._2 != 1L)
25 | .collect().foreach(println)
26 |
27 | }
28 |
29 | def seqOp(U: Long, v: Long): Long = {
30 | println("seqOp")
31 | println("U=" + U)
32 | println("v=" + v)
33 | var count: Int = 0
34 | if (U != 0L) {
35 | count += 1
36 | }
37 | if (v != 0L) {
38 | count += 1
39 | }
40 | if (count > 1) {
41 | 1L
42 | } else {
43 | v
44 | }
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/spark-mllib/src/main/scala/com/libin/scala/AggredateTest.scala:
--------------------------------------------------------------------------------
1 | package com.libin.scala
2 |
3 | import org.apache.spark.{SparkContext, SparkConf}
4 |
5 | /**
6 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved.
7 | * Authors: libin <2578858653@qq.com>
8 | *
9 | * Purpose :
10 | */
11 | object AggredateTest {
12 | def main(args: Array[String]) {
13 | val conf = new SparkConf().setMaster("local").setAppName("aggredate_test")
14 | val sc = new SparkContext(conf)
15 | val data = sc.parallelize(List((1, 3), (1, 2), (1, 4), (2, 3)))
16 | data.aggregateByKey(5)(seq, comb).collect.foreach(println)
17 | }
18 |
19 | def seq(a: Int, b: Int): Int = {
20 | println("seq: " + a + "\t " + b)
21 | math.max(a, b)
22 | }
23 |
24 | def comb(a: Int, b: Int): Int = {
25 | println("comb: " + a + "\t " + b)
26 | a + b
27 | }
28 | }
29 |
--------------------------------------------------------------------------------