├── .gitignore ├── README.md ├── TODO.md ├── bigdata-demo ├── HBase │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── com │ │ └── shujia │ │ ├── Demo01TestAPI.java │ │ ├── Demo02API.java │ │ ├── Demo03DianXin.java │ │ ├── Demo04Filter.java │ │ ├── Demo05MRReadHBase.java │ │ ├── Demo06MRReadAndWriteHBase.java │ │ └── Demo07PhoenixJDBC.java ├── Hadoop │ ├── data │ │ ├── .students.txt.crc │ │ ├── score.txt │ │ ├── stuSumScore.txt │ │ ├── students.txt │ │ └── words.txt │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── com │ │ └── shujia │ │ ├── HDFS │ │ └── HDFSJavaAPI.java │ │ ├── MapReduce │ │ ├── Demo10Partitioner.java │ │ ├── Demo11Sort.java │ │ ├── Demo1WordCount.java │ │ ├── Demo2GenderCnt.java │ │ ├── Demo3SumScore.java │ │ ├── Demo4Join.java │ │ ├── Demo5MRFilter.java │ │ ├── Demo6WordCountCombiner.java │ │ ├── Demo7MRAppMaster.java │ │ ├── Demo8NodeManager.java │ │ └── Demo9MapJoin.java │ │ └── ZOOKEEPER │ │ └── ZKJavaAPI.java ├── Hive │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── com │ │ └── shujia │ │ ├── Hive │ │ └── UDF │ │ │ ├── MyUDF.java │ │ │ ├── MyUDTF.java │ │ │ └── MyUDTF2.java │ │ └── HiveJDBC │ │ └── HiveJDBCOp.java ├── Java │ ├── data │ │ └── students.txt │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── com │ │ └── shujia │ │ ├── HelloWorld.java │ │ ├── InsertTask.java │ │ ├── MySQLJDBCDemo.java │ │ └── ReadAndWriteToMySQL.java ├── Redis │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ └── com │ │ └── shujia │ │ ├── RedisClusterConn.java │ │ ├── RedisConnectionPool.java │ │ ├── RedisDemo1.java │ │ ├── RedisHash.java │ │ ├── RedisHyperLogLog.java │ │ ├── RedisList.java │ │ ├── RedisSet.java │ │ ├── RedisSortedSet.java │ │ └── RedisString.java ├── SpringBootDemo14 │ ├── .gitignore │ ├── .idea │ │ ├── .gitignore │ │ ├── codeStyles │ │ │ ├── Project.xml │ │ │ └── codeStyleConfig.xml │ │ ├── compiler.xml │ │ ├── encodings.xml │ │ ├── jarRepositories.xml │ │ ├── misc.xml │ │ ├── runConfigurations.xml │ │ └── vcs.xml │ ├── data │ │ └── student.sql │ ├── pom.xml │ └── src │ │ └── main │ │ ├── java │ │ └── com │ │ │ └── shujia │ │ │ ├── Controller │ │ │ └── StudentController.java │ │ │ ├── Dao │ │ │ └── StudentRepository.java │ │ │ ├── Entity │ │ │ └── Student.java │ │ │ ├── Service │ │ │ └── StudentService.java │ │ │ ├── SpringBootDemoApplication.java │ │ │ └── common │ │ │ └── Result.java │ │ └── resources │ │ ├── application.properties │ │ └── static │ │ ├── element.css │ │ ├── element.js │ │ ├── fonts │ │ ├── element-icons.ttf │ │ └── element-icons.woff │ │ ├── index.html │ │ ├── jquery.min.js │ │ └── vue.js └── pom.xml ├── bigdata-doris ├── .gitignore ├── README.md ├── pom.xml └── src │ └── main │ ├── docs │ ├── 存储.md │ ├── 数据导入.md │ ├── 数据模型.md │ └── 架构.md │ └── java │ └── README.md ├── bigdata-druid ├── .gitignore ├── README.md ├── pom.xml └── src │ └── main │ └── java │ └── com │ └── libin │ └── README.md ├── bigdata-flink ├── .gitignore ├── README.md ├── pom.xml ├── scripts │ └── run_flink.sh └── src │ └── main │ ├── docs │ ├── 1_检查点checkpoint.md │ ├── 2_状态state.md │ ├── 3_时间time.md │ ├── 4_窗口windows.md │ ├── datastream.md │ ├── flink与几种流式框架对比.md │ ├── flink分布式缓存.md │ ├── flink基本组件.md │ ├── flink广播变量.md │ ├── flink架构.md │ ├── flink程序开发.md │ ├── flink累加器.md │ ├── flink面试题 │ │ ├── flink面试题1.md │ │ └── slot.png │ ├── images │ │ ├── flink_windows.png │ │ ├── flink对比.jpg │ │ ├── flink数据传输方式.jpg │ │ ├── flink架构.jpg │ │ ├── flink检查点.png │ │ └── flink模块.jpg │ └── 面试题 │ │ └── flink面试题.md │ ├── java │ └── com │ │ └── libin │ │ └── data │ │ └── flink │ │ ├── batch │ │ └── WordCountJava.java │ │ └── streaming │ │ └── WordCount.java │ └── scala │ └── com │ └── libin │ └── data │ └── flink │ ├── base │ ├── FlinkStreamingTrait.scala │ └── client │ │ └── KafkaFlinkStreamingTrait.scala │ ├── batch │ └── WordCount.scala │ └── streaming │ └── jobs │ ├── GenCodeFromMysql.scala │ ├── GenCodeFromState.scala │ ├── GenCodeFromWindow.scala │ ├── GenCodeFromWordCount.scala │ └── config │ ├── GenCodeFromBucketingSink.scala │ └── GenCodeFromCheckpoint.scala ├── bigdata-hadoop ├── .gitignore ├── README.md ├── pom.xml └── src │ └── main │ └── java │ └── com │ └── libin │ ├── api │ ├── hdfs │ │ ├── HdfsUtils.java │ │ └── README.md │ ├── mapreduce │ │ ├── ChainMapperChainReducer.java │ │ ├── CombineTextInputFormatTest.java │ │ ├── CounterTest.java │ │ ├── DBInputFormatTest.java │ │ ├── DistributedDemo.java │ │ ├── GetIDMapReduce.java │ │ ├── GetInputSplit.java │ │ ├── GetSplitMapReduce.java │ │ ├── GetStatusMapReduce.java │ │ ├── KpiApp.java │ │ ├── MultipleInputsTest.java │ │ ├── MyGroup.java │ │ ├── NLineInputFormatTest.java │ │ ├── PartitionerDemo.java │ │ ├── README.md │ │ ├── SecondarySort.java │ │ ├── SequenceFileInputFormatTest.java │ │ ├── SortTest.java │ │ ├── TestwithMultipleOutputs.java │ │ ├── TextPathFilterDemo.java │ │ ├── Topk.java │ │ ├── WordCount.java │ │ ├── allSort │ │ │ ├── SamplerInputFormat.java │ │ │ └── SamplerSort.java │ │ └── inputformat │ │ │ ├── FindMaxValueInputFormat.java │ │ │ ├── FindMaxValueInputSplit.java │ │ │ ├── FindMaxValueMapper.java │ │ │ ├── FindMaxValueRecordReader.java │ │ │ ├── FindMaxValueReducer.java │ │ │ └── MaxValueDriver.java │ └── yarn │ │ └── README.md │ ├── code │ └── hdfs │ │ └── README.md │ └── doc │ ├── hdfs │ └── README.md │ ├── mapreduce │ └── README.md │ └── yarn │ ├── README.md │ └── images │ ├── resource_manager.jpg │ ├── timg.jpg │ ├── yarn.jpg │ └── yarn_architecture.gif ├── bigdata-hbase ├── .gitignore ├── README.md ├── image │ └── HBase体系结构.png ├── pom.xml └── src │ └── main │ └── scala │ └── com │ └── libin │ ├── doc │ ├── Compaction.md │ ├── HBasae客户端 │ │ └── 客户端实现.md │ ├── HBase体系结构.md │ ├── HBase数据模型.md │ ├── HBase算法 │ │ ├── LSM树.md │ │ ├── 布隆过滤器.md │ │ └── 跳跃表.md │ ├── HBase面试题.md │ ├── RegionServer │ │ ├── BlockCache.md │ │ ├── HFile.md │ │ ├── HLog.md │ │ ├── MemStore.md │ │ └── RegionServer内部结构.md │ └── 依赖服务组件 │ │ ├── Hdfs.md │ │ └── ZooKeeper.md │ └── utils │ └── HBaseUtils.scala ├── bigdata-hive ├── .gitignore ├── README.md ├── pom.xml └── src │ └── main │ ├── doc │ ├── Hive架构.md │ ├── Hive解析流程.md │ └── Hive面试题.md │ ├── image │ ├── hive作业执行过程.png │ └── hive架构.jpg │ └── java │ └── com │ └── libin │ └── HiveUtils.java ├── bigdata-info ├── .gitignore ├── README.md ├── pom.xml └── src │ └── main │ └── java │ └── com │ └── libin │ ├── Test.java │ ├── elasticsearch │ └── README.md │ ├── griffin │ ├── 1.Griffin简介.md │ ├── 2.Griffin指标使用.md │ ├── README.md │ └── image │ │ ├── 1.png │ │ ├── 2.jpg │ │ ├── 3.png │ │ └── 4.jpg │ ├── oozie │ └── README.md │ ├── pegasus │ ├── README.md │ └── 学习资料.md │ └── talos │ └── README.md ├── bigdata-kafka ├── .gitignore ├── README.md ├── pom.xml └── src │ └── main │ ├── docs │ ├── Kafka副本.md │ ├── Kafka基本概念.md │ ├── Kafka面试题.md │ ├── 消费者与消费组.md │ └── 索引和分段.md │ ├── images │ ├── Kafka体系结构.png │ └── kafka多副本架构.png │ └── scala │ └── com │ └── libin │ ├── README.md │ └── code │ ├── base │ └── KafkaJobTrait.scala │ ├── client │ └── KafkaClient.scala │ ├── streaming │ ├── FlinkStramingJob.scala │ └── SparkStreamingKafkaJob.scala │ └── utils │ └── KafkaUtils.scala ├── bigdata-project ├── pom.xml └── src │ └── main │ └── java │ ├── dataWarehouse │ └── readme.md │ ├── featureEngineering │ └── readme.md │ ├── idmapping │ └── readme.md │ ├── realTimeWarehouse │ └── readme.md │ └── userProfile │ └── readme.md ├── bigdata-spark-sql ├── .gitignore ├── README.md ├── pom.xml └── src │ ├── main │ ├── doc │ │ └── READMD.md │ ├── java │ │ └── com │ │ │ └── libin │ │ │ └── utils │ │ │ └── FileUtils.java │ ├── resources │ │ ├── log4j.properties │ │ ├── people.txt │ │ ├── school.json │ │ ├── stu.json │ │ └── users.parquet │ └── scala │ │ └── com │ │ └── libin │ │ ├── common │ │ └── sparkJobBase.scala │ │ ├── etl │ │ ├── jobs │ │ │ ├── ConvertJobScheduler.scala │ │ │ ├── DfJobScheduler.scala │ │ │ └── DsJobScheduler.scala │ │ ├── loader │ │ │ └── data │ │ │ │ ├── DfBuilder.scala │ │ │ │ ├── DsBuilder.scala │ │ │ │ └── RddBuilder.scala │ │ ├── processor │ │ │ └── ProcessorOp.scala │ │ └── utils │ │ │ ├── DateUtils.scala │ │ │ ├── LoadUtils.scala │ │ │ ├── LogUtils.scala │ │ │ └── PathUtils.scala │ │ └── source │ │ ├── Dataset.scala │ │ ├── Row.scala │ │ ├── SparkSession.scala │ │ └── sql.scala │ └── test │ └── scala │ └── com │ └── libin │ └── etl │ └── testProcessor.scala ├── bigdata-spark-streaming ├── .gitignore ├── README.md ├── pom.xml └── src │ └── main │ ├── doc │ ├── 优化.md │ ├── 常见问题.md │ └── 检查点CheckPoint.md │ ├── image │ ├── 1.png │ └── 2.png │ ├── resources │ └── log4j.properties │ └── scala │ └── com │ └── libin │ └── data │ └── streaming │ ├── base │ ├── SparkStreamingTrait.scala │ └── client │ │ ├── KafkaSparkStreamingTrait.scala │ │ └── SocketSparkStreamingTrait.scala │ ├── jobs │ ├── GenCodeFromCheckpoint.scala │ ├── GenCodeFromForeachRDD.scala │ ├── GenCodeFromKafka.scala │ ├── GenCodeFromParams.scala │ ├── GenCodeFromWindow.scala │ └── NetworkWordCount.scala │ └── utils │ └── StreamingExamples.scala ├── doc1 ├── README-PLAN.md └── README2.md ├── pom.xml ├── spark-core ├── .gitignore ├── README.md ├── pom.xml └── src │ └── main │ ├── doc │ ├── Spark优化.md │ ├── Spark参数配置.md │ ├── Spark基本架构.md │ ├── Spark算子.md │ ├── Spark面试题.md │ ├── images │ │ ├── 统一内存管理_堆内.png │ │ ├── 统一内存管理_堆外.png │ │ ├── 静态内存管理_堆内.png │ │ └── 静态内存管理_堆外.png │ └── 目录.md │ ├── resources │ └── mysql.conf │ └── scala │ └── com │ └── libin │ ├── base │ ├── SparkJobBase.scala │ └── TableLoaderBase.scala │ ├── client │ ├── AccumulatorDemo.scala │ ├── AggregateByKeyDemo.scala │ ├── BroadcastDemo.scala │ ├── MyPartitioner.scala │ ├── README.md │ ├── SecondarySort.scala │ └── cacheAndPersist.scala │ ├── jobs │ └── READMD.md │ ├── loader │ └── READMD.md │ ├── processor │ └── READMD.md │ ├── source │ └── READMD.md │ └── utils │ ├── DateUtils.scala │ ├── MySQLUtils.scala │ ├── PathUtils.scala │ ├── README.md │ ├── ResourceUtils.scala │ ├── SeparatorUtils.scala │ └── SparkLogUtils.scala ├── spark-graphx ├── .gitignore ├── README.md ├── pom.xml └── src │ └── main │ ├── resources │ └── log4j.properties │ └── scala │ └── com │ └── libin │ ├── docs │ └── README.md │ └── graphX │ └── etl │ ├── GparhXShortLength.scala │ ├── GraphXProcessor.scala │ └── graphDegree1Test.scala └── spark-mllib ├── .gitignore ├── README.md ├── pom.xml └── src └── main └── scala └── com └── libin ├── AlsRecommend.scala ├── DistributedMatrixRow.scala ├── FPGrowthDemo.scala ├── GBDT.scala ├── IsotonicRegressionDemo.scala ├── KMeans.scala ├── LBFGSExample.scala ├── LdaDemo.scala ├── LinearRegression.scala ├── LogisticRegression.scala ├── PCADemo.scala ├── RFDemo.scala ├── RowmatriTest01.scala ├── SVD.scala ├── StatisticsDemo.scala ├── Svm.scala ├── Test.scala ├── Tfidf.scala ├── TfidfWord2vec.scala ├── Tree.scala ├── VectorDemo.scala ├── ad ├── PredictLrAd.scala └── TrainFeatureLrAd.scala ├── kaggle ├── kaggle_digit_recognizer_data.scala ├── kaggle_digit_recognizer_lr.scala └── kaggle_digit_recognizer_rf.scala └── scala ├── AaidTest.scala └── AggredateTest.scala /TODO.md: -------------------------------------------------------------------------------- 1 | 补充TODO待办清单 2 | -------------------------------------------------------------------------------- /bigdata-demo/HBase/src/main/java/com/shujia/Demo01TestAPI.java: -------------------------------------------------------------------------------- 1 | package com.shujia; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.hbase.HBaseConfiguration; 5 | import org.apache.hadoop.hbase.TableName; 6 | import org.apache.hadoop.hbase.client.Admin; 7 | import org.apache.hadoop.hbase.client.Connection; 8 | import org.apache.hadoop.hbase.client.ConnectionFactory; 9 | import org.apache.hadoop.hbase.client.Table; 10 | 11 | import java.io.IOException; 12 | 13 | public class Demo01TestAPI { 14 | public static void main(String[] args) throws IOException { 15 | // 1、创建配置文件,设置HBase的连接地址(ZK的地址) 16 | Configuration conf = HBaseConfiguration.create(); 17 | conf.set("hbase.zookeeper.quorum", "master:2181,node1:2181,node2:2181"); 18 | // 2、建立连接 19 | Connection conn = ConnectionFactory.createConnection(conf); 20 | 21 | /** 22 | * 3、执行操作: 23 | * 对表的结构进行操作 则getAdmin 24 | * 对表的数据进行操作 则getTable 25 | */ 26 | Admin admin = conn.getAdmin(); 27 | 28 | Table test = conn.getTable(TableName.valueOf("test")); 29 | 30 | // 4、关闭连接 31 | conn.close(); 32 | 33 | 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /bigdata-demo/HBase/src/main/java/com/shujia/Demo07PhoenixJDBC.java: -------------------------------------------------------------------------------- 1 | package com.shujia; 2 | 3 | import java.sql.*; 4 | 5 | public class Demo07PhoenixJDBC { 6 | public static void main(String[] args) throws SQLException { 7 | 8 | Connection conn = DriverManager.getConnection("jdbc:phoenix:master,node1,node2:2181"); 9 | PreparedStatement ps = conn.prepareStatement("select /*+ INDEX(DIANXIN DIANXIN_INDEX) */ * from DIANXIN where end_date=?"); 10 | ps.setString(1, "20180503212649"); 11 | ResultSet rs = ps.executeQuery(); 12 | while (rs.next()) { 13 | String mdn = rs.getString("mdn"); 14 | String start_date = rs.getString("start_date"); 15 | String end_date = rs.getString("end_date"); 16 | String x = rs.getString("x"); 17 | String y = rs.getString("y"); 18 | String county = rs.getString("county"); 19 | System.out.println(mdn + "\t" + start_date + "\t" + end_date + "\t" + x + "\t" + y + "\t" + county); 20 | } 21 | ps.close(); 22 | conn.close(); 23 | 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /bigdata-demo/Hadoop/data/.students.txt.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-demo/Hadoop/data/.students.txt.crc -------------------------------------------------------------------------------- /bigdata-demo/Hadoop/data/words.txt: -------------------------------------------------------------------------------- 1 | hadoop hive hbase spark flink 2 | hadoop hive hbase spark flink 3 | hadoop hive hbase spark flink 4 | hadoop hive hbase spark flink 5 | hadoop hive hbase spark flink 6 | java scala python 7 | java scala python 8 | java scala python 9 | java scala python 10 | java scala python -------------------------------------------------------------------------------- /bigdata-demo/Hadoop/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | Bigdata14 7 | io.github.wujun728 8 | 1.0 9 | 10 | 4.0.0 11 | 12 | Hadoop 13 | 14 | 15 | 8 16 | 8 17 | 18 | 19 | 20 | 21 | 22 | org.apache.hadoop 23 | hadoop-client 24 | 2.7.6 25 | 26 | 27 | 28 | 29 | org.apache.zookeeper 30 | zookeeper 31 | 3.4.6 32 | 33 | 34 | 35 | 36 | org.apache.hadoop 37 | hadoop-mapreduce-client-core 38 | 2.7.6 39 | 40 | 41 | 42 | 43 | org.apache.hadoop 44 | hadoop-common 45 | 2.7.6 46 | 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /bigdata-demo/Hadoop/src/main/java/com/shujia/MapReduce/Demo5MRFilter.java: -------------------------------------------------------------------------------- 1 | package com.shujia.MapReduce; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.FileSystem; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.io.NullWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Job; 10 | import org.apache.hadoop.mapreduce.Mapper; 11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 12 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 13 | 14 | import java.io.IOException; 15 | 16 | public class Demo5MRFilter { 17 | public static class MyMapper extends Mapper { 18 | @Override 19 | protected void map(LongWritable key, Text value, Mapper.Context context) throws IOException, InterruptedException { 20 | // 过滤出文科三班的学生 21 | String clazz = value.toString().split(",")[4]; 22 | if ("文科三班".equals(clazz)) { 23 | context.write(value, NullWritable.get()); 24 | } 25 | } 26 | } 27 | 28 | public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { 29 | Configuration conf = new Configuration(); 30 | // 设置MapReduce输出的K-V的分隔符 31 | conf.set("mapred.textoutputformat.separator", ","); 32 | Job job = Job.getInstance(conf); 33 | job.setJobName("Demo5MRFilter"); 34 | job.setJarByClass(Demo5MRFilter.class); 35 | 36 | job.setMapperClass(MyMapper.class); 37 | job.setOutputKeyClass(Text.class); 38 | job.setOutputValueClass(NullWritable.class); 39 | 40 | // 配置输入输出路径 41 | FileInputFormat.addInputPath(job, new Path("/student/input")); 42 | // 输出路径不需要提前创建,如果该目录已存在则会报错 43 | // 通过HDFS的JavaAPI判断输出路径是否存在 44 | Path outPath = new Path("/student/filter/output"); 45 | FileSystem fs = FileSystem.get(conf); 46 | if (fs.exists(outPath)) { 47 | fs.delete(outPath, true); 48 | } 49 | 50 | FileOutputFormat.setOutputPath(job, outPath); 51 | 52 | // 等待job运行完成 53 | job.waitForCompletion(true); 54 | 55 | /** 56 | * hdfs dfs -mkdir -p /student/filter/output 57 | * hadoop jar Hadoop-1.0.jar com.shujia.MapReduce.Demo5MRFilter 58 | */ 59 | 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /bigdata-demo/Hadoop/src/main/java/com/shujia/MapReduce/Demo7MRAppMaster.java: -------------------------------------------------------------------------------- 1 | package com.shujia.MapReduce; 2 | 3 | import java.io.IOException; 4 | import java.io.ObjectOutputStream; 5 | import java.io.OutputStream; 6 | import java.io.Serializable; 7 | import java.net.Socket; 8 | 9 | public class Demo7MRAppMaster { 10 | // 作为MR任务运行的AM,负责任务Task的分配与调度 11 | public static void main(String[] args) throws IOException { 12 | // 创建Socket客户端 13 | Socket sk = new Socket("localhost", 8888); 14 | 15 | // 创建Task任务,即将通过Socket发送给NM 16 | Task task = new Task(); 17 | 18 | System.out.println("Task以构建,准备发送"); 19 | // 建立输出流 20 | OutputStream outputStream = sk.getOutputStream(); 21 | 22 | // 将输出流转换为Object的输出流 23 | ObjectOutputStream objectOutputStream = new ObjectOutputStream(outputStream); 24 | 25 | // 直接将任务以Object的形式发送出去 26 | objectOutputStream.writeObject(task); 27 | 28 | objectOutputStream.flush(); 29 | System.out.println("Task已发送成功"); 30 | 31 | 32 | objectOutputStream.close(); 33 | outputStream.close(); 34 | sk.close(); 35 | 36 | 37 | } 38 | 39 | } 40 | 41 | /** 42 | * 在MR中,MapTask、ReduceTask 43 | * 都是线程对象,因为需要在网络中传输,所以都实现了序列化接口 44 | * 分区、分组、排序等其他功能有MR框架提供 45 | */ 46 | class Task extends Thread implements Serializable { 47 | @Override 48 | public void run() { 49 | for (int i = 0; i < 100; i++) { 50 | System.out.println(i); 51 | try { 52 | Thread.sleep(1000); 53 | } catch (InterruptedException e) { 54 | e.printStackTrace(); 55 | } 56 | 57 | } 58 | System.out.println("Task执行完毕"); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /bigdata-demo/Hadoop/src/main/java/com/shujia/MapReduce/Demo8NodeManager.java: -------------------------------------------------------------------------------- 1 | package com.shujia.MapReduce; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.io.ObjectInputStream; 6 | import java.net.ServerSocket; 7 | import java.net.Socket; 8 | 9 | public class Demo8NodeManager { 10 | // 接收AM发送过来的Task并执行 11 | public static void main(String[] args) throws IOException, ClassNotFoundException { 12 | // 创建Socket服务端 13 | ServerSocket serverSocket = new ServerSocket(8888); 14 | System.out.println("NodeManager已经启动,等待接收任务"); 15 | 16 | // 建立Socket连接 17 | Socket socket = serverSocket.accept(); 18 | 19 | // 创建输入流 20 | InputStream inputStream = socket.getInputStream(); 21 | 22 | // 将输入流转换为Object输入流 23 | ObjectInputStream objectInputStream = new ObjectInputStream(inputStream); 24 | 25 | // 直接从Object输入流获取Object对象 26 | Object taskObj = objectInputStream.readObject(); 27 | 28 | System.out.println("接收到了AM发送的Task"); 29 | 30 | // 将Object对象转换为Task对象 31 | Task task = (Task) taskObj; 32 | 33 | System.out.println("正在执行Task"); 34 | // 执行Task 35 | task.start(); 36 | 37 | 38 | // 关闭流,断开连接 39 | objectInputStream.close(); 40 | inputStream.close(); 41 | socket.close(); 42 | serverSocket.close(); 43 | 44 | 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /bigdata-demo/Hive/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | Bigdata14 7 | io.github.wujun728 8 | 1.0 9 | 10 | 4.0.0 11 | 12 | Hive 13 | 14 | 15 | 8 16 | 8 17 | 18 | 19 | 20 | 21 | org.apache.hadoop 22 | hadoop-common 23 | 2.7.6 24 | 25 | 26 | 27 | org.apache.hive 28 | hive-jdbc 29 | 1.2.1 30 | 31 | 32 | org.apache.hive 33 | hive-exec 34 | 1.2.1 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /bigdata-demo/Hive/src/main/java/com/shujia/Hive/UDF/MyUDF.java: -------------------------------------------------------------------------------- 1 | package com.shujia.Hive.UDF; 2 | 3 | import org.apache.hadoop.hive.ql.exec.UDF; 4 | 5 | public class MyUDF extends UDF { 6 | // 自定义UDF 需要继承UDF类,实现evaluate方法 7 | public String evaluate(String clazz) { 8 | // 理科三班 9 | String resStr = ""; 10 | resStr = clazz.replace("一", "1"); 11 | resStr = resStr.replace("二", "2"); 12 | resStr = resStr.replace("三", "3"); 13 | resStr = resStr.replace("四", "4"); 14 | resStr = resStr.replace("五", "5"); 15 | resStr = resStr.replace("六", "6"); 16 | return resStr; 17 | 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /bigdata-demo/Hive/src/main/java/com/shujia/Hive/UDF/MyUDTF.java: -------------------------------------------------------------------------------- 1 | package com.shujia.Hive.UDF; 2 | 3 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 4 | import org.apache.hadoop.hive.ql.metadata.HiveException; 5 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; 6 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 7 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 8 | import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; 9 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 10 | 11 | import java.util.ArrayList; 12 | 13 | public class MyUDTF extends GenericUDTF { 14 | 15 | @Override 16 | // initialize方法,会在UDTF被调用的时候执行一次 17 | public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException { 18 | ArrayList fieldNames = new ArrayList(); 19 | ArrayList fieldOIs = new ArrayList(); 20 | fieldNames.add("col1"); 21 | fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); 22 | fieldNames.add("col2"); 23 | fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); 24 | return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, 25 | fieldOIs); 26 | } 27 | 28 | @Override 29 | public void process(Object[] args) throws HiveException { 30 | 31 | // "key1:value1,key2:value2,key3:value3" 32 | for (Object arg : args) { 33 | String[] kvSplit = arg.toString().split(","); 34 | for (String kv : kvSplit) { 35 | String[] splits = kv.split(":"); 36 | String key = splits[0]; 37 | String value = splits[1]; 38 | ArrayList kvList = new ArrayList<>(); 39 | kvList.add(key); 40 | kvList.add(value); 41 | forward(kvList); 42 | } 43 | } 44 | 45 | 46 | } 47 | 48 | @Override 49 | public void close() throws HiveException { 50 | 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /bigdata-demo/Hive/src/main/java/com/shujia/Hive/UDF/MyUDTF2.java: -------------------------------------------------------------------------------- 1 | package com.shujia.Hive.UDF; 2 | 3 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 4 | import org.apache.hadoop.hive.ql.metadata.HiveException; 5 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; 6 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 7 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 8 | import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; 9 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 10 | 11 | import java.util.ArrayList; 12 | 13 | public class MyUDTF2 extends GenericUDTF { 14 | 15 | @Override 16 | public StructObjectInspector initialize(StructObjectInspector argOIs) throws UDFArgumentException { 17 | ArrayList fieldNames = new ArrayList(); 18 | ArrayList fieldOIs = new ArrayList(); 19 | fieldNames.add("hour"); 20 | fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); 21 | fieldNames.add("value"); 22 | fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector); 23 | return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, 24 | fieldOIs); 25 | } 26 | 27 | @Override 28 | public void process(Object[] args) throws HiveException { 29 | int hour = 0; 30 | for (Object arg : args) { 31 | String value = arg.toString(); 32 | ArrayList hourValueList = new ArrayList<>(); 33 | hourValueList.add(hour+"时"); 34 | hourValueList.add(value); 35 | forward(hourValueList); 36 | hour += 2; 37 | } 38 | 39 | } 40 | 41 | @Override 42 | public void close() throws HiveException { 43 | 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /bigdata-demo/Hive/src/main/java/com/shujia/HiveJDBC/HiveJDBCOp.java: -------------------------------------------------------------------------------- 1 | package com.shujia.HiveJDBC; 2 | 3 | import java.sql.*; 4 | 5 | public class HiveJDBCOp { 6 | public static void main(String[] args) throws ClassNotFoundException, SQLException { 7 | // 1、加载驱动 8 | Class.forName("org.apache.hive.jdbc.HiveDriver"); 9 | 10 | // 2、创建连接 11 | Connection conn = DriverManager.getConnection("jdbc:hive2://master:10000/test1","root",""); 12 | 13 | // 3、创建Statement 14 | // Statement st = conn.createStatement(); 15 | // 4、执行SQL语句 select * from students limit 10 16 | // ResultSet rs = st.executeQuery("select * from students limit 10"); 17 | // 使用prepareStatement 防止SQL注入的问题 18 | PreparedStatement pSt = conn.prepareStatement("select * from students where clazz=?"); 19 | // 设置参数 20 | pSt.setString(1, "文科一班"); 21 | 22 | // PreparedStatement pSt = conn.prepareStatement("select clazz,count(*) as cnt from students group by clazz"); 23 | 24 | ResultSet rs = pSt.executeQuery(); 25 | 26 | // 5、遍历ResultSet获取数据 27 | while (rs.next()) { 28 | int id = rs.getInt("id"); 29 | String name = rs.getString("name"); 30 | int age = rs.getInt("age"); 31 | String gender = rs.getString("gender"); 32 | String clazz = rs.getString("clazz"); 33 | 34 | System.out.println(id + "," + name + "," + age + "," + gender + "," + clazz); 35 | } 36 | 37 | // 关闭连接 38 | rs.close(); 39 | pSt.close(); 40 | conn.close(); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /bigdata-demo/Java/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | Bigdata14 7 | io.github.wujun728 8 | 1.0 9 | 10 | 4.0.0 11 | 12 | Java 13 | 14 | 15 | 8 16 | 8 17 | 18 | 19 | -------------------------------------------------------------------------------- /bigdata-demo/Java/src/main/java/com/shujia/HelloWorld.java: -------------------------------------------------------------------------------- 1 | package com.shujia; 2 | 3 | public class HelloWorld { 4 | public static void main(String[] args) { 5 | // String s = "abc"; 6 | System.out.println("Hello World"); 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /bigdata-demo/Java/src/main/java/com/shujia/MySQLJDBCDemo.java: -------------------------------------------------------------------------------- 1 | package com.shujia; 2 | 3 | import java.sql.*; 4 | 5 | public class MySQLJDBCDemo { 6 | public static void main(String[] args) throws Exception { 7 | // 1、加载驱动 8 | Class.forName("com.mysql.jdbc.Driver"); 9 | 10 | // 2、创建连接 11 | Connection conn = DriverManager.getConnection("jdbc:mysql://master:3306/db1?useSSL=false", "root", "123456"); 12 | 13 | // String clz = "文科二班 or 1=1"; // 直接使用变量拼接SQL会造成SQL注入问题 14 | String clz = "文科二班"; // 直接使用变量拼接SQL会造成SQL注入问题 15 | // String ag = "23 or 1=1"; 16 | int ag = 23; 17 | 18 | //// // 3、创建Statement 19 | // Statement st = conn.createStatement(); 20 | //// 21 | //// // 4、通过Statement执行SQL 22 | // ResultSet rs = st.executeQuery("select * from student where age>"+ag); 23 | 24 | // 3、使用prepareStatement避免SQL注入问题 25 | // 执行DQL时使用executeQuery方法 26 | PreparedStatement ps = conn.prepareStatement("select * from student where clazz=? and age >?"); 27 | // 4、通过PreparedStatement执行SQL 28 | // 先设置参数 从1开始编号 29 | ps.setString(1, clz); 30 | ps.setInt(2, ag); 31 | // 再执行SQL 32 | ResultSet rs = ps.executeQuery(); 33 | 34 | 35 | // 5、遍历ResultSet 获取返回的记录 36 | while (rs.next()) { 37 | int id = rs.getInt("id"); 38 | String name = rs.getString("name"); 39 | int age = rs.getInt("age"); 40 | String gender = rs.getString("gender"); 41 | String clazz = rs.getString("clazz"); 42 | 43 | System.out.println(id + "," + name + "," + age + "," + gender + "," + clazz); 44 | } 45 | // 执行DML(insert、update、delete)操作 可以使用executeUpdate方法 46 | int i = ps.executeUpdate("update score set score=100"); 47 | System.out.println(i); // 返回受影响的记录的条数 48 | 49 | // 其他操作使用execute方法 50 | boolean bool = ps.execute("create database if not exists db3"); 51 | System.out.println(bool); // 指示执行的SQL有无返回值 52 | boolean bool2 = ps.execute("select * from score where score<0 limit 1"); 53 | System.out.println(bool2); 54 | 55 | 56 | // 6、关闭连接 57 | // st.close(); 58 | ps.close(); 59 | conn.close(); 60 | 61 | 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /bigdata-demo/Redis/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | Bigdata14 7 | io.github.wujun728 8 | 1.0 9 | 10 | 4.0.0 11 | 12 | Redis 13 | 14 | 15 | 8 16 | 8 17 | 18 | 19 | 20 | 21 | 22 | redis.clients 23 | jedis 24 | 4.0.1 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /bigdata-demo/Redis/src/main/java/com/shujia/RedisClusterConn.java: -------------------------------------------------------------------------------- 1 | package com.shujia; 2 | 3 | import redis.clients.jedis.ConnectionPool; 4 | import redis.clients.jedis.HostAndPort; 5 | import redis.clients.jedis.JedisCluster; 6 | 7 | import java.util.HashSet; 8 | import java.util.Map; 9 | 10 | public class RedisClusterConn { 11 | public static void main(String[] args) { 12 | // 使用JedisCluster与集群进行通信建立连接 13 | JedisCluster cluster = new JedisCluster(new HostAndPort("master", 6381)); 14 | 15 | cluster.set("cs1", "vv1"); 16 | 17 | System.out.println(cluster.get("cs1")); 18 | 19 | cluster.hset("chs1", "f1", "v1"); 20 | cluster.hset("chs1", "f2", "v1"); 21 | cluster.hset("chs1", "f3", "v1"); 22 | 23 | Map map = cluster.hgetAll("chs1"); 24 | for (Map.Entry kv : map.entrySet()) { 25 | System.out.println(kv.getKey()); 26 | System.out.println(kv.getValue()); 27 | } 28 | 29 | cluster.close(); 30 | 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /bigdata-demo/Redis/src/main/java/com/shujia/RedisConnectionPool.java: -------------------------------------------------------------------------------- 1 | package com.shujia; 2 | 3 | import redis.clients.jedis.Jedis; 4 | import redis.clients.jedis.JedisPool; 5 | 6 | public class RedisConnectionPool { 7 | // Redis连接池 8 | public static void main(String[] args) { 9 | // 使用默认的配置创建Redis连接池 10 | JedisPool jedisPool = new JedisPool("master", 6379); 11 | 12 | // 从连接池中取出一个连接 13 | Jedis jedis = jedisPool.getResource(); 14 | 15 | // 使用连接进行操作 16 | System.out.println(jedis.lrange("list1", 0, -1)); 17 | 18 | // 关闭连接 19 | jedis.close(); 20 | 21 | // 关闭连接池 22 | jedisPool.close(); 23 | 24 | 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /bigdata-demo/Redis/src/main/java/com/shujia/RedisDemo1.java: -------------------------------------------------------------------------------- 1 | package com.shujia; 2 | 3 | import redis.clients.jedis.Jedis; 4 | 5 | public class RedisDemo1 { 6 | /** 7 | * 通过Java代码操作Redis 8 | */ 9 | public static void main(String[] args) { 10 | // 1、建立连接 11 | Jedis jedis = new Jedis("master", 6379); 12 | // 2、测试连通性 13 | System.out.println(jedis.ping()); 14 | 15 | String nk3 = jedis.get("nk3"); 16 | System.out.println(nk3); 17 | 18 | // 关闭Redis连接 19 | jedis.close(); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /bigdata-demo/Redis/src/main/java/com/shujia/RedisHash.java: -------------------------------------------------------------------------------- 1 | package com.shujia; 2 | 3 | import org.junit.After; 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | import redis.clients.jedis.Jedis; 7 | import redis.clients.jedis.JedisCluster; 8 | 9 | import java.util.List; 10 | import java.util.Map; 11 | import java.util.Set; 12 | 13 | public class RedisHash { 14 | Jedis jedis; 15 | 16 | @Before 17 | // 会在执行@Test修饰的方法之前执行 18 | public void init() { 19 | jedis = new Jedis("master", 6379); 20 | } 21 | 22 | @Test 23 | // 创建一个Hash散列 24 | public void HSET() { 25 | jedis.hset("hash1", "id", "1"); 26 | jedis.hset("hash1", "name", "张三"); 27 | jedis.hset("hash1", "age", "18"); 28 | jedis.hset("hash1", "gender", "男"); 29 | jedis.hset("hash1", "clazz", "文科四班"); 30 | } 31 | 32 | @Test 33 | // 获取Hash所有的key 34 | public void HKEYS() { 35 | Set s = jedis.hkeys("hash1"); 36 | for (String s1 : s) { 37 | System.out.println(s1); 38 | } 39 | } 40 | 41 | @Test 42 | // 获取Hash所有的Value 43 | public void HVALS() { 44 | List l = jedis.hvals("hash1"); 45 | for (String s : l) { 46 | System.out.println(s); 47 | } 48 | } 49 | 50 | @Test 51 | // 获取Hash所有的K-V 52 | public void HGETALL() { 53 | Map m = jedis.hgetAll("hash1"); 54 | for (Map.Entry kv : m.entrySet()) { 55 | System.out.println(kv.getKey()); 56 | System.out.println(kv.getValue()); 57 | } 58 | } 59 | 60 | @Test 61 | // 指定Field获取Value 62 | public void HGET() { 63 | System.out.println(jedis.hget("hash1", "name")); 64 | } 65 | 66 | @Test 67 | // 根据Field删除Value 68 | public void HDEL() { 69 | jedis.hdel("hash1", "gender"); 70 | } 71 | 72 | @Test 73 | // 删除整个Hash散列 74 | public void DEL() { 75 | jedis.del("hash1"); 76 | } 77 | 78 | @After 79 | // 表示在@Test方法执行完成之后执行 80 | public void closed() { 81 | jedis.close(); 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /bigdata-demo/Redis/src/main/java/com/shujia/RedisHyperLogLog.java: -------------------------------------------------------------------------------- 1 | package com.shujia; 2 | 3 | import org.junit.After; 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | import redis.clients.jedis.Jedis; 7 | 8 | public class RedisHyperLogLog { 9 | Jedis jedis; 10 | 11 | @Before 12 | // 会在执行@Test修饰的方法之前执行 13 | public void init() { 14 | jedis = new Jedis("master", 6379); 15 | } 16 | 17 | @Test 18 | public void PFADD() { 19 | jedis.pfadd("hll1", "1", "1", "2", "3", "4", "4", "5"); 20 | jedis.pfadd("hll2", "1", "3", "4", "7", "4", "8", "5"); 21 | } 22 | 23 | @Test 24 | // 求一组数据(可能重复)的基数 25 | public void PFCOUNT() { 26 | System.out.println(jedis.pfcount("hll1")); 27 | System.out.println(jedis.pfcount("hll2")); 28 | } 29 | 30 | @Test 31 | // 合并两个HyperLogLog 32 | public void PFMERGE() { 33 | jedis.pfmerge("hll3", "hll1", "hll2"); 34 | System.out.println(jedis.pfcount("hll3")); 35 | } 36 | 37 | 38 | @After 39 | // 表示在@Test方法执行完成之后执行 40 | public void closed() { 41 | jedis.close(); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /bigdata-demo/Redis/src/main/java/com/shujia/RedisList.java: -------------------------------------------------------------------------------- 1 | package com.shujia; 2 | 3 | import org.junit.After; 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | import redis.clients.jedis.Jedis; 7 | 8 | import java.util.List; 9 | 10 | public class RedisList { 11 | Jedis jedis; 12 | 13 | @Before 14 | // 会在执行@Test修饰的方法之前执行 15 | public void init() { 16 | jedis = new Jedis("master", 6379); 17 | } 18 | 19 | @Test 20 | // 创建一个List 21 | public void PUSH() { 22 | jedis.lpush("list1", "1"); 23 | jedis.lpush("list1", "2"); 24 | jedis.rpush("list1", "3"); 25 | jedis.rpush("list1", "4"); 26 | jedis.rpush("list1", "5"); 27 | } 28 | 29 | @Test 30 | // 修改List中的元素 31 | public void LSET() { 32 | jedis.lset("list1", 4, "5.5"); 33 | } 34 | 35 | @Test 36 | // 获取List中的所有元素 37 | public void LRANGE() { 38 | List l = jedis.lrange("list1", 0, -1); 39 | for (String s : l) { 40 | System.out.println(s); 41 | } 42 | } 43 | 44 | 45 | @Test 46 | // 删除元素 47 | public void POP() { 48 | System.out.println(jedis.blpop(1000, "list1")); 49 | System.out.println(jedis.rpop("list1")); 50 | System.out.println(jedis.lpop("list1")); 51 | 52 | } 53 | 54 | 55 | @After 56 | // 表示在@Test方法执行完成之后执行 57 | public void closed() { 58 | jedis.close(); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /bigdata-demo/Redis/src/main/java/com/shujia/RedisSet.java: -------------------------------------------------------------------------------- 1 | package com.shujia; 2 | 3 | import org.junit.After; 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | import redis.clients.jedis.Jedis; 7 | 8 | import java.util.Set; 9 | 10 | public class RedisSet { 11 | Jedis jedis; 12 | 13 | @Before 14 | // 会在执行@Test修饰的方法之前执行 15 | public void init() { 16 | jedis = new Jedis("master", 6379); 17 | } 18 | 19 | @Test 20 | // 创建Set 21 | public void SADD() { 22 | jedis.sadd("s1", "1", "2", "2", "2", "3", "4", "5", "6"); 23 | jedis.sadd("s2", "5", "6", "7", "8", "9"); 24 | } 25 | 26 | @Test 27 | // 移除元素 28 | public void SREM() { 29 | jedis.srem("s1", "1"); 30 | jedis.srem("s1", "4"); 31 | } 32 | 33 | @Test 34 | // 弹出一个元素,位置不确定 35 | public void SPOP() { 36 | String s1 = jedis.spop("s1"); 37 | System.out.println(s1); 38 | } 39 | 40 | @Test 41 | // 获取所有的元素 42 | public void SMEMBERS() { 43 | Set s1 = jedis.smembers("s1"); 44 | for (String s : s1) { 45 | System.out.println(s); 46 | } 47 | } 48 | 49 | @Test 50 | // 集合常见的操作 51 | public void SETOP() { 52 | // 交集 53 | System.out.println(jedis.sinter("s1", "s2")); 54 | // 并集 55 | System.out.println(jedis.sunion("s1", "s2")); 56 | // 差集 57 | System.out.println(jedis.sdiff("s1", "s2")); 58 | 59 | } 60 | 61 | 62 | @After 63 | // 表示在@Test方法执行完成之后执行 64 | public void closed() { 65 | jedis.close(); 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /bigdata-demo/Redis/src/main/java/com/shujia/RedisSortedSet.java: -------------------------------------------------------------------------------- 1 | package com.shujia; 2 | 3 | import org.junit.After; 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | import redis.clients.jedis.Jedis; 7 | import redis.clients.jedis.params.ZParams; 8 | import redis.clients.jedis.resps.Tuple; 9 | 10 | import java.util.Set; 11 | 12 | public class RedisSortedSet { 13 | Jedis jedis; 14 | 15 | @Before 16 | // 会在执行@Test修饰的方法之前执行 17 | public void init() { 18 | jedis = new Jedis("master", 6379); 19 | } 20 | 21 | @Test 22 | // 创建一个有序集合 23 | public void ZADD() { 24 | // zs1 表示水果一月份的销量 25 | jedis.zadd("zs1", 10, "西瓜"); 26 | jedis.zadd("zs1", 10, "西瓜"); 27 | jedis.zadd("zs1", 7, "香蕉"); 28 | jedis.zadd("zs1", 7, "香蕉"); 29 | jedis.zadd("zs1", 7, "香蕉"); 30 | jedis.zadd("zs1", 5, "芒果"); 31 | jedis.zadd("zs1", 5, "芒果"); 32 | jedis.zadd("zs1", 8, "草莓"); 33 | 34 | // zs2 表示水果二月份的销量 35 | jedis.zadd("zs2", 9, "哈密瓜"); 36 | jedis.zadd("zs2", 6, "西瓜"); 37 | jedis.zadd("zs2", 8, "香蕉"); 38 | jedis.zadd("zs2", 3, "香蕉"); 39 | jedis.zadd("zs2", 5, "香蕉"); 40 | jedis.zadd("zs2", 6, "甘蔗"); 41 | jedis.zadd("zs2", 7, "芒果"); 42 | jedis.zadd("zs2", 8, "草莓"); 43 | } 44 | 45 | @Test 46 | // 查看水果的累计销量 47 | public void TwoMonthSUM() { 48 | Set s = jedis.zunionWithScores(new ZParams().aggregate(ZParams.Aggregate.SUM), "zs1", "zs2"); 49 | System.out.println(s); 50 | } 51 | 52 | 53 | @After 54 | // 表示在@Test方法执行完成之后执行 55 | public void closed() { 56 | jedis.close(); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /bigdata-demo/Redis/src/main/java/com/shujia/RedisString.java: -------------------------------------------------------------------------------- 1 | package com.shujia; 2 | 3 | import org.junit.After; 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | import redis.clients.jedis.Jedis; 7 | import redis.clients.jedis.args.BitOP; 8 | 9 | public class RedisString { 10 | Jedis jedis; 11 | 12 | @Before 13 | // 会在执行@Test修饰的方法之前执行 14 | public void init() { 15 | jedis = new Jedis("master", 6379); 16 | } 17 | 18 | 19 | @Test 20 | // 增加一个String类型的value 21 | public void Set() { 22 | jedis.set("j1", "v1"); 23 | jedis.set("j2", "v2"); 24 | jedis.set("j3", "v3"); 25 | 26 | } 27 | 28 | @Test 29 | // 删除一个K-V 30 | public void DEL() { 31 | jedis.del("j1"); 32 | } 33 | 34 | @Test 35 | // 根据K获取V 36 | public void GET() { 37 | System.out.println(jedis.get("j1")); 38 | System.out.println(jedis.get("j2")); 39 | System.out.println(jedis.get("j3")); 40 | } 41 | 42 | @Test 43 | // 创建一个位图 44 | public void SETBIT() { 45 | jedis.setbit("b1", 1, true); 46 | jedis.setbit("b2", 3, true); 47 | } 48 | 49 | @Test 50 | // 获取位图 51 | public void GETBIT() { 52 | System.out.println(jedis.get("b1")); 53 | } 54 | 55 | @Test 56 | // 位图的操作 57 | public void BITOPT() { 58 | jedis.bitop(BitOP.AND, "b3", "b1", "b2"); 59 | jedis.bitop(BitOP.OR, "b4", "b1", "b2"); 60 | jedis.bitop(BitOP.NOT, "b5", "b1"); 61 | jedis.bitop(BitOP.XOR, "b6", "b1", "b2"); 62 | System.out.println(jedis.get("b3")); 63 | System.out.println(jedis.get("b4")); 64 | System.out.println(jedis.get("b5")); 65 | System.out.println(jedis.get("b6")); 66 | } 67 | 68 | 69 | @After 70 | // 表示在@Test方法执行完成之后执行 71 | public void closed() { 72 | jedis.close(); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /bigdata-demo/SpringBootDemo14/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /bigdata-demo/SpringBootDemo14/.idea/codeStyles/Project.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | -------------------------------------------------------------------------------- /bigdata-demo/SpringBootDemo14/.idea/codeStyles/codeStyleConfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /bigdata-demo/SpringBootDemo14/.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 17 | 18 | -------------------------------------------------------------------------------- /bigdata-demo/SpringBootDemo14/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /bigdata-demo/SpringBootDemo14/.idea/jarRepositories.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 9 | 10 | 14 | 15 | 19 | 20 | -------------------------------------------------------------------------------- /bigdata-demo/SpringBootDemo14/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /bigdata-demo/SpringBootDemo14/.idea/runConfigurations.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 9 | 10 | -------------------------------------------------------------------------------- /bigdata-demo/SpringBootDemo14/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /bigdata-demo/SpringBootDemo14/data/student.sql: -------------------------------------------------------------------------------- 1 | create table student( 2 | id int PRIMARY KEY auto_increment, 3 | name varchar(255) not null, 4 | age int not null DEFAULT 0, 5 | gender varchar(255), 6 | clazz varchar(255), 7 | sum_score int 8 | ); 9 | 10 | -- 对name做索引 11 | create index stu_name_index on student(name); -------------------------------------------------------------------------------- /bigdata-demo/SpringBootDemo14/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | io.github.wujun728 8 | SpringBootDemo14 9 | 1.0 10 | 11 | 12 | org.springframework.boot 13 | spring-boot-starter-parent 14 | 2.1.4.RELEASE 15 | 16 | 17 | 18 | UTF-8 19 | UTF-8 20 | 8 21 | 8 22 | 23 | 24 | 25 | 26 | 27 | org.springframework.boot 28 | spring-boot-starter-web 29 | 30 | 31 | 32 | 33 | org.springframework.boot 34 | spring-boot-starter-data-jpa 35 | 36 | 37 | 38 | 39 | mysql 40 | mysql-connector-java 41 | 42 | 43 | 44 | 45 | 46 | org.springframework.boot 47 | spring-boot-maven-plugin 48 | 2.1.4.RELEASE 49 | 50 | true 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /bigdata-demo/SpringBootDemo14/src/main/java/com/shujia/Controller/StudentController.java: -------------------------------------------------------------------------------- 1 | package com.shujia.Controller; 2 | 3 | import com.shujia.Entity.Student; 4 | import com.shujia.Service.StudentService; 5 | import com.shujia.common.Result; 6 | import org.springframework.data.domain.Page; 7 | import org.springframework.web.bind.annotation.*; 8 | 9 | import javax.annotation.Resource; 10 | import java.util.List; 11 | 12 | 13 | @RestController // 将数据以JSON格式返回 14 | @RequestMapping("/stu") 15 | public class StudentController { 16 | @Resource 17 | private StudentService studentService; 18 | 19 | @GetMapping 20 | public Result> findAll() { 21 | List list = studentService.findAll(); 22 | return Result.success(list); 23 | } 24 | 25 | // /stu/page?pageSize=10&pageNum=1 26 | // @GetMapping("/page") 27 | // public Result> findPage(@RequestParam(name = "pageSize") Integer pageSize, 28 | // @RequestParam(name = "pageNum") Integer pageNum) { 29 | // System.out.println(pageSize); 30 | // System.out.println(pageNum); 31 | // Page page = studentService.findPage(pageSize, pageNum); 32 | // return Result.success(page); 33 | // 34 | // } 35 | 36 | @DeleteMapping("/{id}") 37 | public Result deleteById(@PathVariable("id") Integer id) { 38 | System.out.println(id); 39 | studentService.deleteById(id); 40 | return Result.success(); 41 | } 42 | 43 | @PostMapping 44 | public Result saveStu(@RequestBody Student stu) { 45 | studentService.save(stu); 46 | return Result.success(); 47 | } 48 | 49 | @GetMapping("/pageOrsearch") 50 | public Result> searchByClazz(@RequestParam(name = "pageSize") Integer pageSize, 51 | @RequestParam(name = "pageNum") Integer pageNum, 52 | @RequestParam(name = "clazz") String clazz) { 53 | System.out.println(clazz); 54 | Page students = studentService.searchByClazz(pageSize, pageNum, clazz); 55 | return Result.success(students); 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /bigdata-demo/SpringBootDemo14/src/main/java/com/shujia/Dao/StudentRepository.java: -------------------------------------------------------------------------------- 1 | package com.shujia.Dao; 2 | 3 | import com.shujia.Entity.Student; 4 | import org.springframework.data.domain.Page; 5 | import org.springframework.data.domain.PageRequest; 6 | import org.springframework.data.jpa.repository.JpaRepository; 7 | import org.springframework.data.jpa.repository.Query; 8 | import org.springframework.stereotype.Repository; 9 | 10 | @Repository 11 | public interface StudentRepository extends JpaRepository { 12 | @Query(value = "select * from student where clazz like %?%", nativeQuery = true) 13 | public Page findByClazz(String clazz, PageRequest pg); 14 | } 15 | -------------------------------------------------------------------------------- /bigdata-demo/SpringBootDemo14/src/main/java/com/shujia/Entity/Student.java: -------------------------------------------------------------------------------- 1 | package com.shujia.Entity; 2 | 3 | import javax.persistence.*; 4 | 5 | @Entity 6 | @Table(name = "student") 7 | public class Student { 8 | 9 | @Id 10 | @GeneratedValue(strategy = GenerationType.IDENTITY) // 描述id自增 11 | private Integer id; 12 | private String name; 13 | private Integer age; 14 | private String gender; 15 | private String clazz; 16 | // 属性名一般同数据库中表的列名保持一致,不一致时可以使用@Column注解 17 | @Column(name = "sum_score") 18 | private Integer sumScore; 19 | 20 | public Integer getId() { 21 | return id; 22 | } 23 | 24 | public void setId(Integer id) { 25 | this.id = id; 26 | } 27 | 28 | public String getName() { 29 | return name; 30 | } 31 | 32 | public void setName(String name) { 33 | this.name = name; 34 | } 35 | 36 | public Integer getAge() { 37 | return age; 38 | } 39 | 40 | public void setAge(Integer age) { 41 | this.age = age; 42 | } 43 | 44 | public String getGender() { 45 | return gender; 46 | } 47 | 48 | public void setGender(String gender) { 49 | this.gender = gender; 50 | } 51 | 52 | public String getClazz() { 53 | return clazz; 54 | } 55 | 56 | public void setClazz(String clazz) { 57 | this.clazz = clazz; 58 | } 59 | 60 | public Integer getSumScore() { 61 | return sumScore; 62 | } 63 | 64 | public void setSumScore(Integer sumScore) { 65 | this.sumScore = sumScore; 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /bigdata-demo/SpringBootDemo14/src/main/java/com/shujia/Service/StudentService.java: -------------------------------------------------------------------------------- 1 | package com.shujia.Service; 2 | 3 | import com.shujia.Dao.StudentRepository; 4 | import com.shujia.Entity.Student; 5 | import org.springframework.data.domain.Page; 6 | import org.springframework.data.domain.PageRequest; 7 | import org.springframework.data.domain.Sort; 8 | import org.springframework.stereotype.Service; 9 | 10 | import javax.annotation.Resource; 11 | import java.util.List; 12 | 13 | @Service 14 | public class StudentService { 15 | @Resource 16 | private StudentRepository studentRepository; 17 | 18 | public List findAll() { 19 | return studentRepository.findAll(); 20 | } 21 | 22 | public Page findPage(Integer pageSize, Integer pageNum) { 23 | PageRequest pg = PageRequest.of(pageNum - 1, pageSize); 24 | Page pageStu = studentRepository.findAll(pg); 25 | return pageStu; 26 | } 27 | 28 | public void deleteById(Integer id) { 29 | studentRepository.deleteById(id); 30 | } 31 | 32 | public void save(Student stu) { 33 | studentRepository.save(stu); 34 | } 35 | 36 | public Page searchByClazz(Integer pageSize, Integer pageNum, String clazz) { 37 | Sort sort = new Sort(Sort.Direction.DESC, "sum_score"); 38 | PageRequest pg = PageRequest.of(pageNum - 1, pageSize, sort); 39 | Page stuLikeClazz = studentRepository.findByClazz(clazz, pg); 40 | return stuLikeClazz; 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /bigdata-demo/SpringBootDemo14/src/main/java/com/shujia/SpringBootDemoApplication.java: -------------------------------------------------------------------------------- 1 | package com.shujia; 2 | 3 | import org.springframework.boot.SpringApplication; 4 | import org.springframework.boot.autoconfigure.SpringBootApplication; 5 | 6 | @SpringBootApplication 7 | public class SpringBootDemoApplication { 8 | public static void main(String[] args) { 9 | // 启动Spring应用 10 | SpringApplication.run(SpringBootDemoApplication.class,args); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /bigdata-demo/SpringBootDemo14/src/main/java/com/shujia/common/Result.java: -------------------------------------------------------------------------------- 1 | package com.shujia.common; 2 | 3 | public class Result { 4 | private String code; 5 | private String msg; 6 | private T data; 7 | 8 | public Result() { 9 | } 10 | 11 | public Result(T data) { 12 | this.data = data; 13 | } 14 | 15 | public String getCode() { 16 | return code; 17 | } 18 | 19 | public void setCode(String code) { 20 | this.code = code; 21 | } 22 | 23 | public String getMsg() { 24 | return msg; 25 | } 26 | 27 | public void setMsg(String msg) { 28 | this.msg = msg; 29 | } 30 | 31 | public T getData() { 32 | return data; 33 | } 34 | 35 | public void setData(T data) { 36 | this.data = data; 37 | } 38 | 39 | // 请求成功 不返回数据 40 | public static Result success() { 41 | Result rs = new Result<>(); 42 | rs.setCode("200"); 43 | rs.setMsg("ok"); 44 | return rs; 45 | } 46 | 47 | // 请求成功 返回数据 48 | public static Result success(T data) { 49 | Result rs = new Result(data); 50 | rs.setCode("200"); 51 | rs.setMsg("ok"); 52 | return rs; 53 | } 54 | 55 | // 请求失败 56 | public static Result error(String code, String msg) { 57 | Result rs = new Result<>(); 58 | rs.setCode(code); 59 | rs.setMsg(msg); 60 | return rs; 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /bigdata-demo/SpringBootDemo14/src/main/resources/application.properties: -------------------------------------------------------------------------------- 1 | # ?MySQL?????? 2 | spring.datasource.url=jdbc:mysql://master:3306/springbootdemo?useUnicode=true&characterEncoding=utf-8&allowMultiQueries=true&useSSL=false&serverTimezone=GMT%2b8 3 | spring.datasource.username=root 4 | spring.datasource.password=123456 5 | spring.datasource.driver-class-name=com.mysql.cj.jdbc.Driver -------------------------------------------------------------------------------- /bigdata-demo/SpringBootDemo14/src/main/resources/static/fonts/element-icons.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-demo/SpringBootDemo14/src/main/resources/static/fonts/element-icons.ttf -------------------------------------------------------------------------------- /bigdata-demo/SpringBootDemo14/src/main/resources/static/fonts/element-icons.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-demo/SpringBootDemo14/src/main/resources/static/fonts/element-icons.woff -------------------------------------------------------------------------------- /bigdata-demo/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | io.github.wujun728 8 | bigdata-demo 9 | pom 10 | 1.0 11 | 12 | Java 13 | Hadoop 14 | Redis 15 | Hadoop 16 | Hive 17 | HBase 18 | 19 | 20 | 21 | 8 22 | 8 23 | 24 | UTF-8 25 | UTF-8 26 | 27 | 28 | 29 | 30 | 31 | mysql 32 | mysql-connector-java 33 | 5.1.49 34 | 35 | 36 | com.zaxxer 37 | HikariCP 38 | 4.0.3 39 | 40 | 41 | junit 42 | junit 43 | 4.8.2 44 | 45 | 46 | -------------------------------------------------------------------------------- /bigdata-doris/.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | build_info.properties 3 | .classpath 4 | dependency-reduced-pom.xml 5 | *.diff 6 | .DS_Store 7 | .idea/ 8 | *.iml 9 | *.jar 10 | .project 11 | .settings/ 12 | .tags* 13 | target 14 | tmp* 15 | test-output/ 16 | nohup* 17 | *.log 18 | *.swp 19 | *.pyc 20 | script/__pycache__/ 21 | venv 22 | -------------------------------------------------------------------------------- /bigdata-doris/README.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 1、Doris简介 3 | Doris(原百度 Palo)是一款基于大规模并行处理技术的分布式 SQL 数据库,由百度在2017年开源,2018年进入 Apache 孵化器。 4 | 5 | Doris 的主要特性 6 | 1. 兼容 MySQL 协议,支持包括多表 Join、子查询、窗口函数、CTE 在内的丰富的 SQL 语法。支持诸多常见 BI 报表系统,能极大降低用 7 | 户的学习和迁移成本。 8 | 9 | 2. 支持高并发点查询和高吞吐的多维分析查询场景。通过分区裁剪、预聚合、谓词下推、向量化执行等技术,以及高效的列式存储引擎即数 10 | 据压缩算法,满足不同业务场景下的延迟和吞吐需求。 11 | 12 | 3. 特有的数据预聚合功能。支持预聚合表和基准表同步原子更新,为报表场景提供更快速的查询响应。 13 | 14 | 4. 提供强大的扩展性和高可用特性。所有数据都采用多副本的方式保证数据的高可靠,同时提供全自动的副本选择、均衡和修复功能,为用 15 | 户提供7*24小时的高可用数据库系统。 16 | 17 | 5. 提供友好的在线表结构变更功能,能有效应对业务上的需求变化。 18 | 19 | 6. 提供两级数据划分功能以及分层存储功能。用户可以更灵活地对数据进行管理和维护。 20 | 21 | Doris 在百度内部已应用于包括百度凤巢、百度统计等200多个业务线。最大单一业务数据量超过500 TB。同时在百度公有云和 toB 业务 22 | 中也获得了高度认可。自开源以来,已有包括小米、美团、搜狐、新浪微博、瓜子、链家、上海绎维、零售魔方、量化派在内的十多家公司 23 | 将 Doris 使用在生产环境中。 24 | 25 | ##### 2、Doris学习资料 26 | * [Doris中文官网](http://doris.apache.org/master/zh-CN/) 27 | * [Doris官网中文文档](http://doris.apache.org/master/zh-CN/installing/compilation.html) 28 | * [Doris官网](http://doris.apache.org/) 29 | * [Doris文档](http://doris.apache.org/documentation/cn/index.html) 30 | * [GitHub代码-百度](https://github.com/baidu-doris/incubator-doris) 31 | * [GitHub代码-Apache](https://github.com/apache/incubator-doris) 32 | * [GitHub - wiki](https://github.com/apache/incubator-doris/wiki) 33 | 34 | 35 | ##### 2、Doris相关操作 36 | * [Doris读写操作](src/main/java/README.md) 37 | 38 | 39 | -------------------------------------------------------------------------------- /bigdata-doris/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | io.github.wujun728 8 | bigdata-doris 9 | 0.0.1-SNAPSHOT 10 | 11 | 12 | -------------------------------------------------------------------------------- /bigdata-doris/src/main/docs/存储.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-doris/src/main/docs/存储.md -------------------------------------------------------------------------------- /bigdata-doris/src/main/docs/数据导入.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-doris/src/main/docs/数据导入.md -------------------------------------------------------------------------------- /bigdata-doris/src/main/docs/数据模型.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-doris/src/main/docs/数据模型.md -------------------------------------------------------------------------------- /bigdata-doris/src/main/docs/架构.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-doris/src/main/docs/架构.md -------------------------------------------------------------------------------- /bigdata-doris/src/main/java/README.md: -------------------------------------------------------------------------------- 1 | 2 | ##### Doris读取操作 3 | 4 | 5 | -------------------------------------------------------------------------------- /bigdata-druid/.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | build_info.properties 3 | .classpath 4 | dependency-reduced-pom.xml 5 | *.diff 6 | .DS_Store 7 | .idea/ 8 | *.iml 9 | *.jar 10 | .project 11 | .settings/ 12 | .tags* 13 | target/ 14 | tmp* 15 | test-output/ 16 | nohup* 17 | *.log 18 | *.swp 19 | *.pyc 20 | script/__pycache__/ 21 | -------------------------------------------------------------------------------- /bigdata-druid/README.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 1、Druid简介 3 | Druid是一个用于大数据实时查询和分析的高容错、高性能开源分布式系统,旨在快速处理大规模的数据,并能够实现快速查询和分析。 4 | * [Druid官网](http://druid.apache.org/) 5 | * [Druid GitHub](https://github.com/apache/druid) 6 | 7 | ##### 2、Druid操作 8 | 9 | 10 | ##### 3、Druid学习资料 11 | 12 | 13 | -------------------------------------------------------------------------------- /bigdata-druid/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | io.github.wujun728 8 | bigdata-druid 9 | 0.0.1-SNAPSHOT 10 | 11 | 12 | -------------------------------------------------------------------------------- /bigdata-druid/src/main/java/com/libin/README.md: -------------------------------------------------------------------------------- 1 | 2 | ##### Druid相关操作 3 | 4 | -------------------------------------------------------------------------------- /bigdata-flink/.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | build_info.properties 3 | .classpath 4 | dependency-reduced-pom.xml 5 | *.diff 6 | .DS_Store 7 | .idea/ 8 | *.iml 9 | *.jar 10 | .project 11 | .settings/ 12 | .tags* 13 | target 14 | tmp* 15 | test-output/ 16 | nohup* 17 | *.log 18 | *.swp 19 | *.pyc 20 | script/__pycache__/ 21 | venv 22 | -------------------------------------------------------------------------------- /bigdata-flink/README.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 1、Flink基本概念 3 | * [1、Flnik架构](src/main/docs/flink架构.md) 4 | * [2、Flnik基本组件](src/main/docs/flink基本组件.md) 5 | * [3、Flnik与其它框架对比](src/main/docs/flink与几种流式框架对比.md) 6 | 7 | 8 | ##### 2、Flink原理 9 | * [1、Flnik检查点checkpoint](src/main/docs/1_检查点checkpoint.md) 10 | * [2、Flnik状态state](src/main/docs/2_状态state.md) 11 | * [3、Flnik时间操作time](src/main/docs/3_时间time.md) 12 | * [4、Flnik窗口操作windows](src/main/docs/4_窗口windows.md) 13 | * [5、Flnik广播变量](src/main/docs/flink广播变量.md) 14 | * [6、Flnik累加器](src/main/docs/flink累加器.md) 15 | * [7、Flnik分布式缓存](src/main/docs/flink分布式缓存.md) 16 | 17 | 18 | ##### 3、Flink编程 19 | * [1、Flnik程序开发](src/main/docs/flink程序开发.md) 20 | * [2、单词计数](src/main/scala/com/libin/data/flink/streaming/jobs/GenCodeFromWordCount.scala) 21 | * [3、窗口操作](src/main/scala/com/libin/data/flink/streaming/jobs/GenCodeFromWindow.scala) 22 | 23 | 广播变量 24 | 分布式缓存 25 | 累加器和计数器 26 | 27 | ##### 4、Flnik学习资料 28 | * [Flink官网](https://flink.apache.org/) 29 | * [Flink Github](https://github.com/apache/flink) 30 | 31 | 32 | 33 | 相关书籍 34 | -------------------------------------------------------------------------------- /bigdata-flink/scripts/run_flink.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cluster="cluster_name" 4 | queue="cluster_queue" 5 | jar="xxx.jar" 6 | class="xxx" 7 | 8 | flink \ 9 | --cluster "${cluster}" \ 10 | -Dsecurity.kerberos.login.principal=xxx \ 11 | -Dsecurity.kerberos.login.keytab=xxx \ 12 | -Dstate.backend=filesystem \ 13 | -Djob.name=xxx \ 14 | -Dstate.checkpoints.dir=xxx/.flink/checkpoint/xxx \ 15 | run \ 16 | --jobmanager yarn-cluster \ 17 | --allowNonRestoredState \ 18 | --fromSavepoint xxx/.flink/checkpoint/xxx \ 19 | --parallelism 1 \ 20 | --yarnname "FLINK1.9_xxx_Streaming_Job" \ 21 | --yarnqueue ${queue} \ 22 | --yarntaskManagerMemory 2G \ 23 | --yarnjobManagerMemory 2G \ 24 | --class ${class} \ 25 | "${jar}" 26 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/1_检查点checkpoint.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 1、FLink Checkpoint是什么? 3 | checkpoint 是一种分布式快照,把state数据持久化存储了。 4 | 可以理解为在某一时刻一个Flink Job在一个特定时刻的一份全局状态快照,即包含了所有task/operator的状态, 5 | 这样,在任务进行故障恢复的时候,就可以还原到任务故障前最近一次检查点的状态,从而保证数据的一致性。当然, 6 | 为了保证exactly-once/at-least-once 的特性,还需要数据源支持数据回放。 7 | Flink的checkpoint机制基于chandy-lamda算法。 8 | 9 | #####2、Barriers是什么? 10 | flink 分布式快照的核心元素是 stream barriers,这些barriers被注入到流中,并作为流的一部分,随着流流动。 11 | barriers将数据流的记录分为进入当前快照的记录和进入下一个快照的记录,每个barriers都携带了快照的ID, 12 | 快照的数据在barriers的前面推送。barriers非常轻量级,不会中断流的流动。同一时间,会有多个checkpoint在并发进行。 13 | 2.1、单流的barrier 14 | ![单流的barrier](images/flink检查点.png) 15 | 16 | 17 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/2_状态state.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 1、State和CheckPoint 3 | State一般指一个具体的Task/Operator的状态,State数据默认保存在Java的堆内存中。 4 | 而CheckPoint(可以理解为CheckPoint是把State数据持久化存储了)则表示了一个Flink Job在一个特定时刻的一份 5 | 全局状态快照,即包含了所有Task/Operator的状态。 6 | 7 | Task是Flink中执行的基本单位,Operator是算子(Transformation)。 8 | 9 | ##### 2、Flink state类型 10 | State可以被记录,在失败的情况下数据还可以恢复。Flink中有以下两种基本类型的State。 11 | Keyed State。 12 | Operator State。 13 | Keyed State和Operator State以两种形式存在。 14 | 15 | 原始状态(Raw State):由用户自行管理状态具体的数据结构,框架在做CheckPoint的时候,使用byte[]读写状态内容,对其内部数据结构一无所知。 16 | 托管状态(Managed State):由Flink框架管理的状态。通常在DataStream上推荐使用托管状态,当实现一个用户自定义的Operator时使用到原始状态。 17 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/3_时间time.md: -------------------------------------------------------------------------------- 1 | 2 | Flink 认为 Batch 是 Streaming 的一个特例,所以 Flink 底层引擎是一个流式引擎, 3 | 在上面实现了流处理和批处理。而窗口(window)就是从 Streaming 到 Batch 的一个桥梁。 4 | 5 | #### 1、windows窗口是什么? 6 | 在流处理应用中,数据是连续不断的,因此我们不可能等到所有数据都到了才开始处理。当然我们可以每来一个消息就处理一次, 7 | 但是有时我们需要做一些聚合类的处理,例如:在过去的1分钟内有多少用户点击了我们的网页。在这种情况下,我们必须定义一个窗口, 8 | 用来收集最近一分钟内的数据,并对这个窗口内的数据进行计算。 9 | 10 | 窗口可以是时间驱动的(Time Window,例如:每30秒钟),也可以是数据驱动的(Count Window,例如:每一百个元素)。 11 | 一种经典的窗口分类可以分成:滚动窗口(Tumbling Window,无重叠),滑动窗口(Sliding Window,有重叠), 12 | 和会话窗口(Session Window,活动间隙)。 13 | 14 | ![Flink窗口](images/flink_windows.png) 15 | 16 | 1、滚动窗口(Tumbling Windows,记录没有重叠): 17 | 滚动时间窗口的窗口时间是固定的 18 | 滚动计数窗口的窗口内数据量是固定的 19 | 20 | 2、滑动窗口(Slide Windows,记录有重叠): 21 | 每次滑动指定步数 22 | 23 | 3、会话窗口(Session Windows) 24 | 一次会话访问 25 | 26 | ##### 2、时间窗口Time Window 27 | Time Window 是根据时间对数据流进行分组的。 28 | 2.1、滚动时间窗口(Tumbling Time Window) 29 | 翻滚窗口能将数据流切分成不重叠的窗口,每一个事件只能属于一个窗口。 30 | 比如:统计每一分钟中用户购买的商品的总数,需要将用户的行为事件按每一分钟进行切分。 31 | 2.2、滑动时间窗口(Sliding Time Window) 32 | 滑动时间窗口是不间断的,需要平滑地进行窗口聚合,一个元素可以对应多个窗口。 33 | 比如:我们可以每30秒计算一次最近一分钟用户购买的商品总数。 34 | 35 | ##### 3、计数窗口Count Window 36 | Count Window 是根据元素个数对数据流进行分组的。 37 | 3.1、滚动计数窗口(Tumbling Count Window) 38 | 滚动计数窗口会对窗口进行计算。 39 | 比如:想要每100个用户购买行为事件统计购买总数,那么每当窗口中填满100个元素了,就会对窗口进行计算。 40 | 3.2、滑动计数窗口(Sliding Count Window) 41 | 每次滑动指定步数,然后做聚合统计。 42 | 例如:计算每10个元素计算一次最近100个元素的购买总数。 43 | 44 | ##### 4、会话窗口Session Window 45 | 在用户交互事件流中,将事件聚合到会话窗口中(一段用户持续活跃的周期),由非活跃的间隙分隔开。 46 | 比如:需要计算每个用户在活跃期间总共购买的商品数量,如果用户30秒没有活动则视为会话断开。 47 | 48 | 49 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/4_窗口windows.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | Flink 认为 Batch 是 Streaming 的一个特例,所以 Flink 底层引擎是一个流式引擎,在上面实现了流处理和批处理。 4 | 而窗口(window)就是从 Streaming 到 Batch 的一个桥梁。 5 | 6 | ##### 1、windows窗口是什么? 7 | 在流处理应用中,数据是连续不断的,因此我们不可能等到所有数据都到了才开始处理。当然我们可以每来一个消息就处理一次, 8 | 但是有时我们需要做一些聚合类的处理, 9 | 例如:在过去的1分钟内有多少用户点击了我们的网页。 10 | 在这种情况下,我们必须定义一个窗口,用来收集最近一分钟内的数据,并对这个窗口内的数据进行计算。 11 | 12 | 窗口可以是时间驱动的(Time Window,例如:每30秒钟),也可以是数据驱动的(Count Window,例如:每一百个元素)。 13 | 一种经典的窗口分类可以分成: 14 | 15 | 翻滚窗口(Tumbling Window,无重叠),滚动窗口(Sliding Window,有重叠),和会话窗口(Session Window,活动间隙)。 16 | 17 | ![Flink窗口](images/flink_windows.png) 18 | 19 | 1、滚动窗口(Tumbling Windows,记录没有重叠): 20 | 滚动时间窗口的窗口时间是固定的 21 | 滚动计数窗口的窗口内数据量是固定的 22 | 23 | 2、滑动窗口(Slide Windows,记录有重叠): 24 | 每次滑动指定步数 25 | 26 | 3、会话窗口(Session Windows) 27 | 一次会话访问 28 | 29 | ##### 2、窗口Time Window 30 | Time Window 是根据时间对数据流进行分组的。 31 | 2.1、滚动时间窗口(Tumbling Time Window) 32 | 翻滚窗口能将数据流切分成不重叠的窗口,每一个事件只能属于一个窗口。 33 | 比如:统计每一分钟中用户购买的商品的总数,需要将用户的行为事件按每一分钟进行切分。 34 | 2.2、滑动时间窗口(Sliding Time Window) 35 | 滑动时间窗口是不间断的,需要平滑地进行窗口聚合,一个元素可以对应多个窗口。 36 | 比如:我们可以每30秒计算一次最近一分钟用户购买的商品总数。 37 | 38 | ##### 3、计数窗口Count Window 39 | Count Window 是根据元素个数对数据流进行分组的。 40 | 3.1、滚动计数窗口(Tumbling Count Window) 41 | 滚动计数窗口会对窗口进行计算。 42 | 比如:想要每100个用户购买行为事件统计购买总数,那么每当窗口中填满100个元素了,就会对窗口进行计算。 43 | 3.2、滑动计数窗口(Sliding Count Window) 44 | 每次滑动指定步数,然后做聚合统计。 45 | 例如:计算每10个元素计算一次最近100个元素的购买总数。 46 | 47 | ##### 4、会话窗口Session Window 48 | 在用户交互事件流中,将事件聚合到会话窗口中(一段用户持续活跃的周期),由非活跃的间隙分隔开。 49 | 比如:需要计算每个用户在活跃期间总共购买的商品数量,如果用户30秒没有活动则视为会话断开。 50 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/datastream.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 1、DataStream简介 3 | DataStream提供了实时处理的API。 4 | DataStream API主要分为3块:DataSource、Transformation、Sink。 5 | 6 | ##### 1、DataStream Transformation算子 7 | Flink针对DataStream提供了大量的已经实现的算子。 8 | Map:输入一个元素,然后返回一个元素,中间可以进行清洗转换等操作。 9 | FlatMap:输入一个元素,可以返回零个、一个或者多个元素。 10 | Filter:过滤函数,对传入的数据进行判断,符合条件的数据会被留下。 11 | KeyBy:根据指定的Key进行分组,Key相同的数据会进入同一个分区。KeyBy的两种典型用法如下。 12 | 13 | 14 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/flink与几种流式框架对比.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 1、Flink、Spark Streaming、Storm 3 | ![Flink对比](images/flink对比.jpg) 4 | 5 | 模型:Storm和Flink是真正的一条一条处理数据;而Trident(Storm的封装框架)和Spark Streaming其实都是小批处理, 6 | 一次处理一批数据(小批量)。 7 | 8 | API:Storm和Trident都使用基础API进行开发,比如实现一个简单的sum求和操作;而Spark Streaming和Flink中都提供封 9 | 装后的高阶函数,可以直接拿来使用,这样就比较方便了。 10 | 11 | 保证次数:在数据处理方面,Storm可以实现至少处理一次,但不能保证仅处理一次,这样就会导致数据重复处理问题,所以针对 12 | 计数类的需求,可能会产生一些误差;Trident通过事务可以保证对数据实现仅一次的处理,Spark Streaming和Flink也是如此。 13 | 14 | 容错机制:Storm和Trident可以通过ACK机制实现数据的容错机制,而SparkStreaming和Flink可以通过CheckPoint机制实现容错机制。 15 | 16 | 状态管理:Storm中没有实现状态管理,Spark Streaming实现了基于DStream的状态管理,而Trident和Flink实现了基于操作的状态管理。 17 | 18 | 延时:表示数据处理的延时情况,因此Storm和Flink接收到一条数据就处理一条数据,其数据处理的延时性是很低的;而Trident和 19 | Spark Streaming都是小型批处理,它们数据处理的延时性相对会偏高。 20 | 21 | 吞吐量:Storm的吞吐量其实也不低,只是相对于其他几个框架而言较低;Trident属于中等;而Spark Streaming和Flink的吞吐量是比较高的。 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/flink分布式缓存.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 1.分布式缓存 3 | Flink提供了一个分布式缓存(Distributed Cache),类似于Hadoop,可以使用户在并行函数中很方便地读取本地文件。 4 | 5 | 此缓存的工作机制为程序注册一个文件或者目录(本地或者远程文件系统,如HDFS或者S3),通过ExecutionEnvironment注册缓存文件并为它起一个名称。 6 | 当程序执行时,Flink自动将文件或者目录复制到所有TaskManager节点的本地文件系统,用户可以通过这个指定的名称查找文件或者目录, 7 | 然后从TaskManager节点的本地文件系统访问它。 8 | 9 | 10 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/flink基本组件.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 1、Flink组件 3 | Flink中提供了3个组件,包括DataSource、Transformation和DataSink。 4 | 5 | DataSource:表示数据源组件,主要用来接收数据,目前官网提供了readTextFile、 6 | socketTextStream、fromCollection以及一些第三方的Source。 7 | 8 | Transformation:表示算子,主要用来对数据进行处理,比如Map、FlatMap、Filter、Reduce、Aggregation等。 9 | 10 | DataSink:表示输出组件,主要用来把计算的结果输出到其他存储介质中,比如writeAsText以及Kafka、Redis、 11 | Elasticsearch等第三方Sink组件。 12 | 13 | 组装一个Flink Job,至少需要这3个组件。 14 | Flink Job=DataSource+Transformation+DataSink 15 | 16 | ![Flink数据传输方式](images/flink数据传输方式.jpg) 17 | 18 | 对于一个流处理系统,其节点间数据传输的标准模型是,在处理完成一条数据后,将其序列化到缓存中,并立刻通过网络传输到下一个节点, 19 | 由下一个节点继续处理。而对于一个批处理系统,其节点间数据传输的标准模型是,在处理完成一条数据后,将其序列化到缓存中,当缓存 20 | 写满时,就持久化到本地硬盘上;在所有数据都被处理完成后,才开始将其通过网络传输到下一个节点。 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/flink广播变量.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ##### 1.广播变量 4 | Broadcast可以理解为一个公共的共享变量。可以把一个DataSet(数据集)广播出去,不同的Task在节点上都能够获取到它, 5 | 这个数据集在每个节点上只会存在一份。如果不使用Broadcast,则在各节点的每个Task中都需要复制一份DataSet数据集, 6 | 比较浪费内存(也就是一个节点中可能会存在多份DataSet数据)。 7 | 8 | 广播变量只能在Flink批处理程序中才可以使用。 9 | 10 | DataStream Broadcast(分区规则) 11 | 分区规则是把元素广播给所有的分区,数据会被重复处理,类似于Storm中的allGrouping。 12 | 13 | 14 | 15 | ##### 2.广播变量和累加器区别 16 | Flink Broadcast和Accumulator的区别 17 | Broadcast允许程序员将一个只读的变量缓存在每台机器上,而不用在任务之间传递变量。广播变量可以进行共享,但是不可以进行修改。 18 | Accumulator可以在不同任务中对同一个变量进行累加操作,但是只有在任务执行结束的时候才能获得累加器的最终结果。 19 | 20 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/flink架构.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 1、Flink架构 3 | Flink架构可以分为4层,包括Deploy层、Core层、API层和Library层。 4 | 5 | Deploy层:该层主要涉及Flink的部署模式,Flink支持多种部署模式——本地、集群(Standalone/YARN)和云服务器(GCE/EC2)。 6 | 7 | Core层:该层提供了支持Flink计算的全部核心实现,为API层提供基础服务。 8 | 9 | API层:该层主要实现了面向无界Stream的流处理和面向Batch的批处理API,其中流处理对应DataStream API,批处理对应DataSet API。 10 | Library层:该层也被称为Flink应用框架层,根据API层的划分,在API层之上构建的满足特定应用的实现计算框架, 11 | 12 | 也分别对应于面向流处理和面向批处理两类。面向流处理支持CEP(复杂事件处理)、基于SQL-like的操作(基于Table的关系操作); 13 | 面向批处理支持FlinkML(机器学习库)、Gelly(图处理)、Table 操作。 14 | 15 | ![Flink架构](images/flink架构.jpg) 16 | 17 | Flink主要包括DataStream API、DataSet API、Table API、SQL、Graph API和FlinkML等。 18 | 现在Flink也有自己的生态圈,涉及离线数据处理、实时数据处理、SQL操作、图计算和机器学习库等。 19 | 20 | 21 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/flink程序开发.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 1、Flink程序开发步骤 3 | 开发Flink程序有固定的流程。 4 | (1)获得一个执行环境。 5 | (2)加载/创建初始化数据。 6 | (3)指定操作数据的Transaction算子。 7 | (4)指定计算好的数据的存放位置。 8 | (5)调用execute()触发执行程序。 9 | Flink程序是延迟计算的,只有最后调用execute()方法的时候才会真正触发执行程序。 10 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/flink累加器.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ##### 1.累加器 4 | Accumulator即累加器,与MapReduce中Counter的应用场景差不多,都能很好地观察Task在运行期间的数据变化。 5 | 可以在Flink Job的算子函数中使用累加器,但是只有在任务执行结束之后才能获得累加器的最终结果。 6 | Counter是一个具体的累加器实现,常用的Counter有IntCounter、LongCounter和DoubleCounter。 7 | 8 | 9 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/flink面试题/flink面试题1.md: -------------------------------------------------------------------------------- 1 | ##### Flink面试题 2 | 3 | ###### 1、Flink最小计算单位是什么? 4 | slot 5 | ![Flink最小计算单元slot](./slot.png) 6 | 7 | ###### 2、Flink 的 checkpoint 存在哪里? 8 | 可以是内存,文件系统,或者 RocksDB 9 | 10 | ###### 3、如果下级存储不支持事务,Flink怎么保证exactly-once? 11 | 端到端的exactly-once对sink要求比较高,具体实现主要有幂等写入和事务性写入两种方式。 12 | 幂等写入的场景依赖于业务逻辑,更常见的是用事务性写入。 13 | 而事务性写入又有预写日志(WAL)和两阶段提交(2PC)两种方式。 14 | 如果外部系统不支持事务,那么可以用预写日志的方式,把结果数据先当成状 态保存,然后在收到checkpoint完成的通知时一次性写入sink系统。 15 | 16 | ###### 4、Flink vs Spark Streaming 17 | 数据模型: 18 | spark采用RDD模型,DStream是一组组小批数据RDD的集合; 19 | flink基本数据模型是数据流,以及时间Event序列; 20 | 21 | 运行时架构: 22 | spark是批处理,将DAG划分为不同的stage,一个完成之后才可以进行下一个; 23 | flink是标准的流执行模式,一个事件在一个节点处理完之后可以直接发往下一个节点处理; 24 | 25 | ###### 5、checkpoint和spark比较 26 | Flink的checkpoint机制对比spark有什么不同和优势? 27 | spark streaming的checkpoint仅仅是针对driver的故障恢复做了数据和元数据的ck。 28 | 而flink的ck机制采用轻量级的分布式快照,实现了每个算子的快照,以及流动中数据的快照。 29 | 30 | ###### 6、Flink CEP是啥? 31 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/flink面试题/slot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-flink/src/main/docs/flink面试题/slot.png -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/images/flink_windows.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-flink/src/main/docs/images/flink_windows.png -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/images/flink对比.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-flink/src/main/docs/images/flink对比.jpg -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/images/flink数据传输方式.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-flink/src/main/docs/images/flink数据传输方式.jpg -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/images/flink架构.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-flink/src/main/docs/images/flink架构.jpg -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/images/flink检查点.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-flink/src/main/docs/images/flink检查点.png -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/images/flink模块.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-flink/src/main/docs/images/flink模块.jpg -------------------------------------------------------------------------------- /bigdata-flink/src/main/docs/面试题/flink面试题.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-flink/src/main/docs/面试题/flink面试题.md -------------------------------------------------------------------------------- /bigdata-flink/src/main/java/com/libin/data/flink/batch/WordCountJava.java: -------------------------------------------------------------------------------- 1 | package com.libin.data.flink.batch; 2 | 3 | /** 4 | * Copyright (c) 2019/05/17. xixi Inc. All Rights Reserved. 5 | * Authors: libin <2578858653@qq.com> 6 | *

7 | * Purpose : 离线处理 8 | */ 9 | public class WordCountJava { 10 | public static void main(String[] args) { 11 | 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/scala/com/libin/data/flink/base/FlinkStreamingTrait.scala: -------------------------------------------------------------------------------- 1 | package com.libin.data.flink.base 2 | 3 | /** 4 | * Copyright (c) 2020/4/3 libin Inc. All Rights Reserved. 5 | * Authors: libin<2578858653@qq.com> 6 | * 7 | * Purpose : 8 | */ 9 | trait FlinkStreamingTrait { 10 | 11 | /** 12 | * Application Name 13 | */ 14 | def appName: String = this.getClass.getSimpleName 15 | } 16 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/scala/com/libin/data/flink/base/client/KafkaFlinkStreamingTrait.scala: -------------------------------------------------------------------------------- 1 | package com.libin.data.flink.base.client 2 | 3 | /** 4 | * Copyright (c) 2020/4/3 libin Inc. All Rights Reserved. 5 | * Authors: libin<2578858653@qq.com> 6 | * 7 | * Purpose : Kafka和Flink结合trait 8 | */ 9 | trait KafkaFlinkStreamingTrait { 10 | 11 | } 12 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/scala/com/libin/data/flink/batch/WordCount.scala: -------------------------------------------------------------------------------- 1 | package com.libin.data.flink.batch 2 | 3 | /** 4 | * Copyright (c) 2020/4/3 libin Inc. All Rights Reserved. 5 | * Authors: libin<2578858653@qq.com> 6 | * 7 | * Purpose : 8 | */ 9 | object WordCount { 10 | 11 | } 12 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/scala/com/libin/data/flink/streaming/jobs/GenCodeFromMysql.scala: -------------------------------------------------------------------------------- 1 | package com.libin.data.flink.streaming.jobs 2 | 3 | /** 4 | * Copyright (c) 2020/9/6. libin Inc. All Rights Reserved. 5 | * Authors: libin 6 | *

7 | * Purpose : 将处理的数据写入到MySQL中 8 | */ 9 | object GenCodeFromMysql { 10 | 11 | } 12 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/scala/com/libin/data/flink/streaming/jobs/GenCodeFromState.scala: -------------------------------------------------------------------------------- 1 | package com.libin.data.flink.streaming.jobs 2 | 3 | /** 4 | * Copyright (c) 2020/9/3. libin Inc. All Rights Reserved. 5 | * Authors: libin 6 | *

7 | * Purpose : 8 | */ 9 | object GenCodeFromState { 10 | 11 | } 12 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/scala/com/libin/data/flink/streaming/jobs/GenCodeFromWindow.scala: -------------------------------------------------------------------------------- 1 | package com.libin.data.flink.streaming.jobs 2 | 3 | import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment, createTypeInformation} 4 | import org.apache.flink.streaming.api.windowing.time.Time 5 | 6 | /** 7 | * Copyright (c) 2020/4/2 libin Inc. All Rights Reserved. 8 | * Authors: libin<2578858653@qq.com> 9 | * 10 | * Purpose : nc -lk 9999 11 | */ 12 | object GenCodeFromWindow { 13 | 14 | implicit val inTypeInfo = createTypeInformation[String] 15 | 16 | def main(args: Array[String]) { 17 | // create env 18 | val env = StreamExecutionEnvironment.getExecutionEnvironment 19 | // data source 20 | val text = env.socketTextStream("localhost", 9999) 21 | 22 | val counts: DataStream[(String, Int)] = text 23 | .flatMap { 24 | line => 25 | line.toLowerCase.split("\\W+") filter { 26 | x => x.nonEmpty 27 | } 28 | } 29 | .map { 30 | x => 31 | (x, 1) 32 | } 33 | .keyBy(0) 34 | .timeWindow(Time.seconds(5)) 35 | //.countWindowAll(5, 2) 36 | //.timeWindowAll(Time.minutes(1),Time.seconds(30)) 37 | .sum(1) 38 | 39 | counts.print() 40 | 41 | env.execute("Window Stream WordCount") 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/scala/com/libin/data/flink/streaming/jobs/GenCodeFromWordCount.scala: -------------------------------------------------------------------------------- 1 | package com.libin.data.flink.streaming.jobs 2 | 3 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 4 | import org.apache.flink.streaming.api.windowing.time.Time 5 | 6 | /** 7 | * Copyright (c) 2019/05/18. xixi Inc. All Rights Reserved. 8 | * Authors: libin <2578858653@qq.com> 9 | *

10 | * Purpose : 服务器上执行 nc -l 9000 , 运行代码 11 | */ 12 | object GenCodeFromWordCount { 13 | 14 | case class wc(word: String, count: Long) 15 | 16 | def main(args: Array[String]): Unit = { 17 | val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment 18 | val hostname = "localhost" 19 | val port = 9000 20 | val stream = env.socketTextStream(hostname, port, '\n') 21 | 22 | import org.apache.flink.api.scala._ 23 | val wcStream = 24 | stream 25 | .flatMap(x => x.split("\t")) 26 | .map(w => wc(w, 1)) 27 | .keyBy("word") 28 | .timeWindow(Time.seconds(2), Time.seconds(1)) 29 | //.sum("count") 30 | .reduce((a, b) => wc(a.word, a.count + b.count)) 31 | 32 | wcStream.print().setParallelism(1) 33 | env.execute("socket wc") 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/scala/com/libin/data/flink/streaming/jobs/config/GenCodeFromBucketingSink.scala: -------------------------------------------------------------------------------- 1 | package com.libin.data.flink.streaming.jobs.config 2 | 3 | import org.apache.flink.streaming.api.scala.DataStream 4 | import org.apache.flink.streaming.connectors.fs.StringWriter 5 | import org.apache.flink.streaming.connectors.fs.bucketing.{BucketingSink, DateTimeBucketer} 6 | 7 | /** 8 | * Copyright (c) 2020/4/2 libin Inc. All Rights Reserved. 9 | * Authors: libin<2578858653@qq.com> 10 | * 11 | * Purpose : 使用BucketingSink对存储的数据进行输出 12 | */ 13 | object GenCodeFromBucketingSink { 14 | def main(args: Array[String]): Unit = { 15 | 16 | val resultDS: DataStream[Long] = null 17 | 18 | val sink = new BucketingSink[Long]("output path") 19 | sink.setBucketer(new DateTimeBucketer[Long]("yyyy-MM-dd--HHmm")) 20 | sink.setWriter(new StringWriter[Long]()) 21 | // sink.setBatchSize(1024 * 1024 * 100) // this is 100 MB, 22 | sink.setBatchSize(1024 * 1024 * 1) // this is 1 MB, 23 | // sink.setBatchRolloverInterval( 60 * 1000); // this is 30 seconds 24 | resultDS.addSink(sink) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /bigdata-flink/src/main/scala/com/libin/data/flink/streaming/jobs/config/GenCodeFromCheckpoint.scala: -------------------------------------------------------------------------------- 1 | package com.libin.data.flink.streaming.jobs.config 2 | 3 | import org.apache.flink.runtime.state.filesystem.FsStateBackend 4 | import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup 5 | import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment 6 | import org.apache.flink.streaming.api.{CheckpointingMode, TimeCharacteristic} 7 | 8 | /** 9 | * Copyright (c) 2020/4/2 libin Inc. All Rights Reserved. 10 | * Authors: libin<2578858653@qq.com> 11 | * 12 | * Purpose : Env和Checkpoint 一些常见配置 13 | */ 14 | object GenCodeFromCheckpoint { 15 | def main(args: Array[String]): Unit = { 16 | // create env 17 | val env = StreamExecutionEnvironment.getExecutionEnvironment 18 | 19 | // 设置env属性值 20 | env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) 21 | env.setParallelism(16) 22 | env.enableCheckpointing(20000) 23 | env.setStateBackend(new FsStateBackend("checkpoint path")) // kafka offset,确保 exactly-once 24 | 25 | // 设置config属性值 26 | val config = env.getCheckpointConfig 27 | config.enableExternalizedCheckpoints(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION) 28 | config.setCheckpointingMode(CheckpointingMode.AT_LEAST_ONCE) 29 | // config.setCheckpointInterval(10000) 30 | config.setCheckpointInterval(5 * 60 * 1000); // Checkpoint的触发频率; 31 | config.setMinPauseBetweenCheckpoints(5 * 60 * 1000); // Checkpoint之间的最小间隔; 32 | config.setCheckpointTimeout(10 * 60 * 1000); // Checkpoint的超时时间; 33 | config.setTolerableCheckpointFailureNumber(3); // 连续3次checkpoint失败,才会导致作业失败重启;默认值是0 。 34 | 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /bigdata-hadoop/.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | build_info.properties 3 | .classpath 4 | dependency-reduced-pom.xml 5 | *.diff 6 | .DS_Store 7 | .idea/ 8 | *.iml 9 | *.jar 10 | .project 11 | .settings/ 12 | .tags* 13 | target/ 14 | tmp* 15 | test-output/ 16 | nohup* 17 | *.log 18 | *.swp 19 | *.pyc 20 | script/__pycache__/ -------------------------------------------------------------------------------- /bigdata-hadoop/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## MapReduce 3 | * [Mapreduce应用API编程](src/main/java/com/libin/api/mapreduce) 4 | * MapReduce原理 5 | * MapReduce源码分析 6 | 7 | ## Hdfs 8 | * [Hdfs应用](src/main/java/com/libin/api/hdfs) 9 | * [Hdfs原理](src/main/java/com/libin/doc/hdfs) 10 | * [Hdfs源码分析](src/main/java/com/libin/code/hdfs) 11 | 12 | ## Yarn 13 | * Yarn原理 14 | * Yarn源码分析 15 | -------------------------------------------------------------------------------- /bigdata-hadoop/src/main/java/com/libin/api/hdfs/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## HDFS使用 3 | 在大数据开发中,对HDFS的使用无处不在,大部分计算和存储框架直接与HDFS交互,因此学习HDFS很重要. 4 | RD(Research and Development engineer)们一般都喜欢通过命令行进行操作HDFS,那敲键盘的感觉很爽...当然代码API操作也是必须的. 5 | PM(Product Manager)们一般不会研发那点儿东东,更多使用自研的数据管理平台或HUE之类的进行网页查看数据... 6 | ## 7 | ##### 1.HDFS命令行使用 8 | 之前博客整理的文档:https://blog.csdn.net/baolibin528/article/details/43854291 9 | ## 10 | ##### 2.[HDFS的Java API使用](HdfsUtils.java) 11 | 之前博客整理的文档:https://blog.csdn.net/baolibin528/article/details/43868515 12 | -------------------------------------------------------------------------------- /bigdata-hadoop/src/main/java/com/libin/api/mapreduce/GetInputSplit.java: -------------------------------------------------------------------------------- 1 | package com.libin.api.mapreduce; 2 | 3 | import org.apache.hadoop.io.IntWritable; 4 | import org.apache.hadoop.io.LongWritable; 5 | import org.apache.hadoop.io.NullWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapreduce.Mapper; 8 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 9 | 10 | import java.io.IOException; 11 | 12 | /** 13 | * Copyright (c) 2015/10/30. xixi Inc. All Rights Reserved. 14 | * Authors: libin <2578858653@qq.com> 15 | *

16 | * Purpose : 17 | * 问题场景:当有很多个小文件,需要把每个小文件的目录名加进小文件内容中并转换输出,用一个map类的话可以处理每一行数据的时候读取这行数据的目录名加到第一个字段输出。 18 | */ 19 | public class GetInputSplit { 20 | public static class MapClass extends Mapper { 21 | protected void map(LongWritable key, Text value, Mapper.Context context) 22 | throws IOException, InterruptedException { 23 | FileSplit fileSplit = (FileSplit) context.getInputSplit(); 24 | //获得当前子目录名 25 | String pathName = fileSplit.getPath().getName(); 26 | //获得全路径 27 | String path = fileSplit.getPath().toString(); 28 | //获得父目录的全路径 29 | String parentPath = fileSplit.getPath().getParent().toString(); 30 | //获取父目录的目录名 31 | String parentPathName = fileSplit.getPath().getParent().getName(); 32 | //整和Mapkey 33 | String mapkey = pathName + "|" + path + "|" + parentPath + "|" + parentPathName; 34 | context.write(new Text(mapkey), NullWritable.get()); 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /bigdata-hadoop/src/main/java/com/libin/api/mapreduce/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## MapReduce应用API整理 3 | 刚开始学习Hadoop时候使用的版本是hadoop0.20.2,但是Hadoop版本迭代蛮快的。 4 | 学习时候主要使用的是Hadoop1.2.1和Hadoop2.6.0两个版本,工作时候基本都是用Hadoop2.0开发的,也就是Hadoop2.6.0版本。 5 | 下面的代码基本是基于Hadoop2.6.0编写的,大部分都是工作中的实战代码。 6 | 年限基本都是在2015年使用的,那个时候Spark也是刚兴起不久,很多公司还是编写MapReduce居多。 7 | 8 | * [1、分布式缓存](DistributedDemo.java) 9 | * [2、全排序](allSort) 10 | * [3、多目录输出](TestwithMultipleOutputs.java) 11 | * [4、Hadoop的map获取当前spilt文件名](GetInputSplit.java) 12 | * [5、自定义InputFormat 类代码](inputformat) 13 | * [6、TopK](Topk.java) 14 | * [7、二次排序](SecondarySort.java) 15 | * [8、自定义Partitioner](PartitionerDemo.java) 16 | * [9、MultipleInputs用法](MultipleInputsTest.java) 17 | * [10、CombineTextInputFormat用法](CombineTextInputFormatTest.java) 18 | * [11、NLineInputFormat用法](NLineInputFormatTest.java) 19 | * [12、SequenceFileInputFormat用法](SequenceFileInputFormatTest.java) 20 | * [13、MapReduce编程自定义排序](SortTest.java) 21 | * [14、DBInputFormat用法](DBInputFormatTest.java) 22 | * [15、自定义计数器](CounterTest.java) 23 | * [16、Hadoop自定义数据类型](KpiApp.java) 24 | * [17、经典WordCount](WordCount.java) 25 | * [18、Hadoop的ChainMapper/ChainReducer](ChainMapperChainReducer.java) 26 | * [19、hadoop的Context简单使用](GetIDMapReduce.java) 27 | * [20、hadoop的FileSplit简单使用](GetSplitMapReduce.java) 28 | * [21、Hadoop的FileStatus简单使用](GetStatusMapReduce.java) 29 | * [22、Hadoop自定义分组Group](MyGroup.java) 30 | * [23、Hadoop的PathFilter使用](TextPathFilterDemo.java) 31 | -------------------------------------------------------------------------------- /bigdata-hadoop/src/main/java/com/libin/api/mapreduce/inputformat/FindMaxValueInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.libin.api.mapreduce.inputformat; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | import java.util.Random; 7 | 8 | 9 | import org.apache.hadoop.io.ArrayWritable; 10 | import org.apache.hadoop.io.IntWritable; 11 | import org.apache.hadoop.mapreduce.*; 12 | 13 | /** 14 | * Copyright (c) 2015/09/06. xixi Inc. All Rights Reserved. 15 | * Authors: libin <2578858653@qq.com> 16 | *

17 | * Purpose : 18 | */ 19 | public class FindMaxValueInputFormat extends InputFormat { 20 | public static float[] floatvalues; 21 | 22 | /** 23 | * 返回一个InputSplit 集合 24 | * 这个例子一共有两个InputSplit,两个map 25 | * 随机产生100个 0-1 的数组,放到float数组里面 26 | */ 27 | @Override 28 | public List getSplits(JobContext context) throws IOException, 29 | InterruptedException { 30 | int NumOfValues = context.getConfiguration().getInt("NumOfValues", 100); 31 | floatvalues = new float[NumOfValues]; 32 | Random rand = new Random(); 33 | 34 | for (int i = 0; i < NumOfValues; i++) { 35 | floatvalues[i] = rand.nextFloat(); 36 | } 37 | int NumSplits = context.getConfiguration().getInt("mapred.map.tasks", 2); 38 | int beg = 0; 39 | int length = (int) Math.floor(NumOfValues / NumSplits); 40 | 41 | ArrayList splits = new ArrayList(); 42 | int end = length - 1; 43 | 44 | for (int i = 0; i < NumSplits - 1; i++) { 45 | FindMaxValueInputSplit split = new FindMaxValueInputSplit(beg, end); 46 | splits.add(split); 47 | 48 | beg = end + 1; 49 | end = end + length - 1; 50 | } 51 | 52 | FindMaxValueInputSplit split = new FindMaxValueInputSplit(beg, NumOfValues - 1); 53 | splits.add(split); 54 | 55 | return splits; 56 | } 57 | 58 | /** 59 | * 自定义 RecordReader 60 | */ 61 | @Override 62 | public RecordReader createRecordReader( 63 | InputSplit split, TaskAttemptContext context) throws IOException, 64 | InterruptedException { 65 | return new FindMaxValueRecordReader(); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /bigdata-hadoop/src/main/java/com/libin/api/mapreduce/inputformat/FindMaxValueMapper.java: -------------------------------------------------------------------------------- 1 | package com.libin.api.mapreduce.inputformat; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.io.ArrayWritable; 6 | import org.apache.hadoop.io.FloatWritable; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.mapreduce.Mapper; 9 | 10 | /** 11 | * Copyright (c) 2015/09/06. xixi Inc. All Rights Reserved. 12 | * Authors: libin <2578858653@qq.com> 13 | *

14 | * Purpose : Map函数的输入格式有所改变 15 | */ 16 | public class FindMaxValueMapper extends Mapper { 17 | private final static IntWritable one = new IntWritable(1); 18 | 19 | @Override 20 | protected void map( 21 | IntWritable key, 22 | ArrayWritable value, 23 | Mapper.Context context) 24 | throws IOException, InterruptedException { 25 | 26 | FloatWritable[] floatArray = (FloatWritable[]) value.toArray(); 27 | float maxfloat = floatArray[0].get(); 28 | float tmp; 29 | /** 30 | * 求一个InputSplit中的最大值 31 | */ 32 | for (int i = 0; i < floatArray.length; i++) { 33 | tmp = floatArray[i].get(); 34 | if (tmp > maxfloat) { 35 | maxfloat = tmp; 36 | } 37 | } 38 | /** 39 | * 把一个map中的最大值输出出来 40 | */ 41 | context.write(one, new FloatWritable(maxfloat)); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /bigdata-hadoop/src/main/java/com/libin/api/mapreduce/inputformat/FindMaxValueReducer.java: -------------------------------------------------------------------------------- 1 | package com.libin.api.mapreduce.inputformat; 2 | 3 | import java.io.IOException; 4 | import java.util.Iterator; 5 | 6 | import org.apache.hadoop.io.FloatWritable; 7 | import org.apache.hadoop.io.IntWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Reducer; 10 | 11 | /** 12 | * Copyright (c) 2015/09/06. xixi Inc. All Rights Reserved. 13 | * Authors: libin <2578858653@qq.com> 14 | *

15 | * Purpose : Ruducer比较两个Map函数输出的最大值,结果输出在HDFS上面 16 | * 这个例子就比较两个值,有几个Map比较几个 17 | */ 18 | public class FindMaxValueReducer extends Reducer { 19 | @SuppressWarnings("rawtypes") 20 | @Override 21 | protected void reduce( 22 | IntWritable k2, 23 | Iterable v2s, 24 | Reducer.Context context) 25 | throws IOException, InterruptedException { 26 | 27 | Iterator it = v2s.iterator(); 28 | float maxfloat = 0, tmp; 29 | /** 30 | * 取第一个数 31 | */ 32 | if (it.hasNext()) { 33 | maxfloat = ((FloatWritable) (it.next())).get(); 34 | } else { 35 | //集合为空时,输出迭代失败信息 36 | context.write(new Text("Max float value : "), null); 37 | return; 38 | } 39 | /** 40 | * 求最大值 41 | */ 42 | while (it.hasNext()) { 43 | tmp = ((FloatWritable) (it.next())).get(); 44 | if (tmp > maxfloat) { 45 | maxfloat = tmp; 46 | } 47 | } 48 | //把最大的那个值输出来 49 | context.write(new Text("Max float value : "), new FloatWritable(maxfloat)); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /bigdata-hadoop/src/main/java/com/libin/api/mapreduce/inputformat/MaxValueDriver.java: -------------------------------------------------------------------------------- 1 | package com.libin.api.mapreduce.inputformat; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.io.FloatWritable; 6 | import org.apache.hadoop.io.IntWritable; 7 | import org.apache.hadoop.mapreduce.Job; 8 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 9 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 10 | 11 | import com.sun.jersey.core.impl.provider.entity.XMLJAXBElementProvider.Text; 12 | 13 | /** 14 | * Copyright (c) 2015/09/06. xixi Inc. All Rights Reserved. 15 | * Authors: libin <2578858653@qq.com> 16 | *

17 | * Purpose : 18 | */ 19 | public class MaxValueDriver { 20 | public static void main(String[] args) throws Exception { 21 | Configuration conf = new Configuration(); 22 | Job job = Job.getInstance(conf, MaxValueDriver.class.getSimpleName()); 23 | job.setJarByClass(MaxValueDriver.class); 24 | 25 | job.setNumReduceTasks(1); 26 | 27 | job.setMapperClass(FindMaxValueMapper.class); 28 | job.setReducerClass(FindMaxValueReducer.class); 29 | 30 | job.setMapOutputKeyClass(IntWritable.class); 31 | job.setMapOutputValueClass(FloatWritable.class); 32 | 33 | job.setOutputKeyClass(Text.class); 34 | job.setOutputValueClass(FloatWritable.class); 35 | 36 | job.setInputFormatClass(FindMaxValueInputFormat.class); 37 | job.setOutputFormatClass(TextOutputFormat.class); 38 | 39 | // FileInputFormat.setInputPaths(job, args[0]); 40 | FileOutputFormat.setOutputPath(job, new Path(args[0])); 41 | 42 | job.waitForCompletion(true); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /bigdata-hadoop/src/main/java/com/libin/api/yarn/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hadoop/src/main/java/com/libin/api/yarn/README.md -------------------------------------------------------------------------------- /bigdata-hadoop/src/main/java/com/libin/code/hdfs/README.md: -------------------------------------------------------------------------------- 1 | ## HDFS源码阅读 2 | 选用的版本是基于hadoop2.6.0版本,对这个版本还是蛮钟爱的。 3 | 4 | ## 5 | ##### 1.HDFS客户端 6 | HDFS目前提供3个客户端访问操作接口: 7 | 1.DistributedFileSystem:(org.apache.hadoop.hdfs)为用户开发提供基于HDFS的应用操作API。 8 | 2.FsShell:(org.apache.hadoop.fs)可以通过HDFS Shell命令执行常见的文件系统操作。 9 | 3.DFSAdmin:(org.apache.hadoop.hdfs.tools)向系统管理员提供管理HDFS的工具,如升级、管理安全模式等。 10 | 上面3个接口都是直接或间接持有DFSClient(org.apache.hadoop.hdfs)提供的接口方法对HDFS进行管理和操作的。 11 | 12 | * DFSAdmin是一个真正实现分布式文件系统客户端功能的类,使用户进行HDFS操作的起点。 13 | * DFSAdmin会连接到HDFS,对外提供关联文件/目录,读写文件以及管理与配置HDFS系统等功能。 14 | * DFSAdmin通过ClientProtocol(org.apache.hadoop.hdfs.protocol)接口调用NameNode的接口。 15 | * DFSAdmin通过DataTransferProtocol(org.apache.hadoop.hdfs.protocol.datatransfer)与DataNode交互数据。 16 | 17 | ## 18 | ##### 2.RPC通信 19 | 20 | 21 | 22 | ## 23 | ##### 3.NameNode 24 | 25 | 26 | ## 27 | ##### 4.DataNode 28 | -------------------------------------------------------------------------------- /bigdata-hadoop/src/main/java/com/libin/doc/hdfs/README.md: -------------------------------------------------------------------------------- 1 | ## Hdfs原理 2 | 说起分布式存储,可能首先想到的就是HDFS,对于我来说是这个反应的. 3 | HDFS目前已得到了广泛的应用,不管是开源版本、商业版本还是二次开发版本,所以既然使用了就又必要理解它的内部实现原理. 4 | 5 | ##### Hdfs架构图 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /bigdata-hadoop/src/main/java/com/libin/doc/mapreduce/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## MapReduce原理 3 | -------------------------------------------------------------------------------- /bigdata-hadoop/src/main/java/com/libin/doc/yarn/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Yarn原理 3 | * [YARN文档](http://hadoop.apache.org/docs/r2.6.5/hadoop-yarn/hadoop-yarn-site/YARN.html) 4 | 5 | 6 | ##### YARN架构图 7 | ![YARN架构图](images/yarn_architecture.gif) 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /bigdata-hadoop/src/main/java/com/libin/doc/yarn/images/resource_manager.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hadoop/src/main/java/com/libin/doc/yarn/images/resource_manager.jpg -------------------------------------------------------------------------------- /bigdata-hadoop/src/main/java/com/libin/doc/yarn/images/timg.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hadoop/src/main/java/com/libin/doc/yarn/images/timg.jpg -------------------------------------------------------------------------------- /bigdata-hadoop/src/main/java/com/libin/doc/yarn/images/yarn.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hadoop/src/main/java/com/libin/doc/yarn/images/yarn.jpg -------------------------------------------------------------------------------- /bigdata-hadoop/src/main/java/com/libin/doc/yarn/images/yarn_architecture.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hadoop/src/main/java/com/libin/doc/yarn/images/yarn_architecture.gif -------------------------------------------------------------------------------- /bigdata-hbase/.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | build_info.properties 3 | .classpath 4 | dependency-reduced-pom.xml 5 | *.diff 6 | .DS_Store 7 | .idea/ 8 | *.iml 9 | *.jar 10 | .project 11 | .settings/ 12 | .tags* 13 | target 14 | tmp* 15 | test-output/ 16 | nohup* 17 | *.log 18 | *.swp 19 | *.pyc 20 | script/__pycache__/ 21 | venv 22 | -------------------------------------------------------------------------------- /bigdata-hbase/README.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 1、HBase简介 3 | HBase是一个分布式的、面向列的开源数据库,一个结构化数据的分布式存储系统”,HBase在Hadoop之上提供了类似于Bigtable的能力。 4 | * [HBase体系结构](src/main/scala/com/libin/doc/HBase体系结构.md) 5 | * [HBase数据模型](src/main/scala/com/libin/doc/HBase数据模型.md) 6 | 7 | 8 | ##### 2、HBase原理 9 | * [HBase RegionServer内部结构](src/main/scala/com/libin/doc/RegionServer/RegionServer内部结构.md) 10 | * [HBase HLog](src/main/scala/com/libin/doc/RegionServer/HLog.md) 11 | * [HBase MemStore](src/main/scala/com/libin/doc/RegionServer/MemStore.md) 12 | * [HBase HFile](src/main/scala/com/libin/doc/RegionServer/HFile.md) 13 | * [HBase BlockCache](src/main/scala/com/libin/doc/RegionServer/BlockCache.md) 14 | 15 | 16 | ##### 3、HBase相关算法 17 | * [HBase跳跃表](src/main/scala/com/libin/doc/HBase算法/跳跃表.md) 18 | * [HBase LSM树](src/main/scala/com/libin/doc/HBase算法/LSM树.md) 19 | * [HBase布隆过滤器](src/main/scala/com/libin/doc/HBase算法/布隆过滤器.md) 20 | 21 | 22 | ##### 4、HBase依赖服务 23 | * [ZooKeeper](src/main/scala/com/libin/doc/依赖服务组件/ZooKeeper.md) 24 | * [Hdfs](src/main/scala/com/libin/doc/依赖服务组件/Hdfs.md) 25 | 26 | 27 | ##### 5、HBase相关学习资料 28 | * [HBase官网地址](http://hbase.apache.org/) 29 | * [HBase gitbook地址](http://hbase.apache.org/book.html) 30 | 31 | 32 | -------------------------------------------------------------------------------- /bigdata-hbase/image/HBase体系结构.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hbase/image/HBase体系结构.png -------------------------------------------------------------------------------- /bigdata-hbase/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | io.github.wujun728 8 | bigdata-hbase 9 | 0.0.1-SNAPSHOT 10 | 11 | 12 | -------------------------------------------------------------------------------- /bigdata-hbase/src/main/scala/com/libin/doc/Compaction.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /bigdata-hbase/src/main/scala/com/libin/doc/HBasae客户端/客户端实现.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hbase/src/main/scala/com/libin/doc/HBasae客户端/客户端实现.md -------------------------------------------------------------------------------- /bigdata-hbase/src/main/scala/com/libin/doc/HBase体系结构.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ##### HBase体系结构 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /bigdata-hbase/src/main/scala/com/libin/doc/HBase数据模型.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ##### 1、HBase逻辑视图基本概念 4 | 行键、列簇、列限定符、单元格、时间戳 5 | 1)、table:表,一个表包含多行数据。 6 | 2)、row:行,一行数据包含一个唯一标识rowkey、多个column以及对应的值。 7 | 在HBase中,一张表中所有row都按照rowkey的字典序由小到大排序。 8 | 3)、column:列,与关系型数据库中的列不同,HBase中的column由columnfamily(列簇) 9 | 以及qualif ier(列名)两部分组成,两者中间使用":"相连。 10 | 4)、timestamp:时间戳,每个cell在写入HBase的时候都会默认分配一个时间戳作为该cell的版本, 11 | 当然,用户也可以在写入的时候自带时间戳。 12 | 5)、cell:单元格,由五元组(row, column, timestamp, type, value)组成的结构, 13 | 其中type表示Put/Delete这样的操作类型,timestamp代表这个cell的版本。 14 | 这个结构在数据库中实际是以KV结构存储的,其中(row, column,timestamp, type)是K,value字段对应KV结构的V。 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /bigdata-hbase/src/main/scala/com/libin/doc/HBase算法/LSM树.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hbase/src/main/scala/com/libin/doc/HBase算法/LSM树.md -------------------------------------------------------------------------------- /bigdata-hbase/src/main/scala/com/libin/doc/HBase算法/布隆过滤器.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hbase/src/main/scala/com/libin/doc/HBase算法/布隆过滤器.md -------------------------------------------------------------------------------- /bigdata-hbase/src/main/scala/com/libin/doc/HBase算法/跳跃表.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hbase/src/main/scala/com/libin/doc/HBase算法/跳跃表.md -------------------------------------------------------------------------------- /bigdata-hbase/src/main/scala/com/libin/doc/HBase面试题.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hbase/src/main/scala/com/libin/doc/HBase面试题.md -------------------------------------------------------------------------------- /bigdata-hbase/src/main/scala/com/libin/doc/RegionServer/BlockCache.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hbase/src/main/scala/com/libin/doc/RegionServer/BlockCache.md -------------------------------------------------------------------------------- /bigdata-hbase/src/main/scala/com/libin/doc/RegionServer/HFile.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hbase/src/main/scala/com/libin/doc/RegionServer/HFile.md -------------------------------------------------------------------------------- /bigdata-hbase/src/main/scala/com/libin/doc/RegionServer/HLog.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hbase/src/main/scala/com/libin/doc/RegionServer/HLog.md -------------------------------------------------------------------------------- /bigdata-hbase/src/main/scala/com/libin/doc/RegionServer/MemStore.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hbase/src/main/scala/com/libin/doc/RegionServer/MemStore.md -------------------------------------------------------------------------------- /bigdata-hbase/src/main/scala/com/libin/doc/RegionServer/RegionServer内部结构.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hbase/src/main/scala/com/libin/doc/RegionServer/RegionServer内部结构.md -------------------------------------------------------------------------------- /bigdata-hbase/src/main/scala/com/libin/doc/依赖服务组件/Hdfs.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hbase/src/main/scala/com/libin/doc/依赖服务组件/Hdfs.md -------------------------------------------------------------------------------- /bigdata-hbase/src/main/scala/com/libin/doc/依赖服务组件/ZooKeeper.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 1、ZooKeeper简介 3 | ZooKeeper集群中多个节点都存储同一份数据,为保证多节点之间数据的一致性, 4 | ZooKeeper使用ZAB(ZooKeeper Atomic Broadcast)协议作为数据一致性的算法。 5 | 6 | -------------------------------------------------------------------------------- /bigdata-hbase/src/main/scala/com/libin/utils/HBaseUtils.scala: -------------------------------------------------------------------------------- 1 | package com.libin.utils 2 | 3 | /** 4 | * Copyright (c) 2020/4/21 libin Inc. All Rights Reserved. 5 | * Authors: libin<2578858653@qq.com> 6 | * 7 | * Purpose : 8 | */ 9 | object HBaseUtils { 10 | def main(args: Array[String]): Unit = { 11 | println("hello") 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /bigdata-hive/.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | build_info.properties 3 | .classpath 4 | dependency-reduced-pom.xml 5 | *.diff 6 | .DS_Store 7 | .idea/ 8 | *.iml 9 | *.jar 10 | .project 11 | .settings/ 12 | .tags* 13 | target 14 | tmp* 15 | test-output/ 16 | nohup* 17 | *.log 18 | *.swp 19 | *.pyc 20 | script/__pycache__/ 21 | venv 22 | -------------------------------------------------------------------------------- /bigdata-hive/README.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 1、Hive简介 3 | hive是基于Hadoop的一个数据仓库工具,用来进行数据提取、转化、加载,这是一种可以存储、查询和分析存储在Hadoop中的大规模数据的机制。 4 | hive数据仓库工具能将结构化的数据文件映射为一张数据库表,并提供SQL查询功能,能将SQL语句转变成MapReduce任务来执行。 5 | Hive是构建在Hadoop大数据平台之上,Hive数据存储依赖于HDFS, HiveSQL的执行引擎依赖于MapReduce、Spark、Tez等分布式计算引擎, 6 | Hive作业的资源调度依赖于YARN、Mesos等大数据资源调度管理组件。 7 | * [Hive官网](https://hive.apache.org/) 8 | * [Hive GitHub](https://github.com/apache/hive) 9 | 10 | ##### 2、Hive相关操作 11 | * [Hive SQL操作](src/main/doc/Hive架构.md) 12 | * [Hive解析流程](src/main/doc/Hive解析流程.md) 13 | 14 | ##### 3、Hive学习资料 15 | 1、技术博客 16 | 2、书籍 17 | -------------------------------------------------------------------------------- /bigdata-hive/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | io.github.wujun728 8 | bigdata-hive 9 | 0.0.1-SNAPSHOT 10 | 11 | 12 | -------------------------------------------------------------------------------- /bigdata-hive/src/main/doc/Hive架构.md: -------------------------------------------------------------------------------- 1 | 2 | ##### Hive架构 3 | 4 | ![Hive作业工作流程](../image/hive作业执行过程.png) 5 | 6 | 客户端提交SQL作业到HiveServer2, HiveServer2会根据用户提交的SQL作业及数据库中现有的元数据信息生成一份可供计算引擎执行的计划。 7 | 每个执行计划对应若干MapReduce作业,Hive会将所有的MapReduce作业都一一提交到YARN中,由YARN去负责创建MapReduce作业对应的子任务任务,并协调它们的运行。 8 | YARN创建的子任务会与HDFS进行交互,获取计算所需的数据,计算完成后将最终的结果写入HDFS或者本地。 9 | 10 | -------------------------------------------------------------------------------- /bigdata-hive/src/main/doc/Hive解析流程.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hive/src/main/doc/Hive解析流程.md -------------------------------------------------------------------------------- /bigdata-hive/src/main/doc/Hive面试题.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hive/src/main/doc/Hive面试题.md -------------------------------------------------------------------------------- /bigdata-hive/src/main/image/hive作业执行过程.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hive/src/main/image/hive作业执行过程.png -------------------------------------------------------------------------------- /bigdata-hive/src/main/image/hive架构.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-hive/src/main/image/hive架构.jpg -------------------------------------------------------------------------------- /bigdata-hive/src/main/java/com/libin/HiveUtils.java: -------------------------------------------------------------------------------- 1 | package com.libin; 2 | 3 | /** 4 | * Copyright (c) 2020/9/20. libin Inc. All Rights Reserved. 5 | * Authors: libin 6 | *

7 | * Purpose : Hive相关操作 8 | */ 9 | public class HiveUtils { 10 | } 11 | -------------------------------------------------------------------------------- /bigdata-info/.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | build_info.properties 3 | .classpath 4 | dependency-reduced-pom.xml 5 | *.diff 6 | .DS_Store 7 | .idea/ 8 | *.iml 9 | *.jar 10 | .project 11 | .settings/ 12 | .tags* 13 | target/ 14 | tmp* 15 | test-output/ 16 | nohup* 17 | *.log 18 | *.swp 19 | *.pyc 20 | script/__pycache__/ 21 | -------------------------------------------------------------------------------- /bigdata-info/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-info/README.md -------------------------------------------------------------------------------- /bigdata-info/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | io.github.wujun728 8 | bigdata-info 9 | 0.0.1-SNAPSHOT 10 | 11 | 12 | -------------------------------------------------------------------------------- /bigdata-info/src/main/java/com/libin/Test.java: -------------------------------------------------------------------------------- 1 | package com.libin; 2 | 3 | /** 4 | * Copyright (c) 2020/7/30. 小彬科技 Inc. All Rights Reserved. 5 | * Authors: libin <小彬科技> 6 | *

7 | * Purpose : 8 | */ 9 | public class Test { 10 | } 11 | -------------------------------------------------------------------------------- /bigdata-info/src/main/java/com/libin/elasticsearch/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-info/src/main/java/com/libin/elasticsearch/README.md -------------------------------------------------------------------------------- /bigdata-info/src/main/java/com/libin/griffin/2.Griffin指标使用.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-info/src/main/java/com/libin/griffin/2.Griffin指标使用.md -------------------------------------------------------------------------------- /bigdata-info/src/main/java/com/libin/griffin/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-info/src/main/java/com/libin/griffin/README.md -------------------------------------------------------------------------------- /bigdata-info/src/main/java/com/libin/griffin/image/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-info/src/main/java/com/libin/griffin/image/1.png -------------------------------------------------------------------------------- /bigdata-info/src/main/java/com/libin/griffin/image/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-info/src/main/java/com/libin/griffin/image/2.jpg -------------------------------------------------------------------------------- /bigdata-info/src/main/java/com/libin/griffin/image/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-info/src/main/java/com/libin/griffin/image/3.png -------------------------------------------------------------------------------- /bigdata-info/src/main/java/com/libin/griffin/image/4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-info/src/main/java/com/libin/griffin/image/4.jpg -------------------------------------------------------------------------------- /bigdata-info/src/main/java/com/libin/oozie/README.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 1、Oozie简介 3 | Oozie是一个管理 Apache Hadoop 作业的工作流调度系统。 4 | Oozie的 workflow jobs 是由 actions 组成的 有向无环图(DAG)。 5 | Oozie的 coordinator jobs 是由时间 (频率)和数据可用性触发的重复的 workflow jobs 。 6 | Oozie与Hadoop生态圈的其他部分集成在一起,支持多种类型的Hadoop作业(如Java map-reduce、流式map-reduce、Pig、Hive、Sqoop和Distcp)以及特定于系统的工作(如Java程序和shell脚本)。 7 | Oozie是一个可伸缩、可靠和可扩展的系统。 8 | Oozie是大数据四大协作框架之一——任务调度框架,另外三个分别为数据转换工具Sqoop,文件收集库框架Flume,大数据WEB工具Hue。 9 | 10 | ##### 2、Oozie操作 11 | 12 | 13 | 14 | ##### 3、Oozie学习资料 15 | 16 | 17 | -------------------------------------------------------------------------------- /bigdata-info/src/main/java/com/libin/pegasus/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | * [学习资料](./学习资料.md) 4 | -------------------------------------------------------------------------------- /bigdata-info/src/main/java/com/libin/pegasus/学习资料.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ##### 学习文档 4 | * [1、GitHub](https://github.com/XiaoMi/pegasus) 5 | * [2、Apache官网](https://pegasus.apache.org/) 6 | * [3、bookstack文档](https://www.bookstack.cn/read/Pegasus/128323) 7 | 8 | -------------------------------------------------------------------------------- /bigdata-info/src/main/java/com/libin/talos/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-info/src/main/java/com/libin/talos/README.md -------------------------------------------------------------------------------- /bigdata-kafka/.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | build_info.properties 3 | .classpath 4 | dependency-reduced-pom.xml 5 | *.diff 6 | .DS_Store 7 | .idea/ 8 | *.iml 9 | *.jar 10 | .project 11 | .settings/ 12 | .tags* 13 | target 14 | tmp* 15 | test-output/ 16 | nohup* 17 | *.log 18 | *.swp 19 | *.pyc 20 | script/__pycache__/ 21 | venv 22 | -------------------------------------------------------------------------------- /bigdata-kafka/README.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 1、Kafka原理 3 | * [Kafka基本概念](src/main/docs/Kafka基本概念.md) 4 | * [Kafka副本](src/main/docs/Kafka副本.md) 5 | * [消费者与消费组](src/main/docs/消费者与消费组.md) 6 | 7 | ##### 2、Kafka操作 8 | * [Kafka客户端操作](src/main/scala/com/libin/code/client/KafkaClient.scala) 9 | 10 | 11 | ##### 3、Kafka面试 12 | * [Kafka面试题](src/main/docs/Kafka面试题.md) 13 | 14 | -------------------------------------------------------------------------------- /bigdata-kafka/src/main/docs/Kafka副本.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ##### Kafka复本机制 4 | 分区中的所有副本统称为AR(Assigned Replicas)。 5 | 所有与leader副本保持一定程度同步的副本(包括leader副本在内)组成ISR(In-Sync Replicas),ISR集合是AR集合中的一个子集。 6 | 与leader副本同步滞后过多的副本(不包括leader副本)组成OSR(Out-of-Sync Replicas),由此可见,AR=ISR+OSR。 7 | 在正常情况下,所有的 follower 副本都应该与 leader 副本保持一定程度的同步,即 AR=ISR,OSR集合为空。 8 | 默认情况下,当leader副本发生故障时,只有在ISR集合中的副本才有资格被选举为新的leader,而在OSR集合中的副本则没有任何机会。 9 | 10 | 11 | ISR与HW和LEO也有紧密的关系。HW是High Watermark的缩写,俗称高水位,它标识了一个特定的消息偏移量(offset),消费者只能拉取到这个offset之前的消息。 12 | 13 | LEO是Log End Offset的缩写,它标识当前日志文件中下一条待写入消息的offset,LEO的大小相当于当前日志分区中最后一条消息的offset值加1。 14 | 15 | -------------------------------------------------------------------------------- /bigdata-kafka/src/main/docs/Kafka基本概念.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ##### Kafka基本概念 4 | 1.主题 5 | Kafka将一组消息抽象归纳为一个主题(Topic),也就是说,一个主题就是对消息的一个分类。 6 | 生产者将消息发送到特定主题,消费者订阅主题或主题的某些分区进行消费。 7 | 2.消息 8 | 消息是Kafka通信的基本单位,由一个固定长度的消息头和一个可变长度的消息体构成。在老版本中,每一条消息称为Message; 9 | 在由Java重新实现的客户端中,每一条消息称为Record。 10 | 3.分区和副本 11 | Kafka将一组消息归纳为一个主题,而每个主题又被分成一个或多个分区(Partition)。 12 | 每个分区由一系列有序、不可变的消息组成,是一个有序队列。 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /bigdata-kafka/src/main/docs/消费者与消费组.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ##### 1.消费者与消费组 4 | 消费者(Consumer)负责订阅Kafka中的主题(Topic),并且从订阅的主题上拉取消息。 5 | 与其他一些消息中间件不同的是: 6 | 在Kafka的消费理念中还有一层消费组(Consumer Group)的概念,每个消费者都有一个对应的消费组。 7 | 当消息发布到主题后,只会被投递给订阅它的每个消费组中的一个消费者。 8 | 9 | 10 | ##### 2.消息投递模式 11 | 1. 点对点(P2P,Point-to-Point) 12 | 点对点模式是基于队列的,消息生产者发送消息到队列,消息消费者从队列中接收消息。 13 | 2. 发布/订阅(Pub/Sub)模式 14 | 发布订阅模式定义了如何向一个内容节点发布和订阅消息,这个内容节点称为主题(Topic),主题可以认为是消息传递的中介, 15 | 消息发布者将消息发布到某个主题,而消息订阅者从主题中订阅消息。主题使得消息的订阅者和发布者互相保持独立,不需要进行 16 | 接触即可保证消息的传递,发布/订阅模式在消息的一对多广播时采用。 17 | 18 | 如果所有的消费者都隶属于同一个消费组,那么所有的消息都会被均衡地投递给每一个消费者,即每条消息只会被一个消费者处理,这就相当于点对点模式的应用。 19 | 如果所有的消费者都隶属于不同的消费组,那么所有的消息都会被广播给所有的消费者,即每条消息会被所有的消费者处理,这就相当于发布/订阅模式的应用。 20 | 21 | 22 | -------------------------------------------------------------------------------- /bigdata-kafka/src/main/docs/索引和分段.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 索引和分段 3 | 4 | 5 | 6 | ###### 1. Kafka中有那些索引文件? 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /bigdata-kafka/src/main/images/Kafka体系结构.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-kafka/src/main/images/Kafka体系结构.png -------------------------------------------------------------------------------- /bigdata-kafka/src/main/images/kafka多副本架构.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-kafka/src/main/images/kafka多副本架构.png -------------------------------------------------------------------------------- /bigdata-kafka/src/main/scala/com/libin/README.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 1、Kafka操作 3 | 4 | -------------------------------------------------------------------------------- /bigdata-kafka/src/main/scala/com/libin/code/base/KafkaJobTrait.scala: -------------------------------------------------------------------------------- 1 | package com.libin.code.base 2 | 3 | /** 4 | * Copyright (c) 2020/9/26. libin Inc. All Rights Reserved. 5 | * Authors: libin 6 | *

7 | * Purpose : 8 | */ 9 | trait KafkaJobTrait { 10 | 11 | } 12 | -------------------------------------------------------------------------------- /bigdata-kafka/src/main/scala/com/libin/code/client/KafkaClient.scala: -------------------------------------------------------------------------------- 1 | package com.libin.code.client 2 | 3 | /** 4 | * Copyright (c) 2020/9/26. libin Inc. All Rights Reserved. 5 | * Authors: libin 6 | *

7 | * Purpose : 8 | */ 9 | object KafkaClient { 10 | 11 | } 12 | -------------------------------------------------------------------------------- /bigdata-kafka/src/main/scala/com/libin/code/streaming/FlinkStramingJob.scala: -------------------------------------------------------------------------------- 1 | package com.libin.code.streaming 2 | 3 | /** 4 | * Copyright (c) 2020/9/28. libin Inc. All Rights Reserved. 5 | * Authors: libin 6 | *

7 | * Purpose : 8 | */ 9 | 10 | object FlinkStramingJob { 11 | 12 | } 13 | -------------------------------------------------------------------------------- /bigdata-kafka/src/main/scala/com/libin/code/streaming/SparkStreamingKafkaJob.scala: -------------------------------------------------------------------------------- 1 | package com.libin.code.streaming 2 | 3 | /** 4 | * Copyright (c) 2020/9/26. libin Inc. All Rights Reserved. 5 | * Authors: libin 6 | *

7 | * Purpose : 8 | */ 9 | object SparkStreamingKafkaJob { 10 | 11 | } 12 | -------------------------------------------------------------------------------- /bigdata-kafka/src/main/scala/com/libin/code/utils/KafkaUtils.scala: -------------------------------------------------------------------------------- 1 | package com.libin.code.utils 2 | 3 | /** 4 | * Copyright (c) 2020/9/26. libin Inc. All Rights Reserved. 5 | * Authors: libin 6 | *

7 | * Purpose : 8 | */ 9 | object KafkaUtils { 10 | 11 | } 12 | -------------------------------------------------------------------------------- /bigdata-project/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | io.github.wujun728 8 | bigdata-project 9 | 0.0.1-SNAPSHOT 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /bigdata-project/src/main/java/dataWarehouse/readme.md: -------------------------------------------------------------------------------- 1 | 2 | ## 数据仓库 3 | 4 | -------------------------------------------------------------------------------- /bigdata-project/src/main/java/featureEngineering/readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-project/src/main/java/featureEngineering/readme.md -------------------------------------------------------------------------------- /bigdata-project/src/main/java/idmapping/readme.md: -------------------------------------------------------------------------------- 1 | 2 | ## Id-Mapping 3 | 4 | -------------------------------------------------------------------------------- /bigdata-project/src/main/java/realTimeWarehouse/readme.md: -------------------------------------------------------------------------------- 1 | 2 | ## 实时数仓 3 | 4 | -------------------------------------------------------------------------------- /bigdata-project/src/main/java/userProfile/readme.md: -------------------------------------------------------------------------------- 1 | 2 | ## 用户画像 3 | 4 | -------------------------------------------------------------------------------- /bigdata-spark-sql/.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | build_info.properties 3 | .classpath 4 | dependency-reduced-pom.xml 5 | *.diff 6 | .DS_Store 7 | .idea/ 8 | *.iml 9 | *.jar 10 | .project 11 | .settings/ 12 | .tags* 13 | target 14 | tmp* 15 | test-output/ 16 | nohup* 17 | *.log 18 | *.swp 19 | *.pyc 20 | script/__pycache__/ 21 | venv 22 | -------------------------------------------------------------------------------- /bigdata-spark-sql/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## 一、概述以及文档 3 | 1、Spark Sql 4 | Spark sql是spark用来处理结构化和半结构化数据的高级部分; 5 | Spark sql的核心数据抽象DataFrame; 6 | 2、DataFrame 7 | DataFrame和RDD类似,都是spark平台用以分布式并行计算的不可变的分布式数据集; 8 | * [介绍文档](src/main/doc) 9 | 10 | ## 二、代码操作 11 | * [ETL相关操作](src/main/scala/com/libin/etl) 12 | * [相关源码阅读](src/main/scala/com/libin/source) 13 | 14 | ## 三、学习文档 15 | 1、官网 16 | 2、博客网站 17 | 3、相关书籍 18 | -------------------------------------------------------------------------------- /bigdata-spark-sql/src/main/doc/READMD.md: -------------------------------------------------------------------------------- 1 | 2 | #### 相关文档 3 | 4 | -------------------------------------------------------------------------------- /bigdata-spark-sql/src/main/java/com/libin/utils/FileUtils.java: -------------------------------------------------------------------------------- 1 | package com.libin.utils; 2 | 3 | /** 4 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved. 5 | * Authors: libin <2578858653@qq.com> 6 | *

7 | * Purpose : 8 | */ 9 | 10 | public class FileUtils { 11 | public static final String STU_File = "stu.json"; 12 | public static final String SCHOOL_File = "school.json"; 13 | 14 | public static final String PEOPLE_File = "people.txt"; 15 | public static final String USERS_File = "users.parquet"; 16 | } 17 | -------------------------------------------------------------------------------- /bigdata-spark-sql/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | 2 | log4j.logger.org.apache=ERROR 3 | -------------------------------------------------------------------------------- /bigdata-spark-sql/src/main/resources/people.txt: -------------------------------------------------------------------------------- 1 | Michael, 29 2 | Andy, 30 3 | Justin, 19 4 | -------------------------------------------------------------------------------- /bigdata-spark-sql/src/main/resources/school.json: -------------------------------------------------------------------------------- 1 | {"name": "xiaoming", "school": "qinghua", "location": "bj"} 2 | {"name": "xiaoli", "school": "fudan", "location": "shanghai"} 3 | {"name": "xiaoqiang", "school": "nankai", "location": "tianjin"} 4 | {"name": "xiaohong", "school": "chuanda", "location": "sichuan"} -------------------------------------------------------------------------------- /bigdata-spark-sql/src/main/resources/stu.json: -------------------------------------------------------------------------------- 1 | {"name": "xiaoming", "age": 22, "height": 175} 2 | {"name": "xiaoli", "age": 18, "height": 161} 3 | {"name": "xiaoqiang", "age": 26, "height": 198} 4 | {"name": "xiaohong", "age": 18, "height": 158} -------------------------------------------------------------------------------- /bigdata-spark-sql/src/main/resources/users.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-spark-sql/src/main/resources/users.parquet -------------------------------------------------------------------------------- /bigdata-spark-sql/src/main/scala/com/libin/common/sparkJobBase.scala: -------------------------------------------------------------------------------- 1 | package com.libin.common 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import org.apache.spark.sql.SparkSession 5 | import org.joda.time.DateTime 6 | import org.slf4j.{Logger, LoggerFactory} 7 | 8 | /** 9 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved. 10 | * Authors: libin <2578858653@qq.com> 11 | * 12 | * Purpose : 13 | */ 14 | 15 | trait SparkJobBase { 16 | 17 | /** 18 | * 得到app名称 19 | */ 20 | def appName: String = this.getClass.getSimpleName.stripSuffix("$") 21 | 22 | /** 23 | * 对数据处理的一些常用分隔符,使用时需要被重写 24 | */ 25 | val separator: String = "\t" 26 | 27 | /** 28 | * 对数据的填充值 29 | */ 30 | val fillValue: String = "" 31 | 32 | /** 33 | * 对数据处理的分区数,使用时需要被重写 34 | */ 35 | val partitionNum: Int = 400 36 | 37 | /** 38 | * 获取作业名字,使用时需要被重写 39 | */ 40 | def jobName: String = this.getClass.getSimpleName 41 | 42 | /** 43 | * 日志对象 44 | */ 45 | val logger: Logger = LoggerFactory.getLogger(jobName) 46 | 47 | /** 48 | * 创建SparkSession对象 49 | */ 50 | def createSparkSession(): SparkSession = { 51 | SparkSession.builder().appName(appName).getOrCreate() 52 | } 53 | 54 | /** 55 | * 创建本地的SparkSession对象 56 | */ 57 | def createSparkSessionLocal(): SparkSession = { 58 | SparkSession.builder().appName(appName).master("local[2]").getOrCreate() 59 | } 60 | 61 | /** 62 | * 创建本地的SparkContext对象 63 | */ 64 | def createSparkContextLocal(): SparkContext = { 65 | SparkContext.getOrCreate(new SparkConf().setAppName(appName).setMaster("local[2]")) 66 | } 67 | 68 | /** 69 | * 初始化 70 | * 71 | * @return 72 | */ 73 | def initContext: SparkSession = createSparkSessionLocal() 74 | 75 | /** 76 | * 停掉一个作业 77 | * 78 | */ 79 | def destroyJob(): Unit = { 80 | createSparkSessionLocal().stop() 81 | logger.info(s"$jobName stopped at ${new DateTime()}") 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /bigdata-spark-sql/src/main/scala/com/libin/etl/jobs/ConvertJobScheduler.scala: -------------------------------------------------------------------------------- 1 | package com.libin.etl.jobs 2 | 3 | import com.libin.common.SparkJobBase 4 | import com.libin.etl.loader.data.{DfBuilder, RddBuilder} 5 | import com.libin.etl.utils.LoadUtils.stu 6 | import com.libin.utils.FileUtils 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.DataFrame 9 | 10 | /** 11 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved. 12 | * Authors: libin <2578858653@qq.com> 13 | * 14 | * Purpose : 15 | */ 16 | 17 | class ConvertJobScheduler extends SparkJobBase { 18 | override def appName = "convertJobScheduler" 19 | 20 | val sc = createSparkContextLocal() 21 | val ss = createSparkSessionLocal() 22 | } 23 | 24 | object ConvertJobScheduler { 25 | 26 | def apply() = new ConvertJobScheduler 27 | 28 | def main(args: Array[String]) { 29 | val convertScheduler: ConvertJobScheduler = apply() 30 | convertScheduler.logger.info("convertJobScheduler start ...") 31 | 32 | // 创建RDD 33 | val rdd: RDD[stu] = RddBuilder.createRdd(convertScheduler.sc) 34 | 35 | import convertScheduler.ss.implicits._ 36 | println("rdd.toDF().show() ...") 37 | rdd.toDF().show() 38 | println("rdd.toDS().show() ...") 39 | rdd.toDS().show() 40 | 41 | // -------------------------------------------------------------------------------- 42 | // join操作 43 | val stuDf: DataFrame = DfBuilder.readJsonToDf(convertScheduler.ss, FileUtils.STU_File) 44 | val schoolDf: DataFrame = DfBuilder.readJsonToDf(convertScheduler.ss, FileUtils.SCHOOL_File) 45 | 46 | // stuDf.createOrReplaceTempView("stu_df") 47 | // schoolDf.createOrReplaceTempView("school_df") 48 | println("stuDf.join(schoolDf,\"name\").show() ...") 49 | stuDf.join(schoolDf, "name").show() 50 | 51 | // -------------------------------------------------------------------------------- 52 | // 读取test,parquet格式 53 | println(s"dfBuilder.readTextToDf(convertScheduler.ss,${FileUtils.PEOPLE_File}).show() ...") 54 | DfBuilder.readTextToDf(convertScheduler.ss, FileUtils.PEOPLE_File).show() 55 | println(s"dfBuilder.readParquetToDf(convertScheduler.ss,${FileUtils.USERS_File}).show() ...") 56 | DfBuilder.readParquetToDf(convertScheduler.ss, FileUtils.USERS_File).show() 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /bigdata-spark-sql/src/main/scala/com/libin/etl/jobs/DfJobScheduler.scala: -------------------------------------------------------------------------------- 1 | package com.libin.etl.jobs 2 | 3 | import com.libin.common.SparkJobBase 4 | import com.libin.etl.loader.data.DfBuilder 5 | import com.libin.etl.utils.LogUtils 6 | import com.libin.utils.FileUtils 7 | import org.apache.spark.sql.DataFrame 8 | 9 | /** 10 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved. 11 | * Authors: libin <2578858653@qq.com> 12 | * 13 | * Purpose : 14 | */ 15 | 16 | class DfJobScheduler extends SparkJobBase { 17 | override def appName = "dfJobScheduler" 18 | 19 | val ss = createSparkSessionLocal() 20 | } 21 | 22 | object DfJobScheduler { 23 | def apply() = new DfJobScheduler 24 | 25 | def main(args: Array[String]) { 26 | val dfScheduler: DfJobScheduler = apply() 27 | LogUtils.setSparkLogLevels() 28 | dfScheduler.logger.info("dfJobScheduler start ...") 29 | // 测试读取json配置数据 30 | // loadUtils.readResourceFile("stu.json").foreach(println) 31 | // 读取df操作 32 | val df: DataFrame = DfBuilder.readJsonToDf(dfScheduler.ss, FileUtils.STU_File) 33 | 34 | /** 35 | * DataFrame基本操作 36 | */ 37 | // op1.显示数据 38 | println("df.show() ...") 39 | df.show() 40 | // op2.输出结构信息schema 41 | println("df.printSchema() ...") 42 | df.printSchema() 43 | // op3.查询字段 44 | println("df.select(\"name\").show() ...") 45 | df.select("name").show() 46 | // op4.身高高于150,按照年龄倒叙排序输出 47 | println("df.select($\"name\", $\"age\", $\"height\").where($\"height\" > 150).orderBy(df(\"age\").desc).show() ...") 48 | import dfScheduler.ss.implicits._ 49 | df.select($"name", $"age", $"height").where($"height" > 150).orderBy(df("age").desc).show() 50 | // op5.使用groupBy 51 | println("df.groupBy(\"age\").max(\"height\").show()") 52 | df.groupBy("age").max("height").show() 53 | 54 | /** 55 | * 执行sql语句 56 | */ 57 | df.createOrReplaceTempView("stu") 58 | println("df.sqlContext.sql(\"select * from stu\")") 59 | df.sqlContext.sql("select * from stu").show() 60 | println("df.sqlContext.sql(\"select name,age,height from stu\")") 61 | df.sqlContext.sql("select name,age,height from stu").show() 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /bigdata-spark-sql/src/main/scala/com/libin/etl/jobs/DsJobScheduler.scala: -------------------------------------------------------------------------------- 1 | package com.libin.etl.jobs 2 | 3 | import com.libin.common.SparkJobBase 4 | import com.libin.etl.loader.data.DsBuilder 5 | import com.libin.etl.utils.LoadUtils.stu 6 | import org.apache.spark.sql.Dataset 7 | 8 | /** 9 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved. 10 | * Authors: libin <2578858653@qq.com> 11 | * 12 | * Purpose : 13 | */ 14 | 15 | class DsJobScheduler extends SparkJobBase { 16 | override def appName = "dsJobScheduler" 17 | 18 | val ss = createSparkSessionLocal() 19 | } 20 | 21 | object DsJobScheduler { 22 | def apply() = new DsJobScheduler() 23 | 24 | def main(args: Array[String]) { 25 | val dsScheduler: DsJobScheduler = apply() 26 | dsScheduler.logger.info("dsJobScheduler start ...") 27 | 28 | val ds: Dataset[stu] = DsBuilder.createDsBySeq(dsScheduler.ss) 29 | 30 | ds.show() 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /bigdata-spark-sql/src/main/scala/com/libin/etl/loader/data/DsBuilder.scala: -------------------------------------------------------------------------------- 1 | package com.libin.etl.loader.data 2 | 3 | import com.libin.etl.utils.LoadUtils.stu 4 | import org.apache.spark.sql.{Dataset, SparkSession} 5 | 6 | /** 7 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved. 8 | * Authors: libin <2578858653@qq.com> 9 | * 10 | * Purpose : 11 | */ 12 | 13 | object DsBuilder { 14 | 15 | /** 16 | * 读取配置文件中的数据创建Dataset 17 | * 18 | * @param ss SparkSession 19 | * @param fileName 文件名字 20 | * @return 返回DataSet数据集 21 | */ 22 | def readJsonToDs(ss: SparkSession, fileName: String): Dataset[stu] = { 23 | import ss.implicits._ 24 | val url: String = this.getClass.getClassLoader.getResource(fileName).toString 25 | ss.read.json(url).as[stu] 26 | } 27 | 28 | /** 29 | * 使用Seq+toDf创建DataSet 30 | * 31 | * @param ss SparkSession 32 | * @return 返回DataSet数据集 33 | */ 34 | def createDsBySeq(ss: SparkSession): Dataset[stu] = { 35 | import ss.implicits._ 36 | Seq(stu("xiaoming", 22, 175), stu("xiaoli", 18, 161), stu("xiaoqiang", 26, 198), stu("xiaohong", 18, 158)) 37 | .toDS() 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /bigdata-spark-sql/src/main/scala/com/libin/etl/loader/data/RddBuilder.scala: -------------------------------------------------------------------------------- 1 | package com.libin.etl.loader.data 2 | 3 | import com.libin.common.SparkJobBase 4 | import com.libin.etl.utils.LoadUtils.stu 5 | import com.libin.etl.utils.PathUtils 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql.Row 9 | 10 | /** 11 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved. 12 | * Authors: libin <2578858653@qq.com> 13 | * 14 | * Purpose : 15 | */ 16 | 17 | object RddBuilder extends SparkJobBase { 18 | 19 | /** 20 | * 返回RDD类型 21 | * 22 | * @param sc SparkContext 23 | * @return 24 | */ 25 | def createRdd(sc: SparkContext): RDD[stu] = { 26 | val arr: Array[stu] = Array(stu("xiaoming", 22, 175), 27 | stu("xiaoli", 18, 161), 28 | stu("xiaoqiang", 26, 198), 29 | stu("xiaohong", 18, 158)) 30 | sc.makeRDD(arr) 31 | } 32 | 33 | /** 34 | * 读取指定路径下面,指定字段的数据 35 | * 36 | * @param path 数据路径 37 | * @param date 分区时间 38 | * @param isTest 是否测试 39 | * @param cols 列字段集合 40 | */ 41 | def loadDwsBigDataDeviceProfileDBySql(path: String, 42 | date: String, 43 | isTest: Boolean = false, 44 | cols: List[String]): RDD[Row] = { 45 | createSparkSessionLocal() 46 | .read 47 | .parquet(PathUtils.pathAssemble(path, date)) 48 | .selectExpr(cols: _*).rdd 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /bigdata-spark-sql/src/main/scala/com/libin/etl/processor/ProcessorOp.scala: -------------------------------------------------------------------------------- 1 | package com.libin.etl.processor 2 | 3 | /** 4 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved. 5 | * Authors: libin <2578858653@qq.com> 6 | * 7 | * Purpose : 8 | */ 9 | 10 | object ProcessorOp { 11 | 12 | } 13 | -------------------------------------------------------------------------------- /bigdata-spark-sql/src/main/scala/com/libin/etl/utils/DateUtils.scala: -------------------------------------------------------------------------------- 1 | package com.libin.etl.utils 2 | 3 | import org.joda.time.DateTime 4 | import org.joda.time.format.DateTimeFormat 5 | 6 | /** 7 | * Copyright (c) 2020/4/15. libin Inc. All Rights Reserved. 8 | * Authors: libin <2578858653@qq.com> 9 | *

10 | * Purpose : 11 | */ 12 | object DateUtils { 13 | val DATE_FORMAT = "yyyyMMdd" 14 | 15 | /** 16 | * 日期字符串转为DateTime 17 | * 18 | * @param input 日期 19 | * @return 20 | */ 21 | def parseDate(input: String): Option[DateTime] = 22 | try { 23 | Some(DateTimeFormat.forPattern(DATE_FORMAT).parseDateTime(input)) 24 | } catch { 25 | case e: Exception => None 26 | } 27 | } -------------------------------------------------------------------------------- /bigdata-spark-sql/src/main/scala/com/libin/etl/utils/LoadUtils.scala: -------------------------------------------------------------------------------- 1 | package com.libin.etl.utils 2 | 3 | import scala.io.Source 4 | 5 | /** 6 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved. 7 | * Authors: libin <2578858653@qq.com> 8 | * 9 | * Purpose : 10 | */ 11 | 12 | object LoadUtils { 13 | 14 | case class stu(name: String, age: Int, height: Int) 15 | 16 | /** 17 | * 读取配置中的文件 18 | * 19 | * @param fileName 配置文件名 20 | * @return 返回所有行记录的集合数据 21 | */ 22 | def readResourceFile(fileName: String): Array[String] = { 23 | val inputStream = this.getClass.getClassLoader.getResourceAsStream(fileName) 24 | Source.fromInputStream(inputStream).getLines().toArray 25 | } 26 | 27 | 28 | /** 29 | * 读取配置中的文件,按照制定分隔符返回Map 30 | * 31 | * @param fileName 配置文件名 32 | * @return 返回所有行记录的集合数据 33 | */ 34 | def readResourceFile(fileName: String, delimit: String): Map[String, String] = { 35 | val inputStream = this.getClass.getClassLoader.getResourceAsStream(fileName) 36 | Source.fromInputStream(inputStream).getLines() 37 | .map { 38 | line => 39 | val sp = line.split(delimit) 40 | (sp(0), sp(1)) 41 | }.toMap 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /bigdata-spark-sql/src/main/scala/com/libin/etl/utils/LogUtils.scala: -------------------------------------------------------------------------------- 1 | package com.libin.etl.utils 2 | 3 | import org.apache.spark.internal.Logging 4 | import org.apache.log4j.{Level, Logger} 5 | 6 | /** 7 | * Copyright (c) 2020/4/15. libin Inc. All Rights Reserved. 8 | * Authors: libin <2578858653@qq.com> 9 | *

10 | * Purpose : 只显示WARN日志,大量的INFO日志都可以被屏蔽掉 11 | */ 12 | object LogUtils extends Logging { 13 | /** Set reasonable logging levels for streaming if the user has not configured log4j. */ 14 | def setSparkLogLevels() { 15 | val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements 16 | // if (!log4jInitialized) { 17 | if (log4jInitialized) { 18 | // We first log something to initialize Spark's default logging, then we override the 19 | // logging level. 20 | logInfo("Setting log level to [WARN] for streaming example." + 21 | " To override add a custom log4j.properties to the classpath.") 22 | Logger.getRootLogger.setLevel(Level.WARN) 23 | } else { 24 | Logger.getRootLogger.setLevel(Level.WARN) 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /bigdata-spark-sql/src/main/scala/com/libin/etl/utils/PathUtils.scala: -------------------------------------------------------------------------------- 1 | package com.libin.etl.utils 2 | 3 | import org.joda.time.DateTime 4 | 5 | /** 6 | * Copyright (c) 2020/4/15. libin Inc. All Rights Reserved. 7 | * Authors: libin <2578858653@qq.com> 8 | *

9 | * Purpose : 10 | */ 11 | object PathUtils { 12 | /** 13 | * 路径拼接DateTime 14 | * 15 | * @param root 数据根目录 16 | * @param date 读取的数据日期 17 | * @return 完整数据路径 18 | */ 19 | def pathAssemble(root: String, date: DateTime): String = s"$root/date=${date.toString(DateUtils.DATE_FORMAT)}" 20 | 21 | def pathAssembleAll(root: String, date: DateTime): String = s"$root/date=${date.toString(DateUtils.DATE_FORMAT)}/*" 22 | 23 | /** 24 | * 路径拼接String 25 | * 26 | * @param root 数据根目录 27 | * @param date 读取的数据日期 28 | * @return 完整数据路径 29 | */ 30 | def pathAssemble(root: String, date: String): String = pathAssemble(root, DateUtils.parseDate(date).get) 31 | } 32 | -------------------------------------------------------------------------------- /bigdata-spark-sql/src/main/scala/com/libin/source/sql.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark 19 | 20 | import org.apache.spark.annotation.{DeveloperApi, InterfaceStability} 21 | import org.apache.spark.sql.execution.SparkStrategy 22 | 23 | /** 24 | * Allows the execution of relational queries, including those expressed in SQL using Spark. 25 | * 26 | * @groupname dataType Data types 27 | * @groupdesc Spark SQL data types. 28 | * @groupprio dataType -3 29 | * @groupname field Field 30 | * @groupprio field -2 31 | * @groupname row Row 32 | * @groupprio row -1 33 | */ 34 | package object sql { 35 | 36 | /** 37 | * Converts a logical plan into zero or more SparkPlans. This API is exposed for experimenting 38 | * with the query planner and is not designed to be stable across spark releases. Developers 39 | * writing libraries should instead consider using the stable APIs provided in 40 | * [[org.apache.spark.sql.sources]] 41 | */ 42 | @DeveloperApi 43 | @InterfaceStability.Unstable 44 | type Strategy = SparkStrategy 45 | 46 | type DataFrame = Dataset[Row] 47 | } 48 | -------------------------------------------------------------------------------- /bigdata-spark-sql/src/test/scala/com/libin/etl/testProcessor.scala: -------------------------------------------------------------------------------- 1 | package com.libin.etl 2 | 3 | /** 4 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved. 5 | * Authors: libin <2578858653@qq.com> 6 | * 7 | * Purpose : 8 | */ 9 | 10 | class testProcessor { 11 | 12 | val SEPARATOR: String = "\t" 13 | 14 | val PARTITIONNUM: Int = 100 15 | } 16 | 17 | object testProcessor { 18 | 19 | def apply() = new testProcessor() 20 | 21 | def main(args: Array[String]) { 22 | val processor: testProcessor = apply() 23 | println(processor.PARTITIONNUM) 24 | 25 | println(this.getClass.getClassLoader.getResource("stu.json")) 26 | } 27 | } 28 | 29 | -------------------------------------------------------------------------------- /bigdata-spark-streaming/.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | build_info.properties 3 | .classpath 4 | dependency-reduced-pom.xml 5 | *.diff 6 | .DS_Store 7 | .idea/ 8 | *.iml 9 | *.jar 10 | .project 11 | .settings/ 12 | .tags* 13 | target 14 | tmp* 15 | test-output/ 16 | nohup* 17 | *.log 18 | *.swp 19 | *.pyc 20 | script/__pycache__/ 21 | venv 22 | -------------------------------------------------------------------------------- /bigdata-spark-streaming/README.md: -------------------------------------------------------------------------------- 1 | 2 | ##### spark streaming 3 | 4 | * [官网](http://spark.apache.org/streaming/) 5 | * [文档](http://spark.apache.org/docs/latest/streaming-programming-guide.html) 6 | * [Spark Streaming2.2文档](https://spark.apache.org/docs/2.2.0/streaming-programming-guide.html) 7 | 8 | ##### 9 | spark streaming基本原理为将输入数据流以时间片为单位进行拆分,然后以批处理的方式处理每个时间片的数据。 10 | 11 | ![实时数据流](./src/main/image/1.png) 12 | 13 | ![数据流处理](./src/main/image/2.png) 14 | 15 | Spark Streaming使用DSTream来表示一个连续的数据流。 16 | DSTream被表示为一系列连续的RDDs,其中每个RDD包含来自一定时间间隔的数据。 17 | -------------------------------------------------------------------------------- /bigdata-spark-streaming/src/main/doc/优化.md: -------------------------------------------------------------------------------- 1 | 2 | ##### spark streaming作业优化 3 | 4 | ##### 1.数据序列化 5 | 推荐使用Kryo序列化。 6 | 7 | ##### 2.数据缓存 8 | 尝试使用lru算法。 9 | 10 | ##### 3.增大并发数 11 | 不少于上游topic个数。 12 | 13 | ##### 4.合理设置batch时间 14 | mini batch计算模式,数据量堆积严重,可以尝试增大batch时间,在符合数据流延迟时间之内。 15 | 16 | ##### 5、设置合理的core和memory 17 | 根据每秒数据量、计算逻辑、数据是否缓存、是否读写内存数据库、峰值数据量等合理设置。 18 | 19 | -------------------------------------------------------------------------------- /bigdata-spark-streaming/src/main/doc/常见问题.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 作业堆积原因? 3 | batch时间设置过小 4 | 资源配置过少 5 | 缓存文件 6 | 读写外部缓存耗时 7 | 上游不稳定 8 | 9 | 10 | ##### 作业堆积优化? 11 | 12 | 13 | ##### 一致性语义 14 | 读取上游保证恰好消费一次Exactly-once 15 | 写数据保证原子性,幂等性 16 | 17 | -------------------------------------------------------------------------------- /bigdata-spark-streaming/src/main/doc/检查点CheckPoint.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 进行CheckPoint的数据有哪些? 3 | 1、元数据checkpoint 4 | 配置信息:创建的Spark Streaming程序的配置信息,比如SparkConf中的信息。 5 | DStream的操作信息。 6 | 未处理的batch信息,有些job在排队,还没处理的batch信息。 7 | 2、数据checkpoint 8 | 将实时计算中产生的RDD的数据保存在可靠的存储系统中,比如HDFS。 9 | 10 | ##### 什么时候启用checkpoint机制? 11 | 1、使用了有状态的转换、比如reduceByKeyAndWindow操作。 12 | 2、作业失败重启 13 | 14 | -------------------------------------------------------------------------------- /bigdata-spark-streaming/src/main/image/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-spark-streaming/src/main/image/1.png -------------------------------------------------------------------------------- /bigdata-spark-streaming/src/main/image/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/bigdata-spark-streaming/src/main/image/2.png -------------------------------------------------------------------------------- /bigdata-spark-streaming/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Global logging configuration 2 | # log4j.rootLogger=WARN, stdout 3 | # Console output... 4 | # log4j.appender.stdout=org.apache.log4j.ConsoleAppender 5 | # log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 6 | # log4j.appender.stdout.layout.ConversionPattern=%5p [%t] - %m%n 7 | -------------------------------------------------------------------------------- /bigdata-spark-streaming/src/main/scala/com/libin/data/streaming/base/SparkStreamingTrait.scala: -------------------------------------------------------------------------------- 1 | package com.libin.data.streaming.base 2 | 3 | import org.slf4j.{Logger, LoggerFactory} 4 | 5 | /** 6 | * Copyright (c) 2020/4/3 libin Inc. All Rights Reserved. 7 | * Authors: libin<2578858653@qq.com> 8 | * 9 | * Purpose : 10 | */ 11 | trait SparkStreamingTrait { 12 | 13 | /** 14 | * Application Name 15 | */ 16 | def appName: String = this.getClass.getSimpleName 17 | 18 | /** 19 | * logger 20 | */ 21 | def logger: Logger = LoggerFactory.getLogger(appName) 22 | 23 | 24 | 25 | } 26 | -------------------------------------------------------------------------------- /bigdata-spark-streaming/src/main/scala/com/libin/data/streaming/base/client/SocketSparkStreamingTrait.scala: -------------------------------------------------------------------------------- 1 | package com.libin.data.streaming.base.client 2 | 3 | import com.libin.data.streaming.base.SparkStreamingTrait 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.streaming.{Seconds, StreamingContext} 6 | 7 | /** 8 | * Copyright (c) 2020/4/11. libin Inc. All Rights Reserved. 9 | * Authors: libin <2578858653@qq.com> 10 | *

11 | * Purpose : SparkStreaming + Socket 12 | */ 13 | trait SocketSparkStreamingTrait extends SparkStreamingTrait { 14 | 15 | /** 16 | * 根据指定的batch间隔时间,生成StreamingContext对象 17 | * 18 | * @param interval batch间隔时间 19 | * @return StreamingContext对象 20 | */ 21 | def createStreamContext(interval: Int): StreamingContext = { 22 | val conf = new SparkConf().setAppName(appName).setIfMissing("spark.master", "local") 23 | new StreamingContext(conf, Seconds(interval)) 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /bigdata-spark-streaming/src/main/scala/com/libin/data/streaming/jobs/GenCodeFromCheckpoint.scala: -------------------------------------------------------------------------------- 1 | package com.libin.data.streaming.jobs 2 | 3 | import com.libin.data.streaming.utils.StreamingExamples 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.streaming.{Seconds, StreamingContext} 6 | 7 | /** 8 | * Copyright (c) 2020/04/04. libin Inc. All Rights Reserved. 9 | * Authors: libin <2578858653@qq.com> 10 | *

11 | * Purpose : checkpoint 12 | * Linux: nc -lk 9999 13 | * windows: nc -l -p 9999 14 | */ 15 | object GenCodeFromCheckpoint { 16 | def main(args: Array[String]): Unit = { 17 | val conf = new SparkConf().setMaster("local[2]").setAppName("GenCodeFromCheckpoint") 18 | val ssc = new StreamingContext(conf, Seconds(5)) 19 | 20 | StreamingExamples.setStreamingLogLevels() 21 | val lines = ssc.socketTextStream("localhost", 9999) 22 | ssc.checkpoint("E:\\2020_github\\checkout\\GenCodeFromCheckpoint") 23 | 24 | lines.print() 25 | ssc.start() // Start the computation 26 | ssc.awaitTermination() // Wait for the computation to terminate 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /bigdata-spark-streaming/src/main/scala/com/libin/data/streaming/jobs/GenCodeFromForeachRDD.scala: -------------------------------------------------------------------------------- 1 | package com.libin.data.streaming.jobs 2 | 3 | import com.libin.data.streaming.utils.StreamingExamples 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.rdd.RDD 6 | import org.apache.spark.streaming.{Seconds, StreamingContext} 7 | 8 | /** 9 | * Copyright (c) 2020/4/3 libin Inc. All Rights Reserved. 10 | * Authors: libin<2578858653@qq.com> 11 | * 12 | * Purpose : Spark Streaming的foreachRDD算子使用 13 | * 14 | * Linux: nc -lk 9999 15 | * windows: nc -l -p 9999 16 | */ 17 | object GenCodeFromForeachRDD { 18 | def main(args: Array[String]): Unit = { 19 | val conf = new SparkConf().setMaster("local[2]").setAppName("GenCodeFromForeachRDD") 20 | val ssc = new StreamingContext(conf, Seconds(5)) 21 | 22 | StreamingExamples.setStreamingLogLevels() 23 | val lines = ssc.socketTextStream("localhost", 9999) 24 | 25 | lines.map((_, 1)) 26 | .foreachRDD { 27 | rdd => 28 | val saveRdd: RDD[(String, Int)] = rdd.mapPartitions { 29 | iter => 30 | val result = iter.map { 31 | line => 32 | line 33 | }.toList 34 | result.toIterator 35 | } 36 | println(saveRdd.count()) 37 | } 38 | ssc.start() // Start the computation 39 | ssc.awaitTermination() // Wait for the computation to terminate 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /bigdata-spark-streaming/src/main/scala/com/libin/data/streaming/jobs/GenCodeFromKafka.scala: -------------------------------------------------------------------------------- 1 | package com.libin.data.streaming.jobs 2 | 3 | import kafka.serializer.StringDecoder 4 | import org.apache.kafka.common.serialization.StringDeserializer 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.streaming.dstream.{DStream, InputDStream} 7 | import org.apache.spark.streaming.{Seconds, StreamingContext} 8 | import org.apache.spark.streaming.kafka.KafkaUtils 9 | 10 | /** 11 | * Copyright (c) 2020/04/06. libin Inc. All Rights Reserved. 12 | * Authors: libin <2578858653@qq.com> 13 | *

14 | * Purpose : kafka 15 | */ 16 | object GenCodeFromKafka { 17 | def main(args: Array[String]): Unit = { 18 | /** 19 | * 配置kafka参数 20 | */ 21 | val kafkaParams = Map[String, Object]( 22 | "bootstrap.servers" -> "localhost:port", 23 | "key.deserializer" -> classOf[StringDeserializer], 24 | "value.deserializer" -> classOf[StringDeserializer], 25 | "group.id" -> "group_id", 26 | "auto.offset.reset" -> "latest", 27 | "enable.auto.commit" -> (false) 28 | ) 29 | 30 | /** 31 | * 上游kafka topic 32 | */ 33 | val topics = Set("topic1", "topic2") 34 | 35 | val conf = new SparkConf().setMaster("local[2]").setAppName("GenCodeFromKafka") 36 | val ssc = new StreamingContext(conf, Seconds(5)) 37 | 38 | /*val stream: InputDStream[(String, String)] = KafkaUtils 39 | .createDirectStream[String, String, StringDecoder, StringDecoder,(String, String)]( 40 | ssc, kafkaParams, topics 41 | )*/ 42 | 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /bigdata-spark-streaming/src/main/scala/com/libin/data/streaming/jobs/GenCodeFromParams.scala: -------------------------------------------------------------------------------- 1 | package com.libin.data.streaming.jobs 2 | 3 | import com.libin.data.streaming.base.client.SocketSparkStreamingTrait 4 | import org.apache.spark.SparkConf 5 | 6 | /** 7 | * Copyright (c) 2020/4/11. libin Inc. All Rights Reserved. 8 | * Authors: libin <2578858653@qq.com> 9 | *

10 | * Purpose : Spark Streaming一些配置参数 11 | */ 12 | object GenCodeFromParams extends SocketSparkStreamingTrait { 13 | def main(args: Array[String]): Unit = { 14 | // val ssc = createStreamContext(5) 15 | val conf = new SparkConf().setMaster("local[2]").setAppName(appName) 16 | 17 | /** 18 | * 16个分区,5分钟一个batch,则一个batch消费处理的数据量是 5000 * 16 * 5 * 60 = 24000000 19 | */ 20 | // 启用反压机制,开启后spark自动根据系统负载选择最优消费速率 21 | conf.set("spark.streaming.backpressure.enabled", "true") 22 | // 限制第一次批处理应该消费的数据,因为程序冷启动队列里面有大量积压,防止第一次全部读取,造成系统阻塞 23 | conf.set("spark.streaming.backpressure.initialRate", "24000000") 24 | // 限制每秒每个消费线程读取每个kafka分区最大的数据量 25 | conf.set("spark.streaming.kafka.maxRatePerPartition", "5000") // 一般用在反压,限流上 26 | 27 | // 确保在kill任务时,能够处理完最后一批数据,再关闭程序,不会发生强制kill导致数据处理中断,没处理完的数据丢失 28 | conf.set("spark.streaming.stopGracefullyOnShutdown", "true") 29 | 30 | 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /bigdata-spark-streaming/src/main/scala/com/libin/data/streaming/jobs/GenCodeFromWindow.scala: -------------------------------------------------------------------------------- 1 | package com.libin.data.streaming.jobs 2 | 3 | import com.libin.data.streaming.utils.StreamingExamples 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.streaming.{Seconds, StreamingContext} 6 | 7 | /** 8 | * Copyright (c) 2020/4/3 libin Inc. All Rights Reserved. 9 | * Authors: libin<2578858653@qq.com> 10 | * 11 | * Purpose : Spark Streaming的Window算子使用 12 | * Linux: nc -lk 9999 13 | * windows: nc -l -p 9999 14 | */ 15 | object GenCodeFromWindow { 16 | def main(args: Array[String]): Unit = { 17 | val conf = new SparkConf().setMaster("local[2]").setAppName("GenCodeFromWindow") 18 | val ssc = new StreamingContext(conf, Seconds(5)) 19 | 20 | ssc.checkpoint("/home/baolibin/2020_github/checkout") 21 | StreamingExamples.setStreamingLogLevels() 22 | 23 | val lines = ssc.socketTextStream("localhost", 9999) 24 | 25 | // reduceByKeyAndWindow 26 | val res = lines.flatMap(_.split(" ")) 27 | .map((_, 1)) 28 | .reduceByKeyAndWindow((a: Int, b: Int) => (a + b), Seconds(10), Seconds(5)) 29 | 30 | // countByWindow 31 | val resByCount = lines.flatMap(_.split(" ")) 32 | .countByWindow(Seconds(10), Seconds(5)) 33 | 34 | res.print() 35 | resByCount.print() 36 | 37 | ssc.start() // Start the computation 38 | ssc.awaitTermination() // Wait for the computation to terminate 39 | } 40 | } 41 | 42 | /** 43 | Input: 44 | a 45 | a 46 | a 47 | a 48 | a 49 | 50 | b 51 | b 52 | b 53 | b 54 | b 55 | 56 | 57 | a 58 | b 59 | c 60 | d 61 | e 62 | ... 63 | Output: 64 | Time: 1585905790000 ms 65 | ------------------------------------------- 66 | (d,7) 67 | (b,42) 68 | (,21) 69 | (V,1) 70 | (e,7) 71 | (a,42) 72 | (c,7) 73 | ------------------------------------------- 74 | Time: 1585905795000 ms 75 | ------------------------------------------- 76 | (d,1) 77 | (b,6) 78 | (,3) 79 | (a,6) 80 | (c,1) 81 | */ 82 | 83 | 84 | /** 85 | ------------------------------------------- 86 | Time: 1585911530000 ms 87 | ------------------------------------------- 88 | (d,1) 89 | (b,6) 90 | (,3) 91 | (e,1) 92 | (a,6) 93 | (c,1) 94 | ------------------------------------------- 95 | Time: 1585911530000 ms 96 | ------------------------------------------- 97 | 18 98 | */ 99 | -------------------------------------------------------------------------------- /bigdata-spark-streaming/src/main/scala/com/libin/data/streaming/jobs/NetworkWordCount.scala: -------------------------------------------------------------------------------- 1 | package com.libin.data.streaming.jobs 2 | 3 | import com.libin.data.streaming.utils.StreamingExamples 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.streaming.{Seconds, StreamingContext} 6 | 7 | /** 8 | * Copyright (c) 2019/02/16. libin Inc. All Rights Reserved. 9 | * Authors: libin <2578858653@qq.com> 10 | *

11 | * Purpose : Spark Streaming的WordCount 12 | * Linux: nc -lk 9999 13 | * windows: nc -l -p 9999 14 | */ 15 | object NetworkWordCount { 16 | def main(args: Array[String]): Unit = { 17 | // Create a local StreamingContext with two working thread and batch interval of 1 second. 18 | // The master requires 2 cores to prevent from a starvation scenario. 19 | val conf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCount") 20 | val ssc = new StreamingContext(conf, Seconds(5)) 21 | 22 | // Create a DStream that will connect to hostname:port, like localhost:9999 23 | val lines = ssc.socketTextStream("localhost", 9999) 24 | StreamingExamples.setStreamingLogLevels() 25 | 26 | // Split each line into words 27 | val words = lines.flatMap(_.split(" ")) 28 | 29 | // Count each word in each batch 30 | val pairs = words.map(word => (word, 1)) 31 | val wordCounts = pairs.reduceByKey(_ + _) 32 | 33 | // Print the first ten elements of each RDD generated in this DStream to the console 34 | wordCounts.print() 35 | 36 | ssc.start() // Start the computation 37 | ssc.awaitTermination() // Wait for the computation to terminate 38 | 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /bigdata-spark-streaming/src/main/scala/com/libin/data/streaming/utils/StreamingExamples.scala: -------------------------------------------------------------------------------- 1 | package com.libin.data.streaming.utils 2 | 3 | import org.apache.spark.internal.Logging 4 | import org.apache.log4j.{Level, Logger} 5 | 6 | /** 7 | * Copyright (c) 2020/4/3 libin Inc. All Rights Reserved. 8 | * Authors: libin<2578858653@qq.com> 9 | * 10 | * Purpose : 只显示WARN日志,大量的INFO日志都可以被屏蔽掉 11 | */ 12 | object StreamingExamples extends Logging { 13 | /** Set reasonable logging levels for streaming if the user has not configured log4j. */ 14 | def setStreamingLogLevels() { 15 | val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements 16 | // if (!log4jInitialized) { 17 | if (log4jInitialized) { 18 | // We first log something to initialize Spark's default logging, then we override the 19 | // logging level. 20 | logInfo("Setting log level to [WARN] for streaming example." + 21 | " To override add a custom log4j.properties to the classpath.") 22 | Logger.getRootLogger.setLevel(Level.WARN) 23 | }else{ 24 | Logger.getRootLogger.setLevel(Level.WARN) 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /doc1/README-PLAN.md: -------------------------------------------------------------------------------- 1 | 待补充 -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | io.github.wujun728 8 | bigdata 9 | 0.0.1-SNAPSHOT 10 | pom 11 | 12 | 13 | bigdata-flink 14 | bigdata-hadoop 15 | bigdata-hbase 16 | bigdata-hive 17 | bigdata-info 18 | bigdata-project 19 | bigdata-doris 20 | bigdata-druid 21 | bigdata-kafka 22 | spark-core 23 | spark-graphx 24 | spark-mllib 25 | bigdata-spark-sql 26 | bigdata-spark-streaming 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /spark-core/.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | build_info.properties 3 | .classpath 4 | dependency-reduced-pom.xml 5 | *.diff 6 | .DS_Store 7 | .idea/ 8 | *.iml 9 | *.jar 10 | .project 11 | .settings/ 12 | .tags* 13 | target 14 | tmp* 15 | test-output/ 16 | nohup* 17 | *.log 18 | *.swp 19 | *.pyc 20 | script/__pycache__/ 21 | venv 22 | -------------------------------------------------------------------------------- /spark-core/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Spark并行编程框架 3 | SparkCore相关写过太多的代码了,一直没花时间整理一下,2020年会好好花时间整理一下.包括使用、原理、源码等等 4 | 5 | ##### 1、SparkCore相关操作代码 6 | * [SparkCore公共代码类](src/main/scala/com/libin/base) 7 | * [SparkCore工具类](src/main/scala/com/libin/utils) 8 | * [SparkCore常用处理数据模板](src/main/scala/com/libin/jobs) 9 | * [Spark应用API编程小例子](src/main/scala/com/libin/client) 10 | 11 | ##### 2、SparkCore相关文档 12 | * [SparkCore文档](src/main/doc) 13 | * [SparkCore常见报错]() 14 | * [SparkCore优化]() 15 | * [SparkCore原理]() 16 | * [SparkCore源码分析]() 17 | 18 | ##### 3、SparkCore相关学习资料 19 | 1、书籍 20 | 2、博客网站 21 | 3、论文 22 | -------------------------------------------------------------------------------- /spark-core/src/main/doc/Spark优化.md: -------------------------------------------------------------------------------- 1 | 2 | ## Spark调优 3 | 4 | 1数据倾斜 5 | 6 | 编写运行Spark代码时候,常常会由于数据等问题出现数据倾斜的问题。 7 | 8 | 根据个人经验解决方法: 9 | 1. 增加RDD的并行度,对读取的数据增大分区个数; 10 | 2. 对读取的数据自定义分区,让聚集的key分散开; 11 | 3. 通过加大参数配置,增大资源配置(短期见效,不建议长久); 12 | 4. 把RDD分为正常和倾斜2部分,单独对倾斜处理; 13 | 5. 检查上游是否有脏数据,对源数据过滤出脏数据(首选); 14 | 6. ... 15 | 16 | 17 | 算子优化 18 | 19 | 尽量少使用groupByKey 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /spark-core/src/main/doc/Spark参数配置.md: -------------------------------------------------------------------------------- 1 | 2 | ## Spark常用配置参数 3 | 4 | 5 | spark作业常用配置参数 6 | 7 | | 参数 | 释义 | 8 | | ---- | ---- | 9 | | executor-memory | executor内存大小 | 10 | | driver-memory | driver内存大小 | 11 | | spark.yarn.executor.memoryOverhead | executor的堆外内存大小设置 | 12 | | spark.yarn.driver.memoryOverhead | driver的堆外内存大小设置 | 13 | | spark.dynamicAllocation.enabled | 是否开启动态资源分配 | 14 | | spark.dynamicAllocation.initialExecutors | 设置启动的时候初始化多少个executors | 15 | | spark.dynamicAllocation.maxExecutors | 控制动态资源分配的上线 | 16 | | spark.dynamicAllocation.minExecutors | 控制动态资源分配的下线 | 17 | | spark.memory.fraction | 存储和计算内存占比 | 18 | | spark.app.name | 应用程序的名字,将在UI和日志数据中出现 | 19 | | executor-cores | executor的core个数 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /spark-core/src/main/doc/Spark基本架构.md: -------------------------------------------------------------------------------- 1 | ##### Spark基本架构 2 | 从集群部署的角度来看,Spark集群由集群管理器(Cluster Manager)、工作节点(Worker)、执行器(Executor)、驱动器(Driver)、应用程序(Application)等部分组成 3 | 4 | (1)Cluster Manager 5 | Spark的集群管理器,主要负责对整个集群资源的分配与管理。 6 | Cluster Manager在YARN部署模式下为ResourceManager; 7 | 在Mesos部署模式下为Mesos Master; 8 | 在Standalone部署模式下为Master。 9 | Cluster Manager分配的资源属于一级分配,它将各个Worker上的内存、CPU等资源分配给Application,但是并不负责对Executor的资源分配。 10 | 11 | (2)Worker 12 | Spark的工作节点。在YARN部署模式下实际由NodeManager替代。 13 | Worker节点主要负责以下工作: 14 | 将自己的内存、CPU等资源通过注册机制告知ClusterManager; 15 | 创建Executor; 16 | 将资源和任务进一步分配给Executor; 17 | 同步资源信息、Executor状态信息给Cluster Manager等。 18 | 19 | (3)Executor 20 | 执行计算任务的一线组件。主要负责任务的执行及与Worker、Driver的信息同步。 21 | 22 | (4)Driver 23 | Application的驱动程序,Application通过Driver与Cluster Manager、Executor进行通信。 24 | 25 | (5)Application 26 | 用户使用Spark提供的API编写的应用程序,Application通过Spark API将进行RDD的转换和DAG的构建, 27 | 并通过Driver将Application注册到ClusterManager。 28 | 29 | Cluster Manager将会根据Application的资源需求, 30 | 通过一级分配将Executor、内存、CPU等资源分配给Application。 31 | Driver通过二级分配将Executor等资源分配给每一个任务,Application最后通过Driver告诉Executor运行任务。 32 | 33 | ##### Spark核心模块 34 | (1)SparkContext 35 | SparkContext隐藏了网络通信、分布式部署、消息通信、存储体系、计算引擎、度量系统、文件服务、Web UI等内容, 36 | 应用程序开发者只需要使用SparkContext提供的API完成功能开发。 37 | (2)SparkEnv 38 | Spark执行环境SparkEnv是Spark中的Task运行所必需的组件。 39 | SparkEnv内部封装了RPC环境(RpcEnv)、序列化管理器、广播管理器(BroadcastManager)、map任务输出跟踪器(MapOutputTracker)、 40 | 存储体系、度量系统(MetricsSystem)、输出提交协调器(OutputCommitCoordinator)等Task运行所需的各种组件。 41 | (3)调度系统 42 | 调度系统主要由DAGScheduler和TaskScheduler组成,它们都内置在SparkContext中。 43 | DAGScheduler负责创建Job、将DAG中的RDD划分到不同的Stage、给Stage创建对应的Task、批量提交Task等功能。 44 | TaskScheduler负责按照FIFO或者FAIR等调度算法对批量Task进行调度;为Task分配资源;将Task发送到集群管理器的当前应用的Executor上,由Executor负责执行等工作。 45 | 46 | -------------------------------------------------------------------------------- /spark-core/src/main/doc/Spark算子.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/spark-core/src/main/doc/Spark算子.md -------------------------------------------------------------------------------- /spark-core/src/main/doc/Spark面试题.md: -------------------------------------------------------------------------------- 1 | #### Spark面试题 2 | 3 | ##### 1.什么是数据倾斜?如何造成的?解决方案? 4 | 5 | 6 | ##### 2.Spark的shuffle过程? 7 | 8 | 9 | ##### 3.Spark的内存结构? 10 | 11 | 12 | ##### 4.RDD,DAG,Stage怎么理解? 13 | 14 | 15 | ##### 5.Spark提交作业的流程? 16 | 17 | -------------------------------------------------------------------------------- /spark-core/src/main/doc/images/统一内存管理_堆内.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/spark-core/src/main/doc/images/统一内存管理_堆内.png -------------------------------------------------------------------------------- /spark-core/src/main/doc/images/统一内存管理_堆外.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/spark-core/src/main/doc/images/统一内存管理_堆外.png -------------------------------------------------------------------------------- /spark-core/src/main/doc/images/静态内存管理_堆内.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/spark-core/src/main/doc/images/静态内存管理_堆内.png -------------------------------------------------------------------------------- /spark-core/src/main/doc/images/静态内存管理_堆外.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wujun728/jun_bigdata/aa20b0f0a9b2bb8ad52b52d57c0530c936e1ed79/spark-core/src/main/doc/images/静态内存管理_堆外.png -------------------------------------------------------------------------------- /spark-core/src/main/doc/目录.md: -------------------------------------------------------------------------------- 1 | 2 | ##### SparkCore相关使用文档 3 | * [Spark作业配置参数](param) 4 | * [Spark作业调优](optimize) 5 | -------------------------------------------------------------------------------- /spark-core/src/main/resources/mysql.conf: -------------------------------------------------------------------------------- 1 | # JDBC驱动 2 | db.default.driver="com.mysql.jdbc.Driver" 3 | // 数据库URL地址 4 | db.default.url="jdbc:mysql://ip:port/databaseName" 5 | // 数据库帐号 6 | db.default.user="user" 7 | // 数据库密码 8 | db.default.password="password" -------------------------------------------------------------------------------- /spark-core/src/main/scala/com/libin/base/SparkJobBase.scala: -------------------------------------------------------------------------------- 1 | package com.libin.base 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import org.apache.spark.sql.SparkSession 5 | import org.joda.time.DateTime 6 | import org.slf4j.{Logger, LoggerFactory} 7 | 8 | /** 9 | * Copyright (c) 2020/4/13 libin Inc. All Rights Reserved. 10 | * Authors: libin<2578858653@qq.com> 11 | * 12 | * Purpose : Spark操作的一些公共方法 13 | */ 14 | trait SparkJobBase { 15 | /** 16 | * 得到app名称 17 | */ 18 | def appName: String = this.getClass.getSimpleName.stripSuffix("$") 19 | 20 | /** 21 | * 对数据处理的一些常用分隔符,使用时需要被重写 22 | */ 23 | val separator: String = "\t" 24 | 25 | /** 26 | * 对数据的填充值 27 | */ 28 | val fillValue: String = "" 29 | 30 | /** 31 | * 对数据处理的分区数,使用时需要被重写 32 | */ 33 | val partitionNum: Int = 400 34 | 35 | /** 36 | * 获取作业名字,使用时需要被重写 37 | */ 38 | def jobName: String = this.getClass.getSimpleName 39 | 40 | /** 41 | * 日志对象 42 | */ 43 | val logger: Logger = LoggerFactory.getLogger(jobName) 44 | 45 | /** 46 | * 获取一个SparkContext对象,依赖被重写的 @jobName 参数 47 | */ 48 | def getSparkContext: SparkContext = { 49 | val conf = new SparkConf().setAppName(jobName) 50 | SparkContext.getOrCreate(conf) 51 | } 52 | 53 | /** 54 | * 获取一个SparkSession对象,依赖被重写的 @jobName 参数 55 | */ 56 | def getSparkSession: SparkSession = { 57 | SparkSession.builder().appName(jobName).getOrCreate() 58 | } 59 | 60 | /** 61 | * 初始化 62 | * 63 | * @return 64 | */ 65 | def initContext: SparkContext = getSparkContext 66 | 67 | /** 68 | * 停掉一个作业 69 | * 70 | */ 71 | def destroyJob(): Unit = { 72 | getSparkContext.stop() 73 | logger.info(s"$jobName stopped at ${new DateTime()}") 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /spark-core/src/main/scala/com/libin/base/TableLoaderBase.scala: -------------------------------------------------------------------------------- 1 | package com.libin.base 2 | 3 | /** 4 | * Copyright (c) 2020/4/13 libin Inc. All Rights Reserved. 5 | * Authors: libin<2578858653@qq.com> 6 | * 7 | * Purpose : 8 | */ 9 | trait TableLoaderBase { 10 | 11 | } 12 | -------------------------------------------------------------------------------- /spark-core/src/main/scala/com/libin/client/AccumulatorDemo.scala: -------------------------------------------------------------------------------- 1 | package com.libin.client 2 | 3 | import org.apache.spark.{AccumulatorParam, SparkConf, SparkContext} 4 | 5 | /** 6 | * Copyright (c) 2016/11/02. xixi Inc. All Rights Reserved. 7 | * Authors: libin <2578858653@qq.com> 8 | *

9 | * Purpose : 10 | */ 11 | object AccumulatorDemo { 12 | def main(args: Array[String]): Unit = { 13 | val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("AccumulatorDemo") 14 | val sc: SparkContext = new SparkContext(conf) 15 | 16 | val arrAccu = Array(0L, 0L, 0L, 0L, 0L) 17 | val accumulatorArr = sc.accumulator(arrAccu, "HADOOP")(MyAcculumatorParam) 18 | 19 | val accumulatorMl = sc.accumulator(0, "ML") 20 | val accumulatorDl = sc.accumulator(0L, "DL") 21 | val arr = Array("ML", "DL", "CNN", "RNN", "ML", "HADOOP", "SPARK", "ML") 22 | for (i <- 0 to arr.length - 1) { 23 | if (arr(i).equals("ML")) { 24 | accumulatorMl += 1 25 | } else if (arr(i).equals("DL")) { 26 | accumulatorDl += 1 27 | } else if (arr(i).equals("HADOOP")) { 28 | accumulatorArr += Array(1L, 1L, 1L, 1L, 1L) 29 | } 30 | } 31 | println("ML=" + accumulatorMl.name.get + "、" + accumulatorMl.value) 32 | println("DL=" + accumulatorDl.name.get + "、" + accumulatorDl.value) 33 | println("HADOOP=" + accumulatorArr.name.get + "、" + accumulatorArr.value.mkString(",")) 34 | } 35 | 36 | object MyAcculumatorParam extends AccumulatorParam[Array[Long]] { 37 | override def addInPlace(r1: Array[Long], r2: Array[Long]): Array[Long] = { 38 | r1.zip(r2).map(x => x._1 + x._2) 39 | } 40 | 41 | def zero(initialValue: Array[Long]): Array[Long] = { 42 | new Array[Long](initialValue.length) 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /spark-core/src/main/scala/com/libin/client/AggregateByKeyDemo.scala: -------------------------------------------------------------------------------- 1 | package com.libin.client 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import scala.collection.JavaConverters._ 5 | 6 | /** 7 | * Copyright (c) 2017/07/25. xixi Inc. All Rights Reserved. 8 | * Authors: libin <2578858653@qq.com> 9 | *

10 | * Purpose : 求key只出现一次的数据, 如果用groupByKey或reduceByKey很容易就做出来了,现在用aggregateByKey求解一下。 11 | */ 12 | object AggregateByKeyDemo { 13 | def main(args: Array[String]): Unit = { 14 | val conf = new SparkConf().setAppName("aggregateByKeyDemo").setMaster("local") 15 | val sc = new SparkContext(conf) 16 | 17 | sc.textFile("D://sparkmllibData/sparkml/mllibdata/arrregation.txt") 18 | .map { 19 | line => 20 | (line.split("\t")(0), line.split("\t")(1).toLong) 21 | }.aggregateByKey(0L)(seqOp, combOp) 22 | .filter(line => line._2 == 1L) 23 | .collect().foreach(println) 24 | } 25 | 26 | def seqOp(U: Long, V: Long): Long = { 27 | U + 1L 28 | } 29 | 30 | def combOp(U: Long, V: Long): Long = { 31 | U + V 32 | } 33 | } 34 | 35 | /** 36 | * asdfgh 546346 37 | * retr 4567 38 | * asdfgh 7685678 39 | * ghj 2345 40 | * asd 234 41 | * hadoop 435 42 | * ghj 23454 43 | * asdfgh 54675 44 | * asdfgh 546759878 45 | * asd 234 46 | * asdfgh 5467598782 47 | */ 48 | -------------------------------------------------------------------------------- /spark-core/src/main/scala/com/libin/client/BroadcastDemo.scala: -------------------------------------------------------------------------------- 1 | package com.libin.client 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import scala.collection.immutable.HashMap 5 | 6 | /** 7 | * Copyright (c) 2016/11/02. xixi Inc. All Rights Reserved. 8 | * Authors: libin <2578858653@qq.com> 9 | *

10 | * Purpose : 11 | */ 12 | object BroadcastDemo { 13 | def main(args: Array[String]): Unit = { 14 | val conf: SparkConf = new SparkConf().setAppName("CacheRadius").setMaster("local[2]") 15 | val sc = new SparkContext(conf) 16 | val input = "E://sparkmllibData/cacheAndPersist.txt" 17 | val data = sc.textFile(input).map(_.split("\\|", 100)).map(line => { 18 | val Array(privateIP, account, timeFormat, timeType) = line 19 | (privateIP, (account, timeFormat.toLong, timeType.toInt)) 20 | }) 21 | 22 | var accountHash = new HashMap[String, Set[(String, Long, Int)]]() 23 | data.groupByKey().collect().foreach(x => { 24 | accountHash += (x._1 -> x._2.toSet) 25 | }) 26 | val broacast = sc.broadcast(accountHash) 27 | 28 | println(broacast.id) 29 | val hashvalue = broacast.value 30 | for (entry <- hashvalue) { 31 | println(entry._1 + "|" + entry._2) 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /spark-core/src/main/scala/com/libin/client/MyPartitioner.scala: -------------------------------------------------------------------------------- 1 | package com.libin.client 2 | 3 | import org.apache.spark.{Partitioner, SparkConf, SparkContext} 4 | 5 | /** 6 | * Copyright (c) 2015/05/02. xixi Inc. All Rights Reserved. 7 | * Authors: libin <2578858653@qq.com> 8 | *

9 | * Purpose : 自定义分区小Demo 10 | */ 11 | object MyPartitioner { 12 | def main(args: Array[String]): Unit = { 13 | val conf = new SparkConf().setAppName("MyPartitioner").setMaster("local[10]") 14 | val sc = new SparkContext(conf) 15 | val arr = Array((2, 3), (4, 6), (4, 2), (2, 1), (22, 3), (34, 6), 16 | (74, 2), (12, 1), (62, 3), (34, 6), (114, 2), (92, 1)) 17 | val rdd = sc.makeRDD(arr) 18 | rdd.partitionBy(new myPartitioner(10)) 19 | .foreachPartition(x => println(x.toList.mkString(","))) 20 | } 21 | } 22 | 23 | /** 24 | * 不使用已有的分区策略HashPartitioner和RangePartitioner,自定义分区 25 | * 26 | * @param partitions 分区个数 27 | */ 28 | class myPartitioner(partitions: Int) extends Partitioner { 29 | override def numPartitions: Int = partitions 30 | 31 | override def getPartition(key: Any): Int = key match { 32 | case null => 0 33 | case _ => 34 | try { 35 | val curNum = key.asInstanceOf[Int] 36 | if (curNum < 10) curNum 37 | else if (curNum < 100) curNum / numPartitions 38 | else 0 39 | } catch { 40 | case e: Exception => 0 41 | } 42 | } 43 | } -------------------------------------------------------------------------------- /spark-core/src/main/scala/com/libin/client/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Spark编程API应用 3 | * [二次排序](SecondarySort.scala) 4 | * [自定义分区](MyPartitioner.scala) 5 | * [累加器](AccumulatorDemo.scala) 6 | * [广播变量](BroadcastDemo.scala) 7 | * [cache](cacheAndPersist.scala) 8 | * [aggregateByKey](AggregateByKeyDemo.scala) 9 | -------------------------------------------------------------------------------- /spark-core/src/main/scala/com/libin/client/SecondarySort.scala: -------------------------------------------------------------------------------- 1 | package com.libin.client 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | /** 6 | * Copyright (c) 2018/05/02. xixi Inc. All Rights Reserved. 7 | * Authors: libin <2578858653@qq.com> 8 | *

9 | * Purpose : 10 | */ 11 | object SecondarySort { 12 | def main(args: Array[String]): Unit = { 13 | val conf = new SparkConf().setAppName("secondarySort").setMaster("local[2]") 14 | val sc = new SparkContext(conf) 15 | 16 | val arr = Array((2, 3), (4, 6), (4, 2), (2, 1)) 17 | val rdd = sc.makeRDD(arr) 18 | rdd.map(x => (new secondarySortUtils(x._1, x._2), x)) 19 | .sortByKey(ascending = true).map(_._2) 20 | .collect().foreach(println) 21 | } 22 | } 23 | 24 | /** 25 | * 继承Ordered和Serializable实现自定义排序key,并使用sortByKey对自定义的key进行排序. 26 | * 27 | * @param first 第一列数据 28 | * @param second 第二列数据 29 | */ 30 | class secondarySortUtils(val first: Int, val second: Int) extends Ordered[secondarySortUtils] with Serializable { 31 | override def compare(that: secondarySortUtils): Int = { 32 | if (this.first - that.first != 0) this.first - that.first 33 | else this.second - that.second 34 | } 35 | } -------------------------------------------------------------------------------- /spark-core/src/main/scala/com/libin/client/cacheAndPersist.scala: -------------------------------------------------------------------------------- 1 | package com.libin.client 2 | 3 | import org.apache.spark.storage.StorageLevel 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | 6 | /** 7 | * Copyright (c) 2016/11/02. xixi Inc. All Rights Reserved. 8 | * Authors: libin <2578858653@qq.com> 9 | *

10 | * Purpose : 11 | */ 12 | object cacheAndPersist { 13 | def main(args: Array[String]): Unit = { 14 | /*if (args.length != 1) { 15 | System.err.println("Usage ") 16 | System.exit(1) 17 | val Array(input) = args 18 | }*/ 19 | 20 | val input = "E://sparkmllibData/cache.txt" 21 | val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("cacheAndPersist") 22 | val sc: SparkContext = new SparkContext(conf) 23 | 24 | val data1 = sc.textFile(input) 25 | .map(_.split("\\|", 100)) 26 | .map(line => { 27 | val Array(name, age) = line 28 | (name, age) 29 | }).cache() 30 | val data2 = sc.textFile(input) 31 | .map(line => { 32 | line.split("\\|", 100) 33 | }).map(x => { 34 | val Array(name, age) = x 35 | (name, age) 36 | }).filter(y => {y._1.equals("ML") 37 | }).persist(StorageLevel.MEMORY_AND_DISK) 38 | 39 | data1.intersection(data2).foreach(println) 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /spark-core/src/main/scala/com/libin/jobs/READMD.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 相关的一些作业处理模板 3 | 4 | -------------------------------------------------------------------------------- /spark-core/src/main/scala/com/libin/loader/READMD.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ##### 读写存储相关操作代码 5 | -------------------------------------------------------------------------------- /spark-core/src/main/scala/com/libin/processor/READMD.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | #### 处理数据相关代码 4 | 5 | 6 | -------------------------------------------------------------------------------- /spark-core/src/main/scala/com/libin/source/READMD.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 源码相关阅读 3 | 4 | -------------------------------------------------------------------------------- /spark-core/src/main/scala/com/libin/utils/DateUtils.scala: -------------------------------------------------------------------------------- 1 | package com.libin.utils 2 | 3 | import org.joda.time.DateTime 4 | import org.joda.time.format.DateTimeFormat 5 | 6 | /** 7 | * Copyright (c) 2020/4/14. libin Inc. All Rights Reserved. 8 | * Authors: libin <2578858653@qq.com> 9 | *

10 | * Purpose : 11 | */ 12 | object DateUtils { 13 | val DATE_FORMAT = "yyyyMMdd" 14 | 15 | /** 16 | * 日期字符串转为DateTime 17 | * 18 | * @param input 日期 19 | * @return 20 | */ 21 | def parseDate(input: String): Option[DateTime] = 22 | try { 23 | Some(DateTimeFormat.forPattern(DATE_FORMAT).parseDateTime(input)) 24 | } catch { 25 | case e: Exception => None 26 | } 27 | 28 | /** 29 | * DateTime转为日期字符串 30 | * 31 | * @param input 日期 32 | */ 33 | def parseDateTimeToStr(input: DateTime): Option[String] = 34 | try { 35 | Some(input.toString(DATE_FORMAT)) 36 | } catch { 37 | case e: Exception => None 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /spark-core/src/main/scala/com/libin/utils/MySQLUtils.scala: -------------------------------------------------------------------------------- 1 | package com.libin.utils 2 | 3 | import java.sql.{Connection, DriverManager, ResultSet, SQLException} 4 | 5 | import com.typesafe.config.ConfigFactory 6 | import org.slf4j.{Logger, LoggerFactory} 7 | 8 | import scala.collection.mutable.ArrayBuffer 9 | 10 | /** 11 | * Copyright (c) 2020/4/22 libin Inc. All Rights Reserved. 12 | * Authors: libin<2578858653@qq.com> 13 | * 14 | * Purpose : 15 | */ 16 | object MySQLUtils { 17 | val logger: Logger = LoggerFactory.getLogger("MySQLUtils") 18 | val splitTable = "\t" // 制表符字符 19 | /** 20 | * 获取MySQL访问链接 21 | * 22 | * @param config 数据库配置信息 23 | */ 24 | def getMySQLConn(config: String): Connection = { 25 | try { 26 | def dbConf = ConfigFactory.load(config).getConfig("db.default") 27 | 28 | Class.forName(dbConf.getString("driver")) 29 | DriverManager.getConnection(dbConf.getString("url"), 30 | dbConf.getString("user"), 31 | dbConf.getString("password")) 32 | } catch { 33 | case e: SQLException => e.printStackTrace(); null 34 | case _: Throwable => null 35 | } 36 | } 37 | 38 | /** 39 | * 查询指定Sql语句,返回指定字段内容 40 | * 41 | * @param sql 语句 42 | * @param conn Connection 43 | * @param num 读取几个字段,把查询出来的字段每一行拼接在一起 44 | */ 45 | def executeSql(sql: String, conn: Connection, num: Int): Array[String] = { 46 | val stmt = conn.createStatement(); 47 | try { 48 | val rs: ResultSet = stmt.executeQuery(sql) 49 | var arr = new ArrayBuffer[String] 50 | val sb = new StringBuilder 51 | while (rs.next()) { 52 | sb.clear() 53 | for (i <- 1 to (num)) { 54 | sb.append(rs.getString(i)) 55 | // 查询中有多个字段之间用制表符分割开 56 | if (i < num) sb.append(splitTable) 57 | } 58 | arr += sb.toString() 59 | } 60 | rs.close() 61 | arr.toArray 62 | } catch { 63 | case e: SQLException => e.printStackTrace(); null 64 | case _: Throwable => null 65 | } finally { 66 | stmt.close() 67 | } 68 | } 69 | 70 | /** 71 | * 关闭MySQL链接 72 | * 73 | * @param conn Connection链接 74 | */ 75 | def close(conn: Connection): Unit = { 76 | try { 77 | conn.close() 78 | } catch { 79 | case a: SQLException => a.printStackTrace(); null 80 | case _: Throwable => null 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /spark-core/src/main/scala/com/libin/utils/README.md: -------------------------------------------------------------------------------- 1 | 2 | ##### 相关的一些工具类 3 | 1.日期操作类 4 | 2.路径操作类 5 | 3. 6 | 7 | -------------------------------------------------------------------------------- /spark-core/src/main/scala/com/libin/utils/ResourceUtils.scala: -------------------------------------------------------------------------------- 1 | package com.libin.utils 2 | 3 | import org.apache.commons.lang3.StringUtils 4 | 5 | import scala.io.Source 6 | 7 | /** 8 | * Copyright (c) 2020/4/20 XiaoMi Inc. All Rights Reserved. 9 | * Authors: libin 10 | * 11 | * Purpose : 读取resource下面的数据 12 | */ 13 | object ResourceUtils { 14 | /** 15 | * 1. 读取resources文件夹下的配置文件,返回Map形式,以第一个元素为key,第二个元素为value; 16 | * 2. 以#开头的行为注释行,不加入map 17 | * 18 | * @param fileName :文件名称 19 | * @param delimit :分割符 20 | */ 21 | def readFileAsMap(fileName: String, delimit: String): Map[String, String] = { 22 | val inputStream = this.getClass.getClassLoader.getResourceAsStream(fileName) 23 | Source.fromInputStream(inputStream).getLines().filter { 24 | line => 25 | !line.startsWith("#") 26 | }.map { 27 | line => 28 | //使用delimit将该行分成两部分,key与value 29 | val parts = line.split(delimit, 2) 30 | (parts(0), parts(1)) 31 | }.toMap 32 | } 33 | 34 | /** 35 | * 1. 读取resources文件夹下的配置文件,返回Set形式 36 | * 2. 以#开头的行为注释行或者空行,不加入set 37 | * 38 | * @param fileName :文件名称 39 | */ 40 | def readFileAsSet(fileName: String): Set[String] = { 41 | val inputStream = this.getClass.getClassLoader.getResourceAsStream(fileName) 42 | Source.fromInputStream(inputStream).getLines().filter { 43 | line => 44 | !line.startsWith("#") && StringUtils.isNotBlank(line) 45 | }.map(line => StringUtils.trim(line)).toSet 46 | } 47 | 48 | /** 49 | * 1. 读取resources文件夹下的配置文件,每行作为List的一个元素 50 | * 2. 以#开头的行为注释,不加入List 51 | * 52 | * @param fileName :文件名称 53 | */ 54 | def readFileAsList(fileName: String): List[String] = { 55 | val inputStream = this.getClass.getClassLoader.getResourceAsStream(fileName) 56 | Source.fromInputStream(inputStream).getLines().filter { 57 | line => 58 | !line.startsWith("#") 59 | }.toList 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /spark-core/src/main/scala/com/libin/utils/SeparatorUtils.scala: -------------------------------------------------------------------------------- 1 | package com.libin.utils 2 | 3 | /** 4 | * Copyright (c) 2020/4/14. libin Inc. All Rights Reserved. 5 | * Authors: libin <2578858653@qq.com> 6 | *

7 | * Purpose : 常用的一些分隔符 8 | */ 9 | object SeparatorUtils { 10 | 11 | // 常用分隔符 12 | val SEPARATOR_TAB = "\t" 13 | val SEPARATOR_EMPTY = "\\N" 14 | val SEPARATOR_ENTER = "\n" 15 | val SEPARATOR_EQUAL = "=" // 等号字符 16 | val SEPARATOR_SPACE = " " // 空格字符 17 | val SEPARATOR_POINT = "\\." // 点字符 18 | val SEPARATOR_COMMA = "," // 逗号分隔字符 19 | val SEPARATOR_SEMICOLON = ";" // 分号字符 20 | val SEPARATOR_BAR = "-" // 横杠分隔符 21 | val SEPARATOR_LEFT_SLASH = "/" // 左斜线 22 | 23 | val pathSuccess = "_SUCCESS" // 数据成功标识符 24 | } 25 | -------------------------------------------------------------------------------- /spark-graphx/.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | build_info.properties 3 | .classpath 4 | dependency-reduced-pom.xml 5 | *.diff 6 | .DS_Store 7 | .idea/ 8 | *.iml 9 | *.jar 10 | .project 11 | .settings/ 12 | .tags* 13 | target/ 14 | tmp* 15 | test-output/ 16 | nohup* 17 | *.log 18 | *.swp 19 | *.pyc 20 | script/__pycache__/ 21 | -------------------------------------------------------------------------------- /spark-graphx/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## 图框架 3 | 4 | * 图基本学习资料整理 5 | 6 | * [图处理框架学习资料整理](src/main/scala/com/libin/docs) 7 | 8 | * 图存储框架学习资料整理 9 | 10 | * 图可视化框架学习资料整理 11 | -------------------------------------------------------------------------------- /spark-graphx/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | 2 | log4j.logger.org.apache=ERROR 3 | -------------------------------------------------------------------------------- /spark-graphx/src/main/scala/com/libin/graphX/etl/GraphXProcessor.scala: -------------------------------------------------------------------------------- 1 | package com.libin.graphX.etl 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import org.apache.spark.graphx.{Graph, Edge, VertexId} 5 | import org.apache.spark.rdd.RDD 6 | import org.slf4j.{LoggerFactory, Logger} 7 | 8 | /** 9 | * Copyright (c) 2018/7/1. xixi Inc. All Rights Reserved. 10 | * Authors: libin <2578858653@qq.com> 11 | * 12 | * Purpose : 13 | */ 14 | 15 | object GraphXProcessor { 16 | def main(args: Array[String]) { 17 | val conf = new SparkConf().setAppName("joinVertexDemo").setMaster("local") 18 | val sc = new SparkContext(conf) 19 | 20 | val logger: Logger = LoggerFactory.getLogger("GraphProcessor") 21 | logger.info("GraphProcessor start ...") 22 | 23 | val vertexRdd: RDD[(VertexId, (String, String, Long))] = 24 | sc.parallelize(Array( 25 | (1L, ("mid_1", "mid", 1513048521000L)), 26 | (2L, ("imei_1", "phone", 1523048521003L)), 27 | (3L, ("pn_1", "pn", 1523048521005L)) 28 | )) 29 | 30 | val edgeRdd: RDD[Edge[Long]] = 31 | sc.parallelize(Array( 32 | Edge(1L, 2L, 1513048521000L), 33 | Edge(2L, 3L, 1523048521003L) 34 | )) 35 | 36 | // 构造图 37 | val graphTest = Graph(vertexRdd, edgeRdd) 38 | // 输出图的顶点信息 39 | graphTest.vertices.foreach(println) 40 | 41 | val addAttrRdd = sc.makeRDD(Array((1L, 1L), (3L, 3L), (5L, 5L))) 42 | 43 | graphTest.mapVertices((_, attr) => attr._3).joinVertices(addAttrRdd)((_, _, newAttr) => newAttr) 44 | .vertices.foreach(println) 45 | /** 46 | * 操作joinVertices输出结果. 47 | * (1,1) 48 | * (3,3) 49 | * (2,1523048521003) 50 | */ 51 | 52 | graphTest.mapVertices((_, attr) => attr._3).outerJoinVertices(addAttrRdd)((_, _, newAttr) => newAttr) 53 | .vertices.foreach(println) 54 | /** 55 | * 操作outerJoinVertices输出结果. 56 | * (1,Some(1)) 57 | * (3,Some(3)) 58 | * (2,None) 59 | */ 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /spark-mllib/.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | build_info.properties 3 | .classpath 4 | dependency-reduced-pom.xml 5 | *.diff 6 | .DS_Store 7 | .idea/ 8 | *.iml 9 | *.jar 10 | .project 11 | .settings/ 12 | .tags* 13 | target/ 14 | tmp* 15 | test-output/ 16 | nohup* 17 | *.log 18 | *.swp 19 | *.pyc 20 | script/__pycache__/ 21 | *.h -------------------------------------------------------------------------------- /spark-mllib/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Spark MLlib 3 | 4 | * [Spark MLLib示例代码](src/main/scala/com/libin) 5 | 6 | 7 | ## 软件版本号 8 | * scala版本2.11.7 9 | * jdk版本1.8 10 | * spark版本2.1 11 | 12 | 13 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/com/libin/AlsRecommend.scala: -------------------------------------------------------------------------------- 1 | package com.libin 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | import org.apache.spark.mllib.recommendation.ALS 6 | import org.apache.spark.mllib.recommendation.Rating 7 | 8 | /** 9 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved. 10 | * Authors: libin <2578858653@qq.com> 11 | *

12 | * Purpose : 13 | */ 14 | object AlsRecommend { 15 | def main(args: Array[String]) { 16 | //0 构建Spark对象 17 | val conf = new SparkConf() 18 | .setAppName("ALS") 19 | .setMaster("local") 20 | val sc = new SparkContext(conf) 21 | Logger.getRootLogger.setLevel(Level.WARN) 22 | 23 | //1 读取样本数据 24 | val data = sc.textFile("D://sparkmllibData/sparkml/mllibdata/test.data") 25 | val ratings = data.map(_.split(',') match { 26 | case Array(user, item, rate) => 27 | Rating(user.toInt, item.toInt, rate.toDouble) 28 | }) 29 | 30 | //2 使用ALS训练数据建立推荐模型 31 | val rank = 10 32 | val numIterations = 20 33 | val model = ALS.train(ratings, rank, numIterations, 0.01) 34 | 35 | //3从rating中获取user以及product数据集 36 | val usersProducts = ratings.map { 37 | case Rating(user, product, rate) => 38 | (user, product) 39 | } 40 | // 使用推荐模型预对用户和商品进行评分,得到预测评分的数据集 41 | val predictions = 42 | model.predict(usersProducts).map { 43 | case Rating(user, product, rate) => 44 | ((user, product), rate) 45 | } 46 | // 真实数据和预测数据进行合并 47 | val ratesAndPreds = ratings.map { 48 | case Rating(user, product, rate) => 49 | ((user, product), rate) 50 | }.join(predictions) 51 | 52 | val MSE = ratesAndPreds.map { 53 | case ((user, product), (r1, r2)) => 54 | val err = r1 - r2 55 | err * err 56 | }.mean() 57 | println("Mean Squared Error = " + MSE) 58 | 59 | //4 保存/加载模型 60 | /*model.save(sc, "myModelPath") 61 | val sameModel = MatrixFactorizationModel.load(sc, "myModelPath")*/ 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/com/libin/DistributedMatrixRow.scala: -------------------------------------------------------------------------------- 1 | package com.libin 2 | 3 | import org.apache.spark.mllib.linalg.Vectors 4 | import org.apache.spark.mllib.linalg.distributed._ 5 | import org.apache.spark.{SparkContext, SparkConf} 6 | 7 | /** 8 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved. 9 | * Authors: libin <2578858653@qq.com> 10 | *

11 | * Purpose :分布式行矩阵有: 12 | * * 行矩阵、带索引的行矩阵、坐标行矩阵、块行矩阵 13 | */ 14 | object DistributedMatrixRow { 15 | def main(args: Array[String]) { 16 | val conf = new SparkConf() 17 | .setMaster("local") 18 | .setAppName("distributedMatrixRow") 19 | val sc = new SparkContext(conf) 20 | 21 | println("First:RowMatrix ") 22 | val rdd = sc.textFile("D://sparkmllibData/sparkml/mllibdata/MatrixRow.txt") //创建RDD文件路径 23 | .map(_.split(' ') //按“ ”分割 24 | .map(_.toDouble)) //转成Double类型 25 | .map(line => Vectors.dense(line)) //转成Vector格式 26 | val rm = new RowMatrix(rdd) //读入行矩阵 27 | println(rm.numRows()) //打印列数 28 | println(rm.numCols()) //打印行数 29 | rm.rows.foreach(println) 30 | 31 | println("Second:IndexedRow ") 32 | val rdd2 = sc.textFile("D://sparkmllibData/sparkml/mllibdata/MatrixRow.txt") //创建RDD文件路径 33 | .map(_.split(' ') //按“ ”分割 34 | .map(_.toDouble)) //转成Double类型 35 | .map(line => Vectors.dense(line)) //转化成向量存储 36 | .map(vd => IndexedRow(vd.size, vd)) //转化格式 37 | val irm = new IndexedRowMatrix(rdd2) //建立索引行矩阵实例 38 | println(irm.getClass) //打印类型 39 | irm.rows.foreach(println) //打印内容数据 40 | 41 | println("Third: CoordinateMatrix ") 42 | val rdd3 = sc.textFile("D://sparkmllibData/sparkml/mllibdata/MatrixRow.txt") //创建RDD文件路径 43 | .map(_.split(' ') //按“ ”分割 44 | .map(_.toDouble)) //转成Double类型 45 | .map(vue => (vue(0).toLong, vue(1).toLong, vue(2))) //转化成坐标格式 46 | .map(vue2 => MatrixEntry(vue2 _1, vue2 _2, vue2 _3)) //转化成坐标矩阵格式 47 | val crm = new CoordinateMatrix(rdd3) //实例化坐标矩阵 48 | crm.entries.foreach(println) //打印数据 49 | println(crm.numCols()) 50 | println(crm.numCols()) 51 | println(crm.entries.countApproxDistinct()) 52 | 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/com/libin/FPGrowthDemo.scala: -------------------------------------------------------------------------------- 1 | package com.libin 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | import org.apache.spark.mllib.fpm.{FPGrowth, FPGrowthModel} 6 | 7 | /** 8 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved. 9 | * Authors: libin <2578858653@qq.com> 10 | *

11 | * Purpose : 12 | */ 13 | object FPGrowthDemo { 14 | def main(args: Array[String]) { 15 | //0 构建Spark对象 16 | val conf = new SparkConf() 17 | .setAppName("fpg") 18 | .setMaster("local") 19 | val sc = new SparkContext(conf) 20 | Logger.getRootLogger.setLevel(Level.WARN) 21 | 22 | //1 读取样本数据 23 | val data_path = "D://sparkmllibData/sparkml/mllibdata/sample_fpgrowth.txt" 24 | val data = sc.textFile(data_path) 25 | val examples = data.map(_.split(" ")).cache() 26 | 27 | //2 建立模型 28 | val minSupport = 0.6 29 | val numPartition = 10 30 | val model = new FPGrowth() 31 | .setMinSupport(minSupport) 32 | .setNumPartitions(numPartition) 33 | .run(examples) 34 | 35 | //3 打印结果 36 | println("Number of frequent itemsets:" + model.freqItemsets.count()) 37 | model.freqItemsets.collect().foreach { itemset => 38 | println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/com/libin/KMeans.scala: -------------------------------------------------------------------------------- 1 | package com.libin 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.mllib.clustering._ 5 | import org.apache.spark.mllib.linalg.Vectors 6 | import org.apache.spark.{SparkConf, SparkContext} 7 | 8 | /** 9 | * Copyright (c) 2018/09/04. xixi Inc. All Rights Reserved. 10 | * Authors: libin <2578858653@qq.com> 11 | *

12 | * Purpose : 13 | */ 14 | object KMeans { 15 | def main(args: Array[String]) { 16 | //1 构建Spark对象 17 | val conf = new SparkConf() 18 | .setAppName("KMeans") 19 | .setMaster("local") 20 | val sc = new SparkContext(conf) 21 | Logger.getRootLogger.setLevel(Level.WARN) 22 | 23 | // 读取样本数据1,格式为LIBSVM format 24 | val data = sc.textFile("E://sparkmllibData/kMeans_demo/testSet.txt") 25 | //val data = sc.textFile("D://sparkmllibData/sparkml/mllibdata/kmeans_data.txt") 26 | val parsedData = data.map(s => Vectors.dense(s.split('\t').map(_.toDouble))).cache() 27 | 28 | // 新建KMeans聚类模型,并训练 29 | val initMode = "k-means" 30 | //val initMode = "k-means++" 31 | //val initMode = "k-means||" 32 | val numClusters = 5 33 | val numIterations = 100 34 | 35 | val model = new KMeans() 36 | .setInitializationMode(initMode) 37 | .setK(numClusters) 38 | .setMaxIterations(numIterations) 39 | .run(parsedData) 40 | val centers = model.clusterCenters 41 | println("centers") 42 | for (i <- 0 to centers.length - 1) { 43 | println(centers(i)(0) + "\t" + centers(i)(1)) 44 | } 45 | // 误差计算 46 | val WSSSE = model.computeCost(parsedData) 47 | println("Within Set Sum of Squared Errors = " + WSSSE) 48 | 49 | //保存模型 50 | /*val ModelPath = "D://sparkmllibData/sparkml/mllibdata/KMeans_Model" 51 | model.save(sc, ModelPath) 52 | val sameModel = KMeansModel.load(sc, ModelPath)*/ 53 | 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/com/libin/PCADemo.scala: -------------------------------------------------------------------------------- 1 | package com.libin 2 | 3 | import org.apache.spark.mllib.linalg.Vectors 4 | import org.apache.spark.mllib.linalg.distributed.RowMatrix 5 | import org.apache.spark.{SparkContext, SparkConf} 6 | 7 | /** 8 | * Copyright (c) 2018/09/04. xixi Inc. All Rights Reserved. 9 | * Authors: libin <2578858653@qq.com> 10 | *

11 | * Purpose : 12 | */ 13 | object PCADemo { 14 | val conf = new SparkConf() 15 | .setMaster("local") 16 | .setAppName("PCA") 17 | val sc = new SparkContext(conf) 18 | 19 | def main(args: Array[String]) { 20 | val data = sc.textFile("D://sparkmllibData/sparkml/mllibdata/svd.txt") 21 | .map(_.split(" ").map(_.toDouble)) 22 | .map(line => Vectors.dense(line)) 23 | 24 | val rm = new RowMatrix(data) 25 | val pc = rm.computePrincipalComponents(3) 26 | //提取主成分,设置主成分个数为3 27 | val mx = rm.multiply(pc) //创建主成分矩阵 28 | 29 | mx.rows.foreach(println) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/com/libin/RFDemo.scala: -------------------------------------------------------------------------------- 1 | package com.libin 2 | 3 | import org.apache.spark.mllib.linalg.Vectors 4 | import org.apache.spark.mllib.regression.LabeledPoint 5 | import org.apache.spark.mllib.tree.RandomForest 6 | import org.apache.spark.{ml, SparkConf, SparkContext} 7 | import org.apache.spark.mllib.util.MLUtils 8 | 9 | /** 10 | * Copyright (c) 2018/09/04. xixi Inc. All Rights Reserved. 11 | * Authors: libin <2578858653@qq.com> 12 | *

13 | * Purpose : 随机森林 14 | */ 15 | object RFDemo { 16 | def main(args: Array[String]) { 17 | val conf = new SparkConf() 18 | .setMaster("local") 19 | .setAppName("RF") 20 | val sc = new SparkContext(conf) 21 | 22 | val data = MLUtils.loadLibSVMFile(sc, "D://sparkmllibData/sparkml/mllibdata/sample_libsvm_data.txt") 23 | 24 | val numClasses = 2 //分类数量 25 | val categoricalFeaturesInfo = Map[Int, Int]() 26 | //设定输入格式 27 | val numTrees = 3 // 随机森林中决策树的数目 28 | val featureSubSetStrategy = "auto" //设置属性在节点计算数,自动决定每个节点的属性数 Supported: "auto", "all", "sqrt", "log2", "onethird". 29 | val impurity = "gini" //设定信息增益计算方式 Supported values: "gini" (recommended) or "entropy". 30 | val maxDepth = 5 //最大深度 31 | val maxBins = 3 // 设定分割数据集 32 | 33 | /** 34 | * 建立模型 分类 35 | */ 36 | val model = RandomForest.trainClassifier(data, numClasses, categoricalFeaturesInfo, numTrees, 37 | featureSubSetStrategy, impurity, maxDepth, maxBins 38 | ) 39 | model.trees.foreach(println) //打印每棵树信息 40 | println(model.numTrees) 41 | println(model.algo) 42 | 43 | /** 44 | * 建立模型 回归 45 | */ 46 | val data_path1 = "D://sparkmllibData/sparkml/mllibdata/lpsa.data" 47 | val data2 = sc.textFile(data_path1) 48 | val inputdata = data2.map { line => 49 | val parts = line.split(',') 50 | LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(" ").map(_.toDouble))) 51 | }.cache() 52 | 53 | val impurity2 = "variance" 54 | val seed = 11 55 | val model2 = RandomForest.trainRegressor(inputdata, categoricalFeaturesInfo, numTrees, 56 | featureSubSetStrategy, impurity2, maxDepth, maxBins, seed) 57 | model2.trees.foreach(println) //打印每棵树信息 58 | println(model2.numTrees) 59 | println(model2.algo) 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/com/libin/RowmatriTest01.scala: -------------------------------------------------------------------------------- 1 | package com.libin 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | import breeze.linalg._ 6 | import breeze.numerics._ 7 | import org.apache.spark.mllib.linalg.Vectors 8 | import org.apache.spark.mllib.linalg.distributed.RowMatrix 9 | 10 | /** 11 | * Copyright (c) 2018/09/04. xixi Inc. All Rights Reserved. 12 | * Authors: libin <2578858653@qq.com> 13 | *

14 | * Purpose : 15 | */ 16 | object RowmatriTest01 { 17 | def main(args: Array[String]) { 18 | val conf = new SparkConf().setAppName("rowmatri_test01").setMaster("local") 19 | val sc = new SparkContext(conf) 20 | Logger.getRootLogger.setLevel(Level.WARN) 21 | 22 | // 3.6 分布式矩阵 23 | // 3.6.2 行矩阵(RowMatrix) 24 | val rdd1 = sc.parallelize(Array(Array(1.0, 2.0, 3.0, 4.0), Array(2.0, 3.0, 4.0, 5.0), Array(3.0, 4.0, 5.0, 6.0))).map(f => Vectors.dense(f)) 25 | val RM = new RowMatrix(rdd1) 26 | val simic1 = RM.columnSimilarities(0.5) 27 | val simic2 = RM.columnSimilarities() 28 | val simic3 = RM.computeColumnSummaryStatistics() 29 | simic3.max 30 | simic3.min 31 | simic3.mean 32 | val cc1 = RM.computeCovariance 33 | val cc2 = RM.computeGramianMatrix 34 | val pc1 = RM.computePrincipalComponents(3) 35 | val svd = RM.computeSVD(4, true) 36 | val U = svd.U 37 | U.rows.foreach(println) 38 | val s = svd.s 39 | val V = svd.V 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/com/libin/SVD.scala: -------------------------------------------------------------------------------- 1 | package com.libin 2 | 3 | import org.apache.spark.mllib.linalg.Vectors 4 | import org.apache.spark.mllib.linalg.distributed.RowMatrix 5 | import org.apache.spark.{SparkContext, SparkConf} 6 | 7 | /** 8 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved. 9 | * Authors: libin <2578858653@qq.com> 10 | *

11 | * Purpose : 降维SVD 12 | * 奇异值分解(SVD):一个矩阵分解成带有方向向量的矩阵相乘 13 | */ 14 | object SVD { 15 | val conf = new SparkConf() 16 | .setMaster("local") 17 | .setAppName("SVD") 18 | val sc = new SparkContext(conf) 19 | 20 | def main(args: Array[String]) { 21 | val data = sc.textFile("D://sparkmllibData/sparkml/mllibdata/svd.txt") 22 | .map(_.split(" ").map(_.toDouble)) 23 | .map(line => Vectors.dense(line)) 24 | 25 | val rm = new RowMatrix(data) //读入行矩阵 26 | val SVD = rm.computeSVD(2, computeU = true) //进行SVD计算 27 | //求 SVD 分解的矩阵 28 | val u = SVD.U 29 | val s = SVD.s 30 | val v = SVD.V 31 | println("SVD.U") 32 | u.rows.foreach(println) 33 | println("SVD.s") 34 | println(s) 35 | println("SVD.V") 36 | println(v) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/com/libin/Svm.scala: -------------------------------------------------------------------------------- 1 | package com.libin 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD} 6 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics 7 | import org.apache.spark.mllib.util.MLUtils 8 | 9 | /** 10 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved. 11 | * Authors: libin <2578858653@qq.com> 12 | *

13 | * Purpose : 14 | */ 15 | object Svm { 16 | def main(args: Array[String]) { 17 | //1 构建Spark对象 18 | val conf = new SparkConf() 19 | .setAppName("svm") 20 | .setMaster("local") 21 | val sc = new SparkContext(conf) 22 | Logger.getRootLogger.setLevel(Level.WARN) 23 | 24 | // 读取样本数据1,格式为LIBSVM format 25 | val data = MLUtils.loadLibSVMFile(sc, "D://sparkmllibData/sparkml/mllibdata/sample_libsvm_data.txt") 26 | 27 | //样本数据划分训练样本与测试样本 28 | val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) 29 | val training = splits(0).cache() 30 | val test = splits(1) 31 | 32 | //新建逻辑回归模型,并训练 33 | val numIterations = 100 34 | val model = SVMWithSGD.train(training, numIterations) 35 | 36 | //对测试样本进行测试 37 | val predictionAndLabel = test.map { point => 38 | val score = model.predict(point.features) 39 | (score, point.label) 40 | } 41 | val print_predict = predictionAndLabel.take(20) 42 | println("prediction" + "\t" + "label") 43 | for (i <- 0 to print_predict.length - 1) { 44 | println(print_predict(i)._1 + "\t" + print_predict(i)._2) 45 | } 46 | 47 | // 误差计算 48 | val accuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() 49 | println("Area under ROC = " + accuracy) 50 | 51 | //保存模型 52 | /*val ModelPath = "D://sparkmllibData/sparkml/mllibdata/svm_model" 53 | model.save(sc, ModelPath) 54 | val sameModel = SVMModel.load(sc, ModelPath)*/ 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/com/libin/Test.scala: -------------------------------------------------------------------------------- 1 | package com.libin 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | /** 6 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved. 7 | * Authors: libin <2578858653@qq.com> 8 | *

9 | * Purpose : 10 | */ 11 | object Test { 12 | def main(args: Array[String]) { 13 | val conf: SparkConf = new SparkConf().setMaster("local").setAppName("test") 14 | val sc: SparkContext = new SparkContext(conf) 15 | val rdd1 = sc.parallelize(List(('a', 2), ('b', 4), ('c', 6), ('d', 9))) 16 | val rdd2 = sc.parallelize(List(('c', 6), ('c', 7), ('d', 8), ('e', 10))) 17 | val unionrdd = rdd1 union rdd2 18 | //rdd1.coalesce() 19 | unionrdd.foreach(println) 20 | 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/com/libin/Tfidf.scala: -------------------------------------------------------------------------------- 1 | package com.libin 2 | 3 | import org.apache.spark.mllib.feature.{IDFModel, IDF, HashingTF} 4 | import org.apache.spark.mllib.linalg.Vector 5 | import org.apache.spark.rdd.RDD 6 | import org.apache.spark.{SparkContext, SparkConf} 7 | 8 | /** 9 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved. 10 | * Authors: libin <2578858653@qq.com> 11 | *

12 | * Purpose : 13 | */ 14 | object Tfidf { 15 | def main(args: Array[String]) { 16 | val conf: SparkConf = new SparkConf() 17 | .setMaster("local") 18 | .setAppName("tf_idf") 19 | val sc: SparkContext = new SparkContext(conf) 20 | //读取数据 21 | val document = sc.textFile("D://sparkmllibData/sparkml/mllibdata/tf_idf.txt").map(_.split("\t").toSeq) 22 | //创建TF计算实例 23 | val hashingTF = new HashingTF() 24 | //计算文档TF值 25 | val tf: RDD[Vector] = hashingTF.transform(document).cache() 26 | tf.foreach(println) 27 | //创建IDF实例并计算 28 | val idf: IDFModel = new IDF().fit(tf) 29 | println(idf) 30 | //计算TF_IDF词频 31 | val tf_idf: RDD[Vector] = idf.transform(tf) 32 | tf_idf.foreach(println) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/com/libin/TfidfWord2vec.scala: -------------------------------------------------------------------------------- 1 | package com.libin 2 | 3 | /** 4 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved. 5 | * Authors: libin <2578858653@qq.com> 6 | *

7 | * Purpose : 8 | */ 9 | object TfidfWord2vec { 10 | 11 | } 12 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/com/libin/Tree.scala: -------------------------------------------------------------------------------- 1 | package com.libin 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | import org.apache.spark.mllib.tree.DecisionTree 6 | import org.apache.spark.mllib.util.MLUtils 7 | 8 | /** 9 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved. 10 | * Authors: libin <2578858653@qq.com> 11 | *

12 | * Purpose : 13 | */ 14 | object Tree { 15 | def main(args: Array[String]) { 16 | //1 构建Spark对象 17 | val conf = new SparkConf() 18 | .setAppName("DecisionTree") 19 | .setMaster("local") 20 | val sc = new SparkContext(conf) 21 | Logger.getRootLogger.setLevel(Level.WARN) 22 | 23 | // 读取样本数据1,格式为LIBSVM format 24 | val data = MLUtils.loadLibSVMFile(sc, "D://sparkmllibData/sparkml/mllibdata/sample_libsvm_data.txt") 25 | // Split the data into training and test sets (30% held out for testing) 26 | val splits = data.randomSplit(Array(0.7, 0.3)) 27 | val (trainingData, testData) = (splits(0), splits(1)) 28 | 29 | // 新建决策树 30 | val numClasses = 2 //设定分类数量 31 | val categoricalFeaturesInfo = Map[Int, Int]() //设定输入格式 32 | val impurity = "gini" //设定信息增益计算方式 33 | val maxDepth = 3 //设定树高度 34 | val maxBins = 32 //设定分裂数据集 35 | 36 | //建立模型 37 | val model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, 38 | impurity, maxDepth, maxBins) 39 | 40 | // 实际值 预测值 41 | val labelAndPreds = testData.map { point => 42 | val prediction = model.predict(point.features) 43 | (point.label, prediction) 44 | } 45 | val print_predict = labelAndPreds.take(20) 46 | println("label" + "\t" + "prediction") 47 | for (i <- print_predict.indices) { 48 | println(print_predict(i)._1 + "\t" + print_predict(i)._2) 49 | } 50 | //计算误差 51 | val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() 52 | println("Test Error = " + testErr) 53 | println("Learned classification tree model:\n" + model.toDebugString) 54 | 55 | // 保存模型 56 | /*val ModelPath = "D://sparkmllibData/sparkml/mllibdata/Decision_Tree_Model" 57 | model.save(sc, ModelPath) 58 | val sameModel = DecisionTreeModel.load(sc, ModelPath)*/ 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/com/libin/VectorDemo.scala: -------------------------------------------------------------------------------- 1 | package com.libin 2 | 3 | import org.apache.spark.mllib.linalg.{Matrices, Vector, Vectors} 4 | import org.apache.spark.mllib.regression.LabeledPoint 5 | 6 | /** 7 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved. 8 | * Authors: libin <2578858653@qq.com> 9 | *

10 | * Purpose : 11 | */ 12 | object VectorDemo { 13 | def main(args: Array[String]) { 14 | //建立密集向量 15 | val vd: Vector = Vectors.dense(9, 5, 2, 7) 16 | val pos = LabeledPoint(1, vd) 17 | println(pos.features) 18 | println(pos.label) 19 | //println(vd(2)) 20 | //建立稀疏向量 21 | val vs: Vector = Vectors.sparse(4, Array(0, 1, 2, 3), Array(9, 5, 2, 7)) 22 | val neg = LabeledPoint(0, vs) 23 | println(neg.features) 24 | println(neg.label) 25 | //println(vs(2)) 26 | 27 | /*val conf: SparkConf = new SparkConf() 28 | .setAppName("vector") 29 | .setMaster("local") 30 | val sc: SparkContext = new SparkContext(conf) 31 | val mu = MLUtils.loadLibSVMFile(sc,"D://sparkmllibData/sparkml/mllibdata/vectors.txt") 32 | mu.foreach(println)*/ 33 | //本地矩阵 34 | val mx = Matrices.dense(2, 3, Array(1, 2, 3, 4, 5, 6)) 35 | println(mx) 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/com/libin/kaggle/kaggle_digit_recognizer_data.scala: -------------------------------------------------------------------------------- 1 | package com.libin.kaggle 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.{SparkContext, SparkConf} 5 | 6 | /** 7 | * Copyright (c) 2017/06/26. xixi Inc. All Rights Reserved. 8 | * Authors: libin <2578858653@qq.com> 9 | *

10 | * Purpose : 11 | */ 12 | object kaggle_digit_recognizer_data { 13 | def main(args: Array[String]) { 14 | //构建Spark对象 15 | val conf = new SparkConf() 16 | .setAppName("kaggle_digit_recognizer_data") 17 | .setMaster("local") 18 | .set("spark.driver.memory", "2G") 19 | val sc = new SparkContext(conf) 20 | Logger.getRootLogger.setLevel(Level.WARN) 21 | 22 | //将预测结果转换成kaggle识别的格式 , 第一行还要加上 ImageId,Label 23 | var count = 0 24 | val train_data = sc.textFile("E://_deeplearning/Digit-Recognizer-Kaggle-master/data/prediction_rf/part-00000") 25 | .map(line => { 26 | count += 1 27 | count + "," + line 28 | }) 29 | .repartition(1).saveAsTextFile("E://_deeplearning/Digit-Recognizer-Kaggle-master/data/result_rf") 30 | 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/com/libin/scala/AaidTest.scala: -------------------------------------------------------------------------------- 1 | package com.libin.scala 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | 6 | import scala.collection.mutable 7 | import scala.collection.JavaConverters._ 8 | 9 | /** 10 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved. 11 | * Authors: libin <2578858653@qq.com> 12 | *

13 | * Purpose : 14 | */ 15 | object AaidTest { 16 | def main(args: Array[String]): Unit = { 17 | val conf = new SparkConf().setAppName("AaidTest").setMaster("local") 18 | val sc = new SparkContext(conf) 19 | 20 | sc.textFile("D://sparkmllibData/sparkml/mllibdata/arrregation.txt") 21 | .map(line => { 22 | (line.split("\t")(0), line.split("\t")(1).toLong) 23 | }).aggregateByKey(0L)(seqOp, seqOp) 24 | .filter(line => line._2 != 1L) 25 | .collect().foreach(println) 26 | 27 | } 28 | 29 | def seqOp(U: Long, v: Long): Long = { 30 | println("seqOp") 31 | println("U=" + U) 32 | println("v=" + v) 33 | var count: Int = 0 34 | if (U != 0L) { 35 | count += 1 36 | } 37 | if (v != 0L) { 38 | count += 1 39 | } 40 | if (count > 1) { 41 | 1L 42 | } else { 43 | v 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /spark-mllib/src/main/scala/com/libin/scala/AggredateTest.scala: -------------------------------------------------------------------------------- 1 | package com.libin.scala 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | /** 6 | * Copyright (c) 2018/09/03. xixi Inc. All Rights Reserved. 7 | * Authors: libin <2578858653@qq.com> 8 | *

9 | * Purpose : 10 | */ 11 | object AggredateTest { 12 | def main(args: Array[String]) { 13 | val conf = new SparkConf().setMaster("local").setAppName("aggredate_test") 14 | val sc = new SparkContext(conf) 15 | val data = sc.parallelize(List((1, 3), (1, 2), (1, 4), (2, 3))) 16 | data.aggregateByKey(5)(seq, comb).collect.foreach(println) 17 | } 18 | 19 | def seq(a: Int, b: Int): Int = { 20 | println("seq: " + a + "\t " + b) 21 | math.max(a, b) 22 | } 23 | 24 | def comb(a: Int, b: Int): Int = { 25 | println("comb: " + a + "\t " + b) 26 | a + b 27 | } 28 | } 29 | --------------------------------------------------------------------------------