├── _config.yml
├── imgs
    ├── mammut.png
    ├── spark_core
    │   ├── memory.png
    │   ├── rdd-itr.png
    │   ├── rdd-loop.png
    │   ├── shuffle.png
    │   ├── wc-trans.png
    │   ├── rdd-feature.png
    │   ├── spark-eco.png
    │   ├── wordcount.png
    │   ├── dependencies.png
    │   ├── rdd-inmemory.png
    │   ├── object-lifetime.png
    │   ├── repartition-less2more.png
    │   ├── repartition-more2less.png
    │   └── context_cleaner
    │   │   ├── jobs_tab_cached_rdd.png
    │   │   └── storage_tab_cached_rdd.jpg
    └── spark_basics
    │   ├── spark-stack.png
    │   └── sparkcontext-services.png
├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
├── .travis.yml
├── .gitignore
├── example
    ├── src
    │   └── main
    │   │   ├── scala
    │   │       └── com
    │   │       │   └── netease
    │   │       │       └── bigdata
    │   │       │           └── spark
    │   │       │               ├── WordCount.scala
    │   │       │               └── rdd
    │   │       │                   └── RDDCacheTest.scala
    │   │   └── java
    │   │       └── com
    │   │           └── netease
    │   │               └── bigdata
    │   │                   └── hadoop
    │   │                       └── WordCount.java
    └── pom.xml
├── README.md
├── scalastyle-config.xml
├── pom.xml
└── slides
    ├── spark_core
        ├── context_cleaner.html
        └── rdd_basics.html
    └── spark_basics
        └── spark_basics_and_quick_start.html


/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-leap-day
2 | 


--------------------------------------------------------------------------------
/imgs/mammut.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netease-bigdata/ne-spark-courseware/HEAD/imgs/mammut.png


--------------------------------------------------------------------------------
/imgs/spark_core/memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netease-bigdata/ne-spark-courseware/HEAD/imgs/spark_core/memory.png


--------------------------------------------------------------------------------
/imgs/spark_core/rdd-itr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netease-bigdata/ne-spark-courseware/HEAD/imgs/spark_core/rdd-itr.png


--------------------------------------------------------------------------------
/imgs/spark_core/rdd-loop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netease-bigdata/ne-spark-courseware/HEAD/imgs/spark_core/rdd-loop.png


--------------------------------------------------------------------------------
/imgs/spark_core/shuffle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netease-bigdata/ne-spark-courseware/HEAD/imgs/spark_core/shuffle.png


--------------------------------------------------------------------------------
/imgs/spark_core/wc-trans.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netease-bigdata/ne-spark-courseware/HEAD/imgs/spark_core/wc-trans.png


--------------------------------------------------------------------------------
/imgs/spark_core/rdd-feature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netease-bigdata/ne-spark-courseware/HEAD/imgs/spark_core/rdd-feature.png


--------------------------------------------------------------------------------
/imgs/spark_core/spark-eco.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netease-bigdata/ne-spark-courseware/HEAD/imgs/spark_core/spark-eco.png


--------------------------------------------------------------------------------
/imgs/spark_core/wordcount.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netease-bigdata/ne-spark-courseware/HEAD/imgs/spark_core/wordcount.png


--------------------------------------------------------------------------------
/imgs/spark_basics/spark-stack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netease-bigdata/ne-spark-courseware/HEAD/imgs/spark_basics/spark-stack.png


--------------------------------------------------------------------------------
/imgs/spark_core/dependencies.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netease-bigdata/ne-spark-courseware/HEAD/imgs/spark_core/dependencies.png


--------------------------------------------------------------------------------
/imgs/spark_core/rdd-inmemory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netease-bigdata/ne-spark-courseware/HEAD/imgs/spark_core/rdd-inmemory.png


--------------------------------------------------------------------------------
/imgs/spark_core/object-lifetime.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netease-bigdata/ne-spark-courseware/HEAD/imgs/spark_core/object-lifetime.png


--------------------------------------------------------------------------------
/imgs/spark_core/repartition-less2more.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netease-bigdata/ne-spark-courseware/HEAD/imgs/spark_core/repartition-less2more.png


--------------------------------------------------------------------------------
/imgs/spark_core/repartition-more2less.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netease-bigdata/ne-spark-courseware/HEAD/imgs/spark_core/repartition-more2less.png


--------------------------------------------------------------------------------
/imgs/spark_basics/sparkcontext-services.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netease-bigdata/ne-spark-courseware/HEAD/imgs/spark_basics/sparkcontext-services.png


--------------------------------------------------------------------------------
/imgs/spark_core/context_cleaner/jobs_tab_cached_rdd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netease-bigdata/ne-spark-courseware/HEAD/imgs/spark_core/context_cleaner/jobs_tab_cached_rdd.png


--------------------------------------------------------------------------------
/imgs/spark_core/context_cleaner/storage_tab_cached_rdd.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/netease-bigdata/ne-spark-courseware/HEAD/imgs/spark_core/context_cleaner/storage_tab_cached_rdd.jpg


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | 
 5 | ---
 6 | 
 7 | **Describe the bug**
 8 | A clear and concise description of what the bug is.
 9 | 
10 | **Expected behavior**
11 | A clear and concise description of what you expected to happen.
12 | 
13 | **Screenshots**
14 | If applicable, add screenshots to help explain your problem.
15 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | 
 5 | ---
 6 | 
 7 | ## 题名
 8 | 【在这里描述你的主题】
 9 | ## 摘要
10 | 【概要性描述你的主题内容】
11 | ## 大纲
12 | - 自我介绍
13 | - 目录
14 | - 主题一
15 |    - 主题一-1
16 |    - 主题一-2
17 |    - 主题一-3
18 | - 主题二
19 |    - 主题二-1
20 |    - 主题二-2
21 | - 主题三
22 |    - 主题三-1 
23 | - 结尾
24 | 
25 | ## 附录
26 | 【在这里描述你的主题涵盖的其他信息】
27 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: scala
 2 | scala:
 3 |   - 2.11.8
 4 | 
 5 | cache:
 6 | directories:
 7 |   - $HOME/.m2
 8 | 
 9 | deploy:
10 |   provider: pages
11 |   skip_cleanup: true
12 |   github_token: $GITHUB_TOKEN
13 |   email: yaooqinn@hotmail.com
14 |   name: Kent Yao
15 |   on:
16 |     branch: master
17 | 
18 | script:
19 |   - mvn package -q -Dmaven.javadoc.skip=true -B -V
20 | 
21 | notifications:
22 |   email: false
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *#*#
 2 | *.#*
 3 | *.iml
 4 | *.ipr
 5 | *.iws
 6 | *.pyc
 7 | *.pyo
 8 | *.swp
 9 | *~
10 | .DS_Store
11 | .cache
12 | .classpath
13 | .ensime
14 | .ensime_cache/
15 | .ensime_lucene
16 | .generated-mima*
17 | .idea/
18 | .idea_modules/
19 | .project
20 | .pydevproject
21 | .scala_dependencies
22 | .settings
23 | target/
24 | dist/
25 | kyuubi-*-bin-*
26 | *.gz
27 | logs/
28 | pid/
29 | local/
30 | out/
31 | hs_err_pid*
32 | spark-warehouse/
33 | metastore_db
34 | derby.log
35 | 
36 | 


--------------------------------------------------------------------------------
/example/src/main/scala/com/netease/bigdata/spark/WordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.netease.bigdata.spark
 2 | 
 3 | import org.apache.spark.{SparkConf, SparkContext}
 4 | 
 5 | object WordCount {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 |     require(args.length == 1, "Usage: WordCount <input file>")
 9 |     val conf = new SparkConf().setAppName("Word Count").setMaster("local[*]")
10 |     val sparkContext = new SparkContext(conf)
11 |     val textFile = sparkContext.textFile(args(0), 2)
12 |     val words = textFile.flatMap(_.split(" "))
13 |     val ones = words.map((_, 1))
14 |     val counts = ones.reduceByKey(_ + _)
15 |     val res = counts.collect()
16 |     for ((word, count) <- res) {
17 |       println(word + ": " + count)
18 |     }
19 | 
20 |     sparkContext.stop()
21 |   }
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/example/src/main/scala/com/netease/bigdata/spark/rdd/RDDCacheTest.scala:
--------------------------------------------------------------------------------
 1 | package com.netease.bigdata.spark.rdd
 2 | 
 3 | import org.apache.spark.{SparkConf, SparkContext}
 4 | 
 5 | import scala.util.Random
 6 | 
 7 | object RDDCacheTest {
 8 | 
 9 |   def main(args: Array[String]): Unit = {
10 |     val conf = new SparkConf()
11 |       .setAppName(getClass.getSimpleName)
12 |       .set("spark.cleaner.periodicGC.interval", "1min") // context cleaner
13 |     val sc = new SparkContext(conf)
14 |     val data = Seq.fill(1024 * 1024 * 100)(Random.nextInt(100))
15 |     val rdd1 = sc.parallelize(data, 20)
16 |     rdd1.cache() // mark rdd 1 cache
17 |     val rdd2 = rdd1.map((_, 1)).reduceByKey(_ + _) // word count
18 |     val cachedRdd2 = rdd2.cache() // cache shuffled rdd
19 |     rdd2.collect() // action actually trigger caching
20 |     rdd1.count()  // ditto
21 |     rdd2.count() // rdd reuse
22 |     cachedRdd2.count() // ditto
23 |     rdd1.map((_, 1)).reduceByKey(_ + _).take(1) // rdd 1 reuse, not rdd 2
24 |     // no rdd reuse
25 |     val rdd3 = sc.parallelize(data, 30)
26 |     rdd3.map((_, 1)).reduceByKey(_ + _).count()
27 |     10.to(20, 2).foreach { i =>
28 |       val tmp = rdd3.groupBy(_ % i)
29 |       tmp.cache().count()
30 |       if (i % 3 == 0) tmp.take(1)
31 |     }
32 |     Thread.sleep(1000 * 60 * 10)
33 |     sc.stop()
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/example/src/main/java/com/netease/bigdata/hadoop/WordCount.java:
--------------------------------------------------------------------------------
 1 | package com.netease.bigdata.hadoop;
 2 | 
 3 | import org.apache.hadoop.conf.Configuration;
 4 | import org.apache.hadoop.fs.Path;
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.LongWritable;
 7 | import org.apache.hadoop.io.Text;
 8 | import org.apache.hadoop.mapreduce.Job;
 9 | import org.apache.hadoop.mapreduce.Mapper;
10 | import org.apache.hadoop.mapreduce.Reducer;
11 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
12 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
13 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
15 | 
16 | import java.io.IOException;
17 | import java.util.StringTokenizer;
18 | 
19 | public class WordCount {
20 | 
21 |     public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
22 |         private final static IntWritable one = new IntWritable(1);
23 |         private Text word = new Text();
24 | 
25 |         public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
26 |             String line = value.toString();
27 |             StringTokenizer tokenizer = new StringTokenizer(line);
28 |             while (tokenizer.hasMoreTokens()) {
29 |                 word.set(tokenizer.nextToken());
30 |                 context.write(word, one);
31 |             }
32 |         }
33 |     }
34 | 
35 |     public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
36 |         public void reduce(Text key, Iterable<IntWritable> values, Context context)
37 |                 throws IOException, InterruptedException {
38 |             int sum = 0;
39 |             for (IntWritable val : values) {
40 |                 sum += val.get();
41 |             }
42 |             context.write(key, new IntWritable(sum));
43 |         }
44 |     }
45 | 
46 |     public static void main(String[] args) throws Exception {
47 |         Configuration conf = new Configuration();
48 |         Job job = new Job(conf, "wordcount");
49 |         job.setOutputKeyClass(Text.class);
50 |         job.setOutputValueClass(IntWritable.class);
51 |         job.setMapperClass(Map.class);
52 |         job.setReducerClass(Reduce.class);
53 |         job.setInputFormatClass(TextInputFormat.class);
54 |         job.setOutputFormatClass(TextOutputFormat.class);
55 |         FileInputFormat.addInputPath(job, new Path(args[0]));
56 |         FileOutputFormat.setOutputPath(job, new Path(args[1]));
57 |         job.waitForCompletion(true);
58 |     }
59 | }
60 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NetEase Spark Courses [![HitCount](http://hits.dwyl.io/netease-bigdata/ne-spark-courseware.svg)](http://hits.dwyl.io/netease-bigdata/ne-spark-courseware)
 2 | 
 3 | 本项目旨在指导相关的用户在使用[网易猛犸大数据平台](https://bigdata.163yun.com/mammut)的过程中能够更加方便使用Apache Spark进行日常的数据开发工作。
 4 | 
 5 | 
 6 | ## 一、基础知识
 7 | #### 1. [Spark概述及快速入门指南](https://netease-bigdata.github.io/ne-spark-courseware/slides/spark_basics/spark_basics_and_quick_start.html#1)  
 8 | #### 2. [基于Maven在IDE中开发Spark应用]()
 9 | 
10 | ## 二、 Spark Core
11 | #### 1. [Spark RDD概述](https://netease-bigdata.github.io/ne-spark-courseware/slides/spark_core/rdd_basics.html#1)
12 | #### 2. [Spark垃圾回收机制 -- ContextCleaner](https://netease-bigdata.github.io/ne-spark-courseware/slides/spark_core/context_cleaner.html#1)  
13 | #### [Spark On YARN]()
14 | 
15 | ## 三、 Spark SQL
16 | #### [DataFrame/Dataset]()
17 | #### [Spark SQL与Hive集成]()   
18 | #### [Spark SQL UDF]()
19 | #### [如何优化Spark SQL执行过程]()  
20 | #### [Spark SQL Catalyst工作原理详解]()  
21 | #### [Spark SQL Cost Based Optimization详解]()  
22 | #### [Spark SQL Thrift Server详解]()  
23 | #### [Spark SQL 操作各种数据源]()  
24 | #### [Spark SQL 参数详解及调优]()
25 | 
26 | ## 四、 Spark Streaming
27 | #### [大数据处理的类型、流计算的框架及内容概要]()  
28 | #### [SparkStreaming是什么及数据处理流程]()
29 | #### [Spark Streaming集成Kafka]()
30 | #### [Spark Streaming集成Flume]()
31 | 
32 | 
33 | ## 五、 Spark Structured Streaming
34 | #### [Spark Structured Streaming Basics](https://yaooqinn.github.io/sugar/slides/StructuedStreamingBasics.html#1) 
35 | 
36 | ## 六、 Spark Machine Learning
37 | 
38 | ## 七、 Spark GraphX
39 | 
40 | ## 八、 R on Spark
41 | 
42 | ## 九、 Mammut Spark 数据开发
43 | #### [如何使用猛犸Spark进行数据开发]() 
44 | #### [如何使用猛犸进行ETL开发]()
45 | #### [如何使用猛犸Spark Streaming任务开发及调优]()
46 | 
47 | ## 十、 Mammut Spark 自助分析
48 | 
49 | ## 十一、 Spark 参数详解
50 | 
51 | ## 十二、 其他
52 | - DataSourceV2
53 |     - [DataSourceV2 Overview](https://yaooqinn.github.io/sugar/docs/spark/datasourcev2/1_start_from_the_jira.html) - 范文臣大神[SPIP: DataSource API V2](https://docs.google.com/document/d/1n_vUVbF4KD3gxTmkNEon5qdQ-Z8qU5Frf6WMQZ6jJVM/edit#heading=h.mi1fbff5f8f9)读后感
54 | 
55 | ---
56 | 
57 | ## 推广链接 
58 | [Kyuubi](https://github.com/yaooqinn/kyuubi) 基于Spark实现的多租户SQL Thrift/JDBC/ODBC服务 [![codecov](https://codecov.io/gh/yaooqinn/kyuubi/branch/master/graph/badge.svg)](https://codecov.io/gh/yaooqinn/kyuubi) [![Build Status](https://travis-ci.org/yaooqinn/kyuubi.svg?branch=master)](https://travis-ci.org/yaooqinn/kyuubi)[![HitCount](http://hits.dwyl.io/yaooqinn/kyuubi.svg)](http://hits.dwyl.io/yaooqinn/kyuubi) 
59 | 
60 | [spark-authorizer](https://github.com/yaooqinn/spark-authorizer) 提供Spark SQL权限控制能力的插件 [![Build Status](https://travis-ci.org/yaooqinn/spark-authorizer.svg?branch=master)](https://travis-ci.org/yaooqinn/spark-authorizer)
61 | 


--------------------------------------------------------------------------------
/example/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <parent>
  6 |         <artifactId>spark-courseware</artifactId>
  7 |         <groupId>com.netease.bigdata</groupId>
  8 |         <version>1.0.0-SNAPSHOT</version>
  9 |         <relativePath>../pom.xml</relativePath>
 10 |     </parent>
 11 |     <modelVersion>4.0.0</modelVersion>
 12 | 
 13 |     <artifactId>example</artifactId>
 14 |     <name>Examples</name>
 15 |     <packaging>jar</packaging>
 16 | 
 17 |     <dependencies>
 18 |         <dependency>
 19 |             <groupId>org.scala-lang</groupId>
 20 |             <artifactId>scala-library</artifactId>
 21 |         </dependency>
 22 |         <dependency>
 23 |             <groupId>org.apache.hadoop</groupId>
 24 |             <artifactId>hadoop-mapreduce-client-core</artifactId>
 25 |         </dependency>
 26 |         <dependency>
 27 |             <groupId>org.apache.hadoop</groupId>
 28 |             <artifactId>hadoop-client</artifactId>
 29 |         </dependency>
 30 | 
 31 |         <dependency>
 32 |             <groupId>org.apache.spark</groupId>
 33 |             <artifactId>spark-core_${scala.binary.version}</artifactId>
 34 |         </dependency>
 35 |     </dependencies>
 36 | 
 37 |     <build>
 38 |         <plugins>
 39 |             <plugin>
 40 |                 <groupId>org.apache.maven.plugins</groupId>
 41 |                 <artifactId>maven-compiler-plugin</artifactId>
 42 |                 <version>3.5.1</version>
 43 |                 <configuration>
 44 |                     <source>${java.version}</source>
 45 |                     <target>${java.version}</target>
 46 |                     <encoding>UTF-8</encoding>
 47 |                     <maxmem>1024m</maxmem>
 48 |                     <fork>true</fork>
 49 |                     <compilerArgs>
 50 |                         <arg>-Xlint:all,-serial,-path</arg>
 51 |                     </compilerArgs>
 52 |                 </configuration>
 53 |             </plugin>
 54 | 
 55 |             <plugin>
 56 |                 <groupId>net.alchim31.maven</groupId>
 57 |                 <artifactId>scala-maven-plugin</artifactId>
 58 |                 <version>3.3.1</version>
 59 |                 <executions>
 60 |                     <execution>
 61 |                         <id>eclipse-add-source</id>
 62 |                         <goals>
 63 |                             <goal>add-source</goal>
 64 |                         </goals>
 65 |                     </execution>
 66 |                     <execution>
 67 |                         <id>scala-compile-first</id>
 68 |                         <goals>
 69 |                             <goal>compile</goal>
 70 |                         </goals>
 71 |                     </execution>
 72 |                     <execution>
 73 |                         <id>scala-test-compile-first</id>
 74 |                         <goals>
 75 |                             <goal>testCompile</goal>
 76 |                         </goals>
 77 |                     </execution>
 78 |                 </executions>
 79 |                 <configuration>
 80 |                     <scalaVersion>${scala.version}</scalaVersion>
 81 |                     <recompileMode>incremental</recompileMode>
 82 |                     <useZincServer>true</useZincServer>
 83 |                     <args>
 84 |                         <arg>-unchecked</arg>
 85 |                         <arg>-deprecation</arg>
 86 |                         <arg>-feature</arg>
 87 |                         <arg>-explaintypes</arg>
 88 |                         <arg>-Yno-adapted-args</arg>
 89 |                     </args>
 90 |                     <jvmArgs>
 91 |                         <jvmArg>-Xms1024m</jvmArg>
 92 |                         <jvmArg>-Xmx1024m</jvmArg>
 93 |                         <jvmArg>-XX:ReservedCodeCacheSize=512M</jvmArg>
 94 |                     </jvmArgs>
 95 |                     <javacArgs>
 96 |                         <javacArg>-source</javacArg>
 97 |                         <javacArg>${java.version}</javacArg>
 98 |                         <javacArg>-target</javacArg>
 99 |                         <javacArg>${java.version}</javacArg>
100 |                         <javacArg>-Xlint:all,-serial,-path,-try</javacArg>
101 |                     </javacArgs>
102 |                 </configuration>
103 |             </plugin>
104 |         </plugins>
105 |     </build>
106 | </project>


--------------------------------------------------------------------------------
/scalastyle-config.xml:
--------------------------------------------------------------------------------
  1 | <!--
  2 |   ~ Licensed to the Apache Software Foundation (ASF) under one or more
  3 |   ~ contributor license agreements.  See the NOTICE file distributed with
  4 |   ~ this work for additional information regarding copyright ownership.
  5 |   ~ The ASF licenses this file to You under the Apache License, Version 2.0
  6 |   ~ (the "License"); you may not use this file except in compliance with
  7 |   ~ the License.  You may obtain a copy of the License at
  8 |   ~
  9 |   ~    http://www.apache.org/licenses/LICENSE-2.0
 10 |   ~
 11 |   ~ Unless required by applicable law or agreed to in writing, software
 12 |   ~ distributed under the License is distributed on an "AS IS" BASIS,
 13 |   ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |   ~ See the License for the specific language governing permissions and
 15 |   ~ limitations under the License.
 16 |   -->
 17 | <!--
 18 | 
 19 | If you wish to turn off checking for a section of code, you can put a comment in the source
 20 | before and after the section, with the following syntax:
 21 | 
 22 |   // scalastyle:off
 23 |   ...  // stuff that breaks the styles
 24 |   // scalastyle:on
 25 | 
 26 | You can also disable only one rule, by specifying its rule id, as specified in:
 27 |   http://www.scalastyle.org/rules-0.7.0.html
 28 | 
 29 |   // scalastyle:off no.finalize
 30 |   override def finalize(): Unit = ...
 31 |   // scalastyle:on no.finalize
 32 | 
 33 | This file is divided into 3 sections:
 34 |  (1) rules that we enforce.
 35 |  (2) rules that we would like to enforce, but haven't cleaned up the codebase to turn on yet
 36 |      (or we need to make the scalastyle rule more configurable).
 37 |  (3) rules that we don't want to enforce.
 38 | -->
 39 | 
 40 | <scalastyle>
 41 |   <name>Scalastyle standard configuration</name>
 42 | 
 43 |   <!-- ================================================================================ -->
 44 |   <!--                               rules we enforce                                   -->
 45 |   <!-- ================================================================================ -->
 46 | 
 47 |   <check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"/>
 48 | 
 49 |   <check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"/>
 50 | 
 51 |   <check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"/>
 52 | 
 53 |   <check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"/>
 54 | 
 55 |   <check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true">
 56 |     <parameters>
 57 |       <parameter name="maxLineLength"><![CDATA[100]]></parameter>
 58 |       <parameter name="tabSize"><![CDATA[2]]></parameter>
 59 |       <parameter name="ignoreImports">true</parameter>
 60 |     </parameters>
 61 |   </check>
 62 | 
 63 |   <check level="error" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true">
 64 |     <parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters>
 65 |   </check>
 66 | 
 67 |   <check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true">
 68 |     <parameters><parameter name="regex"><![CDATA[(config|[A-Z][A-Za-z]*)]]></parameter></parameters>
 69 |   </check>
 70 | 
 71 |   <check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true">
 72 |     <parameters><parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter></parameters>
 73 |   </check>
 74 | 
 75 |   <check level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
 76 |     <parameters><parameter name="maxParameters"><![CDATA[10]]></parameter></parameters>
 77 |   </check>
 78 | 
 79 |   <check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"/>
 80 | 
 81 |   <check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"/>
 82 | 
 83 |   <check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"/>
 84 | 
 85 |   <check level="error" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"/>
 86 | 
 87 |   <check level="error" class="org.scalastyle.scalariform.IfBraceChecker" enabled="true">
 88 |     <parameters>
 89 |       <parameter name="singleLineAllowed"><![CDATA[true]]></parameter>
 90 |       <parameter name="doubleLineAllowed"><![CDATA[true]]></parameter>
 91 |     </parameters>
 92 |   </check>
 93 | 
 94 |   <check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="true"/>
 95 | 
 96 |   <check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"/>
 97 | 
 98 |   <check customId="nonascii" level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"/>
 99 | 
100 |   <check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"/>
101 | 
102 |   <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceBeforeTokenChecker" enabled="true">
103 |    <parameters>
104 |      <parameter name="tokens">ARROW, EQUALS, ELSE, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
105 |    </parameters>
106 |   </check>
107 | 
108 |   <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceAfterTokenChecker" enabled="true">
109 |     <parameters>
110 |      <parameter name="tokens">ARROW, EQUALS, COMMA, COLON, IF, ELSE, DO, WHILE, FOR, MATCH, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
111 |     </parameters>
112 |   </check>
113 | 
114 |   <!-- ??? usually shouldn't be checked into the code base. -->
115 |   <check level="error" class="org.scalastyle.scalariform.NotImplementedErrorUsage" enabled="true"/>
116 | 
117 |   <check customId="runtimeaddshutdownhook" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
118 |     <parameters><parameter name="regex">Runtime\.getRuntime\.addShutdownHook</parameter></parameters>
119 |     <customMessage><![CDATA[
120 |       Are you sure that you want to use Runtime.getRuntime.addShutdownHook? In most cases, you should use
121 |       ShutdownHookManager.addShutdownHook instead.
122 |       If you must use Runtime.getRuntime.addShutdownHook, wrap the code block with
123 |       // scalastyle:off runtimeaddshutdownhook
124 |       Runtime.getRuntime.addShutdownHook(...)
125 |       // scalastyle:on runtimeaddshutdownhook
126 |     ]]></customMessage>
127 |   </check>
128 | 
129 |   <check customId="mutablesynchronizedbuffer" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
130 |     <parameters><parameter name="regex">mutable\.SynchronizedBuffer</parameter></parameters>
131 |     <customMessage><![CDATA[
132 |       Are you sure that you want to use mutable.SynchronizedBuffer? In most cases, you should use
133 |       java.util.concurrent.ConcurrentLinkedQueue instead.
134 |       If you must use mutable.SynchronizedBuffer, wrap the code block with
135 |       // scalastyle:off mutablesynchronizedbuffer
136 |       mutable.SynchronizedBuffer[...]
137 |       // scalastyle:on mutablesynchronizedbuffer
138 |     ]]></customMessage>
139 |   </check>
140 | 
141 |   <check customId="awaitready" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
142 |     <parameters><parameter name="regex">Await\.ready</parameter></parameters>
143 |     <customMessage><![CDATA[
144 |       Are you sure that you want to use Await.ready? In most cases, you should use ThreadUtils.awaitReady instead.
145 |       If you must use Await.ready, wrap the code block with
146 |       // scalastyle:off awaitready
147 |       Await.ready(...)
148 |       // scalastyle:on awaitready
149 |     ]]></customMessage>
150 |   </check>
151 | 
152 |   <check customId="javaconversions" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
153 |     <parameters><parameter name="regex">JavaConversions</parameter></parameters>
154 |     <customMessage>Instead of importing implicits in scala.collection.JavaConversions._, import
155 |     scala.collection.JavaConverters._ and use .asScala / .asJava methods</customMessage>
156 |   </check>
157 | 
158 |   <check customId="commonslang2" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
159 |     <parameters><parameter name="regex">org\.apache\.commons\.lang\.</parameter></parameters>
160 |     <customMessage>Use Commons Lang 3 classes (package org.apache.commons.lang3.*) instead
161 |     of Commons Lang 2 (package org.apache.commons.lang.*)</customMessage>
162 |   </check>
163 | 
164 |   <check customId="extractopt" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
165 |     <parameters><parameter name="regex">extractOpt</parameter></parameters>
166 |     <customMessage>Use Utils.jsonOption(x).map(.extract[T]) instead of .extractOpt[T], as the latter
167 |     is slower.  </customMessage>
168 |   </check>
169 | 
170 |   <check level="error" class="org.scalastyle.scalariform.ImportOrderChecker" enabled="true">
171 |     <parameters>
172 |       <parameter name="groups">java,scala,3rdParty,yaooqinn</parameter>
173 |       <parameter name="group.java">javax?\..*</parameter>
174 |       <parameter name="group.scala">scala\..*</parameter>
175 |       <parameter name="group.3rdParty">(?!yaooqinn).*</parameter>
176 |     </parameters>
177 |   </check>
178 | 
179 |   <check level="error" class="org.scalastyle.scalariform.DisallowSpaceBeforeTokenChecker" enabled="true">
180 |     <parameters>
181 |       <parameter name="tokens">COMMA</parameter>
182 |     </parameters>
183 |   </check>
184 | 
185 |   <check customId="SingleSpaceBetweenRParenAndLCurlyBrace" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
186 |     <parameters><parameter name="regex">\)\{</parameter></parameters>
187 |     <customMessage><![CDATA[
188 |       Single Space between ')' and `{`.
189 |     ]]></customMessage>
190 |   </check>
191 | 
192 |   <check customId="NoScalaDoc" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
193 |     <parameters><parameter name="regex">(?m)^(\s*)/[*][*].*$(\r|)\n^\1  [*]</parameter></parameters>
194 |     <customMessage>Use Javadoc style indentation for multiline comments</customMessage>
195 |   </check>
196 | 
197 |   <check customId="OmitBracesInCase" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
198 |     <parameters><parameter name="regex">case[^\n>]*=>\s*\{</parameter></parameters>
199 |     <customMessage>Omit braces in case clauses.</customMessage>
200 |   </check>
201 | 
202 |   <check level="error" class="org.scalastyle.scalariform.DeprecatedJavaChecker" enabled="true"/>
203 | 
204 |   <!-- ================================================================================ -->
205 |   <!--       rules we'd like to enforce, but haven't cleaned up the codebase yet        -->
206 |   <!-- ================================================================================ -->
207 | 
208 |   <!-- We cannot turn the following two on, because it'd fail a lot of string interpolation use cases. -->
209 |   <!-- Ideally the following two rules should be configurable to rule out string interpolation. -->
210 |   <check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="false"/>
211 |   <check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="false"/>
212 | 
213 |   <!-- This breaks symbolic method names so we don't turn it on. -->
214 |   <!-- Maybe we should update it to allow basic symbolic names, and then we are good to go. -->
215 |   <check level="error" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="false">
216 |     <parameters>
217 |     <parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter>
218 |     </parameters>
219 |   </check>
220 | 
221 |   <!-- Should turn this on, but we have a few places that need to be fixed first -->
222 |   <check level="error" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="true"/>
223 | 
224 |   <!-- ================================================================================ -->
225 |   <!--                               rules we don't want                                -->
226 |   <!-- ================================================================================ -->
227 | 
228 |   <check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="false">
229 |     <parameters><parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter></parameters>
230 |   </check>
231 | 
232 |   <!-- We want the opposite of this: NewLineAtEofChecker -->
233 |   <check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"/>
234 | 
235 |   <!-- This one complains about all kinds of random things. Disable. -->
236 |   <check level="error" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="false"/>
237 | 
238 |   <!-- We use return quite a bit for control flows and guards -->
239 |   <check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="false"/>
240 | 
241 |   <!-- We use null a lot in low level code and to interface with 3rd party code -->
242 |   <check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="false"/>
243 | 
244 |   <!-- Doesn't seem super big deal here ... -->
245 |   <check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="false"/>
246 | 
247 |   <!-- Doesn't seem super big deal here ... -->
248 |   <check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="false">
249 |     <parameters><parameter name="maxFileLength">800></parameter></parameters>
250 |   </check>
251 | 
252 |   <!-- Doesn't seem super big deal here ... -->
253 |   <check level="error" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="false">
254 |     <parameters><parameter name="maxTypes">30</parameter></parameters>
255 |   </check>
256 | 
257 |   <!-- Doesn't seem super big deal here ... -->
258 |   <check level="error" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="false">
259 |     <parameters><parameter name="maximum">10</parameter></parameters>
260 |   </check>
261 | 
262 |   <!-- Doesn't seem super big deal here ... -->
263 |   <check level="error" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="false">
264 |     <parameters><parameter name="maxLength">50</parameter></parameters>
265 |   </check>
266 | 
267 |   <!-- Not exactly feasible to enforce this right now. -->
268 |   <!-- It is also infrequent that somebody introduces a new class with a lot of methods. -->
269 |   <check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="false">
270 |     <parameters><parameter name="maxMethods"><![CDATA[30]]></parameter></parameters>
271 |   </check>
272 | 
273 |   <!-- Doesn't seem super big deal here, and we have a lot of magic numbers ... -->
274 |   <check level="error" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="false">
275 |     <parameters><parameter name="ignore">-1,0,1,2,3</parameter></parameters>
276 |   </check>
277 | 
278 | </scalastyle>
279 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!--
  3 |   ~ Licensed to the Apache Software Foundation (ASF) under one or more
  4 |   ~ contributor license agreements.  See the NOTICE file distributed with
  5 |   ~ this work for additional information regarding copyright ownership.
  6 |   ~ The ASF licenses this file to You under the Apache License, Version 2.0
  7 |   ~ (the "License"); you may not use this file except in compliance with
  8 |   ~ the License.  You may obtain a copy of the License at
  9 |   ~
 10 |   ~    http://www.apache.org/licenses/LICENSE-2.0
 11 |   ~
 12 |   ~ Unless required by applicable law or agreed to in writing, software
 13 |   ~ distributed under the License is distributed on an "AS IS" BASIS,
 14 |   ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |   ~ See the License for the specific language governing permissions and
 16 |   ~ limitations under the License.
 17 |   -->
 18 | 
 19 | <project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0"
 20 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 21 |     <modelVersion>4.0.0</modelVersion>
 22 | 
 23 |     <groupId>com.netease.bigdata</groupId>
 24 |     <artifactId>spark-courseware</artifactId>
 25 |     <name>Spark Courseware</name>
 26 |     <version>1.0.0-SNAPSHOT</version>
 27 |     <modules>
 28 |         <module>example</module>
 29 |     </modules>
 30 |     <packaging>pom</packaging>
 31 | 
 32 |     <licenses>
 33 |         <license>
 34 |             <name>The Apache Software License, Version 2.0</name>
 35 |             <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
 36 |             <distribution>manual</distribution>
 37 |         </license>
 38 |     </licenses>
 39 | 
 40 |     <developers>
 41 |         <developer>
 42 |             <id>yaooqinn</id>
 43 |             <name>Kent Yao</name>
 44 |             <email>yaooqinn@hotmail.com</email>
 45 |             <organization>NetEase</organization>
 46 |             <url>https://github.com/yaooqinn</url>
 47 |         </developer>
 48 |     </developers>
 49 | 
 50 |     <properties>
 51 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 52 |         <java.version>1.7</java.version>
 53 |         <scala.version>2.11.8</scala.version>
 54 |         <scalatest.version>2.2.6</scalatest.version>
 55 |         <scala.binary.version>2.11</scala.binary.version>
 56 |         <maven.version>3.3.9</maven.version>
 57 |         <spark.group>org.apache.spark</spark.group>
 58 |         <spark.version>2.1.2</spark.version>
 59 |         <spark.scope>provided</spark.scope>
 60 |         <hadoop.version>2.6.5</hadoop.version>
 61 |         <hadoop.deps.scope>provided</hadoop.deps.scope>
 62 |         <hive.group>org.spark-project.hive</hive.group>
 63 |         <hive.version>1.2.1.spark2</hive.version>
 64 |         <hive.version.short>1.2.1</hive.version.short>
 65 |         <hive.deps.scope>provided</hive.deps.scope>
 66 |         <jpam.version>1.1</jpam.version>
 67 |         <apacheds.version>2.0.0-M15</apacheds.version>
 68 |     </properties>
 69 | 
 70 |     <repositories>
 71 |         <repository>
 72 |             <id>central</id>
 73 |             <!-- This should be at top, it makes maven try the central repo first and then others and hence faster dep resolution -->
 74 |             <name>Maven Repository</name>
 75 |             <url>https://repo.maven.apache.org/maven2</url>
 76 |             <releases>
 77 |                 <enabled>true</enabled>
 78 |             </releases>
 79 |             <snapshots>
 80 |                 <enabled>false</enabled>
 81 |             </snapshots>
 82 |         </repository>
 83 |         <repository>
 84 |             <id>apache</id>
 85 |             <name>Apache Repository Snapshots</name>
 86 |             <url>http://repository.apache.org/snapshots</url>
 87 |             <releases>
 88 |                 <enabled>false</enabled>
 89 |             </releases>
 90 |             <snapshots>
 91 |                 <enabled>true</enabled>
 92 |                 <updatePolicy>daily</updatePolicy>
 93 |                 <checksumPolicy>warn</checksumPolicy>
 94 |             </snapshots>
 95 |         </repository>
 96 |     </repositories>
 97 | 
 98 |     <pluginRepositories>
 99 |         <pluginRepository>
100 |             <id>central</id>
101 |             <url>https://repo.maven.apache.org/maven2</url>
102 |             <releases>
103 |                 <enabled>true</enabled>
104 |             </releases>
105 |             <snapshots>
106 |                 <enabled>false</enabled>
107 |             </snapshots>
108 |         </pluginRepository>
109 |         <pluginRepository>
110 |             <id>apache</id>
111 |             <name>Apache Repository Snapshots</name>
112 |             <url>http://repository.apache.org/snapshots</url>
113 |             <releases>
114 |                 <enabled>false</enabled>
115 |             </releases>
116 |             <snapshots>
117 |                 <enabled>true</enabled>
118 |                 <updatePolicy>daily</updatePolicy>
119 |                 <checksumPolicy>warn</checksumPolicy>
120 |             </snapshots>
121 |         </pluginRepository>
122 |     </pluginRepositories>
123 | 
124 |     <dependencyManagement>
125 |         <dependencies>
126 |             <dependency>
127 |                 <groupId>${spark.group}</groupId>
128 |                 <artifactId>spark-yarn_${scala.binary.version}</artifactId>
129 |                 <version>${spark.version}</version>
130 |                 <scope>${spark.scope}</scope>
131 |                 <exclusions>
132 |                     <exclusion>
133 |                         <groupId>org.apache.hadoop</groupId>
134 |                         <artifactId>*</artifactId>
135 |                     </exclusion>
136 |                 </exclusions>
137 |             </dependency>
138 |             <dependency>
139 |                 <groupId>${spark.group}</groupId>
140 |                 <artifactId>spark-hive_${scala.binary.version}</artifactId>
141 |                 <version>${spark.version}</version>
142 |                 <scope>${spark.scope}</scope>
143 |             </dependency>
144 | 
145 |             <dependency>
146 |                 <groupId>${spark.group}</groupId>
147 |                 <artifactId>spark-tags_${scala.binary.version}</artifactId>
148 |                 <version>${spark.version}</version>
149 |                 <scope>${spark.scope}</scope>
150 |             </dependency>
151 | 
152 |             <dependency>
153 |                 <groupId>org.scala-lang</groupId>
154 |                 <artifactId>scala-library</artifactId>
155 |                 <version>${scala.version}</version>
156 |                 <scope>provided</scope>
157 |             </dependency>
158 | 
159 |             <dependency>
160 |                 <groupId>org.apache.hadoop</groupId>
161 |                 <artifactId>hadoop-mapreduce-client-core</artifactId>
162 |                 <version>${hadoop.version}</version>
163 |                 <scope>${hadoop.deps.scope}</scope>
164 |             </dependency>
165 | 
166 |             <dependency>
167 |                 <groupId>org.apache.hadoop</groupId>
168 |                 <artifactId>hadoop-client</artifactId>
169 |                 <version>${hadoop.version}</version>
170 |                 <scope>${hadoop.deps.scope}</scope>
171 |                 <exclusions>
172 |                     <exclusion>
173 |                         <groupId>asm</groupId>
174 |                         <artifactId>asm</artifactId>
175 |                     </exclusion>
176 |                     <exclusion>
177 |                         <groupId>org.codehaus.jackson</groupId>
178 |                         <artifactId>jackson-mapper-asl</artifactId>
179 |                     </exclusion>
180 |                     <exclusion>
181 |                         <groupId>org.ow2.asm</groupId>
182 |                         <artifactId>asm</artifactId>
183 |                     </exclusion>
184 |                     <exclusion>
185 |                         <groupId>org.jboss.netty</groupId>
186 |                         <artifactId>netty</artifactId>
187 |                     </exclusion>
188 |                     <exclusion>
189 |                         <groupId>commons-logging</groupId>
190 |                         <artifactId>commons-logging</artifactId>
191 |                     </exclusion>
192 |                     <exclusion>
193 |                         <groupId>org.mockito</groupId>
194 |                         <artifactId>mockito-all</artifactId>
195 |                     </exclusion>
196 |                     <exclusion>
197 |                         <groupId>org.mortbay.jetty</groupId>
198 |                         <artifactId>servlet-api-2.5</artifactId>
199 |                     </exclusion>
200 |                     <exclusion>
201 |                         <groupId>javax.servlet</groupId>
202 |                         <artifactId>servlet-api</artifactId>
203 |                     </exclusion>
204 |                     <exclusion>
205 |                         <groupId>junit</groupId>
206 |                         <artifactId>junit</artifactId>
207 |                     </exclusion>
208 |                     <exclusion>
209 |                         <groupId>com.sun.jersey</groupId>
210 |                         <artifactId>*</artifactId>
211 |                     </exclusion>
212 |                     <exclusion>
213 |                         <groupId>com.sun.jersey.jersey-test-framework</groupId>
214 |                         <artifactId>*</artifactId>
215 |                     </exclusion>
216 |                     <exclusion>
217 |                         <groupId>com.sun.jersey.contribs</groupId>
218 |                         <artifactId>*</artifactId>
219 |                     </exclusion>
220 |                 </exclusions>
221 |             </dependency>
222 | 
223 |             <dependency>
224 |                 <groupId>org.eclipse.jetty</groupId>
225 |                 <artifactId>jetty-servlet</artifactId>
226 |                 <version>9.3.11.v20160721</version>
227 |             </dependency>
228 | 
229 |             <dependency>
230 |                 <groupId>com.google.guava</groupId>
231 |                 <artifactId>guava</artifactId>
232 |                 <version>14.0.1</version>
233 |                 <scope>provided</scope>
234 |             </dependency>
235 | 
236 |             <dependency>
237 |                 <groupId>net.sf.jpam</groupId>
238 |                 <artifactId>jpam</artifactId>
239 |                 <version>${jpam.version}</version>
240 |                 <scope>provided</scope>
241 |             </dependency>
242 | 
243 |             <dependency>
244 |                 <groupId>org.apache.hadoop</groupId>
245 |                 <artifactId>hadoop-yarn-client</artifactId>
246 |                 <version>${hadoop.version}</version>
247 |                 <scope>${hadoop.deps.scope}</scope>
248 |             </dependency>
249 | 
250 |             <dependency>
251 |                 <groupId>org.scalatest</groupId>
252 |                 <artifactId>scalatest_${scala.binary.version}</artifactId>
253 |                 <version>${scalatest.version}</version>
254 |                 <scope>test</scope>
255 |             </dependency>
256 | 
257 |             <dependency>
258 |                 <groupId>${spark.group}</groupId>
259 |                 <artifactId>spark-core_${scala.binary.version}</artifactId>
260 |                 <version>${spark.version}</version>
261 |             </dependency>
262 | 
263 |             <dependency>
264 |                 <groupId>${spark.group}</groupId>
265 |                 <artifactId>spark-catalyst_${scala.binary.version}</artifactId>
266 |                 <version>${spark.version}</version>
267 |                 <type>test-jar</type>
268 |                 <scope>test</scope>
269 |             </dependency>
270 |             <dependency>
271 |                 <groupId>${spark.group}</groupId>
272 |                 <artifactId>spark-sql_${scala.binary.version}</artifactId>
273 |                 <version>${spark.version}</version>
274 |                 <type>test-jar</type>
275 |                 <scope>test</scope>
276 |             </dependency>
277 |             <dependency>
278 |                 <groupId>${hive.group}</groupId>
279 |                 <artifactId>hive-service</artifactId>
280 |                 <version>${hive.version}</version>
281 |                 <scope>test</scope>
282 |             </dependency>
283 | 
284 |             <dependency>
285 |                 <groupId>org.apache.hadoop</groupId>
286 |                 <artifactId>hadoop-minikdc</artifactId>
287 |                 <version>${hadoop.version}</version>
288 |                 <scope>test</scope>
289 |                 <exclusions>
290 |                     <exclusion>
291 |                         <groupId>org.apache.directory.api</groupId>
292 |                         <artifactId>api-all</artifactId>
293 |                     </exclusion>
294 |                     <exclusion>
295 |                         <groupId>org.apache.directory.jdbm</groupId>
296 |                         <artifactId>apacheds-jdbm1</artifactId>
297 |                     </exclusion>
298 |             </exclusions>
299 |             </dependency>
300 |             <dependency>
301 |                 <groupId>org.apache.directory.server</groupId>
302 |                 <artifactId>apacheds-service</artifactId>
303 |                 <version>${apacheds.version}</version>
304 |                 <scope>test</scope>
305 |                 <exclusions>
306 |                     <exclusion>
307 |                         <groupId>bouncycastle</groupId>
308 |                         <artifactId>bcprov-jdk15</artifactId>
309 |                     </exclusion>
310 |                 </exclusions>
311 |             </dependency>
312 |             <dependency>
313 |                 <groupId>org.apache.curator</groupId>
314 |                 <artifactId>curator-test</artifactId>
315 |                 <version>2.6.0</version>
316 |                 <scope>test</scope>
317 |             </dependency>
318 |             <dependency>
319 |                 <groupId>org.mockito</groupId>
320 |                 <artifactId>mockito-core</artifactId>
321 |                 <version>1.10.19</version>
322 |                 <scope>test</scope>
323 |             </dependency>
324 |         </dependencies>
325 |     </dependencyManagement>
326 | 
327 |     <profiles>
328 |         <profile>
329 |             <id>spark-2.1</id>
330 |             <properties>
331 |                 <spark.version>2.1.2</spark.version>
332 |             </properties>
333 |         </profile>
334 | 
335 |         <profile>
336 |             <id>spark-2.2</id>
337 |             <properties>
338 |                 <spark.version>2.2.1</spark.version>
339 |             </properties>
340 |         </profile>
341 | 
342 |         <profile>
343 |             <id>spark-2.3</id>
344 |             <properties>
345 |                 <spark.version>2.3.0</spark.version>
346 |                 <scalatest.version>3.0.3</scalatest.version>
347 |             </properties>
348 |         </profile>
349 |     </profiles>
350 | </project>
351 | 


--------------------------------------------------------------------------------
/slides/spark_core/context_cleaner.html:
--------------------------------------------------------------------------------
  1 | <html>
  2 |   <head>
  3 |     <title>Apache Spark Garbge Collector</title>
  4 |     <meta charset="utf-8">
  5 |     <style>
  6 |       @import url(https://fonts.googleapis.com/css?family=Yanone+Kaffeesatz);
  7 |       @import url(https://fonts.googleapis.com/css?family=Droid+Serif:400,700,400italic);
  8 |       @import url(https://fonts.googleapis.com/css?family=Ubuntu+Mono:400,700,400italic);
  9 | 
 10 |       body { font-family: 'Droid Serif'; }
 11 |       h1, h2, h3 {
 12 |         font-family: 'Yanone Kaffeesatz';
 13 |         font-weight: 400;
 14 |         margin-bottom: 0;
 15 |       }
 16 |       .remark-slide-content h1 { font-size: 3em; }
 17 |       .remark-slide-content h2 { font-size: 2em; }
 18 |       .remark-slide-content h3 { font-size: 1.6em; }
 19 |       .footnote {
 20 |         position: absolute;
 21 |         bottom: 3em;
 22 |       }
 23 |       li p { line-height: 1.25em; }
 24 |       .red { color: #fa0000; }
 25 |       .large { font-size: 2em; }
 26 |       a, a > code {
 27 |         color: rgb(249, 38, 114);
 28 |         text-decoration: none;
 29 |       }
 30 |       code {
 31 |         background: #e7e8e2;
 32 |         border-radius: 5px;
 33 |       }
 34 |       .remark-code, .remark-inline-code { font-family: 'Ubuntu Mono'; }
 35 |       .remark-code-line-highlighted     { background-color: #373832; }
 36 |       .pull-left {
 37 |         float: left;
 38 |         width: 47%;
 39 |       }
 40 |       .pull-right {
 41 |         float: right;
 42 |         width: 47%;
 43 |       }
 44 |       .pull-right ~ p {
 45 |         clear: both;
 46 |       }
 47 |       #slideshow .slide .content code {
 48 |         font-size: 0.8em;
 49 |       }
 50 |       #slideshow .slide .content pre code {
 51 |         font-size: 0.9em;
 52 |         padding: 15px;
 53 |       }
 54 |       .inverse {
 55 |         background: #272822;
 56 |         color: #777872;
 57 |         text-shadow: 0 0 20px #333;
 58 |       }
 59 |       .inverse h1, .inverse h2 {
 60 |         color: #fff;
 61 |         line-height: 0.8em;
 62 |       }
 63 | 
 64 |       /* Slide-specific styling */
 65 |       #slide-inverse .footnote {
 66 |         bottom: 12px;
 67 |         left: 20px;
 68 |       }
 69 |       #slide-how .slides {
 70 |         font-size: 0.9em;
 71 |         position: absolute;
 72 |         top:  151px;
 73 |         right: 140px;
 74 |       }
 75 |       #slide-how .slides h3 {
 76 |         margin-top: 0.2em;
 77 |       }
 78 |       #slide-how .slides .first, #slide-how .slides .second {
 79 |         padding: 1px 20px;
 80 |         height: 90px;
 81 |         width: 120px;
 82 |         -moz-box-shadow: 0 0 10px #777;
 83 |         -webkit-box-shadow: 0 0 10px #777;
 84 |         box-shadow: 0 0 10px #777;
 85 |       }
 86 |       #slide-how .slides .first {
 87 |         background: #fff;
 88 |         position: absolute;
 89 |         top: 20%;
 90 |         left: 20%;
 91 |         z-index: 1;
 92 |       }
 93 |       #slide-how .slides .second {
 94 |         position: relative;
 95 |         background: #fff;
 96 |         z-index: 0;
 97 |       }
 98 | 
 99 |       /* Two-column layout */
100 |       .left-column {
101 |         color: #777;
102 |         width: 20%;
103 |         height: 92%;
104 |         float: left;
105 |       }
106 |       .left-column h2:last-of-type, .left-column h3:last-of-type, .left-colum h4:last-of-type  {
107 |         color: #000;
108 |       }
109 |       .right-column {
110 |         width: 75%;
111 |         float: right;
112 |         padding-top: 1em;
113 |         font: monospace;
114 |         font-size: 24;
115 |         font-style: all;
116 |       }
117 | 
118 |       /* Two-column layout inverse*/
119 |       .left-column-inverse {
120 |         color: #777;
121 |         width: 20%;
122 |         height: 92%;
123 |         float: left;
124 |       }
125 |       .left-column-inverse h2:last-of-type, .left-column-inverse h3:last-of-type, .left-column-inverse h4:last-of-type {
126 |         color: #fff;
127 |       }
128 |       .right-column-inverse {
129 |         color: #fff;
130 |         width: 75%;
131 |         float: right;
132 |         padding-top: 0em;
133 |         padding-bottom: 0em;
134 |         margin-top: 0em;
135 |         font: monospace;
136 |         font-size: 24;
137 |       }
138 |       .right-column-inverse h2, .right-column-inverse h3, .right-column-inverse h4 {
139 |         color: #fff;
140 |         padding-top: 0em;
141 |         padding-bottom: 0.2em;
142 |         margin-bottom: 0.2em;
143 |       }
144 |     </style>
145 |   </head>
146 |   <body>
147 |     <textarea id="source">
148 | 
149 | class: center, middle, inverse
150 | name: 标题页
151 | 
152 | ## [NetEase Spark Courses](https://netease-bigdata.github.io/ne-spark-courseware/)
153 | 
154 | <br>
155 | <br>
156 | <br>
157 | <br>
158 | 
159 | ## Apache Spark Garbge Collector --- ContextCleaner
160 | 
161 | <br>
162 | <br>
163 | <br>
164 | <br>
165 | 
166 | <img style="zoom: 1.0" src="../../imgs/mammut.png"  align="bottom" />
167 | 
168 | ???
169 | 备注：标题 <br>
170 | 帮助信息：在网页端按H键可进入帮助页面
171 | 
172 | ---
173 | 
174 | class: inverse, center
175 | name: agenda
176 | 
177 |   # Agenda
178 |   ## -
179 |   ** About Me **
180 |   ## -
181 | 
182 |   ** 什么是 ContextCleaner？ ** <br>
183 | 
184 |   ** ContextCleaner 工作原理 ** <br>
185 | 
186 |   ** 什么时候需要关心 ContextCleaner？ ** <br>
187 | 
188 |   ** 如何配置 ContextCleaner？ **<br>
189 | 
190 | ???
191 | 
192 | 备注: 目录<br>
193 | 
194 | 本 TOPIC 内容包括Spark程序内部垃圾回收器（ContextCleaner）的相关介绍<br>
195 | 
196 | ---
197 | 
198 | class: inverse
199 | name: aboutme
200 | 
201 | .left-column-inverse[
202 |   # About Me
203 | ]
204 | 
205 | .right-column-inverse[
206 | ## Kent Yao
207 | 
208 | 2016年11月加入网易，目前在杭州研究院-数据科学中心担任资深大数据平台开发工程师，主导 Spark 作为核心计算框架在[网易大数据平台](https://bigdata.163yun.com/mammut)的相关研发及大规模应用工作。
209 | 
210 | 前华为技术有限公司大数据技术开发部成员。
211 | 
212 | <br>
213 | 
214 | GitHub: https://github.com/yaooqinn
215 | 
216 | <br>
217 | <br>
218 | <br>
219 | 
220 | <img style="zoom: 0.618" src="../../imgs/mammut.png" />
221 | ]
222 | 
223 | ???
224 | 备注：个人简介
225 | 
226 | ---
227 | 
228 | class: inverse
229 | name: why
230 | 
231 | .left-column-inverse[
232 |   ## Why?
233 | ]
234 | 
235 | .right-column-inverse[
236 | 
237 | <br>
238 | 
239 | #### 缓存的 RDD 为啥重算了？
240 | 
241 | #### Shuffle 残留, Executor内存吃紧, 乃至磁盘被打爆？
242 | 
243 | #### 长时应用 Spark Streaming 间歇性停顿
244 | 
245 | #### 长稳服务 Thrift Server 响应缓慢
246 | 
247 | #### 一些Driver端的谜之停顿，甚至Hang住
248 | 
249 | #### 不小心 -XX:+DisableExplicitGC
250 | 
251 | ]
252 | 
253 | ???
254 | 
255 | Spark 作为一个JVM based分布式计算框架，我我们基于Spark编写的应用程序，也会遇到
256 | 
257 | ---
258 | 
259 | class: inverse
260 | name: context cleaner overview
261 | 
262 | .left-column-inverse[
263 |   ## What?
264 |   ### Overview
265 |   #### ContextCleaner
266 | ]
267 | 
268 | .right-column-inverse[
269 | 
270 | #### 什么是 ContextCleaner？
271 | 
272 | ##### **Driver 端**异步清理线程
273 |   - 清理缓存过，但不再引用的 RDD
274 |   - 清理该 RDD 对应的 ShuffleDependency 数据
275 |      - Driver 端 Shuffle 元数据
276 |      - Executor端 Shuffle 文件
277 |      - 伴生的Broadcast 元数据变量
278 |   - 以及 Broadcast、累加器变量、检查点数据
279 | 
280 | ##### 通过Java WeakReference机制来接受**垃圾回收器**通知进行变量的清理
281 | 
282 | #### 什么是 CleanupTask?
283 | ```scala
284 | private sealed trait CleanupTask /** 各种内部变量的清理任务 */
285 | private case class CleanRDD(rddId: Int) extends CleanupTask
286 | private case class CleanShuffle(shuffleId: Int) extends CleanupTask
287 | private case class CleanBroadcast(broadcastId: Long) extends CleanupTask
288 | private case class CleanAccum(accId: Long) extends CleanupTask
289 | private case class CleanCheckpoint(rddId: Int) extends CleanupTask
290 | ```
291 | ]
292 | 
293 | ---
294 | 
295 | class: inverse
296 | name: context cleaner overview
297 | 
298 | .left-column-inverse[
299 |   ## What?
300 |   ### Overview
301 |   #### ContextCleaner
302 |   #### CleanupTaskWeakRef...
303 | ]
304 | 
305 | .right-column-inverse[
306 | 什么是 CleanupTaskWeakReference？
307 | 
308 | ```scala
309 | /**
310 |  * 封装 CleanupTask 的 WeakReference. 当 referent 指向的 object 变成弱引用,
311 |  * 其被自动加入 ReferenceQueue.
312 |  */
313 | private class CleanupTaskWeakReference(
314 |     val task: CleanupTask,
315 |     referent: AnyRef,
316 |     referenceQueue: ReferenceQueue[AnyRef])
317 |   extends WeakReference(referent, referenceQueue)
318 | ```
319 | 
320 | 1.注册
321 | 
322 | ```scala
323 | /** Register an object for cleanup. */
324 |   private def registerForCleanup(objectForCleanup: AnyRef, task: CleanupTask): Unit = {
325 |     referenceBuffer.add(new CleanupTaskWeakReference(task, objectForCleanup, referenceQueue))
326 |   }
327 | 
328 | ```
329 | 
330 | 2.可达性分析 referent 可达性变化
331 | 
332 | 3.垃圾回收器将已注册的引用对象添加到 referenceQueue 中
333 | 
334 | 4.我们拿到这个队列进行遍历，然后做些对应的清理工作 - CleanupTask
335 | ]
336 | 
337 | ???
338 | 
339 | ---
340 | 
341 | class: inverse
342 | name: context cleaner conponents
343 | 
344 | .left-column-inverse[
345 |   ## What?
346 |   ### Overview
347 |   ### Conponents
348 | ]
349 | 
350 | .right-column-inverse[
351 | 
352 | #### referenceBuffer
353 | ```
354 | /**
355 |    * A buffer to ensure that `CleanupTaskWeakReference`s are not garbage collected
356 |    * as long as they have not been handled by the reference queue.
357 |    */
358 |   private val referenceBuffer =
359 |     Collections.newSetFromMap[CleanupTaskWeakReference](new ConcurrentHashMap)
360 | ```
361 | <br>
362 | 
363 | - 确保每个 CleanupTaskWeakReference 在被 ReferenceQueue 处理之前，自身不被回收掉
364 | - 从而确保 RDD/Shuffle/Broadcast等对象被正确的清理
365 | - 一旦开始清理, 就会把对 CleanupTaskWeakReference 引用移除
366 | 
367 | ]
368 | 
369 | ---
370 | 
371 | class: inverse
372 | name:context cleaner conponents
373 | 
374 | .left-column-inverse[
375 |   ## What?
376 |   ### Overview
377 |   ### Conponents
378 | ]
379 | 
380 | .right-column-inverse[
381 | 
382 | #### referenceQueue
383 | ```
384 | private val referenceQueue = new ReferenceQueue[AnyRef]
385 | 
386 | ```
387 | <br>
388 | - ReferenceQueue 引用队列，在检测到适当的可到达性更改后，垃圾回收器将已注册的引用对象添加到该队列中
389 | 
390 | - 用于 CleanupTaskWeakReference 的构建, 通过这个队列我们可以在某个对象即将回收时，搞一些“事情”
391 | 
392 | - ContextCleaner 通过这个队列遍历出那些 CleanupTaskWeakReference 引用, 进而进行清理
393 | 
394 | ]
395 | ---
396 | 
397 | class: inverse
398 | name: context cleaner conponents
399 | 
400 | .left-column-inverse[
401 |   ## What?
402 |   ### Overview
403 |   ### Conponents
404 | ]
405 | 
406 | .right-column-inverse[
407 | 
408 | #### cleaningThread
409 | ```
410 | private val cleaningThread = new Thread() { override def run() { keepCleaning() }}
411 | 
412 | ```
413 | ```scala
414 | /** Keep cleaning RDD, shuffle, and broadcast state. */
415 |   private def keepCleaning(): Unit = Utils.tryOrStopSparkContext(sc) {
416 |     while (!stopped) {
417 |       try {
418 |         val reference = Option(referenceQueue.remove(ContextCleaner.REF_QUEUE_POLL_TIMEOUT))
419 |           .map(_.asInstanceOf[CleanupTaskWeakReference])
420 |         synchronized {
421 |           reference.foreach { ref =>
422 |             referenceBuffer.remove(ref)
423 |             ref.task match {
424 |               case CleanRDD(rddId) =>
425 |                 doCleanupRDD(rddId, blocking = blockOnCleanupTasks)
426 |               ...
427 |               case CleanBroadcast(broadcastId) =>
428 |                 doCleanupBroadcast(broadcastId, blocking = blockOnCleanupTasks)
429 |               ...
430 |             }
431 |           }
432 |         }
433 |       }
434 |       ...
435 |   }
436 | 
437 | ```
438 | 循环获取referenceQueue中的弱引用，根据 CleanupTask 进行对应清理
439 | ]
440 | 
441 | ---
442 | 
443 | class: inverse
444 | name: context cleaner conponents
445 | 
446 | .left-column-inverse[
447 |   ## What?
448 |   ### Overview
449 |   ### Conponents
450 | ]
451 | 
452 | .right-column-inverse[
453 | 
454 | #### periodicGCService
455 | ```
456 | private val periodicGCService: ScheduledExecutorService =
457 |     ThreadUtils.newDaemonSingleThreadScheduledExecutor("context-cleaner-periodic-gc")
458 | ```
459 | 
460 | ```scala
461 |   /** Start the cleaner. */
462 |   def start(): Unit = {
463 |     ...
464 |     periodicGCService.scheduleAtFixedRate(new Runnable {
465 |       override def run(): Unit = System.gc()
466 |     }, periodicGCInterval, periodicGCInterval, TimeUnit.SECONDS)
467 |   }
468 | 
469 | ```
470 | ReferenceQueue 与 Spark 程序 Driver 端的 GC 有关
471 | 
472 | - 常规的GC
473 | - 自己调用System.gc
474 | - Spark 周期性的 System.gc
475 | ]
476 | 
477 | ---
478 | 
479 | class: inverse
480 | name: context cleaner conponents
481 | 
482 | .left-column-inverse[
483 |   ## How?
484 |   ### 栗子
485 | ]
486 | .right-column-inverse[
487 | 
488 | ```scala
489 | object RDDCacheTest {
490 |   def main(args: Array[String]): Unit = {
491 |     val conf = new SparkConf()
492 |       .setAppName(getClass.getSimpleName)
493 |       .set("spark.cleaner.periodicGC.interval", "1min") // context cleaner
494 |     val sc = new SparkContext(conf)
495 |     val data = Seq.fill(1024 * 1024 * 100)(Random.nextInt(100))
496 |     val rdd1 = sc.parallelize(data, 20)
497 |     rdd1.cache() // mark rdd 1 cache
498 |     val rdd2 = rdd1.map((_, 1)).reduceByKey(_ + _) // word count
499 |     val cachedRdd2 = rdd2.cache() // cache shuffled rdd
500 |     rdd2.collect() // action actually trigger caching
501 |     rdd1.count()  // ditto
502 |     rdd2.count() // rdd reuse
503 |     cachedRdd2.count() // ditto
504 |     rdd1.map((_, 1)).reduceByKey(_ + _).take(1) // rdd 1 reuse, not rdd 2
505 |     val rdd3 = sc.parallelize(data, 30)
506 |     rdd3.map((_, 1)).reduceByKey(_ + _).count() // no rdd reuse
507 |     10.to(20, 2).foreach { i =>
508 |       val tmp = rdd3.groupBy(_ % i)
509 |       tmp.cache().count()
510 |       if (i % 3 == 0) tmp.take(1)
511 |     }
512 |     Thread.sleep(1000 * 60 * 10)
513 |     sc.stop()
514 |   }
515 | }
516 | ```
517 | 
518 | 上述例子中，我们简单的调用了RDD.cache, 引入Shuffle算子，再基于Spark自己对broadcast变量的利用，看下ContextCleaner的工作原理
519 | ]
520 | 
521 | ---
522 | 
523 | class: inverse
524 | name: context cleaner conponents
525 | 
526 | .left-column-inverse[
527 |   ## How?
528 |   ### 栗子
529 |   ### 初始化
530 | ]
531 | .right-column-inverse[
532 | 
533 | ```
534 | val sc = new SparkContext(conf)
535 | ```
536 | ⬇️
537 | ```scala
538 | _cleaner =
539 |   if (_conf.getBoolean("spark.cleaner.referenceTracking", true)) {
540 |     Some(new ContextCleaner(this))
541 |   } else {
542 |     None
543 |   }
544 | _cleaner.foreach(_.start())
545 | 
546 | ```
547 | 
548 | 在默认情况下（spark.cleaner.referenceTracking=true），SparkContext实例化的时候为我们自动初始化ContextCleaner
549 | ]
550 | 
551 | ---
552 | 
553 | class: inverse
554 | name: context cleaner conponents
555 | 
556 | .left-column-inverse[
557 |   ## How?
558 |   ### 栗子
559 |   ### 初始化
560 |   ### cache
561 | ]
562 | .right-column-inverse[
563 | 
564 | ```
565 | rdd1.cache()
566 | ```
567 | ⬇️
568 | ```scala
569 |  /**
570 |    * Mark this RDD for persisting using the specified level.
571 |    *
572 |    * @param newLevel the target storage level
573 |    * @param allowOverride whether to override any existing level with the new one
574 |    */
575 |   private def persist(newLevel: StorageLevel, allowOverride: Boolean): this.type = {
576 |     // TODO: Handle changes of StorageLevel
577 |     if (storageLevel != StorageLevel.NONE && newLevel != storageLevel && !allowOverride) {
578 |       throw new UnsupportedOperationException(
579 |         "Cannot change storage level of an RDD after it was already assigned a level")
580 |     }
581 |     // If this is the first time this RDD is marked for persisting, register it
582 |     // with the SparkContext for cleanups and accounting. Do this only once.
583 |     if (storageLevel == StorageLevel.NONE) {
584 |       sc.cleaner.foreach(_.registerRDDForCleanup(this))
585 |       sc.persistRDD(this)
586 |     }
587 |     storageLevel = newLevel
588 |     this
589 |   }
590 | 
591 | ```
592 | 
593 | 在默认情况下（spark.cleaner.referenceTracking=true），调用RDD.cache会将自身注册到ContextCleaner
594 | ]
595 | 
596 | ---
597 | class: inverse
598 | name: context cleaner conponents
599 | 
600 | .left-column-inverse[
601 |   ## How?
602 |   ### 栗子
603 |   ### 初始化
604 |   ### cache
605 |   ### shuffle
606 | ]
607 | .right-column-inverse[
608 | 
609 | ```
610 | val rdd2 = rdd1.map((_, 1)).reduceByKey(_ + _)
611 | ```
612 | ⬇️
613 | ```scala
614 | 
615 | class ShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag](
616 |     @transient private val _rdd: ...)
617 |   extends Dependency[Product2[K, V]] {
618 |   ...
619 | 
620 |   val shuffleId: Int = _rdd.context.newShuffleId()
621 | 
622 |   val shuffleHandle: ShuffleHandle = _rdd.context.env.shuffleManager.registerShuffle(
623 |     shuffleId, _rdd.partitions.length, this)
624 | 
625 |   _rdd.sparkContext.cleaner.foreach(_.registerShuffleForCleanup(this))
626 | }
627 | 
628 | ```
629 | 
630 | 在默认情况下（spark.cleaner.referenceTracking=true），调用Shuffle过程中，会将自身注册到ContextCleaner
631 | 
632 | PS: 之前提到了broadcast等 注册也都是大同小异的逻辑
633 | 
634 | ]
635 | 
636 | ---
637 | class: inverse
638 | name: context cleaner conponents
639 | 
640 | .left-column-inverse[
641 |   ## How?
642 |   ### 栗子
643 |   ### 初始化
644 |   ### cache
645 |   ### shuffle
646 |   ### Job
647 | ]
648 | .right-column-inverse[
649 | 
650 | <img src="../../imgs/spark_core/context_cleaner/jobs_tab_cached_rdd.png" style="zoom: 0.88">
651 | 
652 | RDD的Cache可以大大的帮我提升程序的性能，滥用的话也可能影响性能
653 | 
654 | 1. 过多cache，挤占计算资源
655 | 2. 清理时的压力
656 | 
657 | ]
658 | 
659 | ---
660 | class: inverse
661 | name: context cleaner conponents
662 | 
663 | .left-column-inverse[
664 |   ## How?
665 |   ### 栗子
666 |   ### 初始化
667 |   ### cache
668 |   ### shuffle
669 |   ### Job
670 |   ### Storage
671 | ]
672 | .right-column-inverse[
673 | 
674 | ```
675 | Thread.sleep(1000 * 60 * 10)
676 | ```
677 | ⬇️
678 | 
679 | <img src="../../imgs/spark_core/context_cleaner/storage_tab_cached_rdd.jpg" style="zoom: 0.95">
680 | 
681 | 主线程休眠的10分钟内：
682 | 1. for循环内的缓存的RDD都将变得可达性都会变化，都被我们的ContextCleaner回收
683 | 2. 它们对应的Shuffle缓存信息、文件(如果有)都会相应得到清理
684 | 3. 循环之外的两个RDD，一直存活
685 | 4. 它们对应的Shuffle缓存信息、文件(如果有)都将一直存在
686 | ]
687 | 
688 | ---
689 | class: inverse
690 | name: context cleaner conponents
691 | 
692 | .left-column-inverse[
693 |   ## How?
694 |   ### 栗子
695 |   ### 初始化
696 |   ### cache
697 |   ### shuffle
698 |   ### Job
699 |   ### Storage
700 |   ### Configuration
701 | ]
702 | .right-column-inverse[
703 | ```scala
704 | spark.cleaner.referenceTracking
705 | 总开关
706 | 默认值：true
707 | 
708 | spark.cleaner.referenceTracking.cleanCheckpoints
709 | 是否开启检查点数据的清理
710 | 默认值：false
711 | 
712 | spark.cleaner.periodicGC.interval
713 | System.gc() 执行周期；
714 | 调节这个参数，一方面可以让 Driver 端的元数据（MapStatus）等，回收的更加顺滑
715 | 另一方面，可以及时触发 Executor 端 shuffle数据的及时清理
716 | 默认值：30min
717 | 
718 | spark.cleaner.referenceTracking.blocking
719 | 标记清理线程是否在清理除Shuffle数据之外时阻塞
720 | 默认值：true
721 | 
722 | spark.cleaner.referenceTracking.blocking.shuffle
723 | 标记清理线程在清理shuffle数据时是否阻塞
724 | 默认值：false
725 | ```
726 | 
727 | 阻塞： 清理线程需要等待所有 Executor 执行的结果
728 | 
729 | 不阻塞： Driver 先狂发消息给 Executor 端， 接着就要被回来的消息淹没
730 | ]
731 | 
732 | ---
733 | 
734 | class: inverse
735 | name: sub-agenda
736 | 
737 | .left-column-inverse[
738 |   ## 推荐
739 | ]
740 | 
741 | .right-column-inverse[
742 | 
743 | #### Case Study:
744 | 
745 | - [Debugging a long-running Apache Spark application: A War Story](https://tech.channable.com/posts/2018-04-10-debugging-a-long-running-apache-spark-application.html#footnote1)
746 | 
747 | #### Spark Issues:
748 | - [SPARK-3015](https://issues.apache.org/jira/browse/SPARK-3015) - Removing broadcast in quick successions causes Akka timeout - 关于为什么清理时要blocking
749 | - [SPARK-3139](https://issues.apache.org/jira/browse/SPARK-3139) - Akka timeouts from ContextCleaner when cleaning shuffles - 关于为什么清理shuffle时为何不blocking
750 | - [SPARK-1855]（https://issues.apache.org/jira/browse/SPARK-1855）- Provide memory-and-local-disk RDD checkpointing - 关于为何不默认注册检查点数据的清理
751 | 
752 | ]
753 | 
754 | ---
755 | 
756 | class: middle, center, inverse
757 | name: greetings
758 | # Q & A
759 | 
760 | ---
761 | 
762 | class: middle, center, inverse
763 | name: greetings
764 | # Thank You!
765 | ### [Kent Yao]
766 | 
767 | <img style="zoom: 1.0" src="../../imgs/mammut.png"  align="bottom" />
768 | 
769 | <a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/">Creative Commons Attribution-NonCommercial 4.0 International License</a>.
770 | 
771 | 
772 |     </textarea>
773 |     <script src="https://remarkjs.com/downloads/remark-latest.min.js">
774 |     </script>
775 |     <script>
776 |       var slideshow = remark.create({
777 |         ratio: '16:9',
778 |         slideNumberFormat: 'Slide %current% of %total%',
779 |         // .. or by using a format function
780 |         slideNumberFormat: function (current, total) {
781 |           return ' ' + current + ' of ' + total;
782 |         },
783 |         highlightLanguage: 'scala',
784 |         highlightStyle: 'monokai',
785 |         highlightLines: true,
786 |         // arta, ascetic, dark, default, far, github, googlecode, idea, ir-black, magula, monokai, rainbow, solarized-dark, solarized-light, sunburst, tomorrow, tomorrow-night-blue, tomorrow-night-bright, tomorrow-night, tomorrow-night-eighties, vs, zenburn
787 |         highlightStyle: 'zenburn'
788 |       });
789 |     </script>
790 |   </body>
791 | </html>
792 | 


--------------------------------------------------------------------------------
/slides/spark_basics/spark_basics_and_quick_start.html:
--------------------------------------------------------------------------------
  1 | <html>
  2 |   <head>
  3 |     <title>Spark概述及快速入门指南</title>
  4 |     <meta charset="utf-8">
  5 |     <style>
  6 |       @import url(https://fonts.googleapis.com/css?family=Yanone+Kaffeesatz);
  7 |       @import url(https://fonts.googleapis.com/css?family=Droid+Serif:400,700,400italic);
  8 |       @import url(https://fonts.googleapis.com/css?family=Ubuntu+Mono:400,700,400italic);
  9 | 
 10 |       body { font-family: 'Droid Serif'; }
 11 |       h1, h2, h3 {
 12 |         font-family: 'Yanone Kaffeesatz';
 13 |         font-weight: 400;
 14 |         margin-bottom: 0;
 15 |       }
 16 |       .remark-slide-content h1 { font-size: 3em; }
 17 |       .remark-slide-content h2 { font-size: 2em; }
 18 |       .remark-slide-content h3 { font-size: 1.6em; }
 19 |       .footnote {
 20 |         position: absolute;
 21 |         bottom: 3em;
 22 |       }
 23 |       li p { line-height: 1.25em; }
 24 |       .red { color: #fa0000; }
 25 |       .large { font-size: 2em; }
 26 |       a, a > code {
 27 |         color: rgb(249, 38, 114);
 28 |         text-decoration: none;
 29 |       }
 30 |       code {
 31 |         background: #e7e8e2;
 32 |         border-radius: 5px;
 33 |       }
 34 |       .remark-code, .remark-inline-code { font-family: 'Ubuntu Mono'; }
 35 |       .remark-code-line-highlighted     { background-color: #373832; }
 36 |       .pull-left {
 37 |         float: left;
 38 |         width: 47%;
 39 |       }
 40 |       .pull-right {
 41 |         float: right;
 42 |         width: 47%;
 43 |       }
 44 |       .pull-right ~ p {
 45 |         clear: both;
 46 |       }
 47 |       #slideshow .slide .content code {
 48 |         font-size: 0.8em;
 49 |       }
 50 |       #slideshow .slide .content pre code {
 51 |         font-size: 0.9em;
 52 |         padding: 15px;
 53 |       }
 54 |       .inverse {
 55 |         background: #272822;
 56 |         color: #777872;
 57 |         text-shadow: 0 0 20px #333;
 58 |       }
 59 |       .inverse h1, .inverse h2 {
 60 |         color: #fff;
 61 |         line-height: 0.8em;
 62 |       }
 63 | 
 64 |       /* Slide-specific styling */
 65 |       #slide-inverse .footnote {
 66 |         bottom: 12px;
 67 |         left: 20px;
 68 |       }
 69 |       #slide-how .slides {
 70 |         font-size: 0.9em;
 71 |         position: absolute;
 72 |         top:  151px;
 73 |         right: 140px;
 74 |       }
 75 |       #slide-how .slides h3 {
 76 |         margin-top: 0.2em;
 77 |       }
 78 |       #slide-how .slides .first, #slide-how .slides .second {
 79 |         padding: 1px 20px;
 80 |         height: 90px;
 81 |         width: 120px;
 82 |         -moz-box-shadow: 0 0 10px #777;
 83 |         -webkit-box-shadow: 0 0 10px #777;
 84 |         box-shadow: 0 0 10px #777;
 85 |       }
 86 |       #slide-how .slides .first {
 87 |         background: #fff;
 88 |         position: absolute;
 89 |         top: 20%;
 90 |         left: 20%;
 91 |         z-index: 1;
 92 |       }
 93 |       #slide-how .slides .second {
 94 |         position: relative;
 95 |         background: #fff;
 96 |         z-index: 0;
 97 |       }
 98 | 
 99 |       /* Two-column layout */
100 |       .left-column {
101 |         color: #777;
102 |         width: 20%;
103 |         height: 92%;
104 |         float: left;
105 |       }
106 |       .left-column h2:last-of-type, .left-column h3:last-of-type, .left-colum h4:last-of-type  {
107 |         color: #000;
108 |       }
109 |       .right-column {
110 |         width: 75%;
111 |         float: right;
112 |         padding-top: 1em;
113 |         font: monospace;
114 |         font-size: 24;
115 |         font-style: all;
116 |       }
117 | 
118 |       /* Two-column layout inverse*/
119 |       .left-column-inverse {
120 |         color: #777;
121 |         width: 20%;
122 |         height: 92%;
123 |         float: left;
124 |       }
125 |       .left-column-inverse h2:last-of-type, .left-column-inverse h3:last-of-type, .left-column-inverse h4:last-of-type {
126 |         color: #fff;
127 |       }
128 |       .right-column-inverse {
129 |         color: #fff;
130 |         width: 75%;
131 |         float: right;
132 |         padding-top: 0em;
133 |         padding-bottom: 0em;
134 |         margin-top: 0em;
135 |         font: monospace;
136 |         font-size: 24;
137 |       }
138 |       .right-column-inverse h2, .right-column-inverse h3, .right-column-inverse h4 {
139 |         color: #fff;
140 |         padding-top: 0em;
141 |         padding-bottom: 0.2em;
142 |         margin-bottom: 0.2em;
143 |       }
144 |     </style>
145 |   </head>
146 |   <body>
147 |     <textarea id="source">
148 | 
149 | class: center, middle, inverse
150 | name: 标题页
151 | 
152 | ## [NetEase Spark Courses](https://netease-bigdata.github.io/ne-spark-courseware/)
153 | 
154 | <br>
155 | <br>
156 | <br>
157 | <br>
158 | 
159 | ## Apache Spark 概述及快速入门指南
160 | 
161 | <br>
162 | <br>
163 | <br>
164 | <br>
165 | 
166 | <img style="zoom: 1.0" src="../../imgs/mammut.png"  align="bottom" />
167 | 
168 | ???
169 | 备注：标题 <br>
170 | 帮助信息：在网页端按H键可进入帮助页面
171 | 
172 | ---
173 | 
174 | class: inverse, center
175 | name: agenda
176 | 
177 |   # Agenda
178 |   ## -
179 |   ** About Me **
180 |   ## -
181 | 
182 |   ** 什么是 Spark？ ** <br>
183 | 
184 |   ** 正确理解 Spark 相关概念 ** <br>
185 | 
186 |   ** 怎么用 Spark？ ** <br>
187 | 
188 |   ** 如何提升 Spark 技能？ **<br>
189 | 
190 | ???
191 | 
192 | 备注: 目录<br>
193 | 
194 | 本 TOPIC 从最基本的方面讲解 Spark 入门所需具备的入手手段<br>
195 | 
196 | 内容尽可能的涵盖：<br>
197 | 大数据处理的基本知识<br>
198 | 从数据开发工程师或者数据分析师角度需要理解的 Spark 的一些基本概念<br>
199 | 如何和底层大数据工程师（hadoop/spark 等）及运维合理的交流问题<br>
200 | 以及如何在 Spark 这条路上一条道走到黑，走的更远
201 | 
202 | ---
203 | 
204 | class: inverse
205 | name: aboutme
206 | 
207 | .left-column-inverse[
208 |   # About Me
209 | ]
210 | 
211 | .right-column-inverse[
212 | ## Kent Yao
213 | 
214 | 2016年11月加入网易，目前在杭州研究院-数据科学中心担任资深大数据平台开发工程师，主导 Spark 作为核心计算框架在[网易大数据平台](https://bigdata.163yun.com/mammut)的相关研发及大规模应用工作。
215 | 
216 | 前华为技术有限公司大数据技术开发部成员。
217 | 
218 | <br>
219 | 
220 | GitHub: https://github.com/yaooqinn
221 | 
222 | <br>
223 | <br>
224 | <br>
225 | 
226 | <img style="zoom: 0.618" src="../../imgs/mammut.png" />
227 | ]
228 | 
229 | ???
230 | 备注：个人简介
231 | 
232 | ---
233 | 
234 | class: inverse
235 | name: sub-agenda
236 | 
237 | .left-column-inverse[
238 |   # Agenda
239 |   ## 什么是 Spark?
240 | ]
241 | 
242 | .right-column-inverse[
243 | ### 什么是 Spark?
244 | ### Spark v.s. Hadoop
245 | ### Spark v.s. Hive
246 | ### Spark v.s. Impala
247 | ### Spark v.s. Flink
248 | ]
249 | 
250 | ???
251 | 备注：子目录<br>
252 | 
253 | 本章主要从 Spark 本身及Hadoop生态中的各类优秀组件的比较重，对于 Spark 的能力，定位，场景有较为客观的认识<br>
254 | 
255 | 由于能力、篇幅有限，只能基于当下的场景给出较为片面的个人见解
256 | 
257 | ---
258 | 
259 | class: inverse
260 | name: whatisrddoverview
261 | 
262 | .left-column-inverse[
263 |   ## 什么是 Spark?
264 |   ### 从定义上看
265 | ]
266 | 
267 | .right-column[
268 | <div style="margin-top: 20px;text-align: center; font-size: 25px; background-color: white">
269 |   <b>Apache Spark™</b> is a unified analytics engine for large-scale data processing.
270 | </div>
271 | 
272 | <div>
273 |     <img src="../../imgs/spark_basics/spark-stack.png" style="margin-top: 15px; width: 100%;  zoom: 2.50; align-self: center;" usemap="#stack-map">
274 |     <map name="stack-map">
275 |       <area shape="rect" coords="0,0,74,95" href="http://com.netease.bigdata.spark.apache.org/sql/" alt="Spark SQL" title="Spark SQL">
276 |       <area shape="rect" coords="74,0,150,95" href="http://com.netease.bigdata.spark.apache.org/streaming/" alt="Spark Streaming" title="Spark Streaming">
277 |       <area shape="rect" coords="150,0,224,95" href="http://com.netease.bigdata.spark.apache.org/mllib/" alt="MLlib (machine learning)" title="MLlib">
278 |       <area shape="rect" coords="225,0,300,95" href="http://com.netease.bigdata.spark.apache.org/graphx/" alt="GraphX" title="GraphX">
279 |     </map>
280 | </div>
281 | 
282 | <h3 style="text-align: center; font-size: 25px; color: white;"><span style="line-height: 95px; display: inline-block; vertical-align: bottom;">Apache® Spark™ Ecosystem</span></h3>
283 | 
284 | ]
285 | 
286 | ???
287 | 备注：从定义上理解什么是 Spark <br>
288 | 从官方目前给出的定义，Apache Spark 是一台支持大规模数据处理标准统一的分析引擎<br>
289 | 
290 | 模块涵盖了
291 | Spark SQL - 离线结构化数据的处理方案
292 | Spark Streaming - 流式计算框架
293 | Spark MLlib - 机器学习框架
294 | Graphx - 图计算框架
295 | 
296 | 同时这些框架都运行在通用的 Spark Core 底层计算框架上面
297 | 
298 | 得益于 2.x 版本后 API 层面在 SparkSession/DataFrame 上的高度统一，使得用户可以基于 “one stack to rule them all” 来大大滴爽一把。
299 | 
300 | ---
301 | class: inverse
302 | name: sparkvshadoop
303 | 
304 | .left-column-inverse[
305 |   ## 什么是 Spark?
306 |   ### 从定义上看
307 |   ### 从特点上看
308 | ]
309 | 
310 | .right-column-inverse[
311 | 
312 | #### 速度块
313 | - DAG - 使用 DAG 对 RDD 的关系进行建模，描述其依赖关系
314 | - [Catalyst](https://databricks.com/blog/2015/04/13/deep-dive-into-spark-sqls-catalyst-optimizer.html) - 优化器框架
315 | - [Project Tungsten](https://databricks.com/blog/2015/04/28/project-tungsten-bringing-spark-closer-to-bare-metal.html) - 内存管理、二进制处理、缓存友好、Code-Gen
316 | - [CBO](https://databricks.com/blog/2017/08/31/cost-based-optimizer-in-apache-spark-2-2.html) - 基于代价的优化器
317 | - [Continuous Processing](https://databricks.com/blog/2018/03/20/low-latency-continuous-processing-mode-in-structured-streaming-in-apache-spark-2-3-0.html) - Streaming实现准实时到实时
318 | - ...
319 | 
320 | #### 易用性
321 | - 丰富的高阶 API 算子
322 | - 丰富的语言支持：Java, Scala, Python, R, SQL
323 | 
324 | #### 通用性<span style="font-size: 0.9em; "> - 统一的 API 和 统一的底层模型: One Stack to Rule Them All</span>
325 | 
326 | #### 多平台<span style="font-size: 0.9em; "> - Standalone/YARN/Mesos/K8S; DataSources</span>
327 | 
328 | ]
329 | 
330 | ???
331 | 
332 | 备注：从 Spark 特点上理解<br>
333 | 
334 | 在图论中，如果一个有向图从任意顶点出发无法经过若干条边回到该点，则这个图是一个有向无环图（ DAG 图）<br>
335 | Spark SQL Paper: http://people.csail.mit.edu/matei/papers/2015/sigmod_spark_sql.pdf <br>
336 | 
337 | RDD Paper: https://www2.eecs.berkeley.edu/Pubs/TechRpts/2011/EECS-2011-82.pdf
338 | 
339 | ---
340 | 
341 | class: inverse
342 | name: sparkvshadoop
343 | 
344 | .left-column-inverse[
345 |   ## Spark v.s. Hadoop
346 | ]
347 | 
348 | .right-column-inverse[
349 | <div style="margin-top: 20px;text-align: center; font-size: 25px; background-color: white;">
350 |   <b>Apache Hadoop™</b> 大数据基础生态"事实标准"
351 | </div>
352 | #### HDFS
353 | - 大数据存储"事实标准" - 高可靠的分布式文件系统，可作为Spark等计算引擎的可靠底层存储承载
354 | 
355 | #### YARN
356 | - 大数据资源调度"事实标准" - 允许我们各种大数据应用（包括Spark）以多租户的模式共享集群
357 | 
358 | #### MapReduce
359 | - 低阶"呆板"的计算框架 - Spark等等诸多大数据计算框架所challenge的点
360 | 
361 | ]
362 | 
363 | ???
364 | 备注: spark v.s. hadoop
365 | 
366 | ---
367 | 
368 | class: inverse
369 | name: sparkvsmapreduce
370 | 
371 | .left-column-inverse[
372 |   ## Spark v.s. Hadoop
373 |   ### Spark v.s MapReduce
374 | ]
375 | 
376 | .right-column-inverse[
377 | <div style="margin-top: 20px;text-align: center; font-size: 25px; background-color: white;">
378 |   <b>Apache Spark™</b> 大数据计算框架"事实标准"
379 | </div>
380 | 
381 | #### MapReduce缺点
382 | 
383 | - 抽象层次低，难以上手
384 | - 表达能力欠佳，局限map/reduce算子，Job的lineage管理
385 | - 大量shuffle，大量落盘，不适合迭代
386 | 
387 | <div style="margin-top: 10px;text-align: left; font-size: medium; ">
388 |   事实上就像你要装修房子的时候，不小心选了“清工”，你得告诉泥工干啥，木工干啥，X工干啥，然后在告诉包工头什么时候泥工需要做完啥，木工需要做完啥…… 最后发现实际上就像你用双手装修完了房子
389 | </div>
390 | 
391 | #### Spark的优点
392 | - RDD抽象 - 弹性分布式数据集在单机上的完美表达
393 | - 丰富的算子 - 像乐高积木一样灵活，易玩
394 | - 高效的迭代算法
395 | 
396 | <div style="margin-top: 10px;text-align: left; font-size: medium; ">
397 |   终于把事情交给了一个靠谱的包工头
398 | </div>
399 | ]
400 | 
401 | ???
402 | 备注： spark v.s. mapreduce
403 | 
404 | 一个RDD就是一个分布式对象集合，本质上是一个只读的分区记录集合，
405 | 每个RDD可以分成多个分区，每个分区就是一个数据集片段，并且一个RDD的不同分区可以被保存到集群中不同的节点上，从而可以在集群中的不同节点上进行并行计算。RDD提供了一种高度受限的共享内存模型，即RDD是只读的记录分区的集合，不能直接修改，只能基于稳定的物理存储中的数据集来创建RDD，或者通过在其他RDD上执行确定的转换操作（如map、join和groupBy）而创建得到新的RDD。
406 | 
407 | ---
408 | 
409 | class:inverse
410 | name: hadoopwordcount
411 | 
412 | .left-column-inverse[
413 |   ## Spark v.s. Hadoop
414 |   ### Spark v.s MapReduce
415 |   ### Hadoop WordCount
416 | ]
417 | 
418 | .right-column-inverse[
419 | ```java
420 | public class WordCount {
421 |     public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
422 |         private final static IntWritable one = new IntWritable(1);
423 |         private Text word = new Text();
424 |         public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
425 |             String line = value.toString();
426 |             StringTokenizer tokenizer = new StringTokenizer(line);
427 |             while (tokenizer.hasMoreTokens()) {
428 |                 word.set(tokenizer.nextToken());
429 |                 context.write(word, one);
430 |             }
431 |         }
432 |     }
433 |     public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
434 |         public void reduce(Text key, Iterable<IntWritable> values, Context context)
435 |                 throws IOException, InterruptedException {
436 |             int sum = 0;
437 |             for (IntWritable val : values) {
438 |                 sum += val.get();
439 |             }
440 |             context.write(key, new IntWritable(sum));
441 |         }
442 |     }
443 |     public static void main(String[] args) throws Exception {
444 |         Configuration conf = new Configuration();
445 |         Job job = new Job(conf, "wordcount");
446 |         job.setOutputKeyClass(Text.class);
447 |         job.setOutputValueClass(IntWritable.class);
448 |         job.setMapperClass(Map.class);
449 |         job.setReducerClass(Reduce.class);
450 |         job.setInputFormatClass(TextInputFormat.class);
451 |         job.setOutputFormatClass(TextOutputFormat.class);
452 |         FileInputFormat.addInputPath(job, new Path(args[0]));
453 |         FileOutputFormat.setOutputPath(job, new Path(args[1]));
454 |         job.waitForCompletion(true);
455 |     }
456 | }
457 | ```
458 | ]
459 | 
460 | ---
461 | 
462 | class: inverse
463 | name: sparkwordcount
464 | 
465 | .left-column-inverse[
466 |   ## Spark v.s. Hadoop
467 |   ### Spark v.s MapReduce
468 |   ### Hadoop WordCount
469 |   ### Spark WordCount
470 | ]
471 | 
472 | .right-column-inverse[
473 | 
474 | ```scala
475 | object WordCount {
476 | 
477 |   def main(args: Array[String]): Unit = {
478 |     val conf = new SparkConf().setAppName("Word Count").setMaster("local[*]")
479 |     val sparkContext = new SparkContext(conf)
480 |     val textFile = sparkContext.textFile(args(0), 2)
481 |     val words = textFile.flatMap(_.split(" "))
482 |     val ones = words.map((_, 1))
483 |     val counts = ones.reduceByKey(_ + _)
484 |     val res = counts.collect()
485 |     for ((word, count) <- res) {
486 |       println(word + ": " + count)
487 |     }
488 |     sparkContext.stop()
489 |   }
490 | }
491 | ```
492 | 
493 | 或者一句话搞定：
494 | ```scala
495 | sc.textFile("README.md").flatMap(_.split("\\s+")).map((_, 1)).reduceByKey(_ + _).collect()
496 | ```
497 | ]
498 | 
499 | ---
500 | 
501 | class: inverse
502 | name: sparkvshive
503 | 
504 | .left-column-inverse[
505 |   ## Spark v.s. Hive
506 |  ]
507 | 
508 | .right-column-inverse[
509 | <div style="margin-top: 20px;text-align: center; font-size: 25px; background-color: white;">
510 |   <b>Apache Hive MetaStore</b> 大数据元数据"事实标准"
511 | </div>
512 | 
513 | #### Hive
514 | - "原始"的SQL on Hadoop方案
515 | - 实用、使用成本低，往往在中大型的互联网企业中积累了大量的业务和用户
516 | - 查询性能通常很低
517 | 
518 | #### Spark SQL
519 | - 基于Spark Core的SQL on Hadoop方案
520 | - 查询性能很快
521 | - 和Hive的关系可以用"暧昧"来形容
522 | 
523 | #### Kyuubi
524 | - 基于Spark SQL提供企业级SQL on Hadoop解决方案
525 | - https://github.com/yaooqinn/kyuubi
526 | 
527 | ]
528 | 
529 | ---
530 | 
531 | class: inverse
532 | name: sparkvsimpala
533 | 
534 | .left-column-inverse[
535 |   ## Spark v.s. Impala
536 |  ]
537 | 
538 | .right-column-inverse[
539 | 
540 | #### Impala
541 | - MPP 架构 - 分治思想、均分 task，share nothing、Scalability问题
542 | - 查询性能最快
543 | - Workload 建议在百亿级别之下
544 | - Impala on YARN 已被放弃，与现有计算集群无法融合
545 | 
546 | #### Spark SQL
547 | - DAG 架构 - MR模型、Shared storage、 全局meta，优秀的Scalability
548 | - 查询性能也快
549 | - Workload > 百亿级别
550 | - Spark on YARN 可无缝融入现有计算资源池
551 | 
552 | ]
553 | 
554 | ---
555 | 
556 | class: inverse
557 | name: sparkvsflink
558 | 
559 | .left-column-inverse[
560 |   ## Spark v.s. Flink
561 |  ]
562 | 
563 | .right-column-inverse[
564 | 
565 | - Flink作为Spark的有力挑战者，正在以新的计算模型尝试解决Spark也在尝试解决的问题<br>
566 |   <img style="zoom: 0.65" align="center" src="https://mapr.com/developercentral/lambda-architecture/assets/otherpageimages/lambda-architecture-2-800.jpg">
567 | - 两者相互"学习"、"借鉴"
568 | - Spark Streaming (Maintained) - DStream、RDD based、微批处理、高吞吐、准实时
569 | - Spark Structrured Streaming - DataFrame Based、 微批处理/Continuous Process、高吞吐、准实时/实时
570 | - Flink - 数据流及事件序列模型、流执行模式、实时
571 | 
572 | ]
573 | 
574 | ---
575 | 
576 | class: inverse
577 | name: sub-agenda
578 | 
579 | .left-column-inverse[
580 |   # Agenda
581 |   ## Spark Glossary
582 | ]
583 | 
584 | .right-column-inverse[
585 | ### SparkContext
586 | ### Application
587 | ### Configuration
588 | ### Deployment
589 | ### Monitoring
590 | ### Tuning
591 | ]
592 | 
593 | ???
594 | 
595 | 备注：理解 Spark 相关的术语及其背后包含的语义，
596 | 
597 | 有助于用户正确理解 Spark 的构成
598 | 
599 | 有助于用户正确和底层平台开发和维护人员进行有效的沟通
600 | 
601 | ---
602 | 
603 | class: inverse
604 | name:sparkcontext
605 | 
606 | .left-column-inverse[
607 |   ## Spark Glossary
608 |   ### SparkContext
609 | ]
610 | .right-column-inverse[
611 | - the entrance / the heart / the master of a Spark APP
612 | 
613 | <img style="background-color: white" src="../../imgs/spark_basics/sparkcontext-services.png">
614 | ]
615 | 
616 | ---
617 | 
618 | class: inverse
619 | name: application
620 | 
621 | .left-column-inverse[
622 |   ## Spark Glossary
623 |   ### SparkContext
624 |   ### Application
625 | ]
626 | 
627 | .right-column-inverse[
628 | 
629 | 一个Spark应用是一个由包含 SparkContext 实例主程序作为 Driver 进程，并协调一堆独立的 Executor 进程构成的一个 Master/Salve 的结构<br>
630 | 
631 |   <img src="http://spark.apache.org/docs/latest/img/cluster-overview.png">
632 | ```scala
633 | Driver Program - 包含 SparkContext 对象的用户主程序, 调度节点
634 | Executor - task处理节点、计算节点
635 | Cluster Manager - 管理集群计算资源的外部服务，常见的有 Standalone Mesos, k8s, YARN
636 | 
637 | ```
638 | 
639 | ]
640 | 
641 | ???
642 | Spark运行架构包括集群资源管理器（Cluster Manager）、运行作业任务的工作节点（Worker Node）、每个应用的任务控制节点（Driver）和每个工作节点上负责具体任务的执行进程（Executor）。其中，集群资源管理器可以是Spark自带的资源管理器，也可以是YARN或Mesos等资源管理框架。
643 | 与Hadoop MapReduce计算框架相比，Spark所采用的Executor有两个优点：一是利用多线程来执行具体的任务（Hadoop MapReduce采用的是进程模型），减少任务的启动开销；二是Executor中有一个BlockManager存储模块，会将内存和磁盘共同作为存储设备，当需要多轮迭代计算时，可以将中间结果存储到这个存储模块里，下次需要时，就可以直接读该存储模块里的数据，而不需要读写到HDFS等文件系统里，因而有效减少了IO开销；或者在交互式查询场景下，预先将表缓存到该存储系统上，从而可以提高读写IO性能。
644 | 
645 | ---
646 | 
647 | class: inverse
648 | name: applicationcomponents
649 | 
650 | .left-column-inverse[
651 |   ## Spark Glossary
652 |   ### SparkContext
653 |   ### Application
654 | ]
655 | 
656 | .right-column-inverse[
657 | Application
658 | ```
659 |     - 用户主程序
660 |     - 包含主程序依赖、Spark jars， 可包含多个Job
661 | ```
662 | RDD
663 | ```
664 |     - 用户编程模型，只读的、分布式的数据集及包含的运算的非分布概念抽象
665 |     - transformation - 指定RDD之间的相互依赖关系
666 |     - action - 用户执行计算，指定输出形式
667 | ```
668 | Job
669 | ```
670 |      - 以 action 算子为划分
671 | ```
672 | Stage
673 | ```
674 |      - 以 shuffle 算子划分，也即下游的算子有可能需要从上有算子全量的全量输出中获得输入
675 | ```
676 | Task
677 | ```
678 |      - 对应 RDD 分区数，将一个 stage 划分成一堆 task，由 Driver 调度到 Executor 计算
679 | ```
680 | 
681 | ]
682 | 
683 | ???
684 | 
685 | 在Spark中，一个应用（Application）由一个任务控制节点（Driver）和若干个作业（Job）构成，一个作业由多个阶段（Stage）构成，一个阶段由多个任务（Task）组成。当执行一个应用时，任务控制节点会向集群管理器（Cluster Manager）申请资源，启动Executor，并向Executor发送应用程序代码和文件，然后在Executor上执行任务，运行结束后，执行结果会返回给任务控制节点，或者写到HDFS或者其他数据库中。
686 | 
687 | Spark通过分析各个RDD的依赖关系生成了DAG，再通过分析各个RDD中的分区之间的依赖关系来决定如何划分阶段，具体划分方法是：在DAG中进行反向解析，遇到宽依赖就断开，遇到窄依赖就把当前的RDD加入到当前的阶段中；将窄依赖尽量划分在同一个阶段中，可以实现流水线计算
688 | 
689 | 两类操作的主要区别是，transformations（比如map、filter、groupBy、join等）接受RDD并返回RDD，
690 | 而action操作（比如count、collect等）接受RDD但是返回非RDD（即输出一个值或结果）。
691 | RDD提供的转换接口都非常简单，都是类似map、filter、groupBy、join等粗粒度的数据转换操作，而不是针对某个数据项的细粒度修改。
692 | 
693 | ---
694 | 
695 | class: inverse
696 | name: configuration
697 | 
698 | .left-column-inverse[
699 |   ## Spark Glossary
700 |   ### SparkContext
701 |   ### Application
702 |   ### Configuration
703 | ]
704 | 
705 | .right-column-inverse[
706 | 
707 | Spark Properties
708 | ```scala
709 | # 配置文件
710 | conf/spark-defaults.conf
711 | 
712 | # 硬编码
713 | val conf = new SparkConf().setMaster("local[2]").setAppName("NetEase")
714 | 
715 | # 动态传参
716 | bin/spark-submit \
717 |   --name "Netease" \
718 |   --master local[4] \
719 |   --conf spark.eventLog.enabled=false myapp.jar
720 | 
721 | ```
722 | <br>
723 | <br>
724 | 
725 | 更多可参照文档: https://spark.apache.org/docs/latest/configuration.html#spark-configuration
726 | 
727 | ]
728 | 
729 | ---
730 | 
731 | class: inverse
732 | name: deployment
733 | 
734 | .left-column-inverse[
735 |   ## Spark Glossary
736 |   ### SparkContext
737 |   ### Application
738 |   ### Configuration
739 |   ### Deployment
740 | ]
741 | 
742 | .right-column-inverse[
743 | 
744 | Cluster Manager
745 | ```
746 |     - spark.master / --master
747 |     - Standalone - 在集群上启动对应的Master 和 Worker进程，作为资源的管理器
748 |     - Local - 单进程充当Driver和Executor
749 |     - Mesos - Apache下的开源分布式资源管理框架
750 |     - YARN - 生产环境首选资源管理器，支持Hive、Spark、Flink各种任务的资源调度
751 |     - k8s - 暂时还是实验特性
752 | ```
753 | Deploy Mode
754 | ```
755 |     - spark.submit.deplyMode / --deploy-mode
756 |     - client - Driver 运行在本地，ApplicationMaster 只作为 Executor Launcher
757 |         - 该模式一般用于调试场景，可以方便自己查看 Driver 端日志，定位问题
758 |         - 单机的负载极限
759 |         - Driver 潜在的网络瓶颈
760 | 
761 |     - cluster - Driver 运行在 ApplicationMaster 内
762 |         - 该模式一般用于生产环境
763 |         - YARN ApplicationMaster带 failover 可防止一些潜在问题
764 |         - 同一集群内 Driver 所处网络环境较好
765 |         - 同一NodeManager节点过多 Container 而产生的本地资源竞争
766 |             - spark.driver.cores
767 |             - spark.driver.memory
768 | ```
769 | 
770 | ]
771 | 
772 | ---
773 | 
774 | class: inverse
775 | name: monitoring
776 | 
777 | .left-column-inverse[
778 |   ## Spark Glossary
779 |   ### SparkContext
780 |   ### Application
781 |   ### Deployment
782 |   ### Configuration
783 |   ### Monitoring
784 | ]
785 | 
786 | .right-column-inverse[
787 | 
788 | Live UI
789 | 
790 | ```
791 | http://<driver-node>:4040
792 | ```
793 | 
794 | History Server
795 | 
796 | ```
797 | http://<history-server-url>:18080
798 | ```
799 | Metrics
800 | 
801 | ```
802 | $SPARK_HOME/conf/metrics.properties
803 | 
804 | ```
805 | Log
806 | ```
807 | # 配置
808 | $SPARK_HOME/conf/log4j.properties
809 | ```
810 | ```
811 | # 一般定位顺序
812 | Driver -> ApplicationMaster -> Executor -> NodeManager
813 | ```
814 | ```
815 | # 命令
816 | yarn logs -applicationId [app_id] -appOwner [user_name]
817 | ```
818 | ]
819 | 
820 | ---
821 | 
822 | class: inverse
823 | name: debugstring
824 | 
825 | .left-column-inverse[
826 |   ## Spark Glossary
827 |   ### SparkContext
828 |   ### Application
829 |   ### Deployment
830 |   ### Configuration
831 |   ### Monitoring
832 | ]
833 | 
834 | .right-column-inverse[
835 | 
836 | ### toDebugString
837 | 
838 | ```scala
839 | scala> val wordCount = sc.textFile("README.md").flatMap(_.split("\\s+")).map((_, 1)).reduceByKey(_ + _)
840 | wordCount: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[21] at reduceByKey at <console>:24
841 | 
842 | scala> wordCount.toDebugString
843 | res13: String =
844 | (2) ShuffledRDD[21] at reduceByKey at <console>:24 []
845 |  +-(2) MapPartitionsRDD[20] at map at <console>:24 []
846 |     |  MapPartitionsRDD[19] at flatMap at <console>:24 []
847 |     |  README.md MapPartitionsRDD[18] at textFile at <console>:24 []
848 |     |  README.md HadoopRDD[17] at textFile at <console>:24 []
849 | ```
850 | ]
851 | 
852 | ???
853 | 
854 | Spark的这种依赖关系设计，使其具有了天生的容错性，大大加快了Spark的执行速度。因为，RDD数据集通过“血缘关系”记住了它是如何从其它RDD中演变过来的，血缘关系记录的是粗颗粒度的转换操作行为，当这个RDD的部分分区数据丢失时，它可以通过血缘关系获取足够的信息来重新运算和恢复丢失的数据分区，由此带来了性能的提升。
855 | 
856 | ---
857 | 
858 | class: inverse
859 | name: monitoring
860 | 
861 | .left-column-inverse[
862 |   ## Spark Glossary
863 |   ### SparkContext
864 |   ### Application
865 |   ### Deployment
866 |   ### Configuration
867 |   ### Monitoring
868 |   ### Tuning
869 | ]
870 | 
871 | .right-column-inverse[
872 | 
873 | 如何从数据层面下手调优
874 | ```
875 | 数据类型 (primitive)             FileFormat（json / parquet / orc...)
876 | 文件分区、分桶                   小文件问题
877 | 数据倾斜问题                     单个 partition / task 的 workload
878 | ...
879 | ```
880 | 如何从资源层面下手调优
881 | ```
882 | Driver（内存的配置）
883 | Executor (内存、核数)大小的配置
884 | 所属 Yarn 队列资源的通盘考虑
885 | Hadoop集群“超售” - 网络、磁盘、NodeManager load...
886 | ...
887 | ```
888 | 
889 | 如何从代码层面下手调优
890 | 
891 | ```
892 | 避免 shuffle / 广播
893 | 预聚合
894 | kryo序列化
895 | RDD 复用 / 持久化
896 | 高性能算子
897 | RDD -> DataFrame/Dataset
898 | ```
899 | ]
900 | 
901 | ---
902 | class: inverse
903 | name: sub-agenda
904 | 
905 | .left-column-inverse[
906 |   ## Spark之路
907 | ]
908 | 
909 | .right-column-inverse[
910 | 
911 | #### 源码学习:
912 | - Spark源码 - https://github.com/apache/spark <br>
913 | - 三方库源码 - https://spark-packages.org/ <br>
914 | 
915 | #### 文档学习:
916 | - 官方文档 - http://spark.apache.org/docs/latest/ <br>
917 | - 官方博客 - https://databricks.com/blog <br>
918 | 
919 | #### 其他：
920 | - https://github.com/netease-bigdata/ne-spark-courseware
921 | - https://github.com/jaceklaskowski/mastering-apache-spark-book
922 | - 远离 <b>王家林</b>
923 | 
924 | ]
925 | 
926 | ---
927 | 
928 | class: middle, center, inverse
929 | name: greetings
930 | # Q & A
931 | 
932 | ---
933 | 
934 | class: middle, center, inverse
935 | name: greetings
936 | # Thank You!
937 | ### [Kent Yao]
938 | 
939 | <img style="zoom: 1.0" src="../../imgs/mammut.png"  align="bottom" />
940 | 
941 | <a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/">Creative Commons Attribution-NonCommercial 4.0 International License</a>.
942 | 
943 | 
944 |     </textarea>
945 |     <script src="https://remarkjs.com/downloads/remark-latest.min.js">
946 |     </script>
947 |     <script>
948 |       var slideshow = remark.create({
949 |         ratio: '16:9',
950 |         slideNumberFormat: 'Slide %current% of %total%',
951 |         // .. or by using a format function
952 |         slideNumberFormat: function (current, total) {
953 |           return ' ' + current + ' of ' + total;
954 |         },
955 |         highlightLanguage: 'scala',
956 |         highlightStyle: 'monokai',
957 |         highlightLines: true,
958 |         // arta, ascetic, dark, default, far, github, googlecode, idea, ir-black, magula, monokai, rainbow, solarized-dark, solarized-light, sunburst, tomorrow, tomorrow-night-blue, tomorrow-night-bright, tomorrow-night, tomorrow-night-eighties, vs, zenburn
959 |         highlightStyle: 'zenburn'
960 |       });
961 |     </script>
962 |   </body>
963 | </html>
964 | 


--------------------------------------------------------------------------------
/slides/spark_core/rdd_basics.html:
--------------------------------------------------------------------------------
   1 | 
   2 | <html>
   3 |   <head>
   4 |     <title>RDD Basics</title>
   5 |     <meta charset="utf-8">
   6 |     <style>
   7 |       @import url(https://fonts.googleapis.com/css?family=Yanone+Kaffeesatz);
   8 |       @import url(https://fonts.googleapis.com/css?family=Droid+Serif:400,700,400italic);
   9 |       @import url(https://fonts.googleapis.com/css?family=Ubuntu+Mono:400,700,400italic);
  10 | 
  11 |       body { font-family: 'Droid Serif'; }
  12 |       h1, h2, h3 {
  13 |         font-family: 'Yanone Kaffeesatz';
  14 |         ffont-weight: 400;
  15 |         margin-bottom: 0;
  16 |       }
  17 |       .remark-slide-content h1 { font-size: 3em; }
  18 |       .remark-slide-content h2 { font-size: 2em; }
  19 |       .remark-slide-content h3 { font-size: 1.6em; }
  20 |       .footnote {
  21 |         position: absolute;
  22 |         bottom: 3em;
  23 |       }
  24 |       li p { line-height: 1.25em; }
  25 |       .red { color: #fa0000; }
  26 |       .large { font-size: 2em; }
  27 |       a, a > code {
  28 |         color: rgb(249, 38, 114);
  29 |         text-decoration: none;
  30 |       }
  31 |       code {
  32 |         background: #e7e8e2;
  33 |         border-radius: 5px;
  34 |       }
  35 |       .remark-code, .remark-inline-code { font-family: 'Ubuntu Mono'; }
  36 |       .remark-code-line-highlighted     { background-color: #373832; }
  37 |       .pull-left {
  38 |         float: left;
  39 |         width: 47%;
  40 |       }
  41 |       .pull-right {
  42 |         float: right;
  43 |         width: 47%;
  44 |       }
  45 |       .pull-right ~ p {
  46 |         clear: both;
  47 |       }
  48 |       #slideshow .slide .content code {
  49 |         font-size: 0.8em;
  50 |       }
  51 |       #slideshow .slide .content pre code {
  52 |         font-size: 0.9em;
  53 |         padding: 15px;
  54 |       }
  55 |       .inverse {
  56 |         background: #272822;
  57 |         color: #777872;
  58 |         text-shadow: 0 0 20px #333;
  59 |       }
  60 |       .inverse h1, .inverse h2 {
  61 |         color: #fff;
  62 |         line-height: 0.8em;
  63 |       }
  64 |     
  65 |       /* Slide-specific styling */
  66 |       #slide-inverse .footnote {
  67 |         bottom: 12px;
  68 |         left: 20px;
  69 |       }
  70 |       #slide-how .slides {
  71 |         font-size: 0.9em;
  72 |         position: absolute;
  73 |         top:  151px;
  74 |         right: 140px;
  75 |       }
  76 |       #slide-how .slides h3 {
  77 |         margin-top: 0.2em;
  78 |       }
  79 |       #slide-how .slides .first, #slide-how .slides .second {
  80 |         padding: 1px 20px;
  81 |         height: 90px;
  82 |         width: 120px;
  83 |         -moz-box-shadow: 0 0 10px #777;
  84 |         -webkit-box-shadow: 0 0 10px #777;
  85 |         box-shadow: 0 0 10px #777;
  86 |       }
  87 |       #slide-how .slides .first {
  88 |         background: #fff;
  89 |         position: absolute;
  90 |         top: 20%;
  91 |         left: 20%;
  92 |         z-index: 1;
  93 |       }
  94 |       #slide-how .slides .second {
  95 |         position: relative;
  96 |         background: #fff;
  97 |         z-index: 0;
  98 |       }
  99 |     
 100 |       /* Two-column layout */
 101 |       .left-column {
 102 |         color: #777;
 103 |         width: 20%;
 104 |         height: 92%;
 105 |         float: left;
 106 |       }
 107 |       .left-column h2:last-of-type, .left-column h3:last-of-type {
 108 |         color: #000;
 109 |       }
 110 |       .right-column {
 111 |         width: 75%;
 112 |         float: right;
 113 |         padding-top: 1em;
 114 |       }
 115 |     
 116 |       /* Two-column layout inverse*/
 117 |       .left-column-inverse {
 118 |         color: #777;
 119 |         width: 20%;
 120 |         height: 92%;
 121 |         float: left;
 122 |       }
 123 |       .left-column-inverse h2:last-of-type, .left-column-inverse h3:last-of-type {
 124 |         color: #fff;
 125 |       }
 126 |       .right-column-inverse {
 127 |         width: 75%;
 128 |         float: right;
 129 |         padding-top: 1em;
 130 |       }
 131 |       .right-column-inverse h2, .right-column-inverse h3, .right-column-inverse h4 {
 132 |         color: #fff;
 133 |       }
 134 |     </style>
 135 |   </head>
 136 |   <body>
 137 |     <textarea id="source">
 138 | 
 139 | class: center, middle, inverse
 140 | 
 141 | ## [NetEase Spark Courses](https://netease-bigdata.github.io/ne-spark-courseware/)
 142 | 
 143 | <br>
 144 | <br>
 145 | <br>
 146 | <br>
 147 | 
 148 | ## Spark RDD理解与性能调优
 149 | 
 150 | <br>
 151 | <br>
 152 | <br>
 153 | <br>
 154 | 
 155 | <img style="zoom: 1.0" src="../../imgs/mammut.png"  align="bottom" />
 156 | 
 157 | ???
 158 | 备注：标题 <br>
 159 | 帮助信息：在网页端按H键可进入帮助页面
 160 | 
 161 | ---
 162 | 
 163 | class:  center,inverse
 164 | name: rdd
 165 | 
 166 |   # Agenda
 167 |   ## -
 168 |   ** About Me **<br>
 169 |   ## -
 170 |   **  RDD定义及特点  **<br>
 171 |   ** RDD Operations **<br>
 172 |   ** RDD 依赖 **<br>
 173 |   ** RDD Persist **<br>
 174 |   ** 相关链接 **<br>
 175 | 
 176 | ---
 177 | class: 
 178 | name:
 179 | .left-column[
 180 | ## About Me
 181 | ]
 182 | .right-column[
 183 | ## 王斐
 184 | 
 185 | 2018年校招加入网易，硕士期间研究Spark平台内存优化，目前在杭州研究院-数据科学中心负责Spark平台开发相关工作。
 186 | ]
 187 | ---
 188 | 
 189 | 
 190 | class:
 191 | name:
 192 | 
 193 | .left-column[
 194 | ### What is Spark?
 195 | 
 196 | ]
 197 | .right-column[
 198 | Apache Spark™ is a unified analytics engine for large-scale data processing.
 199 | <figure class="half">
 200 |     <img src="../../imgs/spark_core/spark-eco.png" width="100%" height="60%">
 201 | </figure>
 202 | ]
 203 | ---
 204 | 
 205 | 
 206 | class:
 207 | name:
 208 | 
 209 | .left-column[
 210 | ### RDD定义及特点
 211 | #### RDD定义
 212 | 
 213 | ]
 214 | .right-column[
 215 | RDD(Resilient Distributed Datasets),弹性分布式数据集，是对分布式内存的抽象。
 216 | <figure class="half">
 217 |     <img src="../../imgs/spark_core/rdd-inmemory.png" width="45%" height="40%">
 218 |     &nbsp; &nbsp; &nbsp;     
 219 |     <img src="../../imgs/spark_core/rdd-itr.png" width="48%"  height="30%">
 220 | </figure>
 221 | ]
 222 | 
 223 | ???
 224 | 首先讲一下rdd的定义，rdd的英文全称是resilient distributed dataset，弹性分布式数据集，是对分布式内存的抽象。可以看下左边这张图，可以看到这个虚线下面是hdfs，上面是rdd，看起来rdd和hdfs有点像，都是跨节点，然后hdfs是分块，rdd是分区。
 225 | 我们看右边这张图，rdd的一个分区，拆开了里面是一个迭代器，我们知道迭代器并不是一个集合，而只是提供了一个访问集合的方法。它类似于Java里面的stream。是lazy的，只有在触发取数据才会返回数据。
 226 | 所以rdd也是lazy的，并不是一声明就会进行计算，而是需要一个触发，才会进行计算。
 227 | 迭代器里面是hasnext和next方法，当调用next，他返回被引用集合中的一条数据，称之为record。
 228 | 
 229 | ---
 230 | class:
 231 | name:
 232 | .left-column[
 233 | ### RDD定义及特点
 234 | #### RDD定义
 235 | #### RDD特点
 236 | 
 237 | ]
 238 | .right-column[
 239 | 
 240 | #### 弹性
 241 | - 不可变
 242 | 
 243 | - 血缘
 244 | 
 245 | #### 分布式
 246 | - 跨节点
 247 | - 分区
 248 | 
 249 | #### 延迟计算
 250 | <figure class="half">
 251 |     <img src="../../imgs/spark_core/rdd-feature.png" width="100%"  height="30%">
 252 | </figure>
 253 | 
 254 | ]
 255 | ???
 256 | 前面讲了rdd的定义和形态，这里讲下他的特点。
 257 | rdd叫做弹性分布式数据集，首先讲一下弹性相关的特点。
 258 | 不变性。不变性是函数式编程的基石。给人以安全感，就是说这个rdd不论在哪，只要输入一样，他的输出永远都一样，这就方便进行容错。也方便进行弹性调度，spark在观察到一个任务在某个节点运行特别缓慢，会为这个任务在其他资源丰富的节点创建备份任务，谁先运行完就使用谁的结果，因为他的结果是确定的。
 259 | 然后是血缘，就是说对于一个rdd，我知道他的来龙去脉，就算中间过程执行出错，也能从源头重新来过，这就提供了很好的容错。 
 260 | 分布式的特点就比较明朗，就是跨节点和分区，跨节点提供了多个节点一起执行的能力，而分区，将要处理的数据的粒度进行划分的更细，方便进行并行操作。
 261 | 延迟计算前面已经提到过，分区里面是迭代器，这是一个lazy访问，比如下面这一大串的rdd，组成了一个链，在触发图上这个runjob之前，前面的abcdef都不会发生计算。
 262 | 这样做有优点也有缺点，优点就是说，这样可以更明确的进行计算，比如我一个应用我要做五个job，这五件事情是相互独立的。如果没有延迟计算，那五件job，第一件开始了，我开始算，第二件开始了，我也算，同时五个job一起算，可能内存中就存储了大量的中间数据，结果这些资源可能死锁，五个job一件都完成不了。
 263 | 而延迟计算就比较明确，一件job触发runjob我才计算，完成一个job再去完成另外的job，这样效率更高。
 264 | 然后就是延迟计算，可以得到整个job的拓扑图，可以对这个拓扑图进行优化。
 265 | 延迟计算的缺点，就是响应会慢点，这就导致spark在流处理这块，响应速度比Flink要慢，不适合实时流计算场景。但是目前spark也在流计算方面追赶，相信未来spark在流处理方面会有更好地表现。
 266 | 
 267 | 
 268 | 
 269 | 
 270 | ---
 271 | class:
 272 | name:
 273 | .left-column[
 274 | ## RDD Operations
 275 | <!-- ### transformation & action
 276 |  -->
 277 |  ### transformation
 278 | ]
 279 | .right-column[
 280 | transformation：从数据源生成RDD或者对已存在的RDD进行转换生成新RDD。
 281 | - transforamtion算子提供了一个并行的语义。
 282 | 
 283 | - API隐藏了数据划分、并行、通信、容错等复杂的框架代码。
 284 | 
 285 | - 算子里面的函数为用户自定义的函数(UDF)，该函数为一个串行函数，根据算子的语义对RDD中的数据按照语义的<font color=#A52A2A size=4 >**操作粒度**</font>进行操作。
 286 |   1. rdd1.mapValues(s=> s*2)
 287 |   2. def func1( i:Int):Int={ i*2}    &nbsp; &nbsp;&nbsp;rdd1.mapValues (func1)
 288 | 
 289 | |transformation | Meaning |
 290 | | ----- | :-----: |
 291 | |  textFile/objectFile| 从数据源生成RDD|
 292 | | map(func)      | 对每条record进行函数计算 |
 293 | | mapValues(func)| 对(key,value)类型的record的value计算 |
 294 | | filter(func) | 只保留符合条件的records |
 295 | | flatMap(func) | 将records的按照规则进行展开 |
 296 | | mapPartitions(func) | 以RDD的每个分区进行函数计算 |
 297 | | mapPartitionsWithIndex(func) |   对分区操作，提供partitionId参数      |
 298 | | groupByKey([numPartitions]) | 对(k,v)类型records按照key进行聚合 |
 299 | | reduceByKey(func, [numPartitions]) | 对(k,v)类型records按照key进行合并 |
 300 | | aggregateByKey(zeroValue)(seqOp, combOp,[numPartitions]) |  先按照分区聚合，然后按照key值合并    |
 301 | | coalesce(numPartitions) | 重新分区 |
 302 | | repartition(numPartitions) | 重新分区 |
 303 | 
 304 | <!-- | repartitionAndSortWithinPartitions(partitioner) | 重新分区，且在分区内进行排序 |
 305 |  -->
 306 | 
 307 | <!-- | sample(withReplacement, fraction, seed) | 返回一个子集 |
 308 | | intersection(otherDataset) | 返回两个RDD的交集        |
 309 | | distinct([numPartitions])) |   返回一个不包含重复record的数据集      |
 310 | | sortByKey([ascending], [numPartitions]) | 对(k,v)类型records按照key进行排序 |
 311 | | union(otherDataset) | 直接将两个RDD的分区进行联合 |
 312 | | join(otherDataset, [numPartitions]) | (k,v) join(k,w)=>(k,(v,w))|
 313 | | cogroup(otherDataset, [numPartitions]) |   （k,v) coGroup(k,w)=>(k,seq(v),seq(w))|
 314 | | cartesian(otherDataset) | 笛卡尔积计算 | -->
 315 | 
 316 | ]
 317 | ???
 318 | spark的提供了丰富的算子，相对于MapReduce编程模型，需要自己指定mapper和reducer方便了很多。
 319 | 算子分为两种，trans和action。首先讲一下trans，字面意思是转换。是生成新的RDD，要么是从数据中生成，或者是对现有RDD进行操作生成新的。
 320 | 下面列的这些都是常见的trans算子，比如map,mapvalue,mappartitions这些。trans算子提供了并行的语义，把底层的并行进行了封装，什么数据划分都不用我们管，往往需要用户提供一个函数参数，这个函数是一个串行函数，然后加上算子的并行的语义，就可以并行的对数据进行计算。至于对哪些数据进行计算，是对一条数据，还是一个分区的数据，是需要看算子的操作粒度的。
 321 | 
 322 | <!-- ---
 323 | class:
 324 | name:
 325 | .left-column[
 326 | ## RDD Operations
 327 | ### transformation
 328 | ]
 329 | .right-column[
 330 | - transforamtion算子提供了一个并行的语义。
 331 | 
 332 | - API隐藏了数据划分、并行、通信、容错等复杂的框架代码。
 333 | 
 334 | - 算子里面如果有函数参数，该函数为用户自定义的函数(UDF)，该函数为一个串行函数，根据算子的语义对RDD中的数据按照语义的<font color=#A52A2A size=4 >**操作粒度**</font>进行操作。
 335 | 
 336 |   1. rdd1.mapValues(s=> s*2)
 337 |   2. def func1( i:Int):Int={ i*2}
 338 | 
 339 |   rdd1.mapValues (func1)
 340 | 
 341 | ] -->
 342 | ---
 343 | class:
 344 | name:
 345 | .left-column[
 346 | ## RDD Operations
 347 | <!-- ### transformation & action
 348 |  -->
 349 |  ### transformation
 350 | ]
 351 | .right-column[
 352 | 
 353 | #### 操作粒度
 354 | 
 355 | - 对每条record操作
 356 | 
 357 |   map, flatMap, filter
 358 | 
 359 | - 对(key, value)类型record中的value操作
 360 | 
 361 |   mapValues
 362 | 
 363 | - 对整个分区数据操作
 364 | 
 365 |   mapPartitions,mapPartitionsWithIndex
 366 | 
 367 | - 对分区进行混洗(shuffle)
 368 | 
 369 |   reduceBykey,aggregataByKey,gropuByKey,repartition
 370 | 
 371 |   repartition,repartitionAndSortWithinPartitions
 372 | 
 373 | <!-- - 对多个RDD进行操作
 374 | 
 375 |   join, union, coGroup
 376 | 
 377 | - 重新分区
 378 | 
 379 |   repartition, coalesce  -->
 380 | 
 381 | 
 382 | 
 383 | ]
 384 | 
 385 | ---
 386 | 
 387 | class:
 388 | name:
 389 | .left-column[
 390 | ## RDD Operations
 391 | ### transformation
 392 | ### action
 393 | 
 394 | ]
 395 | .right-column[
 396 | 
 397 | Action:得到一个结果，或者将RDD存入磁盘。
 398 | 
 399 | | Action | Meaning |
 400 | | :----------: | :--------------------------------------: |
 401 | | countByKey() | 返回Map(k,count(k))，即每个key的个数 |
 402 | | reduce(func) | 将所有数据按照func进行聚合，返回一个值 |
 403 | |   take(n)    | 返回数据集的前N个数据的数组 |
 404 | |  collect()   | 将所有数据提取到driver上，转换成一个数组 |
 405 | |   count()    | 获得数据集的数据条数，一个值 |
 406 | |   first()    | 返回数据集的第一个数据，一个值|
 407 | |foreach(func) | 对每个数据进行操作 |
 408 | | takeOrdered(n, [ordering])  | 返回排序后数据集的前n个数据，一个数组 |
 409 | | takeSample(withReplacement, num, [seed]) | 根据seed进行抽样，获得num个数据，一个数组 |
 410 | |  saveAsTextFile(path)  | 保存为text文件 |
 411 | | saveAsObjectFile(path) | 保存为object文件 |
 412 | ]
 413 | 
 414 | ---
 415 | 
 416 | class:
 417 | name:
 418 | .left-column[
 419 | ## RDD Operations
 420 | ### transformation
 421 | ### action
 422 | ### 理解算子
 423 | ]
 424 | .right-column[
 425 | 
 426 | - map, mapPartitions
 427 | 
 428 |   例子： 一个RDD有10个分区，每个分区有1000条数据。对每条数据进行function操作。
 429 | 
 430 |   map算子，调用function 10000次
 431 | 
 432 |   mapPartitions算子，调用function10次，但是每次处理一个分区的数据，分区较大可能发生OOM
 433 | 
 434 | 	```
 435 |     //  rdd:RDD[(Int,Int)]
 436 |     def function1(tuple:(Int,Int)):(Int,Int)={
 437 |       (tuple._1+1,tuple._2+1)
 438 |     }
 439 |     def function2(iterator: Iterator[(Int,Int)]): Iterator[(Int,Int)] ={
 440 |       var list=List[(Int,Int)]()
 441 |       for(i<-iterator){
 442 |         list.::(i._1+1,i._2+1)
 443 |       }
 444 |       list.iterator
 445 |     }
 446 |     rdd.map(function1)
 447 |     rdd.mapPartitions(function2)
 448 |  ```
 449 | 
 450 | - collect
 451 | 
 452 |   collect算子会拉取rdd中所有数据到driver节点，转换成一个数组，如果数据量过大，会造成driver的OOM
 453 | 
 454 | <!-- - repartitionandsortwithinpartitions
 455 | 
 456 |   如果需要repartition之后对分区排序，那么使用这个repartitionandsortwithinpar
 457 |   titions性能会更好，因为可以将排序放在shuffle过程 -->
 458 | 
 459 | 
 460 | <!-- - 尽量使用reduceByKey代替groupByKey
 461 | 
 462 |   reduceByKey 算子会在map端对数据进行聚合(map-side combine)。
 463 |   groupByKey不会在map端进行聚合，会造成在shuffle阶段进行大规模数据传输和在shuffle read端的巨大内存压力。
 464 | 
 465 |  -->
 466 | 
 467 | ]
 468 | 
 469 | 
 470 | 
 471 | 
 472 | ---
 473 | class:
 474 | name:
 475 | .left-column[
 476 | ## RDD Operations
 477 | ### transformation
 478 | ### action
 479 | ### 理解算子
 480 | ### wordCount Demo
 481 | ]
 482 | .right-column[
 483 | 
 484 | ```
 485 | package com.netease.bigdata.spark
 486 | 
 487 | import org.apache.spark.{SparkConf, SparkContext}
 488 | 
 489 | object WordCount {
 490 | 
 491 |   def main(args: Array[String]): Unit = {
 492 |     require(args.length == 1, "Usage: WordCount <input file>")
 493 |     val conf = new SparkConf().setAppName("Word Count").setMaster("local[*]")
 494 |     val sc = new SparkContext(conf)
 495 |     val textFile = sc.textFile(args(0), 2)
 496 |     val words = textFile.flatMap(_.split("\\s+"))
 497 |     val ones = words.map((_, 1))
 498 |     val counts = ones.reduceByKey(_ + _)
 499 |     val res = counts.collect()
 500 |     for ((word, count) <- res) {
 501 |       println(word + ": " + count)
 502 |     }
 503 | 
 504 |     sc.stop()
 505 |   }
 506 | 
 507 | }
 508 | ```
 509 | 
 510 | <figure class="half">
 511 |     <img src="../../imgs/spark_core/wc-trans.png" width="100%" height="28%">
 512 | </figure>
 513 | 
 514 | 
 515 | ]
 516 | ---
 517 | class:
 518 | name:
 519 | .left-column[
 520 | ## RDD 依赖
 521 | ### 宽依赖 & 窄依赖
 522 | 
 523 | ]
 524 | .right-column[
 525 | 
 526 | <figure class="half">
 527 |     <img src="../../imgs/spark_core/dependencies.png" width="90%" height="50%">
 528 | </figure>
 529 | 
 530 | ]
 531 | ---
 532 | class:
 533 | name:
 534 | .left-column[
 535 | ## RDD 依赖
 536 | ### 宽依赖 & 窄依赖
 537 | ### Shuffle 
 538 | 
 539 | ]
 540 | .right-column[
 541 | 
 542 | 
 543 | #### Shuffle是一个怎样的过程: SortShuffle
 544 | 
 545 | - shuffle write端每个task计算结果按照key进行hash得到partitionID,然后把((pID,key),value)插入到一个数组(内存)中,如果内存不足则spill到磁盘，对数组使用TimSort排序（内存不足则spill到磁盘)
 546 | 
 547 | - 将排序结果存为partitiondFile到磁盘(序列化,磁盘I/O).
 548 | 
 549 | - shuffle read端从所有partitionFile中拉取对应分区数据，进行网络传输，反序列化为对象，进行合并排序，内存不足则溢出磁盘.
 550 | 
 551 | 
 552 | 
 553 | 
 554 | 
 555 | <figure class="half">
 556 |     <img src="../../imgs/spark_core/shuffle.png" width="72%" height="40%">
 557 | </figure>
 558 | 
 559 | 
 560 | 
 561 | ]
 562 | 
 563 | ---
 564 | class:
 565 | name:
 566 | .left-column[
 567 | ## RDD 依赖
 568 | ### 宽依赖 & 窄依赖
 569 | ### Shuffle 
 570 | ### Shuffle & Stage & Task
 571 | 
 572 | ]
 573 | .right-column[
 574 | #### WordCount Lineage
 575 | 
 576 | <figure class="half">
 577 |     <img src="../../imgs/spark_core/wordcount.png" width="70%" height="35%">
 578 | </figure>
 579 | 
 580 | - Action触发Job的提交
 581 | - 按照Shuffle划分stages
 582 | - 每个stage中是独立的n个task，n等于当前stage中rdd分区的数目，每个task分别处理一个分区的数据
 583 | - 在一个executor中，并行的task数目和executor的核数有关。
 584 | 
 585 | 
 586 | 
 587 | ]
 588 | ---
 589 | class:
 590 | name:
 591 | .left-column[
 592 | ## RDD 依赖
 593 | ### 宽依赖 & 窄依赖
 594 | ### Shuffle 
 595 | ### Shuffle & Stage & Task
 596 | ### Task & Loop
 597 | 
 598 | ]
 599 | .right-column[
 600 | 
 601 | ##### Loop
 602 | - 从数据源中读取数据对象
 603 | - 对每个数据对象运用一系列计算函数
 604 | - 将计算结果写入一个新的数据集合中
 605 | 
 606 | <figure class="half">
 607 |     <img src="../../imgs/spark_core/rdd-loop.png" width="70%" height="30%">
 608 | </figure>
 609 | ]
 610 | ---
 611 | class:
 612 | name:
 613 | .left-column[
 614 | ## RDD 依赖
 615 | ### 宽依赖 & 窄依赖
 616 | ### Shuffle 
 617 | ### Shuffle & Stage & Task
 618 | ### Task & Loop
 619 | ### Object Lifetime
 620 | 
 621 | ]
 622 | .right-column[
 623 | 三种数据容器：存放数据对象的引用，数据对象的生命周期依赖于其容器的生命周期。
 624 | 
 625 | 
 626 | 
 627 | <figure class="half">
 628 | 
 629 |     <img src="../../imgs/spark_core/object-lifetime.png" width="80%" height="40%">
 630 | 
 631 | </figure>
 632 | 
 633 | 
 634 | ]
 635 | 
 636 | ---
 637 | class:
 638 | name:
 639 | .left-column[
 640 | ## RDD 依赖
 641 | ### 宽依赖 & 窄依赖
 642 | ### Shuffle 
 643 | ### Shuffle & Stage & Task
 644 | ### Task & Loop
 645 | ### Object Lifetime
 646 | ### Shuffle Tuning
 647 | 
 648 | ]
 649 | .right-column[
 650 | #### 避免不必要shuffle
 651 | 
 652 | - repartition, coalesce, repartitionandsortwithinpartitions 对rdd重新分区
 653 | 
 654 |   repartition(numPartitions: Int)=coalesce(numPartitions, shuffle = true)
 655 | 
 656 | <!--   如果需要repartition之后对分区排序，那么使用这个repartitionandsortwithinpar
 657 |   titions性能会更好，因为可以将分区排序放在shuffle过程 -->
 658 | 
 659 |   <figure class="half">
 660 |     <img src="../../imgs/spark_core/repartition-more2less.png" width="60%" height="30%">
 661 |     &nbsp; &nbsp;  
 662 |     <img src="../../imgs/spark_core/repartition-less2more.png" width="30%"  height="30%">
 663 | </figure>
 664 | 
 665 | 
 666 | ]
 667 | ---
 668 | class:
 669 | name:
 670 | .left-column[
 671 | ## RDD 依赖
 672 | ### 宽依赖 & 窄依赖
 673 | ### Shuffle 
 674 | ### Shuffle & Stage & Task
 675 | ### Task & Loop
 676 | ### Object Lifetime
 677 | ### Shuffle Tuning
 678 | 
 679 | ]
 680 | .right-column[
 681 | - broadcast实现map join代替 reduce join，<font color=#A52A2A size=4 >**避免shuffle**</font>
 682 | 
 683 |   在对RDD使用join类操作(with inputs not co-partitioned)，而且join操作中的一个RDD或表的数据量比较小
 684 | 
 685 |   使用broadcast广播小表，然后通过对小表进行遍历完成map join，避免了shuffle的发生
 686 | 
 687 | ```
 688 | val rdd2Data = rdd2.collect()
 689 | val rdd2Bc = sc.broadcast(rdd2Data)
 690 | def function(tuple: (String,Int)): (String,(Int,String)) ={
 691 |     for(value <- rdd2Bc.value){
 692 |      if(value._1.equals(tuple._1))
 693 |         return (tuple._1,(tuple._2,value._2.toString))
 694 |          }
 695 |          (tuple._1,(tuple._2,null))
 696 |    }
 697 | val rdd3 = rdd1.map(function(_))
 698 | 
 699 | ```
 700 | 
 701 | 
 702 | 
 703 | ]
 704 | 
 705 | ---
 706 | class:
 707 | name:
 708 | .left-column[
 709 | ## RDD 依赖
 710 | ### 宽依赖 & 窄依赖
 711 | ### Shuffle 
 712 | ### Shuffle & Stage & Task
 713 | ### Task & Loop
 714 | ### Object Lifetime
 715 | ### Shuffle Tuning
 716 | 
 717 | ]
 718 | .right-column[
 719 | 
 720 | - Why broadcast?
 721 |   
 722 |   Spark存在作用域，变量声明在driver上，当task需要操作这些driver上声明的变量时会从driver拷贝副本传输到task。
 723 | 
 724 |   broadcast是保证这个由driver声明的变量值只会发送到每个worker上面一份。
 725 | 
 726 |   如果不使用broadcast,driver需要给每个task都发送一份副本，如果广播变量较大，会造成大量网络传输。
 727 | 
 728 | ```
 729 | val rdd2Data = rdd2.collect()
 730 | //val rdd2Bc = sc.broadcast(rdd2Data)
 731 | def function(tuple: (String,Int)): (String,(Int,String)) ={
 732 |     for(value <- rdd2Data){
 733 |      if(value._1.equals(tuple._1))
 734 |         return (tuple._1,(tuple._2,value._2.toString))
 735 |          }
 736 |          (tuple._1,(tuple._2,null))
 737 |    }
 738 | val rdd3 = rdd1.map(function(_))
 739 | 
 740 | ```
 741 | 
 742 | ]
 743 | 
 744 | ---
 745 | class:
 746 | name:
 747 | .left-column[
 748 | ## RDD 依赖
 749 | ### 宽依赖 & 窄依赖
 750 | ### Shuffle 
 751 | ### Shuffle & Stage & Task
 752 | ### Task & Loop
 753 | ### Object Lifetime
 754 | ### Shuffle Tuning
 755 | 
 756 | ]
 757 | .right-column[
 758 | #### map-side Combine
 759 | 
 760 | 在shuffle write端进行合并数据，可以减少shuffle阶段序列化反序列化开销以及网络传输开销，也会减小在shuffle read端的压力，提升程序性能。
 761 | 
 762 | - 尽量使用aggregateByKey和reduceByKey代替groupByKey
 763 | 
 764 | | | |
 765 | | :---:| :---: |
 766 | | groupByKey([numPartitions]) | 没有map-side combine，对(k,v)类型records按照key进行聚合 |
 767 | | reduceByKey(func, [numPartitions]) | map-side combine, 对(k,v)类型records按照key进行合并 |
 768 | | aggregateByKey(zeroValue)(seqOp, combOp, [numPartitions]) |  先对数据按照分区使用seqOp聚合，然后再按照key值使用combOq合并    |
 769 | 
 770 |  一个RDD，变量名为rdd:RDD[(Int,Int)]，对其进行按key，求value之和操作。
 771 | 
 772 |  ```
 773 |  //reduceByKey with map-side combine
 774 | val red=rdd.reduceByKey(_+_)
 775 | //aggregateByKey with map-side combine
 776 | val agg=rdd.aggregateByKey(0)(((i1,i2)=>i1+i2),((i1,i2)=>(i1+i2)))
 777 | //groupByKey without map-side combine
 778 | val gbk=rdd.groupByKey().mapValues(iter=> iter.sum)
 779 |  ```
 780 | 
 781 | ]
 782 | 
 783 | ---
 784 | class:
 785 | name:
 786 | .left-column[
 787 | ## RDD 依赖
 788 | ### 宽依赖 & 窄依赖
 789 | ### Shuffle 
 790 | ### Shuffle & Stage & Task
 791 | ### Task & Loop
 792 | ### Object Lifetime
 793 | ### Shuffle Tuning
 794 | 
 795 | ]
 796 | .right-column[
 797 | #### 数据倾斜
 798 | 数据倾斜是由于存在一些<font color=#A52A2A size=4 >**热点数据**</font>，比如某个key存在大量对应的value，或者某个分区存在大量数据(即存在大量hash之后得到同样hash值的key)。
 799 | 
 800 | ##### 数据倾斜现象
 801 | - 绝大多数task很快结束，存在几个straggler.
 802 | - 原本能够正常执行的Spark作业，执行某个数据集突然报出OOM（内存溢出）异常。
 803 | 
 804 | ##### 数据倾斜原理及影响
 805 | - shuffle Read需要将各个节点上相同的key拉取到某个节点上的一个task来进行处理，比如按照key进行聚合或join等操作。此时如果某个key对应的数据量特别大的话，就会发生数据倾斜。
 806 | 
 807 | - 某个task对应数据量大，可能会导致OOM，以及多次重试。
 808 | 
 809 | - 每个stage的运行时间由最后一个完成的task决定。
 810 | ]
 811 | ---
 812 | class:
 813 | name:
 814 | .left-column[
 815 | ## RDD 依赖
 816 | ### 宽依赖 & 窄依赖
 817 | ### Shuffle 
 818 | ### Shuffle & Stage & Task
 819 | ### Task & Loop
 820 | ### Object Lifetime
 821 | ### Shuffle Tuning
 822 | 
 823 | ]
 824 | .right-column[
 825 | #### 解决数据倾斜
 826 | 
 827 | - 提高shuffle操作的并行度-加大shuffle操作时partition数量
 828 | 
 829 |   reduceByKey(func, [numPartitions])
 830 | 
 831 |   aggregateByKey(zeroValue)(seqOp, combOp, [numPartitions])  
 832 | 
 833 |   配置spark.default.parallelism 
 834 | 
 835 |   优点： 实现简单，可以有效缓解数据倾斜。
 836 | 
 837 |   缺点： 针对热点数据比如一个key对应巨量数据的情况无法解决。
 838 | 
 839 | - 分阶段聚合
 840 | 
 841 |   前面提到的map-side combine
 842 | 
 843 |   优点： 对于聚合类的shuffle操作导致的数据倾斜，非常有效。
 844 | 
 845 |   缺点： 仅仅适用于聚合类的shuffle操作，适用范围相对较窄。无法处理join类shuffle数据倾斜。
 846 | 
 847 | ]
 848 | 
 849 | 
 850 | ---
 851 | class:
 852 | name:
 853 | .left-column[
 854 | ## RDD 依赖
 855 | ### 宽依赖 & 窄依赖
 856 | ### Shuffle 
 857 | ### Shuffle & Stage & Task
 858 | ### Task & Loop
 859 | ### Object Lifetime
 860 | ### Shuffle Tuning
 861 | 
 862 | ]
 863 | .right-column[
 864 | 
 865 |  - 给key加随机前缀，缓解热点数据
 866 | 
 867 |   通过spark提供的takeSample算子可以对RDD进行采样。通过观察看是否存在数据倾斜/热点数据.
 868 | 
 869 |   ```
 870 |   takeSample(withReplacement, num, [seed])  
 871 |  ```
 872 | 
 873 |   进行join操作，比如存在热点key “hello",对应大量的value,此时我们给每个key加上(0-10)之间的随机前缀，这些数据就会随机变成(1_hello,v1),(2_hello,v2) ... (10_hello,v10)，这样就缓解了热点的key。
 874 | 
 875 | 
 876 | 
 877 | 
 878 | - 混合使用多种调优策略
 879 | 
 880 | 
 881 | ]
 882 | 
 883 | <!-- ---
 884 | class:
 885 | name:
 886 | .left-column[
 887 | ## RDD 依赖
 888 | ### 宽依赖 & 窄依赖
 889 | ### Shuffle 
 890 | ### Stage & Task
 891 | ### Shuffle Tuning
 892 | 
 893 | ]
 894 | .right-column[
 895 | 
 896 | - Why broadcast?
 897 |   
 898 |   Spark存在作用域，变量声明在driver上，当executor需要使用driver上的变量，会有driver拷贝一个副本发送到executor。
 899 | 
 900 |   broadcast是保证这个由driver声明的变量值只会发送到executor上面一份。
 901 | 
 902 |   如果不使用broadcast,每条数据都要调用map里面的function,每调用一次就会由driver拷贝一份副本发送到executor，造成大量网络传输。
 903 | 
 904 | ] -->
 905 | 
 906 | 
 907 | <!-- ---
 908 | class: 
 909 | name:
 910 | 
 911 | 
 912 | .left-column[
 913 | ## RDD与Task
 914 | ### 分区与Task
 915 | 
 916 | ]
 917 | .right-column[
 918 | 
 919 | - 每个stage里面，有若干个 相互独立的Task。
 920 | 
 921 | - Task数目等于被操作RDD的分区数。
 922 | 
 923 | - 每个Task分别对RDD的一个分区进行一系列操作。
 924 | 
 925 | - 在一个executor中，并行的task数目和executor的核数有关。
 926 | 
 927 | ] -->
 928 | 
 929 | 
 930 | 
 931 | <!-- ---
 932 | class: 
 933 | name:
 934 | 
 935 | 
 936 | .left-column[
 937 | ## RDD与Task
 938 | ### 分区与Task
 939 | 
 940 | ]
 941 | .right-column[
 942 | 
 943 | - 每个stage里面，有若干个 相互独立的Task。
 944 | 
 945 | - Task数目等于被操作RDD的分区数。
 946 | 
 947 | - 每个Task分别对RDD的一个分区进行一系列操作。
 948 | 
 949 | - 在一个executor中，并行的task数目和executor的核数有关。
 950 | 
 951 | ] -->
 952 | 
 953 | 
 954 | <!-- ---
 955 | class: 
 956 | name:
 957 | 
 958 | 
 959 | .left-column[
 960 | ## RDD与Task
 961 | ### 分区与Task
 962 | ### 内存模型
 963 | 
 964 | ]
 965 | .right-column[
 966 | <center class="half">
 967 |     <img src="../../imgs/rdd_basics/memory.png" width="30%" height="50%">
 968 | </center>
 969 | 
 970 | - 当RDD分区里数据量很大时，每个task占用的执行内存比较大，容易造成内存紧张。
 971 | - 当executor内存压力大，可以增大分区数量(减少分区数据量)或者减少executor cpu 核数（减小并行处理task的数量)。
 972 | 
 973 | 
 974 | 
 975 | ] -->
 976 | 
 977 | ---
 978 | class:
 979 | name:
 980 | .left-column[
 981 | ## RDD 依赖
 982 | ### 宽依赖 & 窄依赖
 983 | ### Shuffle 
 984 | ### Shuffle & Stage & Task
 985 | ### Task & Loop
 986 | ### Object Lifetime
 987 | ### Shuffle Tuning
 988 | 
 989 | ]
 990 | .right-column[
 991 | 
 992 |  #### 参数调优
 993 | 
 994 | - spark.shuffle.file.buffer
 995 | 
 996 | 	default: 32k
 997 | 
 998 | 	shuffle write端写磁盘文件时缓冲区大小，适量增大可以减少磁盘I/O次数，进而提升性能。
 999 | 
1000 | - spark.reducer.maxSizeInFlight
1001 | 
1002 | 	default: 48M
1003 | 
1004 | 	shuffle read端拉取对应分区数据缓冲区大小，适量增大可以减少网络传输次数，进而提升性能。
1005 | 
1006 | - spark.shuffle.io.maxRetries
1007 | 
1008 | 	default: 3
1009 | 
1010 | 	shuffle read端拉取对应数据时，因为网络异常拉取失败重新尝试的最大次数。针对超大数据量的应用，可以增大重试次数，大幅度提升稳定性。
1011 | 
1012 | 
1013 | 
1014 | ]
1015 | 
1016 | 
1017 | ---
1018 | class: 
1019 | name:
1020 | 
1021 | 
1022 | .left-column[
1023 | ## RDD Persist
1024 | 
1025 | 
1026 | ]
1027 | .right-column[
1028 | - 机器学习，图计算等应用存在大量迭代计算。
1029 | 
1030 | - 适当的缓存中间数据可以避免重复计算。
1031 | 
1032 |   persist(storageLevel)
1033 | 
1034 |   cache()=persist(StorageLevel.MEMORY_ONLY)
1035 | 
1036 | - 缓存级别
1037 | 
1038 | |                     |  |
1039 | | :-----------------: | :----:|
1040 | |     MEMORY_ONLY     |    只缓存在内存中|
1041 | |   MEMORY_AND_DISK   |  缓存在内存和磁盘|
1042 | |   MEMORY_ONLY_SER   |  序列化缓存在内存中|
1043 | | MEMORY_AND_DISK_SER |   序列化缓存在内存和磁盘|
1044 | |      DISK_ONLY      |    只缓存在磁盘|
1045 | |    MEMORY_ONLY_2    |    缓存在内存中两份(一份副本)|
1046 | |  MEMORY_AND_DISK_2  |  缓存在内核和磁盘两份(一份副本)|
1047 | |      OFF_HEAP       |  缓存在堆外 |
1048 | 
1049 | 
1050 | 
1051 | ]
1052 | ---
1053 | class: 
1054 | name:
1055 | 
1056 | 
1057 | .left-column[
1058 | ## 相关链接
1059 | 
1060 | 
1061 | ]
1062 | .right-column[
1063 | 
1064 | - RDD论文
1065 | 
1066 |   https://www.usenix.org/system/files/conference/nsdi12/nsdi12-final138.pdf
1067 | 
1068 | - RDD算子介绍
1069 | 
1070 |   http://spark.apache.org/docs/latest/rdd-programming-guide.html
1071 | 
1072 | - spark 配置
1073 | 
1074 |   http://spark.apache.org/docs/latest/configuration.html
1075 | 
1076 | - 性能调优
1077 | 
1078 |   http://spark.apache.org/docs/latest/tuning.html
1079 | 
1080 | 
1081 | 
1082 | ]
1083 | ---
1084 | 
1085 | class: middle, center, inverse
1086 | name: greetings
1087 | # Q & A
1088 | ---
1089 | class: center, middle, inverse
1090 | 
1091 | # Thanks！
1092 | 
1093 | <img style="zoom: 1.0" src="../../imgs/mammut.png"  align="bottom" />
1094 | 
1095 | <a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc/4.0/">Creative Commons Attribution-NonCommercial 4.0 International License</a>.
1096 | 
1097 |     </textarea>
1098 |     <script src="https://remarkjs.com/downloads/remark-latest.min.js">
1099 |     </script>
1100 |     <script>
1101 |       var slideshow = remark.create({
1102 |         ratio: '16:9',
1103 |         slideNumberFormat: 'Slide %current% of %total%',
1104 |         // .. or by using a format function
1105 |         slideNumberFormat: function (current, total) {
1106 |           return ' ' + current + ' of ' + total;
1107 |         },
1108 |         highlightLanguage: 'scala',
1109 |         highlightStyle: 'monokai',
1110 |         highlightLines: true,
1111 |         // arta, ascetic, dark, default, far, github, googlecode, idea, ir-black, magula, monokai, rainbow, solarized-dark, solarized-light, sunburst, tomorrow, tomorrow-night-blue, tomorrow-night-bright, tomorrow-night, tomorrow-night-eighties, vs, zenburn
1112 |         highlightStyle: 'zenburn'
1113 |       });
1114 |     </script>
1115 |   </body>
1116 | </html>
1117 | 


--------------------------------------------------------------------------------