├── README.md ├── pom.xml └── src └── main └── scala ├── META-INF └── MANIFEST.MF └── info └── xiaohei └── www └── spark └── examples ├── decisiontree └── RunDecisionTree.scala └── kmeans └── RunKMeans.scala /README.md: -------------------------------------------------------------------------------- 1 | ## Spark数据分析Demo 2 | 3 | 使用Spark Mllib进行数据分析实例 4 | 5 | 已完成的有: 6 | 7 | > 1.[决策树算法预测森林植被](http://www.xiaohei.info/2016/05/06/spark-decisiontree-predict/) 8 | > 2.[KMeans算法检测网络异常入侵](http://www.xiaohei.info/2016/05/09/spark-kmeans-network/) 9 | 10 | 示例数据下载: 11 | 12 | > 1.[决策树算法预测森林植被](https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/) 13 | > 2.[KMeans算法检测网络异常入侵](http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html) 14 | 15 | 参考书:[Spark高级数据分析](https://book.douban.com/subject/26647951/) 16 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | info.xiaohei.www 8 | bigdata-spark 9 | 1.0-SNAPSHOT 10 | 11 | 12 | UTF-8 13 | 1.4.1 14 | 0.9.0.1 15 | 2.7.2 16 | 17 | 18 | 19 | 20 | org.apache.kafka 21 | kafka_2.11 22 | ${kafka.version} 23 | 24 | 25 | org.apache.spark 26 | spark-core_2.11 27 | ${spark.version} 28 | 29 | 30 | org.apache.spark 31 | spark-sql_2.11 32 | ${spark.version} 33 | 34 | 35 | org.apache.spark 36 | spark-streaming_2.11 37 | ${spark.version} 38 | 39 | 40 | org.apache.spark 41 | spark-mllib_2.11 42 | ${spark.version} 43 | 44 | 45 | org.apache.spark 46 | spark-streaming-kafka_2.11 47 | ${spark.version} 48 | 49 | 50 | -------------------------------------------------------------------------------- /src/main/scala/META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | Main-Class: info.xiaohei.www.spark.examples.decisiontree.RunDecisionTr 3 | ee 4 | 5 | -------------------------------------------------------------------------------- /src/main/scala/info/xiaohei/www/spark/examples/decisiontree/RunDecisionTree.scala: -------------------------------------------------------------------------------- 1 | package info.xiaohei.www.spark.examples.decisiontree 2 | 3 | import org.apache.spark.mllib.evaluation.{MulticlassMetrics, MultilabelMetrics} 4 | import org.apache.spark.mllib.linalg.Vectors 5 | import org.apache.spark.mllib.regression.LabeledPoint 6 | import org.apache.spark.mllib.tree.{RandomForest, DecisionTree} 7 | import org.apache.spark.mllib.tree.model.{RandomForestModel, DecisionTreeModel} 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.{SparkContext, SparkConf} 10 | 11 | /** 12 | * Copyright © 2016 xiaohei, All Rights Reserved. 13 | * Email : chubbyjiang@gmail.com 14 | * Host : xiaohei.info 15 | * Created : 16/5/6 11:25 16 | */ 17 | object RunDecisionTree { 18 | def main(args: Array[String]) { 19 | val conf = new SparkConf().setAppName("DecisionTree") 20 | val sc = new SparkContext(conf) 21 | //读取数据 22 | val rawData = sc.textFile("/spark_data/covtype.data") 23 | //转换为为LabeledPoint 24 | val data = rawData.map { line => 25 | val values = line.split(",").map(_.toDouble) 26 | //init返回除了最后一个元素的所有元素,作为特征向量 27 | val feature = Vectors.dense(values.init) 28 | //返回最后一个目标特征,由于决策树的目标特征规定从0开始,而数据是从1开始的,所以要-1 29 | val label = values.last - 1 30 | LabeledPoint(label, feature) 31 | } 32 | 33 | val Array(trainData, cvData, testData) = data.randomSplit(Array(0.8, 0.1, 0.1)) 34 | trainData.cache() 35 | cvData.cache() 36 | testData.cache() 37 | 38 | //第一个决策树模型 39 | //val model = DecisionTree.trainClassifier(trainData, 7, Map[Int, Int](), "gini", 4, 100) 40 | //使用最好的参数组合的决策树模型 41 | //val model = DecisionTree.trainClassifier(trainData, 7, Map[Int, Int](), "entropy", 20, 300) 42 | //构建随机森林 43 | val model = RandomForest.trainClassifier(trainData, 7, Map(10 -> 4, 11 -> 40), 20, "auto", "entropy", 30, 300) 44 | val metrics = getMetrics(model, cvData) 45 | 46 | //混淆矩阵和模型准确度 47 | System.out.println(metrics.confusionMatrix) 48 | System.out.println(metrics.precision) 49 | 50 | //每个类别对应的准确度 51 | (0 until 7).map(target => (metrics.precision(target), metrics.recall(target))).foreach(println) 52 | 53 | 54 | 55 | } 56 | 57 | /** 58 | * 获得评估指标 59 | * 60 | * @param model 决策树模型 61 | * @param data 用于交叉验证的数据集 62 | **/ 63 | def getMetrics(model: DecisionTreeModel, data: RDD[LabeledPoint]): MulticlassMetrics = { 64 | //将交叉验证数据集的每个样本的特征向量交给模型预测,并和原本正确的目标特征组成一个tuple 65 | val predictionsAndLables = data.map { d => 66 | (model.predict(d.features), d.label) 67 | } 68 | //将结果交给MulticlassMetrics,其可以以不同的方式计算分配器预测的质量 69 | new MulticlassMetrics(predictionsAndLables) 70 | } 71 | 72 | /** 73 | * @param model 随机啥森林模型 74 | * @param data 用于交叉验证的数据集 75 | * */ 76 | def getMetrics(model: RandomForestModel, data: RDD[LabeledPoint]): MulticlassMetrics = { 77 | //将交叉验证数据集的每个样本的特征向量交给模型预测,并和原本正确的目标特征组成一个tuple 78 | val predictionsAndLables = data.map { d => 79 | (model.predict(d.features), d.label) 80 | } 81 | //将结果交给MulticlassMetrics,其可以以不同的方式计算分配器预测的质量 82 | new MulticlassMetrics(predictionsAndLables) 83 | } 84 | 85 | /** 86 | * 在训练数据集上得到最好的参数组合 87 | * 88 | * @param trainData 训练数据集 89 | * @param cvData 交叉验证数据集 90 | **/ 91 | def getBestParam(trainData: RDD[LabeledPoint], cvData: RDD[LabeledPoint]): Unit = { 92 | val evaluations = for (impurity <- Array("gini", "entropy"); 93 | depth <- Array(1, 20); 94 | bins <- Array(10, 300)) yield { 95 | val model = DecisionTree.trainClassifier(trainData, 7, Map[Int, Int](), impurity, depth, bins) 96 | val metrics = getMetrics(model, cvData) 97 | ((impurity, depth, bins), metrics.precision) 98 | } 99 | evaluations.sortBy(_._2).reverse.foreach(println) 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/main/scala/info/xiaohei/www/spark/examples/kmeans/RunKMeans.scala: -------------------------------------------------------------------------------- 1 | package info.xiaohei.www.spark.examples.kmeans 2 | 3 | import org.apache.spark.mllib.clustering.{KMeansModel, KMeans} 4 | import org.apache.spark.mllib.linalg.Vectors 5 | import org.apache.spark.mllib.linalg.Vector 6 | import org.apache.spark.rdd.RDD 7 | import org.apache.spark.{SparkContext, SparkConf} 8 | 9 | /** 10 | * Copyright © 2016 xiaohei, All Rights Reserved. 11 | * Email : chubbyjiang@gmail.com 12 | * Host : xiaohei.info 13 | * Created : 16/5/9 09:52 14 | */ 15 | object RunKMeans { 16 | def main(args: Array[String]) { 17 | val conf = new SparkConf().setAppName("KMeans") 18 | val sc = new SparkContext(conf) 19 | //读取数据 20 | val rawData = sc.textFile("/spark_data/ch05/kddcup.data") 21 | //根据类别查看统计信息,各个类别下有多少数据 22 | //catStatsData(rawData) 23 | 24 | val labelsAndData = rawData.map { line => 25 | //buffer是一个可变列表 26 | val buffer = line.split(",").toBuffer 27 | //下标1-3的元素 28 | buffer.remove(1, 3) 29 | //最后一个元素为label 30 | val label = buffer.remove(buffer.length - 1) 31 | //转换为Vector 32 | val vector = Vectors.dense(buffer.map(_.toDouble).toArray) 33 | (label, vector) 34 | } 35 | //数据只用到values部分 36 | val data = labelsAndData.values.cache() 37 | //第一次训练模型 38 | //firstKMeans(data, labelsAndData) 39 | 40 | //取不同k值观察模型优劣 41 | /*(50 to 130 by 10).map { k => 42 | (k, clusteringScore(data, k)) 43 | }.foreach(println)*/ 44 | 45 | //取最好的k值训练模型 46 | val kmeans = new KMeans() 47 | kmeans.setK(130) 48 | kmeans.setEpsilon(1.0e-6) 49 | kmeans.setRuns(10) 50 | val model = kmeans.run(data) 51 | catLabelCount(labelsAndData, model) 52 | } 53 | 54 | /** 55 | * 输出样本数据的统计信息 56 | **/ 57 | def catStatsData(rawData: RDD[String]): Unit = { 58 | //根据","分割,只保留最后的类别 59 | val catStatsData = rawData.map(_.split(",").last) 60 | //对类别的数目进行统计,并根据统计的数量从小打到排序 61 | .countByValue().toSeq.sortBy(_._2) 62 | //转换为从大到小排序 63 | .reverse 64 | catStatsData.foreach(println) 65 | } 66 | 67 | /** 68 | * 初次训练模型 69 | **/ 70 | def firstKMeans(data: RDD[Vector] 71 | , labelsAndData: RDD[(String, Vector)]): Unit = { 72 | //训练模型 73 | val kmeans = new KMeans() 74 | val model = kmeans.run(data) 75 | //输出每个族群的点 76 | model.clusterCenters.foreach(println) 77 | catLabelCount(labelsAndData, model) 78 | } 79 | 80 | /** 81 | * 输出每个族群包含的类别和个数信息 82 | * 83 | * @param labelsAndData 含类别信息的数据 84 | * @param model KMeans模型 85 | **/ 86 | def catLabelCount(labelsAndData: RDD[(String, Vector)], model: KMeansModel): Unit = { 87 | //输出每个聚类中心有哪些类别各有多少个数据 88 | val clusterLabelCount = labelsAndData.map { case (label, datum) => 89 | //为样本数据划分聚类中心 90 | val cluster = model.predict(datum) 91 | //返回数据的中心和类别二元组 92 | (cluster, label) 93 | }.countByValue() 94 | //排序之后格式化输出 95 | clusterLabelCount.toSeq.sorted.foreach { case ((cluster, label), count) => 96 | println(f"$cluster\t$label\t$count") 97 | } 98 | } 99 | 100 | /** 101 | * 计算两个向量之间的距离 102 | * 103 | * @param a 向量1 104 | * @param b 向量2 105 | * 欧式距离:空间上两个点的距离=两个向量相应元素的差的平方和的平方根 106 | **/ 107 | def distance(a: Vector, b: Vector) = { 108 | //求平方根 109 | math.sqrt( 110 | //将两个向量合并 111 | a.toArray.zip(b.toArray) 112 | //两个向量中的每个值相减 113 | .map(d => d._1 - d._2) 114 | //相间的值平方 115 | .map(d => d * d) 116 | //之后相加 117 | .sum) 118 | } 119 | 120 | /** 121 | * 计算数据点到聚类中心质心的距离 122 | * 123 | * @param datum 数据点 124 | * @param model kmeans模型 125 | **/ 126 | def distToCentrolid(datum: Vector, model: KMeansModel) = { 127 | //得到该数据点的聚类中心 128 | val cluster = model.predict(datum) 129 | //得到该聚类中心的质心 130 | val centrolid = model.clusterCenters(cluster) 131 | //计算距离 132 | distance(centrolid, datum) 133 | } 134 | 135 | /** 136 | * 根据各个数据点到该数据点聚类中心质心的距离来判断该模型优劣 137 | * 138 | * @param data 样本数据 139 | * @param k k值 140 | **/ 141 | def clusteringScore(data: RDD[Vector], k: Int) = { 142 | val kmeans = new KMeans() 143 | //设置k值 144 | kmeans.setK(k) 145 | //设置该k值的聚类次数 146 | kmeans.setRuns(10) 147 | //设置迭代过程中,质心的最小移动值,默认为1.0e-4 148 | kmeans.setEpsilon(1.0e-6) 149 | val model = kmeans.run(data) 150 | //计算样本数据到其各自质心的记录的平均值 151 | data.map { datum => 152 | distToCentrolid(datum, model) 153 | }.mean() 154 | } 155 | } 156 | --------------------------------------------------------------------------------