├── .gitignore ├── sparkML ├── src │ └── main │ │ ├── factory │ │ ├── Algorithm.scala │ │ ├── AlgTrait.scala │ │ └── AlgorithmFactory.scala │ │ ├── optimizer │ │ ├── optimizer.scala │ │ └── FTRLProximal.scala │ │ ├── recommender │ │ ├── Recommender.scala │ │ ├── ALSRec.scala │ │ └── SlopOneRec.scala │ │ ├── app.scala │ │ ├── input │ │ ├── DataHolder.scala │ │ ├── recommend │ │ │ ├── RecDataHolder.scala │ │ │ ├── YahooDataHolder.scala │ │ │ └── NetflixDataHolder.scala │ │ ├── LRDataHolder.scala │ │ └── DataFactory.scala │ │ ├── Classifier │ │ ├── RegressionModel.scala │ │ └── LRWithFTRL.scala │ │ ├── util │ │ ├── SparkEnv.scala │ │ ├── Conf.scala │ │ └── MainHolder.scala │ │ └── linalg │ │ └── algUtil.scala └── sparkML.iml ├── README.md ├── LICENSE └── bash └── splitDataset.py /.gitignore: -------------------------------------------------------------------------------- 1 | */lib/ 2 | */out/ 3 | */META-INF/ 4 | */scala-train/ 5 | */mlTrains/ 6 | */.idea/ 7 | -------------------------------------------------------------------------------- /sparkML/src/main/factory/Algorithm.scala: -------------------------------------------------------------------------------- 1 | package main.factory 2 | 3 | /** 4 | * Created by zhy on 2015/8/2 0002. 5 | */ 6 | trait Algorithm extends RMSE with Serializable -------------------------------------------------------------------------------- /sparkML/src/main/optimizer/optimizer.scala: -------------------------------------------------------------------------------- 1 | package main.optimizer 2 | 3 | /** 4 | * Created by zhy on 2015/8/1 0001. 5 | */ 6 | 7 | trait Optimizer extends Serializable 8 | -------------------------------------------------------------------------------- /sparkML/src/main/recommender/Recommender.scala: -------------------------------------------------------------------------------- 1 | package main.recommender 2 | 3 | import main.factory.Algorithm 4 | 5 | /** 6 | * Created by zhy on 2015/7/19 0019. 7 | */ 8 | 9 | class Recommender extends Algorithm -------------------------------------------------------------------------------- /sparkML/src/main/app.scala: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import main.util.{Conf, MainHolder} 4 | 5 | /** 6 | * Created by zhy on 2015/7/19 0019. 7 | */ 8 | object app extends App { 9 | 10 | override def main(args: Array[String]) { 11 | val opt = new Conf(args) 12 | 13 | MainHolder.setUp(opt) 14 | 15 | MainHolder.calculateRMSE 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /sparkML/src/main/input/DataHolder.scala: -------------------------------------------------------------------------------- 1 | package main.input 2 | 3 | import org.apache.spark.mllib.recommendation.Rating 4 | import org.apache.spark.mllib.regression.LabeledPoint 5 | import org.apache.spark.rdd.RDD 6 | 7 | /** 8 | * Created by zhy on 2015/8/3 0003. 9 | */ 10 | trait DataHolder extends Serializable { 11 | def getLRData(): RDD[LabeledPoint] 12 | 13 | def getData(): RDD[Rating] 14 | 15 | def getDataDesc: Unit 16 | } 17 | -------------------------------------------------------------------------------- /sparkML/src/main/Classifier/RegressionModel.scala: -------------------------------------------------------------------------------- 1 | package main.classifier 2 | 3 | import main.factory.Algorithm 4 | import main.optimizer.Optimizer 5 | import org.apache.spark.mllib.regression.LabeledPoint 6 | import org.apache.spark.rdd.RDD 7 | 8 | /** 9 | * Created by zhy on 2015/8/2 0002. 10 | */ 11 | 12 | /** 13 | * 回归模型 14 | */ 15 | trait RegressionModel extends Algorithm with Serializable { 16 | 17 | //优化算法 18 | def optimizer: Optimizer 19 | 20 | //训练及预测 21 | def train(trainData: RDD[LabeledPoint]): Unit 22 | } 23 | -------------------------------------------------------------------------------- /sparkML/src/main/util/SparkEnv.scala: -------------------------------------------------------------------------------- 1 | package main.util 2 | 3 | /** 4 | * Created by zhy on 2015/7/18 0018. 5 | */ 6 | 7 | import org.apache.log4j.{Level, Logger} 8 | import org.apache.spark.{SparkConf, SparkContext} 9 | 10 | /** 11 | * 初始化SparkContext 12 | */ 13 | object SparkEnv { 14 | 15 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN) 16 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF) 17 | 18 | val conf = new SparkConf().setAppName("MachineLearningInSpark").setMaster("local[2]") 19 | val sc = new SparkContext(conf) 20 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Machine Learning In Spark 2 | 3 | Scalable system written in [Scala](http://www.scala-lang.org/) using the [Apache Spark framework](https://spark.apache.org/) get the framework design from [OndraFiedler/spark-recommender](https://github.com/OndraFiedler/spark-recommender). 4 | 5 | ## Features 6 | 7 | A ML framework to implement or develop ML algorithms which can run in spark. 8 | 9 | ### Implemented Algorithms 10 | 11 | - 1.ALS from Spark MLLib 12 | - 2.Slop-One 13 | - 3.Logistic Regression with FTRL-Proximal 14 | 15 | ### Dataset 16 | 17 | - 1.NetFlix 18 | - 2.Yahoo 19 | - 3.Logistic Regression Dataset -------------------------------------------------------------------------------- /sparkML/src/main/linalg/algUtil.scala: -------------------------------------------------------------------------------- 1 | package main.linalg 2 | 3 | import breeze.linalg.{SparseVector => BSV, Vector => BV} 4 | import org.apache.spark.mllib.linalg.{SparseVector, Vector} 5 | 6 | /** 7 | * Created by zhy on 2015/8/2 0002. 8 | */ 9 | object AlgUtil { 10 | /** 11 | * 向量->Breeze向量 12 | * @param v Vector 13 | * @return Breeze Vector 14 | */ 15 | def VtoB(v: Vector): BV[Double] = 16 | new BSV[Double](v.toSparse.indices, v.toSparse.values, v.toSparse.size) 17 | 18 | /** 19 | * 稀疏向量->Breeze向量 20 | * @param v SparseVector 21 | * @return Breeze Vector 22 | */ 23 | def StoB(v: SparseVector): BV[Double] = new BSV[Double](v.indices, v.values, v.size) 24 | } 25 | -------------------------------------------------------------------------------- /sparkML/sparkML.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /sparkML/src/main/input/recommend/RecDataHolder.scala: -------------------------------------------------------------------------------- 1 | package main.input.recommend 2 | 3 | import main.input.DataHolder 4 | import org.apache.spark.mllib.recommendation.Rating 5 | import org.apache.spark.rdd.RDD 6 | 7 | /** 8 | * Created by zhy on 2015/7/18 0018. 9 | */ 10 | 11 | /** 12 | * 推荐算法数据接口,可获取相应的Rating和ID2Name映射 13 | */ 14 | 15 | trait RecDataHolder extends DataHolder with Serializable { 16 | protected val ratings: RDD[Rating] 17 | protected val productsIDsToNameMap: Map[Int, String] 18 | 19 | override def getLRData = ??? 20 | 21 | override def getData = getRatings 22 | 23 | override def getDataDesc = printRatingDesc 24 | 25 | def getRatings(): RDD[Rating] = ratings 26 | 27 | def printRatingDesc = println("数据集包含 " + ratings.count + " 条数据,来自 " 28 | + ratings.map(_.user).distinct.count + " 个用户和 " + ratings.map(_.product).distinct.count + "件产品") 29 | 30 | def getIDToProductnameMap(): Map[Int, String] = productsIDsToNameMap 31 | 32 | def getNumOfProducts(): Int = productsIDsToNameMap.keys.max + 1 33 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Ondra Fiedler 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /sparkML/src/main/util/Conf.scala: -------------------------------------------------------------------------------- 1 | package main.util 2 | 3 | import main.factory.AlgorithmFactory 4 | import main.input.DataFactory 5 | import org.rogach.scallop.ScallopConf 6 | 7 | /** 8 | * Created by zhy on 2015/7/19 0019. 9 | */ 10 | 11 | /** 12 | * 命令行参数解析类 13 | * @param arguments 命令行参数 14 | */ 15 | class Conf(arguments: Seq[String]) extends ScallopConf(arguments) { 16 | 17 | val datasetTypes = DataFactory.dataHolderList 18 | val algorithms = AlgorithmFactory.AlgList 19 | 20 | banner( """ 21 | Spark机器学习算法 22 | ---------------- 23 | 基于Spark的机器学习算法库 24 | 25 | 示例: 26 | spark-submit [Jar] --data Yahoo --dir /zhy/data/Yahoo/ --method ALS 27 | 28 | 参数: 29 | """) 30 | 31 | version("version 1.5.0") 32 | 33 | val data = opt[String](required = true, validate = { str => datasetTypes.map(_.getName).contains(str) }, descr = { 34 | "数据集类型。可选类型: " + datasetTypes.map(_.getName).reduce(_ + ", " + _) 35 | }) 36 | 37 | val dir = opt[String](required = true, descr = "数据集根目录") 38 | 39 | val method = opt[String](required = true, validate = { str => algorithms.map(_.getName).contains(str) }, descr = { 40 | "推荐算法。可选类型: " + algorithms.map(_.getName).reduce(_ + ", " + _) 41 | }) 42 | 43 | } 44 | -------------------------------------------------------------------------------- /sparkML/src/main/factory/AlgTrait.scala: -------------------------------------------------------------------------------- 1 | package main.factory 2 | 3 | import main.util.{MainHolder, SparkEnv} 4 | 5 | /** 6 | * Created by zhy on 2015/8/2 0002. 7 | */ 8 | 9 | /** 10 | * 推荐算法输入数据 11 | */ 12 | trait InputRecData extends Serializable { 13 | protected val sc = SparkEnv.sc 14 | protected val ratings = MainHolder.getDataHolder().getData 15 | MainHolder.getDataHolder().getDataDesc 16 | 17 | //分割数据集为训练集、验证集、测试集 18 | protected val RDD = ratings.randomSplit(Array(0.7, 0.2, 0.1)) 19 | protected val trainData = RDD(0).persist 20 | protected val validateData = RDD(1).persist 21 | protected val testData = RDD(2).persist 22 | protected val numValidation = validateData.count 23 | protected val numTest = testData.count 24 | } 25 | 26 | /** 27 | * LR输入数据 28 | */ 29 | trait InputLRData extends Serializable { 30 | protected val sc = SparkEnv.sc 31 | protected val data = MainHolder.getDataHolder().getLRData 32 | MainHolder.getDataHolder().getDataDesc 33 | 34 | protected val RDD = data.randomSplit(Array(0.8, 0.2)) 35 | protected val trainData = RDD(0).persist 36 | protected val testData = RDD(1).persist 37 | 38 | } 39 | 40 | /** 41 | * 算法度量方式 42 | */ 43 | trait RMSE extends Serializable { 44 | protected var RMSE: Double = Double.MaxValue 45 | 46 | /** 47 | * @return 算法对于指定数据集推荐结果的均方根误差(RMSE) 48 | */ 49 | def getRMSE = println("测试集的RMSE为 " + RMSE + "\n----------测试完毕----------") 50 | } 51 | -------------------------------------------------------------------------------- /sparkML/src/main/input/recommend/YahooDataHolder.scala: -------------------------------------------------------------------------------- 1 | package main.input.recommend 2 | 3 | import main.util.SparkEnv 4 | import org.apache.spark.mllib.recommendation.Rating 5 | import org.apache.spark.rdd.RDD 6 | 7 | /** 8 | * Created by zhy on 2015/7/18 0018. 9 | */ 10 | 11 | /** 12 | * @param dataDirectoryPath Yahoo数据集根目录 13 | */ 14 | class YahooDataHolder(dataDirectoryPath: String) extends RecDataHolder with Serializable { 15 | override protected val ratings: RDD[Rating] = loadRatingsFromAFile() 16 | override protected val productsIDsToNameMap: Map[Int, String] = loadIDsToProductnameMapFromADirectory(dataDirectoryPath) 17 | 18 | /** 19 | * 从文件中读取Yahoo数据集评分 20 | * @return RDD[Rating] 21 | */ 22 | protected def loadRatingsFromAFile(): RDD[Rating] = { 23 | val ratings = SparkEnv.sc.textFile(dataDirectoryPath + "data.txt") 24 | .filter(line => formatSpace(line).split(" ").length >= 3) 25 | .map { line => 26 | val lineFormat = formatSpace(line) 27 | val fields = lineFormat.split(" ") 28 | (Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble)) 29 | } 30 | ratings 31 | } 32 | 33 | /** 34 | * 去除字符串中多于一个连续的空格 35 | * @param line 输入字符串 36 | * @return 去除多余空格后的字符串 37 | */ 38 | protected def formatSpace(line: String): String = { 39 | line.replaceAll("\\s+", " ") 40 | } 41 | 42 | /** 43 | * 44 | * @param dataDirectoryPath Yahoo数据集根目录 45 | * @return Map:musicID -> musicName 46 | */ 47 | protected def loadIDsToProductnameMapFromADirectory(dataDirectoryPath: String): Map[Int, String] = { 48 | null 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /sparkML/src/main/input/LRDataHolder.scala: -------------------------------------------------------------------------------- 1 | package main.input 2 | 3 | import main.util.SparkEnv 4 | import org.apache.spark.mllib.linalg.SparseVector 5 | import org.apache.spark.mllib.regression.LabeledPoint 6 | import org.apache.spark.rdd.RDD 7 | 8 | import scala.collection.mutable.ArrayBuffer 9 | 10 | /** 11 | * Created by zhy on 2015/8/3 0003. 12 | */ 13 | 14 | /** 15 | * 逻辑回归数据集 16 | * @param dataDirectoryPath 数据集根目录 17 | */ 18 | class LRDataHolder(dataDirectoryPath: String) extends DataHolder with Serializable { 19 | private val data: RDD[LabeledPoint] = loadDataFromFile 20 | private val dimensions = 1000 21 | 22 | def loadDataFromFile: RDD[LabeledPoint] = { 23 | val feature1 = SparkEnv.sc.textFile(dataDirectoryPath + "Features.txt") 24 | val feature2 = SparkEnv.sc.textFile(dataDirectoryPath + "Info.txt") 25 | val data = SparkEnv.sc.textFile(dataDirectoryPath + "data.txt") 26 | .map { line => 27 | var indices = ArrayBuffer[Int]() 28 | var values = ArrayBuffer[Double]() 29 | val fields = line.split(" ") 30 | val label = fields(0).toDouble 31 | fields.foreach { field => 32 | val featureI = field.split(":") 33 | if (featureI.length == 2) { 34 | indices += featureI(0).toInt 35 | values += featureI(1).toDouble 36 | } 37 | } 38 | new LabeledPoint(label, new SparseVector(dimensions, indices.toArray, values.toArray)) 39 | } 40 | data 41 | } 42 | 43 | override def getLRData = data 44 | 45 | override def getData = ??? 46 | 47 | override def getDataDesc = println("数据集包含" + data.count + "条数据") 48 | } 49 | -------------------------------------------------------------------------------- /sparkML/src/main/util/MainHolder.scala: -------------------------------------------------------------------------------- 1 | package main.util 2 | 3 | import main.factory.{Algorithm, AlgorithmFactory} 4 | import main.input.{DataFactory, DataHolder} 5 | 6 | /** 7 | * Created by zhy on 2015/7/19 0019. 8 | */ 9 | 10 | /** 11 | * 初始化并获取DataHolder和Recommender 12 | */ 13 | object MainHolder { 14 | private var recommender: Option[Algorithm] = None 15 | private var dataHolder: Option[DataHolder] = None 16 | 17 | /** 18 | * 初始化DataHolder数据源和rcommender算法 19 | * @param conf 配置管理类 20 | */ 21 | def setUp(conf: Conf): Unit = { 22 | val dataHolderNameToFactoryMap = DataFactory.dataHolderList.map(holder => holder.getName -> holder).toMap 23 | val dataHolderStr: String = conf.data() 24 | dataHolder = Some(dataHolderNameToFactoryMap.get(dataHolderStr).get.getInstance(conf)) 25 | 26 | val recommenderNameToFactoryMap = AlgorithmFactory.AlgList.map(rec => rec.getName -> rec).toMap 27 | val recommenderStr: String = conf.method() 28 | recommender = Some(recommenderNameToFactoryMap.get(recommenderStr).get.getAlg(conf)) 29 | } 30 | 31 | /** 32 | * 计算该推荐算法对于测试集的均方根误差RMSE 33 | * @return Unit 34 | */ 35 | def calculateRMSE() = getAlgInstance.getRMSE 36 | 37 | /** 38 | * 39 | * @return 机器学习算法实例 40 | */ 41 | def getAlgInstance(): Algorithm = { 42 | recommender match { 43 | case Some(rec) => rec 44 | case None => throw new MainHolderNotInitializedException 45 | } 46 | } 47 | 48 | /** 49 | * 50 | * @return 数据源实例 51 | */ 52 | def getDataHolder(): DataHolder = { 53 | dataHolder match { 54 | case Some(holder) => holder 55 | case None => throw new MainHolderNotInitializedException 56 | } 57 | } 58 | 59 | class MainHolderNotInitializedException extends Exception 60 | 61 | } 62 | -------------------------------------------------------------------------------- /sparkML/src/main/input/DataFactory.scala: -------------------------------------------------------------------------------- 1 | package main.input 2 | 3 | import main.input.recommend.{NetflixDataHolder4Directory, NetflixDataHolder4OneFile, RecDataHolder, YahooDataHolder} 4 | import main.util.Conf 5 | 6 | /** 7 | * Created by zhy on 2015/7/19 0019. 8 | */ 9 | 10 | /** 11 | * 数据集工厂 12 | */ 13 | trait DataFactory { 14 | def getName: String 15 | 16 | def getDesc: String 17 | 18 | def getInstance(conf: Conf): DataHolder 19 | } 20 | 21 | object DataFactory { 22 | val dataHolderList: List[DataFactory] = List(YahooFac, NetFlix2Fac, NetFlix1Fac, LR) 23 | } 24 | 25 | object YahooFac extends DataFactory { 26 | override def getName: String = "Yahoo" 27 | 28 | override def getDesc: String = "数据源:Yahoo数据集,单个文件\n" + 29 | "数据格式:userID itemID(musicID) rating(0-100)" 30 | 31 | override def getInstance(conf: Conf): RecDataHolder = { 32 | println(getDesc) 33 | new YahooDataHolder(conf.dir()) 34 | } 35 | } 36 | 37 | object NetFlix1Fac extends DataFactory { 38 | override def getName: String = "NetFlixInFile" 39 | 40 | override def getDesc: String = "数据源:NetFlix数据集,单个文件\n数据格式:???" 41 | 42 | override def getInstance(conf: Conf): RecDataHolder = { 43 | println(getDesc) 44 | new NetflixDataHolder4OneFile(conf.dir()) 45 | } 46 | } 47 | 48 | object NetFlix2Fac extends DataFactory { 49 | override def getName: String = "NetFlixInDirectory" 50 | 51 | override def getDesc: String = "数据源:NetFlix数据集,目录\n" + 52 | "数据格式:每个文件第一行为UserID,其余每行:movieID,rating(0-5),time" 53 | 54 | override def getInstance(conf: Conf): RecDataHolder = { 55 | println(getDesc) 56 | new NetflixDataHolder4Directory(conf.dir()) 57 | } 58 | } 59 | 60 | object LR extends DataFactory { 61 | override def getName: String = "LR" 62 | 63 | override def getInstance(conf: Conf): DataHolder = { 64 | println(getDesc) 65 | new LRDataHolder(conf.dir()) 66 | } 67 | 68 | override def getDesc: String = "数据源:逻辑回归数据集,单个文件\n" + 69 | "数据格式:每行 label 特征维度1:特征数据1 ...... 特征维度n:特征数据n" 70 | } -------------------------------------------------------------------------------- /sparkML/src/main/optimizer/FTRLProximal.scala: -------------------------------------------------------------------------------- 1 | package main.optimizer 2 | 3 | import breeze.linalg.SparseVector 4 | import breeze.numerics.abs 5 | import org.apache.spark.mllib.regression.LabeledPoint 6 | 7 | /** 8 | * Created by zhy on 2015/8/1 0001. 9 | */ 10 | 11 | /** 12 | * 13 | * @param beta 添加到梯度的协方差矩阵中避免学习速率过高 14 | * @param alpha 初始学习速率 15 | * @param L1 L1正则项权重 16 | * @param L2 L2正则项权重 17 | * @param D 特征向量维度 18 | */ 19 | final class FTRLProximal(val beta: Double = 0.1, val alpha: Double = 0.1, val L1: Double = 0.0, val L2: Double = 0.0, val D: Int = 1000) 20 | extends Optimizer { 21 | 22 | private val N: SparseVector[Double] = SparseVector.zeros(D) 23 | private val Z: SparseVector[Double] = SparseVector.zeros(D) 24 | private var W: SparseVector[Double] = SparseVector.zeros(D) 25 | 26 | def printV = { 27 | println("W向量" + W.toString()) 28 | println("N向量" + N.toString()) 29 | println("Z向量" + Z.toString()) 30 | } 31 | 32 | //迭代函数 33 | def optimize(data: LabeledPoint, initialWeights: SparseVector[Double]): 34 | SparseVector[Double] = { 35 | W = initialWeights 36 | println("optimize函数") 37 | step(data.features.toArray, data.label.toInt) 38 | } 39 | 40 | //迭代过程 41 | //TODO 迭代过程需要优化 42 | def step(feature: Array[Double], label: Int): SparseVector[Double] = { 43 | println("step函数") 44 | var p: Double = 0.0 45 | for (i_double <- feature) { 46 | val i = i_double.toInt 47 | var sign: Int = 0 48 | if (Z(i) < 0) 49 | sign = -1 50 | else 51 | sign = 1 52 | if (abs(Z(i)) <= L1) { 53 | W(i) = 0.0 54 | } else { 55 | W(i) = (sign * L1 - Z(i)) / ((beta + Math.sqrt(N(i))) / alpha + L2) 56 | } 57 | p += W(i) 58 | } 59 | 60 | // predict 61 | p = 1 / (1 + Math.exp(-p)) 62 | 63 | // update 64 | val g: Double = p - label 65 | for (i_double <- feature) { 66 | val i = i_double.toInt 67 | val sigma: Double = (Math.sqrt(N(i) + g * g) - Math.sqrt(N(i))) / alpha 68 | Z(i) += g - sigma * W(i) 69 | N(i) += g * g 70 | } 71 | W 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /sparkML/src/main/factory/AlgorithmFactory.scala: -------------------------------------------------------------------------------- 1 | package main.factory 2 | 3 | import main.classifier.LRWithFTRL 4 | import main.recommender.{ALSRec, Recommender, SlopOneRec} 5 | import main.util.Conf 6 | 7 | /** 8 | * Created by zhy on 2015/7/19 0019. 9 | */ 10 | 11 | /** 12 | * 机器学习算法工厂 13 | */ 14 | trait AlgorithmFactory { 15 | def getName: String 16 | 17 | def getAlg(conf: Conf): Algorithm 18 | 19 | def getAlgDes(): String 20 | 21 | def getParamDes(): String 22 | 23 | def getDescription(): String = { 24 | getAlgDes() + "\n参数:\n" + getParamDes() 25 | } 26 | } 27 | 28 | object AlgorithmFactory { 29 | val AlgList: List[AlgorithmFactory] = List(ALS, SlopOne, LRWithFTRL) 30 | } 31 | 32 | object ALS extends AlgorithmFactory { 33 | override def getName: String = "ALS" 34 | 35 | override def getAlgDes(): String = "MLLib中ALS算法" 36 | 37 | protected val rankStr = "rank" 38 | protected val lambdaStr = "λ" 39 | protected val iterStr = "numberOfIterations" 40 | 41 | override def getParamDes(): String = rankStr + " = ,特征向量维度\n" + lambdaStr + " = ,正则化参数\n" + iterStr + " = ,迭代次数" 42 | 43 | override def getAlg(conf: Conf): Recommender = { 44 | val ranks = 12 to 15 45 | val lambdas = List(0.01, 0.05) 46 | val iters = 10 to 20 47 | 48 | println(getDescription) 49 | new ALSRec(ranks, lambdas, iters) 50 | } 51 | } 52 | 53 | object SlopOne extends AlgorithmFactory { 54 | override def getName: String = "Slop-One" 55 | 56 | override def getAlg(conf: Conf): Recommender = { 57 | println(getDescription) 58 | new SlopOneRec 59 | } 60 | 61 | override def getParamDes(): String = "无参数" 62 | 63 | override def getAlgDes(): String = "Slop-One算法" 64 | } 65 | 66 | object LRWithFTRL extends AlgorithmFactory { 67 | override def getName: String = "LR-FTRL" 68 | 69 | protected val numFea = "numFeatures" 70 | 71 | override def getParamDes(): String = numFea + "= ,特征向量维度\n" 72 | 73 | override def getAlgDes(): String = "采用FTRL-Proximal优化的Logistic Regression算法" 74 | 75 | override def getAlg(conf: Conf): Algorithm = { 76 | //TODO 根据数据集特征提供维度 77 | val numFeatures = 1000 78 | 79 | println(getDescription) 80 | new LRWithFTRL(numFeatures) 81 | } 82 | } -------------------------------------------------------------------------------- /sparkML/src/main/Classifier/LRWithFTRL.scala: -------------------------------------------------------------------------------- 1 | package main.classifier 2 | 3 | import breeze.linalg.SparseVector 4 | import breeze.numerics.exp 5 | import main.factory.InputLRData 6 | import main.linalg.AlgUtil 7 | import main.optimizer.FTRLProximal 8 | import org.apache.spark.mllib.linalg.Vector 9 | import org.apache.spark.mllib.regression.LabeledPoint 10 | import org.apache.spark.rdd.RDD 11 | 12 | import scala.collection.mutable.ArrayBuffer 13 | 14 | /** 15 | * Created by zhy on 2015/8/2 0002. 16 | */ 17 | 18 | /** 19 | * Logistic Regression逻辑回归模型 20 | */ 21 | final class LRWithFTRL(val numFeatures: Int) 22 | extends RegressionModel with InputLRData with Serializable { 23 | 24 | //初始化特征向量 25 | private var weights: SparseVector[Double] = SparseVector.zeros(numFeatures) 26 | 27 | //设定优化算法 28 | override val optimizer = new FTRLProximal(D = numFeatures) 29 | 30 | train(trainData) 31 | predictAccuracy(testData) 32 | 33 | def train(data: LabeledPoint): Unit = { 34 | weights = optimizer.optimize(data, weights) 35 | optimizer.printV 36 | } 37 | 38 | //训练参数 39 | //TODO 训练和测试过程并行化 40 | override def train(trainData: RDD[LabeledPoint]): Unit = { 41 | val localTrainData = trainData.toLocalIterator 42 | localTrainData.foreach(data => train(data)) 43 | } 44 | 45 | /** 46 | * 分类预测准确率 47 | * @param testData 测试数据集合 48 | * @return 准确率 49 | */ 50 | def predictAccuracy(testData: RDD[LabeledPoint]): Unit = { 51 | var predictions = new ArrayBuffer[Tuple2[Double,Double]]() 52 | testData.toLocalIterator.foreach{ data => 53 | val prediction = (data.label, predict(data.features)) 54 | train(data) 55 | predictions += prediction 56 | } 57 | val numData:Int = predictions.toArray.length 58 | val numCorrect:Int = predictions.toArray.filter{data=> 59 | data._1 == data._2 60 | }.length 61 | println("正确预测的数量: " + numCorrect + 62 | "\n所有数量: " + numData ) 63 | RMSE = numCorrect * 1.0 / numData 64 | } 65 | 66 | /** 67 | * 根据假设函数 预测单个样本 68 | * @param testData 测试样本数据 69 | * @return 分类数据: 1 or 0 70 | */ 71 | def predict(testData: Vector): Double = { 72 | val x: Double = weights.dot(AlgUtil.VtoB(testData)) 73 | val prob: Double = sigmod(x) 74 | if (prob > 0.5) return 1.0 75 | else return 0.0 76 | } 77 | 78 | override def getRMSE = 79 | println("使用FTRL-Proximal的逻辑回归在测试集上的预测准确率为" + RMSE + "\n----------测试完毕----------") 80 | 81 | //sigmod函数 82 | private def sigmod(x: Double): Double = 1.0 / (1 + exp(-x)) 83 | } 84 | -------------------------------------------------------------------------------- /sparkML/src/main/recommender/ALSRec.scala: -------------------------------------------------------------------------------- 1 | package main.recommender 2 | 3 | import main.factory.InputRecData 4 | import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating} 5 | import org.apache.spark.rdd.RDD 6 | 7 | /** 8 | * Created by zhy on 2015/7/19 0019. 9 | */ 10 | 11 | final class ALSRec(ranks: Range, lambdas: List[Double], numIters: Range) 12 | extends Recommender with InputRecData with Serializable { 13 | //训练模型并测试 14 | val model = getBestModel() 15 | test 16 | 17 | 18 | /** 19 | * 20 | * @return 获取参数最佳的模型 21 | */ 22 | private def getBestModel(): Option[MatrixFactorizationModel] = { 23 | Some(ALS.train(trainData,12,20,0.05)) 24 | } 25 | 26 | /** 27 | * 使用测试集进行测试 28 | */ 29 | private def test = { 30 | RMSE = calculateRmse(model.get, testData, numTest) 31 | } 32 | 33 | /** 34 | * 计算rmse均方根误差 35 | * @param model 算法模型 36 | * @param dataset 数据集 37 | * @param n 数据集大小 38 | * @return 该算法模型在该验证数据集上的RMSE 39 | */ 40 | private def calculateRmse(model: MatrixFactorizationModel, dataset: RDD[Rating], n: Long): Double = { 41 | val predictions: RDD[Rating] = model.predict(dataset.map(x => (x.user, x.product))) 42 | val predictionsAndRatings = predictions.map { x => 43 | ((x.user, x.product), x.rating) 44 | }.join(dataset.map { x => 45 | ((x.user, x.product), x.rating) 46 | } 47 | ).values 48 | val tmp_RMSE = math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / n) 49 | println("计算得到的RMSE为: " + tmp_RMSE) 50 | tmp_RMSE 51 | } 52 | 53 | /** 54 | * 训练模型 55 | * @param numValidation 验证集大小 56 | * @return 训练完成的模型 57 | */ 58 | private def train(numValidation: Long): Option[MatrixFactorizationModel] = { 59 | RMSE = Double.MaxValue 60 | var bestModel: Option[MatrixFactorizationModel] = None 61 | var bestRank = 0 62 | var bestLambda = -1.0 63 | var bestNumIter = -1 64 | 65 | for (rank <- ranks; lambda <- lambdas; numIter <- numIters) { 66 | val model = ALS.train(trainData, rank, numIter, lambda) 67 | val validataionRmse = calculateRmse(model, validateData, numValidation) 68 | if (validataionRmse < RMSE) { 69 | bestModel = Some(model) 70 | RMSE = validataionRmse 71 | bestRank = rank 72 | bestLambda = lambda 73 | bestNumIter = numIter 74 | } 75 | } 76 | println("模型训练完毕。最优参数为: (rank = " + bestRank + "; numIter = " + 77 | bestNumIter + "; lambda = " + bestLambda + ")") 78 | 79 | bestModel 80 | } 81 | } -------------------------------------------------------------------------------- /bash/splitDataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3.3 2 | # coding: UTF-8 3 | #Author :zhy 4 | 5 | import os,sys,shutil 6 | import random 7 | from subprocess import call 8 | 9 | def chooseFile(): 10 | path = sys.argv[1] 11 | percentage = (float)(sys.argv[2]) 12 | fileList = os.listdir(path) 13 | fileNum = (int)(len(fileList) * percentage) 14 | fileNum = [1,fileNum][fileNum >= 1] 15 | chosenFile = random.sample(fileList,fileNum) 16 | print(chosenFile) 17 | return chosenFile 18 | 19 | def splitDatasetToDirectory(chosenFile): 20 | """将NetFlix数据集目录下的小文件随机选择10%复制到另外一个文件夹""" 21 | path = sys.argv[1] 22 | for file in chosenFile: 23 | if sys.platform.__eq__("win32"): 24 | desPath = "c:/Users/zhy/Documents/study/AD.SE/courseDesign/DatasetInHDFS/NetFlix/little/" 25 | shutil.copy(path + file, desPath) 26 | else: 27 | call(["cp", path + file, path + "../little/training_set/"]) 28 | 29 | def splitDatasetToHDFS(chosenFile): 30 | """将NetFlix数据集目录下的小文件随机选择10%上传至HDFS中以测试使用""" 31 | deleteOldData() 32 | path = sys.argv[1] 33 | for file in chosenFile: 34 | call(["hdfs","dfs","-put",path + file,"/zhy/data/NetFlix/little/training_set/"]) 35 | 36 | def deleteOldData(): 37 | """删除HDFS中原有的数据集""" 38 | call(["hdfs","dfs","-rm","-R","/zhy/data/NetFlix/little/training_set/"]) 39 | call(["hdfs","dfs","-mkdir","/zhy/data/NetFlix/little/training_set/"]) 40 | 41 | def inputParm(): 42 | if len(sys.argv) < 3: 43 | print("命令格式:./splitDataset.py [本地数据集路径] [数据集选择比例]") 44 | sys.exit(1) 45 | trigger = True 46 | while trigger: 47 | try: 48 | trigger = False 49 | print("1 -> 抽取 “" + sys.argv[1] + 50 | "” 目录下10%的文件并复制到“../little/training_set/”目录下\n") 51 | print("2 -> 抽取 “" + sys.argv[1] + 52 | "” 目录下10%的文件并上传到HDFS中“zhy/data/NetFlix/little/training_set/”目录下\n") 53 | print("3 -> 运行Spark程序\n") 54 | print("4 -> 同时执行'2'+'3'的操作\n") 55 | param = int(input('请输入选择的操作 ... \n')) 56 | return param 57 | 58 | except ValueError: 59 | trigger = True 60 | print("输入不合法,请输入一个数字 ... ") 61 | 62 | def execute(): 63 | DataSet = input('请输入数据集选项:\n') 64 | Dir = input('请输入数据集位置:\n') 65 | Alg = input('请输入算法类型:\n') 66 | call(["spark-submit","/home/zhy/spark-app/zhy/sparkML.jar","--data",DataSet,"--dir",Dir,"--method",Alg]) 67 | 68 | if __name__ == '__main__': 69 | param = inputParm() 70 | if param == 1: 71 | splitDatasetToDirectory(chooseFile()) 72 | elif param == 2: 73 | splitDatasetToHDFS(chooseFile()) 74 | elif param == 3: 75 | execute() 76 | elif param == 4: 77 | splitDatasetToHDFS(chooseFile()) 78 | execute() 79 | else: 80 | print("未知选项,退出程序") 81 | sys.exit(1) 82 | 83 | -------------------------------------------------------------------------------- /sparkML/src/main/input/recommend/NetflixDataHolder.scala: -------------------------------------------------------------------------------- 1 | package main.input.recommend 2 | 3 | /** 4 | * Created by zhy on 2015/7/18 0018. 5 | */ 6 | 7 | import main.util.SparkEnv 8 | import org.apache.hadoop.conf.Configuration 9 | import org.apache.hadoop.fs.{FileSystem, Path} 10 | import org.apache.spark.mllib.recommendation.Rating 11 | import org.apache.spark.rdd.RDD 12 | 13 | import scala.collection.mutable.ArrayBuffer 14 | ; 15 | 16 | /** 17 | * @param dataDirectoryPath NetFlix数据集根目录 18 | */ 19 | abstract class NetflixDataHolder(dataDirectoryPath: String) extends RecDataHolder { 20 | protected val productsIDsToNameMap = loadIDsToProductnameMapFromADirectory() 21 | 22 | /** 23 | * 从 "movie_titles.txt" 中获取电影名和ID的映射 24 | * @return Map: movieID -> title 25 | */ 26 | protected def loadIDsToProductnameMapFromADirectory(): Map[Int, String] = { 27 | val sc = SparkEnv.sc 28 | val movies = sc.textFile(dataDirectoryPath + "movie_titles.txt").map { line => 29 | val fields = line.split(",") 30 | // format: (movieID, movieName) 31 | (fields(0).toInt, fields(2) + " (" + fields(1) + ")") 32 | }.collect.toMap 33 | movies 34 | } 35 | } 36 | 37 | /** 38 | * 从一个文件读取NetFilx数据 文件格式: movieID>,userID,rating,date. 39 | * @param dataDirectoryPath NetFlix数据集目录 40 | * @param filename 文件名 41 | */ 42 | class NetflixDataHolder4OneFile(dataDirectoryPath: String, filename: String = "ratings.txt") extends NetflixDataHolder(dataDirectoryPath) with Serializable { 43 | protected val ratings = { 44 | val sc = SparkEnv.sc 45 | val ratingsRDD = sc.textFile(dataDirectoryPath + filename).map { 46 | line => val fields = line.split(",") 47 | (Rating(fields(1).toInt, fields(0).toInt, fields(2).toDouble)) 48 | } 49 | ratingsRDD 50 | } 51 | } 52 | 53 | /** 54 | * 从一个目录下所有文件读取NetFilx数据 文件格式: movieID>,userID,rating,date. 55 | * @param dataDirectoryPath NetFlix数据集目录 56 | */ 57 | class NetflixDataHolder4Directory(dataDirectoryPath: String) extends NetflixDataHolder(dataDirectoryPath) with Serializable { 58 | protected val ratings = loadRatingsFromADirectory() 59 | 60 | protected def loadRatingsFromADirectory(): RDD[Rating] = { 61 | val conf = new Configuration() 62 | val hdfs = FileSystem.get(conf) 63 | val dataPath = new Path(dataDirectoryPath + "training_set") 64 | val stats = hdfs.listStatus(dataPath) 65 | var fileList = new ArrayBuffer[String] 66 | 67 | for (stat <- stats) fileList += stat.getPath.toString 68 | val ratingsRDDsArray = fileList.map(filePath => loadRatingsFromOneFile(filePath)) 69 | val ratings = SparkEnv.sc.union(ratingsRDDsArray) 70 | ratings.persist.coalesce(77) 71 | } 72 | 73 | protected def loadRatingsFromOneFile(absoluteFilePath: String): RDD[Rating] = { 74 | val ratingsTxtRDD = SparkEnv.sc.textFile(absoluteFilePath) 75 | val movieIDLine = ratingsTxtRDD.first() 76 | val movieID = movieIDLine.split(":")(0).toInt 77 | 78 | val ratingsRDD = ratingsTxtRDD.map(line => if (line == movieIDLine) { 79 | Rating(-1, -1, -1) 80 | } else { 81 | val fields = line.split(",") 82 | (Rating(fields(0).toInt, movieID, fields(1).toDouble)) 83 | }) 84 | ratingsRDD.filter(rat => rat.user >= 0) 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /sparkML/src/main/recommender/SlopOneRec.scala: -------------------------------------------------------------------------------- 1 | package main.recommender 2 | 3 | import main.factory.InputRecData 4 | import org.apache.spark.mllib.recommendation.Rating 5 | import org.apache.spark.rdd.RDD 6 | 7 | import scala.collection.mutable.ArrayBuffer 8 | 9 | /** 10 | * Created by zhy on 2015/7/26 0026. 11 | */ 12 | final class SlopOneRec extends Recommender with InputRecData with Serializable { 13 | val trainDataGroupByUser = trainData.map(rating => (rating.user, (rating.product, rating.rating))) 14 | .groupByKey.persist 15 | test 16 | 17 | 18 | /** 19 | * 使用测试集进行测试 20 | */ 21 | private def test = { 22 | RMSE = calculateRmse(testData, numTest) 23 | } 24 | 25 | def numUserConsumer_ij(product_i: Int, product_j: Int): Long = { 26 | trainDataGroupByUser.filter { trainData4one => 27 | trainData4one._2.exists(a => a._1 == product_i) && trainData4one._2.exists(b => b._1 == product_j) 28 | }.count() 29 | } 30 | 31 | /** 32 | * 33 | * @param u 用户ID 34 | * @param i 物品ID 35 | * @return 评分三元组 36 | */ 37 | def predict(u: Int, i: Int): Rating = { 38 | //(projectID,Ratings) 39 | val S_u = { 40 | val ratings = trainDataGroupByUser.lookup(u) 41 | if (ratings.length <= 0) throw new UserNotFoundException 42 | ratings(0).toIterator 43 | } 44 | var prediction: Double = 0 45 | var S_u_minus_i: Double = 0 46 | S_u.foreach { S_uj => 47 | if (S_uj._1 == i) return new Rating(u, i, S_uj._2) 48 | val deviation_ij = calcuDeviation_ij(i, S_uj._1) 49 | val r_uj = S_uj._2 50 | S_u_minus_i += 1 51 | prediction += (deviation_ij + r_uj) 52 | } 53 | new Rating(u, i, prediction / (S_u_minus_i - 1)) 54 | } 55 | 56 | 57 | private def calculateRmse(dataset: RDD[Rating], n: Long): Double = { 58 | println("开始计算RMSE") 59 | var predictions = ArrayBuffer[Rating]() 60 | val train = dataset.toLocalIterator 61 | train.foreach { x => 62 | println("预测一个样本的评分") 63 | predictions += predict(x.user, x.product) 64 | } 65 | val predictionsRDD: RDD[Rating] = sc.parallelize(predictions.toSeq) 66 | val predictionsAndRatings = predictionsRDD.map { x => 67 | ((x.user, x.product), x.rating) 68 | }.join(dataset.map { x => 69 | ((x.user, x.product), x.rating) 70 | } 71 | ).values 72 | val tmp_RMSE = math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / n) 73 | println("计算得到的RMSE为: " + tmp_RMSE) 74 | tmp_RMSE 75 | } 76 | 77 | /** 78 | * 79 | * @param product_i 物品i 80 | * @param product_j 物品j 81 | * @return 物品i与j的偏差 82 | */ 83 | private def calcuDeviation_ij(product_i: Int, product_j: Int): Double = { 84 | val userList4i = trainData.filter(rating => rating.product == product_i) 85 | val userList4j = trainData.filter(rating => rating.product == product_j) 86 | val userList4ij = userList4i.intersection(userList4j) 87 | val numUser4ij = userList4ij.count() 88 | if (numUser4ij == 0) return 0 89 | var deviation_ij: Double = 0 90 | userList4ij.foreach { rating => 91 | val user = rating.user 92 | val rating_ui = userList4ij.filter(rating => rating.user == user && rating.product == product_i) 93 | .toLocalIterator.next().rating 94 | val rating_uj = userList4ij.filter(rating => rating.user == user && rating.product == product_j) 95 | .toLocalIterator.next().rating 96 | deviation_ij += (rating_ui - rating_uj) 97 | } 98 | deviation_ij / numUser4ij 99 | } 100 | 101 | class UserNotFoundException extends Exception 102 | 103 | } 104 | --------------------------------------------------------------------------------