├── .gitignore
├── sparkML
├── src
│ └── main
│ │ ├── factory
│ │ ├── Algorithm.scala
│ │ ├── AlgTrait.scala
│ │ └── AlgorithmFactory.scala
│ │ ├── optimizer
│ │ ├── optimizer.scala
│ │ └── FTRLProximal.scala
│ │ ├── recommender
│ │ ├── Recommender.scala
│ │ ├── ALSRec.scala
│ │ └── SlopOneRec.scala
│ │ ├── app.scala
│ │ ├── input
│ │ ├── DataHolder.scala
│ │ ├── recommend
│ │ │ ├── RecDataHolder.scala
│ │ │ ├── YahooDataHolder.scala
│ │ │ └── NetflixDataHolder.scala
│ │ ├── LRDataHolder.scala
│ │ └── DataFactory.scala
│ │ ├── Classifier
│ │ ├── RegressionModel.scala
│ │ └── LRWithFTRL.scala
│ │ ├── util
│ │ ├── SparkEnv.scala
│ │ ├── Conf.scala
│ │ └── MainHolder.scala
│ │ └── linalg
│ │ └── algUtil.scala
└── sparkML.iml
├── README.md
├── LICENSE
└── bash
└── splitDataset.py
/.gitignore:
--------------------------------------------------------------------------------
1 | */lib/
2 | */out/
3 | */META-INF/
4 | */scala-train/
5 | */mlTrains/
6 | */.idea/
7 |
--------------------------------------------------------------------------------
/sparkML/src/main/factory/Algorithm.scala:
--------------------------------------------------------------------------------
1 | package main.factory
2 |
3 | /**
4 | * Created by zhy on 2015/8/2 0002.
5 | */
6 | trait Algorithm extends RMSE with Serializable
--------------------------------------------------------------------------------
/sparkML/src/main/optimizer/optimizer.scala:
--------------------------------------------------------------------------------
1 | package main.optimizer
2 |
3 | /**
4 | * Created by zhy on 2015/8/1 0001.
5 | */
6 |
7 | trait Optimizer extends Serializable
8 |
--------------------------------------------------------------------------------
/sparkML/src/main/recommender/Recommender.scala:
--------------------------------------------------------------------------------
1 | package main.recommender
2 |
3 | import main.factory.Algorithm
4 |
5 | /**
6 | * Created by zhy on 2015/7/19 0019.
7 | */
8 |
9 | class Recommender extends Algorithm
--------------------------------------------------------------------------------
/sparkML/src/main/app.scala:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import main.util.{Conf, MainHolder}
4 |
5 | /**
6 | * Created by zhy on 2015/7/19 0019.
7 | */
8 | object app extends App {
9 |
10 | override def main(args: Array[String]) {
11 | val opt = new Conf(args)
12 |
13 | MainHolder.setUp(opt)
14 |
15 | MainHolder.calculateRMSE
16 | }
17 |
18 | }
19 |
--------------------------------------------------------------------------------
/sparkML/src/main/input/DataHolder.scala:
--------------------------------------------------------------------------------
1 | package main.input
2 |
3 | import org.apache.spark.mllib.recommendation.Rating
4 | import org.apache.spark.mllib.regression.LabeledPoint
5 | import org.apache.spark.rdd.RDD
6 |
7 | /**
8 | * Created by zhy on 2015/8/3 0003.
9 | */
10 | trait DataHolder extends Serializable {
11 | def getLRData(): RDD[LabeledPoint]
12 |
13 | def getData(): RDD[Rating]
14 |
15 | def getDataDesc: Unit
16 | }
17 |
--------------------------------------------------------------------------------
/sparkML/src/main/Classifier/RegressionModel.scala:
--------------------------------------------------------------------------------
1 | package main.classifier
2 |
3 | import main.factory.Algorithm
4 | import main.optimizer.Optimizer
5 | import org.apache.spark.mllib.regression.LabeledPoint
6 | import org.apache.spark.rdd.RDD
7 |
8 | /**
9 | * Created by zhy on 2015/8/2 0002.
10 | */
11 |
12 | /**
13 | * 回归模型
14 | */
15 | trait RegressionModel extends Algorithm with Serializable {
16 |
17 | //优化算法
18 | def optimizer: Optimizer
19 |
20 | //训练及预测
21 | def train(trainData: RDD[LabeledPoint]): Unit
22 | }
23 |
--------------------------------------------------------------------------------
/sparkML/src/main/util/SparkEnv.scala:
--------------------------------------------------------------------------------
1 | package main.util
2 |
3 | /**
4 | * Created by zhy on 2015/7/18 0018.
5 | */
6 |
7 | import org.apache.log4j.{Level, Logger}
8 | import org.apache.spark.{SparkConf, SparkContext}
9 |
10 | /**
11 | * 初始化SparkContext
12 | */
13 | object SparkEnv {
14 |
15 | Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
16 | Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
17 |
18 | val conf = new SparkConf().setAppName("MachineLearningInSpark").setMaster("local[2]")
19 | val sc = new SparkContext(conf)
20 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Machine Learning In Spark
2 |
3 | Scalable system written in [Scala](http://www.scala-lang.org/) using the [Apache Spark framework](https://spark.apache.org/) get the framework design from [OndraFiedler/spark-recommender](https://github.com/OndraFiedler/spark-recommender).
4 |
5 | ## Features
6 |
7 | A ML framework to implement or develop ML algorithms which can run in spark.
8 |
9 | ### Implemented Algorithms
10 |
11 | - 1.ALS from Spark MLLib
12 | - 2.Slop-One
13 | - 3.Logistic Regression with FTRL-Proximal
14 |
15 | ### Dataset
16 |
17 | - 1.NetFlix
18 | - 2.Yahoo
19 | - 3.Logistic Regression Dataset
--------------------------------------------------------------------------------
/sparkML/src/main/linalg/algUtil.scala:
--------------------------------------------------------------------------------
1 | package main.linalg
2 |
3 | import breeze.linalg.{SparseVector => BSV, Vector => BV}
4 | import org.apache.spark.mllib.linalg.{SparseVector, Vector}
5 |
6 | /**
7 | * Created by zhy on 2015/8/2 0002.
8 | */
9 | object AlgUtil {
10 | /**
11 | * 向量->Breeze向量
12 | * @param v Vector
13 | * @return Breeze Vector
14 | */
15 | def VtoB(v: Vector): BV[Double] =
16 | new BSV[Double](v.toSparse.indices, v.toSparse.values, v.toSparse.size)
17 |
18 | /**
19 | * 稀疏向量->Breeze向量
20 | * @param v SparseVector
21 | * @return Breeze Vector
22 | */
23 | def StoB(v: SparseVector): BV[Double] = new BSV[Double](v.indices, v.values, v.size)
24 | }
25 |
--------------------------------------------------------------------------------
/sparkML/sparkML.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/sparkML/src/main/input/recommend/RecDataHolder.scala:
--------------------------------------------------------------------------------
1 | package main.input.recommend
2 |
3 | import main.input.DataHolder
4 | import org.apache.spark.mllib.recommendation.Rating
5 | import org.apache.spark.rdd.RDD
6 |
7 | /**
8 | * Created by zhy on 2015/7/18 0018.
9 | */
10 |
11 | /**
12 | * 推荐算法数据接口,可获取相应的Rating和ID2Name映射
13 | */
14 |
15 | trait RecDataHolder extends DataHolder with Serializable {
16 | protected val ratings: RDD[Rating]
17 | protected val productsIDsToNameMap: Map[Int, String]
18 |
19 | override def getLRData = ???
20 |
21 | override def getData = getRatings
22 |
23 | override def getDataDesc = printRatingDesc
24 |
25 | def getRatings(): RDD[Rating] = ratings
26 |
27 | def printRatingDesc = println("数据集包含 " + ratings.count + " 条数据,来自 "
28 | + ratings.map(_.user).distinct.count + " 个用户和 " + ratings.map(_.product).distinct.count + "件产品")
29 |
30 | def getIDToProductnameMap(): Map[Int, String] = productsIDsToNameMap
31 |
32 | def getNumOfProducts(): Int = productsIDsToNameMap.keys.max + 1
33 | }
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2014 Ondra Fiedler
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/sparkML/src/main/util/Conf.scala:
--------------------------------------------------------------------------------
1 | package main.util
2 |
3 | import main.factory.AlgorithmFactory
4 | import main.input.DataFactory
5 | import org.rogach.scallop.ScallopConf
6 |
7 | /**
8 | * Created by zhy on 2015/7/19 0019.
9 | */
10 |
11 | /**
12 | * 命令行参数解析类
13 | * @param arguments 命令行参数
14 | */
15 | class Conf(arguments: Seq[String]) extends ScallopConf(arguments) {
16 |
17 | val datasetTypes = DataFactory.dataHolderList
18 | val algorithms = AlgorithmFactory.AlgList
19 |
20 | banner( """
21 | Spark机器学习算法
22 | ----------------
23 | 基于Spark的机器学习算法库
24 |
25 | 示例:
26 | spark-submit [Jar] --data Yahoo --dir /zhy/data/Yahoo/ --method ALS
27 |
28 | 参数:
29 | """)
30 |
31 | version("version 1.5.0")
32 |
33 | val data = opt[String](required = true, validate = { str => datasetTypes.map(_.getName).contains(str) }, descr = {
34 | "数据集类型。可选类型: " + datasetTypes.map(_.getName).reduce(_ + ", " + _)
35 | })
36 |
37 | val dir = opt[String](required = true, descr = "数据集根目录")
38 |
39 | val method = opt[String](required = true, validate = { str => algorithms.map(_.getName).contains(str) }, descr = {
40 | "推荐算法。可选类型: " + algorithms.map(_.getName).reduce(_ + ", " + _)
41 | })
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/sparkML/src/main/factory/AlgTrait.scala:
--------------------------------------------------------------------------------
1 | package main.factory
2 |
3 | import main.util.{MainHolder, SparkEnv}
4 |
5 | /**
6 | * Created by zhy on 2015/8/2 0002.
7 | */
8 |
9 | /**
10 | * 推荐算法输入数据
11 | */
12 | trait InputRecData extends Serializable {
13 | protected val sc = SparkEnv.sc
14 | protected val ratings = MainHolder.getDataHolder().getData
15 | MainHolder.getDataHolder().getDataDesc
16 |
17 | //分割数据集为训练集、验证集、测试集
18 | protected val RDD = ratings.randomSplit(Array(0.7, 0.2, 0.1))
19 | protected val trainData = RDD(0).persist
20 | protected val validateData = RDD(1).persist
21 | protected val testData = RDD(2).persist
22 | protected val numValidation = validateData.count
23 | protected val numTest = testData.count
24 | }
25 |
26 | /**
27 | * LR输入数据
28 | */
29 | trait InputLRData extends Serializable {
30 | protected val sc = SparkEnv.sc
31 | protected val data = MainHolder.getDataHolder().getLRData
32 | MainHolder.getDataHolder().getDataDesc
33 |
34 | protected val RDD = data.randomSplit(Array(0.8, 0.2))
35 | protected val trainData = RDD(0).persist
36 | protected val testData = RDD(1).persist
37 |
38 | }
39 |
40 | /**
41 | * 算法度量方式
42 | */
43 | trait RMSE extends Serializable {
44 | protected var RMSE: Double = Double.MaxValue
45 |
46 | /**
47 | * @return 算法对于指定数据集推荐结果的均方根误差(RMSE)
48 | */
49 | def getRMSE = println("测试集的RMSE为 " + RMSE + "\n----------测试完毕----------")
50 | }
51 |
--------------------------------------------------------------------------------
/sparkML/src/main/input/recommend/YahooDataHolder.scala:
--------------------------------------------------------------------------------
1 | package main.input.recommend
2 |
3 | import main.util.SparkEnv
4 | import org.apache.spark.mllib.recommendation.Rating
5 | import org.apache.spark.rdd.RDD
6 |
7 | /**
8 | * Created by zhy on 2015/7/18 0018.
9 | */
10 |
11 | /**
12 | * @param dataDirectoryPath Yahoo数据集根目录
13 | */
14 | class YahooDataHolder(dataDirectoryPath: String) extends RecDataHolder with Serializable {
15 | override protected val ratings: RDD[Rating] = loadRatingsFromAFile()
16 | override protected val productsIDsToNameMap: Map[Int, String] = loadIDsToProductnameMapFromADirectory(dataDirectoryPath)
17 |
18 | /**
19 | * 从文件中读取Yahoo数据集评分
20 | * @return RDD[Rating]
21 | */
22 | protected def loadRatingsFromAFile(): RDD[Rating] = {
23 | val ratings = SparkEnv.sc.textFile(dataDirectoryPath + "data.txt")
24 | .filter(line => formatSpace(line).split(" ").length >= 3)
25 | .map { line =>
26 | val lineFormat = formatSpace(line)
27 | val fields = lineFormat.split(" ")
28 | (Rating(fields(0).toInt, fields(1).toInt, fields(2).toDouble))
29 | }
30 | ratings
31 | }
32 |
33 | /**
34 | * 去除字符串中多于一个连续的空格
35 | * @param line 输入字符串
36 | * @return 去除多余空格后的字符串
37 | */
38 | protected def formatSpace(line: String): String = {
39 | line.replaceAll("\\s+", " ")
40 | }
41 |
42 | /**
43 | *
44 | * @param dataDirectoryPath Yahoo数据集根目录
45 | * @return Map:musicID -> musicName
46 | */
47 | protected def loadIDsToProductnameMapFromADirectory(dataDirectoryPath: String): Map[Int, String] = {
48 | null
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/sparkML/src/main/input/LRDataHolder.scala:
--------------------------------------------------------------------------------
1 | package main.input
2 |
3 | import main.util.SparkEnv
4 | import org.apache.spark.mllib.linalg.SparseVector
5 | import org.apache.spark.mllib.regression.LabeledPoint
6 | import org.apache.spark.rdd.RDD
7 |
8 | import scala.collection.mutable.ArrayBuffer
9 |
10 | /**
11 | * Created by zhy on 2015/8/3 0003.
12 | */
13 |
14 | /**
15 | * 逻辑回归数据集
16 | * @param dataDirectoryPath 数据集根目录
17 | */
18 | class LRDataHolder(dataDirectoryPath: String) extends DataHolder with Serializable {
19 | private val data: RDD[LabeledPoint] = loadDataFromFile
20 | private val dimensions = 1000
21 |
22 | def loadDataFromFile: RDD[LabeledPoint] = {
23 | val feature1 = SparkEnv.sc.textFile(dataDirectoryPath + "Features.txt")
24 | val feature2 = SparkEnv.sc.textFile(dataDirectoryPath + "Info.txt")
25 | val data = SparkEnv.sc.textFile(dataDirectoryPath + "data.txt")
26 | .map { line =>
27 | var indices = ArrayBuffer[Int]()
28 | var values = ArrayBuffer[Double]()
29 | val fields = line.split(" ")
30 | val label = fields(0).toDouble
31 | fields.foreach { field =>
32 | val featureI = field.split(":")
33 | if (featureI.length == 2) {
34 | indices += featureI(0).toInt
35 | values += featureI(1).toDouble
36 | }
37 | }
38 | new LabeledPoint(label, new SparseVector(dimensions, indices.toArray, values.toArray))
39 | }
40 | data
41 | }
42 |
43 | override def getLRData = data
44 |
45 | override def getData = ???
46 |
47 | override def getDataDesc = println("数据集包含" + data.count + "条数据")
48 | }
49 |
--------------------------------------------------------------------------------
/sparkML/src/main/util/MainHolder.scala:
--------------------------------------------------------------------------------
1 | package main.util
2 |
3 | import main.factory.{Algorithm, AlgorithmFactory}
4 | import main.input.{DataFactory, DataHolder}
5 |
6 | /**
7 | * Created by zhy on 2015/7/19 0019.
8 | */
9 |
10 | /**
11 | * 初始化并获取DataHolder和Recommender
12 | */
13 | object MainHolder {
14 | private var recommender: Option[Algorithm] = None
15 | private var dataHolder: Option[DataHolder] = None
16 |
17 | /**
18 | * 初始化DataHolder数据源和rcommender算法
19 | * @param conf 配置管理类
20 | */
21 | def setUp(conf: Conf): Unit = {
22 | val dataHolderNameToFactoryMap = DataFactory.dataHolderList.map(holder => holder.getName -> holder).toMap
23 | val dataHolderStr: String = conf.data()
24 | dataHolder = Some(dataHolderNameToFactoryMap.get(dataHolderStr).get.getInstance(conf))
25 |
26 | val recommenderNameToFactoryMap = AlgorithmFactory.AlgList.map(rec => rec.getName -> rec).toMap
27 | val recommenderStr: String = conf.method()
28 | recommender = Some(recommenderNameToFactoryMap.get(recommenderStr).get.getAlg(conf))
29 | }
30 |
31 | /**
32 | * 计算该推荐算法对于测试集的均方根误差RMSE
33 | * @return Unit
34 | */
35 | def calculateRMSE() = getAlgInstance.getRMSE
36 |
37 | /**
38 | *
39 | * @return 机器学习算法实例
40 | */
41 | def getAlgInstance(): Algorithm = {
42 | recommender match {
43 | case Some(rec) => rec
44 | case None => throw new MainHolderNotInitializedException
45 | }
46 | }
47 |
48 | /**
49 | *
50 | * @return 数据源实例
51 | */
52 | def getDataHolder(): DataHolder = {
53 | dataHolder match {
54 | case Some(holder) => holder
55 | case None => throw new MainHolderNotInitializedException
56 | }
57 | }
58 |
59 | class MainHolderNotInitializedException extends Exception
60 |
61 | }
62 |
--------------------------------------------------------------------------------
/sparkML/src/main/input/DataFactory.scala:
--------------------------------------------------------------------------------
1 | package main.input
2 |
3 | import main.input.recommend.{NetflixDataHolder4Directory, NetflixDataHolder4OneFile, RecDataHolder, YahooDataHolder}
4 | import main.util.Conf
5 |
6 | /**
7 | * Created by zhy on 2015/7/19 0019.
8 | */
9 |
10 | /**
11 | * 数据集工厂
12 | */
13 | trait DataFactory {
14 | def getName: String
15 |
16 | def getDesc: String
17 |
18 | def getInstance(conf: Conf): DataHolder
19 | }
20 |
21 | object DataFactory {
22 | val dataHolderList: List[DataFactory] = List(YahooFac, NetFlix2Fac, NetFlix1Fac, LR)
23 | }
24 |
25 | object YahooFac extends DataFactory {
26 | override def getName: String = "Yahoo"
27 |
28 | override def getDesc: String = "数据源:Yahoo数据集,单个文件\n" +
29 | "数据格式:userID itemID(musicID) rating(0-100)"
30 |
31 | override def getInstance(conf: Conf): RecDataHolder = {
32 | println(getDesc)
33 | new YahooDataHolder(conf.dir())
34 | }
35 | }
36 |
37 | object NetFlix1Fac extends DataFactory {
38 | override def getName: String = "NetFlixInFile"
39 |
40 | override def getDesc: String = "数据源:NetFlix数据集,单个文件\n数据格式:???"
41 |
42 | override def getInstance(conf: Conf): RecDataHolder = {
43 | println(getDesc)
44 | new NetflixDataHolder4OneFile(conf.dir())
45 | }
46 | }
47 |
48 | object NetFlix2Fac extends DataFactory {
49 | override def getName: String = "NetFlixInDirectory"
50 |
51 | override def getDesc: String = "数据源:NetFlix数据集,目录\n" +
52 | "数据格式:每个文件第一行为UserID,其余每行:movieID,rating(0-5),time"
53 |
54 | override def getInstance(conf: Conf): RecDataHolder = {
55 | println(getDesc)
56 | new NetflixDataHolder4Directory(conf.dir())
57 | }
58 | }
59 |
60 | object LR extends DataFactory {
61 | override def getName: String = "LR"
62 |
63 | override def getInstance(conf: Conf): DataHolder = {
64 | println(getDesc)
65 | new LRDataHolder(conf.dir())
66 | }
67 |
68 | override def getDesc: String = "数据源:逻辑回归数据集,单个文件\n" +
69 | "数据格式:每行 label 特征维度1:特征数据1 ...... 特征维度n:特征数据n"
70 | }
--------------------------------------------------------------------------------
/sparkML/src/main/optimizer/FTRLProximal.scala:
--------------------------------------------------------------------------------
1 | package main.optimizer
2 |
3 | import breeze.linalg.SparseVector
4 | import breeze.numerics.abs
5 | import org.apache.spark.mllib.regression.LabeledPoint
6 |
7 | /**
8 | * Created by zhy on 2015/8/1 0001.
9 | */
10 |
11 | /**
12 | *
13 | * @param beta 添加到梯度的协方差矩阵中避免学习速率过高
14 | * @param alpha 初始学习速率
15 | * @param L1 L1正则项权重
16 | * @param L2 L2正则项权重
17 | * @param D 特征向量维度
18 | */
19 | final class FTRLProximal(val beta: Double = 0.1, val alpha: Double = 0.1, val L1: Double = 0.0, val L2: Double = 0.0, val D: Int = 1000)
20 | extends Optimizer {
21 |
22 | private val N: SparseVector[Double] = SparseVector.zeros(D)
23 | private val Z: SparseVector[Double] = SparseVector.zeros(D)
24 | private var W: SparseVector[Double] = SparseVector.zeros(D)
25 |
26 | def printV = {
27 | println("W向量" + W.toString())
28 | println("N向量" + N.toString())
29 | println("Z向量" + Z.toString())
30 | }
31 |
32 | //迭代函数
33 | def optimize(data: LabeledPoint, initialWeights: SparseVector[Double]):
34 | SparseVector[Double] = {
35 | W = initialWeights
36 | println("optimize函数")
37 | step(data.features.toArray, data.label.toInt)
38 | }
39 |
40 | //迭代过程
41 | //TODO 迭代过程需要优化
42 | def step(feature: Array[Double], label: Int): SparseVector[Double] = {
43 | println("step函数")
44 | var p: Double = 0.0
45 | for (i_double <- feature) {
46 | val i = i_double.toInt
47 | var sign: Int = 0
48 | if (Z(i) < 0)
49 | sign = -1
50 | else
51 | sign = 1
52 | if (abs(Z(i)) <= L1) {
53 | W(i) = 0.0
54 | } else {
55 | W(i) = (sign * L1 - Z(i)) / ((beta + Math.sqrt(N(i))) / alpha + L2)
56 | }
57 | p += W(i)
58 | }
59 |
60 | // predict
61 | p = 1 / (1 + Math.exp(-p))
62 |
63 | // update
64 | val g: Double = p - label
65 | for (i_double <- feature) {
66 | val i = i_double.toInt
67 | val sigma: Double = (Math.sqrt(N(i) + g * g) - Math.sqrt(N(i))) / alpha
68 | Z(i) += g - sigma * W(i)
69 | N(i) += g * g
70 | }
71 | W
72 | }
73 |
74 | }
75 |
--------------------------------------------------------------------------------
/sparkML/src/main/factory/AlgorithmFactory.scala:
--------------------------------------------------------------------------------
1 | package main.factory
2 |
3 | import main.classifier.LRWithFTRL
4 | import main.recommender.{ALSRec, Recommender, SlopOneRec}
5 | import main.util.Conf
6 |
7 | /**
8 | * Created by zhy on 2015/7/19 0019.
9 | */
10 |
11 | /**
12 | * 机器学习算法工厂
13 | */
14 | trait AlgorithmFactory {
15 | def getName: String
16 |
17 | def getAlg(conf: Conf): Algorithm
18 |
19 | def getAlgDes(): String
20 |
21 | def getParamDes(): String
22 |
23 | def getDescription(): String = {
24 | getAlgDes() + "\n参数:\n" + getParamDes()
25 | }
26 | }
27 |
28 | object AlgorithmFactory {
29 | val AlgList: List[AlgorithmFactory] = List(ALS, SlopOne, LRWithFTRL)
30 | }
31 |
32 | object ALS extends AlgorithmFactory {
33 | override def getName: String = "ALS"
34 |
35 | override def getAlgDes(): String = "MLLib中ALS算法"
36 |
37 | protected val rankStr = "rank"
38 | protected val lambdaStr = "λ"
39 | protected val iterStr = "numberOfIterations"
40 |
41 | override def getParamDes(): String = rankStr + " = ,特征向量维度\n" + lambdaStr + " = ,正则化参数\n" + iterStr + " = ,迭代次数"
42 |
43 | override def getAlg(conf: Conf): Recommender = {
44 | val ranks = 12 to 15
45 | val lambdas = List(0.01, 0.05)
46 | val iters = 10 to 20
47 |
48 | println(getDescription)
49 | new ALSRec(ranks, lambdas, iters)
50 | }
51 | }
52 |
53 | object SlopOne extends AlgorithmFactory {
54 | override def getName: String = "Slop-One"
55 |
56 | override def getAlg(conf: Conf): Recommender = {
57 | println(getDescription)
58 | new SlopOneRec
59 | }
60 |
61 | override def getParamDes(): String = "无参数"
62 |
63 | override def getAlgDes(): String = "Slop-One算法"
64 | }
65 |
66 | object LRWithFTRL extends AlgorithmFactory {
67 | override def getName: String = "LR-FTRL"
68 |
69 | protected val numFea = "numFeatures"
70 |
71 | override def getParamDes(): String = numFea + "= ,特征向量维度\n"
72 |
73 | override def getAlgDes(): String = "采用FTRL-Proximal优化的Logistic Regression算法"
74 |
75 | override def getAlg(conf: Conf): Algorithm = {
76 | //TODO 根据数据集特征提供维度
77 | val numFeatures = 1000
78 |
79 | println(getDescription)
80 | new LRWithFTRL(numFeatures)
81 | }
82 | }
--------------------------------------------------------------------------------
/sparkML/src/main/Classifier/LRWithFTRL.scala:
--------------------------------------------------------------------------------
1 | package main.classifier
2 |
3 | import breeze.linalg.SparseVector
4 | import breeze.numerics.exp
5 | import main.factory.InputLRData
6 | import main.linalg.AlgUtil
7 | import main.optimizer.FTRLProximal
8 | import org.apache.spark.mllib.linalg.Vector
9 | import org.apache.spark.mllib.regression.LabeledPoint
10 | import org.apache.spark.rdd.RDD
11 |
12 | import scala.collection.mutable.ArrayBuffer
13 |
14 | /**
15 | * Created by zhy on 2015/8/2 0002.
16 | */
17 |
18 | /**
19 | * Logistic Regression逻辑回归模型
20 | */
21 | final class LRWithFTRL(val numFeatures: Int)
22 | extends RegressionModel with InputLRData with Serializable {
23 |
24 | //初始化特征向量
25 | private var weights: SparseVector[Double] = SparseVector.zeros(numFeatures)
26 |
27 | //设定优化算法
28 | override val optimizer = new FTRLProximal(D = numFeatures)
29 |
30 | train(trainData)
31 | predictAccuracy(testData)
32 |
33 | def train(data: LabeledPoint): Unit = {
34 | weights = optimizer.optimize(data, weights)
35 | optimizer.printV
36 | }
37 |
38 | //训练参数
39 | //TODO 训练和测试过程并行化
40 | override def train(trainData: RDD[LabeledPoint]): Unit = {
41 | val localTrainData = trainData.toLocalIterator
42 | localTrainData.foreach(data => train(data))
43 | }
44 |
45 | /**
46 | * 分类预测准确率
47 | * @param testData 测试数据集合
48 | * @return 准确率
49 | */
50 | def predictAccuracy(testData: RDD[LabeledPoint]): Unit = {
51 | var predictions = new ArrayBuffer[Tuple2[Double,Double]]()
52 | testData.toLocalIterator.foreach{ data =>
53 | val prediction = (data.label, predict(data.features))
54 | train(data)
55 | predictions += prediction
56 | }
57 | val numData:Int = predictions.toArray.length
58 | val numCorrect:Int = predictions.toArray.filter{data=>
59 | data._1 == data._2
60 | }.length
61 | println("正确预测的数量: " + numCorrect +
62 | "\n所有数量: " + numData )
63 | RMSE = numCorrect * 1.0 / numData
64 | }
65 |
66 | /**
67 | * 根据假设函数 预测单个样本
68 | * @param testData 测试样本数据
69 | * @return 分类数据: 1 or 0
70 | */
71 | def predict(testData: Vector): Double = {
72 | val x: Double = weights.dot(AlgUtil.VtoB(testData))
73 | val prob: Double = sigmod(x)
74 | if (prob > 0.5) return 1.0
75 | else return 0.0
76 | }
77 |
78 | override def getRMSE =
79 | println("使用FTRL-Proximal的逻辑回归在测试集上的预测准确率为" + RMSE + "\n----------测试完毕----------")
80 |
81 | //sigmod函数
82 | private def sigmod(x: Double): Double = 1.0 / (1 + exp(-x))
83 | }
84 |
--------------------------------------------------------------------------------
/sparkML/src/main/recommender/ALSRec.scala:
--------------------------------------------------------------------------------
1 | package main.recommender
2 |
3 | import main.factory.InputRecData
4 | import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
5 | import org.apache.spark.rdd.RDD
6 |
7 | /**
8 | * Created by zhy on 2015/7/19 0019.
9 | */
10 |
11 | final class ALSRec(ranks: Range, lambdas: List[Double], numIters: Range)
12 | extends Recommender with InputRecData with Serializable {
13 | //训练模型并测试
14 | val model = getBestModel()
15 | test
16 |
17 |
18 | /**
19 | *
20 | * @return 获取参数最佳的模型
21 | */
22 | private def getBestModel(): Option[MatrixFactorizationModel] = {
23 | Some(ALS.train(trainData,12,20,0.05))
24 | }
25 |
26 | /**
27 | * 使用测试集进行测试
28 | */
29 | private def test = {
30 | RMSE = calculateRmse(model.get, testData, numTest)
31 | }
32 |
33 | /**
34 | * 计算rmse均方根误差
35 | * @param model 算法模型
36 | * @param dataset 数据集
37 | * @param n 数据集大小
38 | * @return 该算法模型在该验证数据集上的RMSE
39 | */
40 | private def calculateRmse(model: MatrixFactorizationModel, dataset: RDD[Rating], n: Long): Double = {
41 | val predictions: RDD[Rating] = model.predict(dataset.map(x => (x.user, x.product)))
42 | val predictionsAndRatings = predictions.map { x =>
43 | ((x.user, x.product), x.rating)
44 | }.join(dataset.map { x =>
45 | ((x.user, x.product), x.rating)
46 | }
47 | ).values
48 | val tmp_RMSE = math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / n)
49 | println("计算得到的RMSE为: " + tmp_RMSE)
50 | tmp_RMSE
51 | }
52 |
53 | /**
54 | * 训练模型
55 | * @param numValidation 验证集大小
56 | * @return 训练完成的模型
57 | */
58 | private def train(numValidation: Long): Option[MatrixFactorizationModel] = {
59 | RMSE = Double.MaxValue
60 | var bestModel: Option[MatrixFactorizationModel] = None
61 | var bestRank = 0
62 | var bestLambda = -1.0
63 | var bestNumIter = -1
64 |
65 | for (rank <- ranks; lambda <- lambdas; numIter <- numIters) {
66 | val model = ALS.train(trainData, rank, numIter, lambda)
67 | val validataionRmse = calculateRmse(model, validateData, numValidation)
68 | if (validataionRmse < RMSE) {
69 | bestModel = Some(model)
70 | RMSE = validataionRmse
71 | bestRank = rank
72 | bestLambda = lambda
73 | bestNumIter = numIter
74 | }
75 | }
76 | println("模型训练完毕。最优参数为: (rank = " + bestRank + "; numIter = " +
77 | bestNumIter + "; lambda = " + bestLambda + ")")
78 |
79 | bestModel
80 | }
81 | }
--------------------------------------------------------------------------------
/bash/splitDataset.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3.3
2 | # coding: UTF-8
3 | #Author :zhy
4 |
5 | import os,sys,shutil
6 | import random
7 | from subprocess import call
8 |
9 | def chooseFile():
10 | path = sys.argv[1]
11 | percentage = (float)(sys.argv[2])
12 | fileList = os.listdir(path)
13 | fileNum = (int)(len(fileList) * percentage)
14 | fileNum = [1,fileNum][fileNum >= 1]
15 | chosenFile = random.sample(fileList,fileNum)
16 | print(chosenFile)
17 | return chosenFile
18 |
19 | def splitDatasetToDirectory(chosenFile):
20 | """将NetFlix数据集目录下的小文件随机选择10%复制到另外一个文件夹"""
21 | path = sys.argv[1]
22 | for file in chosenFile:
23 | if sys.platform.__eq__("win32"):
24 | desPath = "c:/Users/zhy/Documents/study/AD.SE/courseDesign/DatasetInHDFS/NetFlix/little/"
25 | shutil.copy(path + file, desPath)
26 | else:
27 | call(["cp", path + file, path + "../little/training_set/"])
28 |
29 | def splitDatasetToHDFS(chosenFile):
30 | """将NetFlix数据集目录下的小文件随机选择10%上传至HDFS中以测试使用"""
31 | deleteOldData()
32 | path = sys.argv[1]
33 | for file in chosenFile:
34 | call(["hdfs","dfs","-put",path + file,"/zhy/data/NetFlix/little/training_set/"])
35 |
36 | def deleteOldData():
37 | """删除HDFS中原有的数据集"""
38 | call(["hdfs","dfs","-rm","-R","/zhy/data/NetFlix/little/training_set/"])
39 | call(["hdfs","dfs","-mkdir","/zhy/data/NetFlix/little/training_set/"])
40 |
41 | def inputParm():
42 | if len(sys.argv) < 3:
43 | print("命令格式:./splitDataset.py [本地数据集路径] [数据集选择比例]")
44 | sys.exit(1)
45 | trigger = True
46 | while trigger:
47 | try:
48 | trigger = False
49 | print("1 -> 抽取 “" + sys.argv[1] +
50 | "” 目录下10%的文件并复制到“../little/training_set/”目录下\n")
51 | print("2 -> 抽取 “" + sys.argv[1] +
52 | "” 目录下10%的文件并上传到HDFS中“zhy/data/NetFlix/little/training_set/”目录下\n")
53 | print("3 -> 运行Spark程序\n")
54 | print("4 -> 同时执行'2'+'3'的操作\n")
55 | param = int(input('请输入选择的操作 ... \n'))
56 | return param
57 |
58 | except ValueError:
59 | trigger = True
60 | print("输入不合法,请输入一个数字 ... ")
61 |
62 | def execute():
63 | DataSet = input('请输入数据集选项:\n')
64 | Dir = input('请输入数据集位置:\n')
65 | Alg = input('请输入算法类型:\n')
66 | call(["spark-submit","/home/zhy/spark-app/zhy/sparkML.jar","--data",DataSet,"--dir",Dir,"--method",Alg])
67 |
68 | if __name__ == '__main__':
69 | param = inputParm()
70 | if param == 1:
71 | splitDatasetToDirectory(chooseFile())
72 | elif param == 2:
73 | splitDatasetToHDFS(chooseFile())
74 | elif param == 3:
75 | execute()
76 | elif param == 4:
77 | splitDatasetToHDFS(chooseFile())
78 | execute()
79 | else:
80 | print("未知选项,退出程序")
81 | sys.exit(1)
82 |
83 |
--------------------------------------------------------------------------------
/sparkML/src/main/input/recommend/NetflixDataHolder.scala:
--------------------------------------------------------------------------------
1 | package main.input.recommend
2 |
3 | /**
4 | * Created by zhy on 2015/7/18 0018.
5 | */
6 |
7 | import main.util.SparkEnv
8 | import org.apache.hadoop.conf.Configuration
9 | import org.apache.hadoop.fs.{FileSystem, Path}
10 | import org.apache.spark.mllib.recommendation.Rating
11 | import org.apache.spark.rdd.RDD
12 |
13 | import scala.collection.mutable.ArrayBuffer
14 | ;
15 |
16 | /**
17 | * @param dataDirectoryPath NetFlix数据集根目录
18 | */
19 | abstract class NetflixDataHolder(dataDirectoryPath: String) extends RecDataHolder {
20 | protected val productsIDsToNameMap = loadIDsToProductnameMapFromADirectory()
21 |
22 | /**
23 | * 从 "movie_titles.txt" 中获取电影名和ID的映射
24 | * @return Map: movieID -> title
25 | */
26 | protected def loadIDsToProductnameMapFromADirectory(): Map[Int, String] = {
27 | val sc = SparkEnv.sc
28 | val movies = sc.textFile(dataDirectoryPath + "movie_titles.txt").map { line =>
29 | val fields = line.split(",")
30 | // format: (movieID, movieName)
31 | (fields(0).toInt, fields(2) + " (" + fields(1) + ")")
32 | }.collect.toMap
33 | movies
34 | }
35 | }
36 |
37 | /**
38 | * 从一个文件读取NetFilx数据 文件格式: movieID>,userID,rating,date.
39 | * @param dataDirectoryPath NetFlix数据集目录
40 | * @param filename 文件名
41 | */
42 | class NetflixDataHolder4OneFile(dataDirectoryPath: String, filename: String = "ratings.txt") extends NetflixDataHolder(dataDirectoryPath) with Serializable {
43 | protected val ratings = {
44 | val sc = SparkEnv.sc
45 | val ratingsRDD = sc.textFile(dataDirectoryPath + filename).map {
46 | line => val fields = line.split(",")
47 | (Rating(fields(1).toInt, fields(0).toInt, fields(2).toDouble))
48 | }
49 | ratingsRDD
50 | }
51 | }
52 |
53 | /**
54 | * 从一个目录下所有文件读取NetFilx数据 文件格式: movieID>,userID,rating,date.
55 | * @param dataDirectoryPath NetFlix数据集目录
56 | */
57 | class NetflixDataHolder4Directory(dataDirectoryPath: String) extends NetflixDataHolder(dataDirectoryPath) with Serializable {
58 | protected val ratings = loadRatingsFromADirectory()
59 |
60 | protected def loadRatingsFromADirectory(): RDD[Rating] = {
61 | val conf = new Configuration()
62 | val hdfs = FileSystem.get(conf)
63 | val dataPath = new Path(dataDirectoryPath + "training_set")
64 | val stats = hdfs.listStatus(dataPath)
65 | var fileList = new ArrayBuffer[String]
66 |
67 | for (stat <- stats) fileList += stat.getPath.toString
68 | val ratingsRDDsArray = fileList.map(filePath => loadRatingsFromOneFile(filePath))
69 | val ratings = SparkEnv.sc.union(ratingsRDDsArray)
70 | ratings.persist.coalesce(77)
71 | }
72 |
73 | protected def loadRatingsFromOneFile(absoluteFilePath: String): RDD[Rating] = {
74 | val ratingsTxtRDD = SparkEnv.sc.textFile(absoluteFilePath)
75 | val movieIDLine = ratingsTxtRDD.first()
76 | val movieID = movieIDLine.split(":")(0).toInt
77 |
78 | val ratingsRDD = ratingsTxtRDD.map(line => if (line == movieIDLine) {
79 | Rating(-1, -1, -1)
80 | } else {
81 | val fields = line.split(",")
82 | (Rating(fields(0).toInt, movieID, fields(1).toDouble))
83 | })
84 | ratingsRDD.filter(rat => rat.user >= 0)
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/sparkML/src/main/recommender/SlopOneRec.scala:
--------------------------------------------------------------------------------
1 | package main.recommender
2 |
3 | import main.factory.InputRecData
4 | import org.apache.spark.mllib.recommendation.Rating
5 | import org.apache.spark.rdd.RDD
6 |
7 | import scala.collection.mutable.ArrayBuffer
8 |
9 | /**
10 | * Created by zhy on 2015/7/26 0026.
11 | */
12 | final class SlopOneRec extends Recommender with InputRecData with Serializable {
13 | val trainDataGroupByUser = trainData.map(rating => (rating.user, (rating.product, rating.rating)))
14 | .groupByKey.persist
15 | test
16 |
17 |
18 | /**
19 | * 使用测试集进行测试
20 | */
21 | private def test = {
22 | RMSE = calculateRmse(testData, numTest)
23 | }
24 |
25 | def numUserConsumer_ij(product_i: Int, product_j: Int): Long = {
26 | trainDataGroupByUser.filter { trainData4one =>
27 | trainData4one._2.exists(a => a._1 == product_i) && trainData4one._2.exists(b => b._1 == product_j)
28 | }.count()
29 | }
30 |
31 | /**
32 | *
33 | * @param u 用户ID
34 | * @param i 物品ID
35 | * @return 评分三元组
36 | */
37 | def predict(u: Int, i: Int): Rating = {
38 | //(projectID,Ratings)
39 | val S_u = {
40 | val ratings = trainDataGroupByUser.lookup(u)
41 | if (ratings.length <= 0) throw new UserNotFoundException
42 | ratings(0).toIterator
43 | }
44 | var prediction: Double = 0
45 | var S_u_minus_i: Double = 0
46 | S_u.foreach { S_uj =>
47 | if (S_uj._1 == i) return new Rating(u, i, S_uj._2)
48 | val deviation_ij = calcuDeviation_ij(i, S_uj._1)
49 | val r_uj = S_uj._2
50 | S_u_minus_i += 1
51 | prediction += (deviation_ij + r_uj)
52 | }
53 | new Rating(u, i, prediction / (S_u_minus_i - 1))
54 | }
55 |
56 |
57 | private def calculateRmse(dataset: RDD[Rating], n: Long): Double = {
58 | println("开始计算RMSE")
59 | var predictions = ArrayBuffer[Rating]()
60 | val train = dataset.toLocalIterator
61 | train.foreach { x =>
62 | println("预测一个样本的评分")
63 | predictions += predict(x.user, x.product)
64 | }
65 | val predictionsRDD: RDD[Rating] = sc.parallelize(predictions.toSeq)
66 | val predictionsAndRatings = predictionsRDD.map { x =>
67 | ((x.user, x.product), x.rating)
68 | }.join(dataset.map { x =>
69 | ((x.user, x.product), x.rating)
70 | }
71 | ).values
72 | val tmp_RMSE = math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / n)
73 | println("计算得到的RMSE为: " + tmp_RMSE)
74 | tmp_RMSE
75 | }
76 |
77 | /**
78 | *
79 | * @param product_i 物品i
80 | * @param product_j 物品j
81 | * @return 物品i与j的偏差
82 | */
83 | private def calcuDeviation_ij(product_i: Int, product_j: Int): Double = {
84 | val userList4i = trainData.filter(rating => rating.product == product_i)
85 | val userList4j = trainData.filter(rating => rating.product == product_j)
86 | val userList4ij = userList4i.intersection(userList4j)
87 | val numUser4ij = userList4ij.count()
88 | if (numUser4ij == 0) return 0
89 | var deviation_ij: Double = 0
90 | userList4ij.foreach { rating =>
91 | val user = rating.user
92 | val rating_ui = userList4ij.filter(rating => rating.user == user && rating.product == product_i)
93 | .toLocalIterator.next().rating
94 | val rating_uj = userList4ij.filter(rating => rating.user == user && rating.product == product_j)
95 | .toLocalIterator.next().rating
96 | deviation_ij += (rating_ui - rating_uj)
97 | }
98 | deviation_ij / numUser4ij
99 | }
100 |
101 | class UserNotFoundException extends Exception
102 |
103 | }
104 |
--------------------------------------------------------------------------------