├── .idea └── vcs.xml ├── src ├── main │ └── scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── ml │ │ ├── util │ │ ├── Utils.scala │ │ ├── XORShiftRandom.scala │ │ ├── DBHPartitioner.scala │ │ ├── SparkUtils.scala │ │ └── LoaderUtils.scala │ │ ├── dbscan │ │ ├── DBSCANPoint.scala │ │ ├── DBSCANLabeledPoint.scala │ │ ├── DBSCANRectangle.scala │ │ ├── DBSCANGraph.scala │ │ ├── LocalDBSCANArchery.scala │ │ ├── LocalDBSCANNaive.scala │ │ ├── DBSCAN2.scala │ │ ├── EvenSplitPartitioner.scala │ │ └── DBSCAN.scala │ │ ├── tsne │ │ ├── TSNEParam.scala │ │ ├── TSNEHelper.scala │ │ ├── impl │ │ │ ├── SimpleTSNE.scala │ │ │ ├── BHTSNE.scala │ │ │ └── LBFGSTSNE.scala │ │ ├── tree │ │ │ └── SPTree.scala │ │ ├── X2P.scala │ │ └── TSNEGradient.scala │ │ ├── timeseries │ │ ├── params │ │ │ └── TimeSeriesParams.scala │ │ ├── MatrixUtil.scala │ │ ├── Lag.scala │ │ ├── models │ │ │ ├── Autoregression.scala │ │ │ ├── ARGARCH.scala │ │ │ ├── EWMA.scala │ │ │ ├── GARCH.scala │ │ │ └── HoltWinters.scala │ │ └── UnivariateTimeSeries.scala │ │ ├── knn │ │ ├── Distance.scala │ │ └── KNNClassifier.scala │ │ ├── sampling │ │ ├── UnderSampling.scala │ │ └── OverSampling.scala │ │ ├── fm │ │ ├── FMModel.scala │ │ └── BSFMModel.scala │ │ └── mvm │ │ └── MVMModel.scala └── test │ └── scala │ └── org │ └── apache │ └── spark │ └── ml │ ├── timeseries │ ├── MatrixUtilSuite.scala │ ├── models │ │ ├── GARCHSuite.scala │ │ ├── EWMASuite.scala │ │ ├── AutoregressionSuite.scala │ │ ├── ARGARCHSuite.scala │ │ ├── HoltWintersSuite.scala │ │ └── ARIMASuite.scala │ └── UnivariateTimeSeriesSuite.scala │ └── knn_is │ └── KNN_ISSuite.scala ├── README.md └── pom.xml /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/util/Utils.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.util 2 | 3 | import java.util.Random 4 | 5 | object Utils { 6 | val random = new Random() 7 | def log1pExp(x: Double): Double = { 8 | if (x > 0) { 9 | x + math.log1p(math.exp(-x)) 10 | } else { 11 | math.log1p(math.exp(x)) 12 | } 13 | } 14 | } -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/dbscan/DBSCANPoint.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.dbscan 2 | 3 | import org.apache.spark.ml.linalg.Vector 4 | 5 | case class DBSCANPoint(val vector: Vector) { 6 | 7 | def x: Double = vector(0) 8 | def y: Double = vector(1) 9 | 10 | def distanceSquared(other: DBSCANPoint): Double = { 11 | val dx = other.x - x 12 | val dy = other.y - y 13 | (dx * dx) + (dy * dy) 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/tsne/TSNEParam.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.tsne 2 | 3 | case class TSNEParam( 4 | early_exaggeration: Int = 100, 5 | exaggeration_factor: Double = 4.0, 6 | t_momentum: Int = 25, 7 | initial_momentum: Double = 0.5, 8 | final_momentum: Double = 0.8, 9 | eta: Double = 500.0, 10 | min_gain: Double = 0.01 11 | ) -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/timeseries/params/TimeSeriesParams.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.timeseries.params 2 | 3 | import org.apache.spark.ml.param.{Param, Params} 4 | 5 | /** 6 | * Created by endy on 16-12-22. 7 | */ 8 | trait TimeSeriesParams extends Params { 9 | final val timeCol = new Param[String](this, "timeCol", 10 | "The column that stored time value") 11 | def setTimeCol(value: String): this.type = set(timeCol, value) 12 | 13 | final val timeSeriesCol = new Param[String](this, "timeSeriesCol", 14 | "The column that stored time series value") 15 | def setTimeSeriesCol(value: String): this.type = set(timeSeriesCol, value) 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/dbscan/DBSCANLabeledPoint.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.dbscan 2 | 3 | import org.apache.spark.ml.linalg.Vector 4 | 5 | /** 6 | * Companion constants for labeled points 7 | */ 8 | object DBSCANLabeledPoint { 9 | 10 | val Unknown = 0 11 | 12 | object Flag extends Enumeration { 13 | type Flag = Value 14 | val Border, Core, Noise, NotFlagged = Value 15 | } 16 | 17 | } 18 | 19 | class DBSCANLabeledPoint(vector: Vector) extends DBSCANPoint(vector) { 20 | 21 | def this(point: DBSCANPoint) = this(point.vector) 22 | 23 | var flag = DBSCANLabeledPoint.Flag.NotFlagged 24 | var cluster = DBSCANLabeledPoint.Unknown 25 | var visited = false 26 | 27 | override def toString(): String = { 28 | s"$vector,$cluster,$flag" 29 | } 30 | 31 | } 32 | 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Distributed Algorithms On Spark 2 | 3 | This project implement some popular algorithms on spark.You can read the papers of them to see their details. 4 | 5 | Currently it support the following algorithms and I will add some other algorithms in the future. 6 | 7 | - Distributed KNN 8 | - Down Sampling 9 | - Over Sampling 10 | - Affinity Propagation 11 | - Distributed t-SNE 12 | - Factorization Machines 13 | - Multi-view Machines 14 | - Block Structures Factorization Machines 15 | - Timeseries models 16 | - DBSCAN 17 | 18 | 19 | This project support spark 2.x 20 | 21 | ## reference 22 | 23 | - https://github.com/viirya/SparkAffinityPropagation 24 | - https://github.com/saurfang/spark-tsne 25 | - https://github.com/cloudml/zen 26 | - https://github.com/sryza/spark-timeseries 27 | - https://github.com/irvingc/dbscan-on-spark 28 | - http://mlwiki.org/index.php/Metric_Trees -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/ml/timeseries/MatrixUtilSuite.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.timeseries 2 | 3 | import org.apache.spark.SparkFunSuite 4 | import org.apache.spark.ml.linalg.{Matrices, Vectors} 5 | import org.apache.spark.ml.util.DefaultReadWriteTest 6 | import org.apache.spark.mllib.util.MLlibTestSparkContext 7 | 8 | /** 9 | * Created by endy on 16-12-21. 10 | */ 11 | class MatrixUtilSuite extends SparkFunSuite with MLlibTestSparkContext 12 | with DefaultReadWriteTest { 13 | test("modifying toBreeze version modifies original tensor") { 14 | val vec = Vectors.dense(1.0, 2.0, 3.0) 15 | val breezeVec = MatrixUtil.toBreeze(vec) 16 | breezeVec(1) = 4.0 17 | assert(vec(1) == 4.0) 18 | 19 | val mat = Matrices.zeros(3, 4) 20 | val breezeMat = MatrixUtil.toBreeze(mat) 21 | breezeMat(0, 1) = 2.0 22 | assert(mat(0, 1) == 2.0) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/dbscan/DBSCANRectangle.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.dbscan 2 | 3 | /** 4 | * A rectangle with a left corner of (x, y) and a right upper corner of (x2, y2) 5 | */ 6 | case class DBSCANRectangle(x: Double, y: Double, x2: Double, y2: Double) { 7 | 8 | /** 9 | * Returns whether other is contained by this box 10 | */ 11 | def contains(other: DBSCANRectangle): Boolean = { 12 | x <= other.x && other.x2 <= x2 && y <= other.y && other.y2 <= y2 13 | } 14 | 15 | /** 16 | * Returns whether point is contained by this box 17 | */ 18 | def contains(point: DBSCANPoint): Boolean = { 19 | x <= point.x && point.x <= x2 && y <= point.y && point.y <= y2 20 | } 21 | 22 | /** 23 | * Returns a new box from shrinking this box by the given amount 24 | */ 25 | def shrink(amount: Double): DBSCANRectangle = { 26 | DBSCANRectangle(x + amount, y + amount, x2 - amount, y2 - amount) 27 | } 28 | 29 | /** 30 | * Returns a whether the rectangle contains the point, and the point 31 | * is not in the rectangle's border 32 | */ 33 | def almostContains(point: DBSCANPoint): Boolean = { 34 | x < point.x && point.x < x2 && y < point.y && point.y < y2 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/knn/Distance.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.classification 2 | 3 | import org.apache.spark.ml.linalg.Vector 4 | 5 | object Distance extends Enumeration { 6 | 7 | val Euclidean, Manhattan = Value 8 | 9 | /** 10 | * Computes the (Manhattan or Euclidean) distance between instance x and instance y. 11 | * The type of the distance used is determined by the value of distanceType. 12 | * 13 | * @param x instance x 14 | * @param y instance y 15 | * @param distanceType type of the distance used (Distance.Euclidean or Distance.Manhattan) 16 | * @return Distance 17 | */ 18 | def apply(x: Vector, y: Vector, distanceType: Distance.Value): Double = { 19 | distanceType match { 20 | case Euclidean => euclidean(x, y) 21 | case Manhattan => manhattan(x, y) 22 | case _ => euclidean(x, y) 23 | } 24 | } 25 | 26 | /** 27 | * Computes the Euclidean distance between instance x and instance y. 28 | * The type of the distance used is determined by the value of distanceType. 29 | * 30 | * @param x instance x 31 | * @param y instance y 32 | * @return Euclidean distance 33 | */ 34 | private def euclidean(x: Vector, y: Vector): Double = { 35 | var sum = 0.0 36 | val size = x.size 37 | 38 | for (i <- 0 until size) sum += (x(i) - y(i)) * (x(i) - y(i)) 39 | 40 | Math.sqrt(sum) 41 | } 42 | 43 | /** 44 | * Computes the Manhattan distance between instance x and instance y. 45 | * The type of the distance used is determined by the value of distanceType. 46 | * 47 | * @param x instance x 48 | * @param y instance y 49 | * @return Manhattan distance 50 | */ 51 | private def manhattan(x: Vector, y: Vector): Double = { 52 | var sum = 0.0 53 | val size = x.size 54 | 55 | for (i <- 0 until size) sum += Math.abs(x(i) - y(i)) 56 | 57 | sum 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/util/XORShiftRandom.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.util 2 | 3 | import java.nio.ByteBuffer 4 | import java.util.{Random => JavaRandom} 5 | 6 | import scala.util.hashing.MurmurHash3 7 | 8 | /** 9 | * This class implements a XORShift random number generator algorithm 10 | * Source: 11 | * Marsaglia, G. (2003). Xorshift RNGs. Journal of Statistical Software, Vol. 8, Issue 14. 12 | * @see Paper 13 | * This implementation is approximately 3.5 times faster than 14 | * { @link java.util.Random java.util.Random}, partly because of the algorithm, but also due 15 | * to renouncing thread safety. JDK's implementation uses an AtomicLong seed, this class 16 | * uses a regular Long. We can forgo thread safety since we use a new instance of the RNG 17 | * for each thread. 18 | */ 19 | class XORShiftRandom(init: Long) extends JavaRandom(init) { 20 | 21 | def this() = this(System.nanoTime) 22 | 23 | private var seed = XORShiftRandom.hashSeed(init) 24 | 25 | // we need to just override next - this will be called by nextInt, nextDouble, 26 | // nextGaussian, nextLong, etc. 27 | override protected def next(bits: Int): Int = { 28 | var nextSeed = seed ^ (seed << 21) 29 | nextSeed ^= (nextSeed >>> 35) 30 | nextSeed ^= (nextSeed << 4) 31 | seed = nextSeed 32 | (nextSeed & ((1L << bits) - 1)).asInstanceOf[Int] 33 | } 34 | 35 | override def setSeed(s: Long) { 36 | seed = XORShiftRandom.hashSeed(s) 37 | } 38 | } 39 | 40 | /** Contains benchmark method and main method to run benchmark of the RNG */ 41 | object XORShiftRandom { 42 | 43 | /** Hash seeds to have 0/1 bits throughout. */ 44 | private def hashSeed(seed: Long): Long = { 45 | val bytes = ByteBuffer.allocate(java.lang.Long.SIZE).putLong(seed).array() 46 | MurmurHash3.bytesHash(bytes) 47 | } 48 | 49 | } -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/ml/timeseries/models/GARCHSuite.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.timeseries.models 2 | 3 | import org.apache.commons.math3.random.MersenneTwister 4 | import org.apache.spark.SparkFunSuite 5 | import org.apache.spark.ml.linalg.DenseVector 6 | import org.apache.spark.ml.util.DefaultReadWriteTest 7 | import org.apache.spark.mllib.util.MLlibTestSparkContext 8 | 9 | /** 10 | * Created by endy on 16-12-22. 11 | */ 12 | class GARCHSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest{ 13 | 14 | test("GARCH log likelihood") { 15 | val model = new GARCHModel(.2, .3, .4) 16 | val rand = new MersenneTwister(5L) 17 | val n = 10000 18 | 19 | val ts = new DenseVector(model.sample(n, rand)) 20 | val logLikelihoodWithRightModel = model.logLikelihood(ts) 21 | 22 | val logLikelihoodWithWrongModel1 = new GARCHModel(.3, .4, .5).logLikelihood(ts) 23 | val logLikelihoodWithWrongModel2 = new GARCHModel(.25, .35, .45).logLikelihood(ts) 24 | val logLikelihoodWithWrongModel3 = new GARCHModel(.1, .2, .3).logLikelihood(ts) 25 | 26 | assert(logLikelihoodWithRightModel > logLikelihoodWithWrongModel1) 27 | assert(logLikelihoodWithRightModel > logLikelihoodWithWrongModel2) 28 | assert(logLikelihoodWithRightModel > logLikelihoodWithWrongModel3) 29 | assert(logLikelihoodWithWrongModel2 > logLikelihoodWithWrongModel1) 30 | } 31 | 32 | test("gradient") { 33 | val alpha = 0.3 34 | val beta = 0.4 35 | val omega = 0.2 36 | val genModel = new GARCHModel(omega, alpha, beta) 37 | val rand = new MersenneTwister(5L) 38 | val n = 10000 39 | 40 | val ts = new DenseVector(genModel.sample(n, rand)) 41 | 42 | val gradient1 = new GARCHModel(omega + .1, alpha + .05, beta + .1).gradient(ts) 43 | assert(gradient1.forall(_ < 0.0)) 44 | val gradient2 = new GARCHModel(omega - .1, alpha - .05, beta - .1).gradient(ts) 45 | assert(gradient2.forall(_ > 0.0)) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/tsne/TSNEHelper.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.tsne 2 | 3 | import breeze.linalg._ 4 | import breeze.stats._ 5 | import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix 6 | import org.apache.spark.rdd.RDD 7 | 8 | object TSNEHelper { 9 | // p_ij = (p_{i|j} + p_{j|i}) / 2n 10 | def computeP(p_ji: CoordinateMatrix, n: Int): RDD[(Int, Iterable[(Int, Double)])] = { 11 | p_ji.entries 12 | .flatMap(e => Seq( 13 | ((e.i.toInt, e.j.toInt), e.value), 14 | ((e.j.toInt, e.i.toInt), e.value) 15 | )) 16 | .reduceByKey(_ + _) // p + p' 17 | .map{case ((i, j), v) => (i, (j, math.max(v / 2 / n, 1e-12))) } // p / 2n 18 | .groupByKey() 19 | } 20 | 21 | /** 22 | * Update Y via gradient dY 23 | * @param Y current Y 24 | * @param dY gradient dY 25 | * @param iY stored y_i - y_{i-1} 26 | * @param gains adaptive learning rates 27 | * @param iteration n 28 | * @param param [[TSNEParam]] 29 | * @return 30 | */ 31 | def update(Y: DenseMatrix[Double], 32 | dY: DenseMatrix[Double], 33 | iY: DenseMatrix[Double], 34 | gains: DenseMatrix[Double], 35 | iteration: Int, 36 | param: TSNEParam): DenseMatrix[Double] = { 37 | import param._ 38 | val momentum = if (iteration <= t_momentum) initial_momentum else final_momentum 39 | gains.foreachPair { 40 | case ((i, j), old_gain) => 41 | val new_gain = math.max(min_gain, 42 | if ((dY.unsafeValueAt(i, j) > 0.0) != (iY.unsafeValueAt(i, j) > 0.0)) 43 | old_gain + 0.2 44 | else 45 | old_gain * 0.8 46 | ) 47 | gains.unsafeUpdate(i, j, new_gain) 48 | 49 | val new_iY = momentum * iY.unsafeValueAt(i, j) - eta * new_gain * dY.unsafeValueAt(i, j) 50 | iY.unsafeUpdate(i, j, new_iY) 51 | 52 | Y.unsafeUpdate(i, j, Y.unsafeValueAt(i, j) + new_iY) // Y += iY 53 | } 54 | Y := Y(*, ::) - (mean(Y(::, *)): DenseMatrix[Double]).toDenseVector 55 | } 56 | } -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/dbscan/DBSCANGraph.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.dbscan 2 | 3 | import scala.annotation.tailrec 4 | 5 | /** 6 | * Top level method for creating a DBSCANGraph 7 | */ 8 | object DBSCANGraph { 9 | 10 | /** 11 | * Create an empty graph 12 | */ 13 | def apply[T](): DBSCANGraph[T] = new DBSCANGraph(Map[T, Set[T]]()) 14 | 15 | } 16 | 17 | /** 18 | * An immutable unweighted graph with vertexes and edges 19 | */ 20 | class DBSCANGraph[T] private (nodes: Map[T, Set[T]]) extends Serializable { 21 | 22 | /** 23 | * Add the given vertex `v` to the graph 24 | * 25 | */ 26 | def addVertex(v: T): DBSCANGraph[T] = { 27 | nodes.get(v) match { 28 | case None => new DBSCANGraph(nodes + (v -> Set())) 29 | case Some(_) => this 30 | } 31 | } 32 | 33 | /** 34 | * Insert an edge from `from` to `to` 35 | */ 36 | def insertEdge(from: T, to: T): DBSCANGraph[T] = { 37 | nodes.get(from) match { 38 | case None => new DBSCANGraph(nodes + (from -> Set(to))) 39 | case Some(edge) => new DBSCANGraph(nodes + (from -> (edge + to))) 40 | } 41 | } 42 | 43 | /** 44 | * Insert a vertex from `one` to `another`, and from `another` to `one` 45 | * 46 | */ 47 | def connect(one: T, another: T): DBSCANGraph[T] = { 48 | insertEdge(one, another).insertEdge(another, one) 49 | } 50 | 51 | /** 52 | * Find all vertexes that are reachable from `from` 53 | */ 54 | def getConnected(from: T): Set[T] = { 55 | getAdjacent(Set(from), Set[T](), Set[T]()) - from 56 | } 57 | 58 | @tailrec 59 | private def getAdjacent(tovisit: Set[T], visited: Set[T], adjacent: Set[T]): Set[T] = { 60 | 61 | tovisit.headOption match { 62 | case Some(current) => 63 | nodes.get(current) match { 64 | case Some(edges) => 65 | getAdjacent(edges.diff(visited) ++ tovisit.tail, visited + current, adjacent ++ edges) 66 | case None => getAdjacent(tovisit.tail, visited, adjacent) 67 | } 68 | case None => adjacent 69 | } 70 | 71 | } 72 | 73 | } 74 | 75 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/util/DBHPartitioner.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.util 2 | 3 | import scala.reflect.ClassTag 4 | 5 | import org.apache.spark.HashPartitioner 6 | import org.apache.spark.graphx._ 7 | import org.apache.spark.graphx.impl.GraphImpl 8 | import org.apache.spark.storage.StorageLevel 9 | 10 | /** 11 | * Degree-Based Hashing, the paper: 12 | * Distributed Power-law Graph Computing: Theoretical and Empirical Analysis 13 | */ 14 | class DBHPartitioner(val partitions: Int, val threshold: Int = 0) 15 | extends HashPartitioner(partitions) { 16 | /** 17 | * Default DBH doesn't consider the situation where both the degree of src and 18 | * dst vertices are both small than a given threshold value 19 | */ 20 | def getKey(et: EdgeTriplet[Int, _]): Long = { 21 | val srcId = et.srcId 22 | val dstId = et.dstId 23 | val srcDeg = et.srcAttr 24 | val dstDeg = et.dstAttr 25 | val maxDeg = math.max(srcDeg, dstDeg) 26 | val minDegId = if (maxDeg == srcDeg) dstId else srcId 27 | val maxDegId = if (maxDeg == srcDeg) srcId else dstId 28 | if (maxDeg < threshold) { 29 | maxDegId 30 | } else { 31 | minDegId 32 | } 33 | } 34 | 35 | override def equals(other: Any): Boolean = other match { 36 | case dbh: DBHPartitioner => 37 | dbh.numPartitions == numPartitions 38 | case _ => 39 | false 40 | } 41 | } 42 | 43 | object DBHPartitioner { 44 | def partitionByDBH[VD: ClassTag, ED: ClassTag](input: Graph[VD, ED], 45 | storageLevel: StorageLevel): Graph[VD, ED] = { 46 | val edges = input.edges 47 | val conf = edges.context.getConf 48 | val numPartitions = conf.getInt("", edges.partitions.length) 49 | val dbh = new DBHPartitioner(numPartitions, 0) 50 | val degGraph = GraphImpl(input.degrees, edges) 51 | val newEdges = degGraph.triplets.mapPartitions(_.map(et => 52 | (dbh.getKey(et), Edge(et.srcId, et.dstId, et.attr)) 53 | )).partitionBy(dbh).map(_._2) 54 | GraphImpl(input.vertices, newEdges, null.asInstanceOf[VD], storageLevel, storageLevel) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/ml/timeseries/models/EWMASuite.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.timeseries.models 2 | 3 | import org.apache.spark.SparkFunSuite 4 | import org.apache.spark.ml.util.DefaultReadWriteTest 5 | import org.apache.spark.mllib.util.MLlibTestSparkContext 6 | import org.apache.spark.mllib.util.TestingUtils._ 7 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} 8 | import org.apache.spark.sql.{Dataset, Row} 9 | 10 | class EWMASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest{ 11 | @transient var dataSet: Dataset[_] = _ 12 | @transient var dataSet1: Dataset[_] = _ 13 | 14 | override def beforeAll(): Unit = { 15 | super.beforeAll() 16 | 17 | val schema = StructType(Array(StructField("time", StringType), StructField("timeseries", 18 | DoubleType))) 19 | 20 | val smoothed = Array( 21 | Array("201512", 7.0), Array("201601", 8.0), Array("201602", 9.0), 22 | Array("201509", 4.0), Array("201510", 5.0), Array("201511", 6.0), 23 | Array("201506", 1.0), Array("201507", 2.0), Array("201508", 3.0), 24 | Array("201603", 10.0)) 25 | 26 | val orig1 = sc.parallelize(smoothed.map(x => Row(x: _*))) 27 | dataSet = spark.createDataFrame(orig1, schema) 28 | 29 | val oil = Array( 30 | Array("201506", 446.7), Array("201507", 454.5), Array("201508", 455.7), 31 | Array("201512", 425.3), Array("201601", 485.1), Array("201602", 506.0), 32 | Array("201509", 423.6), Array("201510", 456.3), Array("201511", 440.6), 33 | Array("201603", 526.8), Array("201604", 514.3), Array("201605", 494.2)) 34 | 35 | val orig2 = sc.parallelize(oil.map(x => Row(x: _*))) 36 | dataSet1 = spark.createDataFrame(orig2, schema) 37 | } 38 | 39 | 40 | test("add time dependent effects") { 41 | 42 | val m1 = new EWMAModel(0.2).setTimeCol("time").setTimeSeriesCol("timeseries") 43 | val res = m1.transform(dataSet).collect().map{case Row(x: Double) => x} 44 | 45 | assert(res(0) == 1.0) 46 | assert(res(1) ~== 1.2 absTol 10E-5) 47 | } 48 | 49 | test("fitting EWMA model") { 50 | val model = new EWMA() 51 | .setTimeCol("time") 52 | .setTimeSeriesCol("timeseries") 53 | .setMaxIter(10000) 54 | .setMaxEval(10000) 55 | .setInitPoint(.94) 56 | .fit(dataSet1) 57 | 58 | assert(model.smoothing ~== 0.89 absTol 0.01) // approximately 0.89 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/ml/timeseries/models/AutoregressionSuite.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.timeseries.models 2 | 3 | import org.apache.commons.math3.random.{MersenneTwister, RandomGenerator} 4 | import org.apache.spark.SparkFunSuite 5 | import org.apache.spark.ml.linalg.DenseVector 6 | import org.apache.spark.ml.util.DefaultReadWriteTest 7 | import org.apache.spark.mllib.util.MLlibTestSparkContext 8 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} 9 | import org.apache.spark.sql.{Dataset, Row} 10 | 11 | /** 12 | * Created by endy on 16-12-19. 13 | */ 14 | class AutoregressionSuite extends SparkFunSuite with MLlibTestSparkContext 15 | with DefaultReadWriteTest { 16 | 17 | @transient var dataSet: Dataset[_] = _ 18 | 19 | override def beforeAll(): Unit = { 20 | super.beforeAll() 21 | } 22 | 23 | test("fit AR(1) model") { 24 | val ts = sample(5000, new MersenneTwister(10L), 1.5, Array(.2)) 25 | 26 | val fittedModel = new Autoregression() 27 | .setTimeCol("time") 28 | .setTimeSeriesCol("timeseries") 29 | .setMaxLag(1) 30 | .setNoIntercept(false) 31 | .fit(ts) 32 | 33 | assert(fittedModel.coefficients.length == 1) 34 | assert(math.abs(fittedModel.c - 1.5) < .07) 35 | assert(math.abs(fittedModel.coefficients(0) - .2) < .03) 36 | } 37 | 38 | test("fit AR(2) model") { 39 | 40 | val ts = sample(5000, new MersenneTwister(10L), 1.5, Array(.2, .3)) 41 | val fittedModel = new Autoregression() 42 | .setTimeCol("time") 43 | .setTimeSeriesCol("timeseries") 44 | .setMaxLag(2) 45 | .setNoIntercept(false) 46 | .fit(ts) 47 | 48 | assert(fittedModel.coefficients.length == 2) 49 | assert(math.abs(fittedModel.c - 1.5) < .15) 50 | assert(math.abs(fittedModel.coefficients(0) - .2) < .03) 51 | assert(math.abs(fittedModel.coefficients(1) - .3) < .03) 52 | } 53 | 54 | def sample(n: Int, rand: RandomGenerator, c: Double, coefficients: Array[Double]): Dataset[_] = { 55 | val vec = new DenseVector(Array.fill[Double](n)(rand.nextGaussian())) 56 | val res = new ARModel(c, coefficients).addTimeDependentEffects(vec).toArray 57 | .zipWithIndex 58 | 59 | val schema = StructType(Array(StructField("time", StringType), StructField("timeseries", 60 | DoubleType))) 61 | 62 | val rdd = sc.parallelize(res.map(x => Row(x._2.formatted("%05d"), x._1))) 63 | 64 | spark.createDataFrame(rdd, schema) 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/tsne/impl/SimpleTSNE.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.tsne.impl 2 | 3 | import breeze.linalg._ 4 | import breeze.stats.distributions.Rand 5 | import org.apache.spark.ml.tsne.{TSNEGradient, TSNEHelper, TSNEParam, X2P} 6 | import org.apache.spark.mllib.linalg.distributed.RowMatrix 7 | import org.apache.spark.storage.StorageLevel 8 | import org.slf4j.LoggerFactory 9 | 10 | import scala.util.Random 11 | 12 | object SimpleTSNE { 13 | private def logger = LoggerFactory.getLogger(SimpleTSNE.getClass) 14 | 15 | def tsne( 16 | input: RowMatrix, 17 | noDims: Int = 2, 18 | maxIterations: Int = 1000, 19 | perplexity: Double = 30, 20 | callback: (Int, DenseMatrix[Double], Option[Double]) => Unit = {case _ => }, 21 | seed: Long = Random.nextLong()): DenseMatrix[Double] = { 22 | if(input.rows.getStorageLevel == StorageLevel.NONE) { 23 | logger.warn("Input is not persisted and performance could be bad") 24 | } 25 | 26 | Rand.generator.setSeed(seed) 27 | 28 | val tsneParam = TSNEParam() 29 | import tsneParam._ 30 | 31 | val n = input.numRows().toInt 32 | val Y: DenseMatrix[Double] = DenseMatrix.rand(n, noDims, Rand.gaussian(0, 1)) 33 | val iY = DenseMatrix.zeros[Double](n, noDims) 34 | val gains = DenseMatrix.ones[Double](n, noDims) 35 | 36 | // approximate p_{j|i} 37 | val p_ji = X2P(input, 1e-5, perplexity) 38 | val P = TSNEHelper.computeP(p_ji, n).glom().cache() 39 | 40 | var iteration = 1 41 | while(iteration <= maxIterations) { 42 | val bcY = P.context.broadcast(Y) 43 | 44 | val numerator = P.map{ arr => TSNEGradient.computeNumerator(bcY.value, arr.map(_._1): _*) }.cache() 45 | val bcNumerator = P.context.broadcast({ 46 | numerator.treeAggregate(0.0)(seqOp = (x, v) => x + sum(v), combOp = _ + _) 47 | }) 48 | 49 | val (dY, loss) = P.zip(numerator).treeAggregate((DenseMatrix.zeros[Double](n, noDims), 0.0))( 50 | seqOp = (c, v) => { 51 | // c: (grad, loss), v: (Array[(i, Iterable(j, Distance))], numerator) 52 | val l = TSNEGradient.compute(v._1, bcY.value, v._2, bcNumerator.value, c._1, iteration <= early_exaggeration) 53 | (c._1, c._2 + l) 54 | }, 55 | combOp = (c1, c2) => { 56 | // c: (grad, loss) 57 | (c1._1 + c2._1, c1._2 + c2._2) 58 | }) 59 | 60 | bcY.destroy() 61 | bcNumerator.destroy() 62 | numerator.unpersist() 63 | 64 | TSNEHelper.update(Y, dY, iY, gains, iteration, tsneParam) 65 | 66 | logger.debug(s"Iteration $iteration finished with $loss") 67 | callback(iteration, Y.copy, Some(loss)) 68 | iteration += 1 69 | } 70 | Y 71 | } 72 | } -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/tsne/tree/SPTree.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.tsne.tree 2 | 3 | import breeze.linalg._ 4 | import breeze.numerics._ 5 | 6 | import scala.annotation.tailrec 7 | 8 | 9 | class SPTree private[tree](val dimension: Int, 10 | val corner: DenseVector[Double], 11 | val width: DenseVector[Double]) extends Serializable { 12 | private[this] val childWidth: DenseVector[Double] = width :/ 2.0 13 | lazy val radiusSq: Double = sum(pow(width, 2)) 14 | private[tree] val totalMass: DenseVector[Double] = DenseVector.zeros(dimension) 15 | private var count: Int = 0 16 | private var leaf: Boolean = true 17 | val center: DenseVector[Double] = DenseVector.zeros(dimension) 18 | 19 | lazy val children: Array[SPTree] = { 20 | (0 until pow(2, dimension)).toArray.map { 21 | i => 22 | val bits = DenseVector(s"%0${dimension}d".format(i.toBinaryString.toInt).toArray.map(_.toDouble - '0'.toDouble)) 23 | val childCorner: DenseVector[Double] = corner + (bits :* childWidth) 24 | new SPTree(dimension, childCorner, childWidth) 25 | } 26 | } 27 | 28 | final def insert(vector: DenseVector[Double], finalize: Boolean = false): SPTree = { 29 | totalMass += vector 30 | count += 1 31 | 32 | if(leaf) { 33 | if(count == 1) { // first to leaf 34 | center := vector 35 | } else if(!vector.equals(center)) { 36 | (1 until count).foreach(_ => getCell(center).insert(center, finalize)) //subdivide 37 | leaf = false 38 | } 39 | } 40 | 41 | if(finalize) computeCenter(false) 42 | 43 | if(leaf) this else getCell(vector).insert(vector, finalize) 44 | } 45 | 46 | def computeCenter(recursive: Boolean = true): Unit = { 47 | if(count > 0) { 48 | center := totalMass / count.toDouble 49 | if(recursive) children.foreach(_.computeCenter()) 50 | } 51 | } 52 | 53 | def getCell(vector: DenseVector[Double]): SPTree = { 54 | val idx = ((vector - corner) :/ childWidth).data 55 | children(idx.foldLeft(0)((acc, i) => acc * 2 + min(max(i.ceil.toInt - 1, 0), 1))) 56 | } 57 | 58 | def getCount: Int = count 59 | 60 | def isLeaf: Boolean = leaf 61 | } 62 | 63 | object SPTree { 64 | def apply(Y: DenseMatrix[Double]): SPTree = { 65 | val d = Y.cols 66 | val minMaxs = minMax(Y(::, *)).toDenseVector 67 | val mins = minMaxs.mapValues(_._1) 68 | val maxs = minMaxs.mapValues(_._2) 69 | 70 | val tree = new SPTree(Y.cols, mins, maxs - mins) 71 | 72 | // insert points but wait till end to compute all centers 73 | //Y(*, ::).foreach(tree.insert(_, finalize = false)) 74 | (0 until Y.rows).foreach(i => tree.insert(Y(i, ::).t, finalize = false)) 75 | // compute all center of mass 76 | tree.computeCenter() 77 | 78 | tree 79 | } 80 | } -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/tsne/X2P.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.tsne 2 | 3 | import breeze.linalg.DenseVector 4 | import org.apache.spark.mllib.X2PHelper._ 5 | import org.apache.spark.mllib.linalg.Vectors 6 | import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry, RowMatrix} 7 | import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ 8 | import org.slf4j.LoggerFactory 9 | 10 | object X2P { 11 | 12 | private def logger = LoggerFactory.getLogger(X2P.getClass) 13 | 14 | def apply(x: RowMatrix, tol: Double = 1e-5, perplexity: Double = 30.0): CoordinateMatrix = { 15 | require(tol >= 0, "Tolerance must be non-negative") 16 | require(perplexity > 0, "Perplexity must be positive") 17 | 18 | val mu = (3 * perplexity).toInt //TODO: Expose this as parameter 19 | val logU = Math.log(perplexity) 20 | val norms = x.rows.map(Vectors.norm(_, 2.0)) 21 | norms.persist() 22 | val rowsWithNorm = x.rows.zip(norms).map{ case (v, norm) => VectorWithNorm(v, norm) } 23 | val neighbors = rowsWithNorm.zipWithIndex() 24 | .cartesian(rowsWithNorm.zipWithIndex()) 25 | .flatMap { 26 | case ((u, i), (v, j)) => 27 | if(i < j) { 28 | val dist = fastSquaredDistance(u, v) 29 | Seq((i, (j, dist)), (j, (i, dist))) 30 | } else Seq.empty 31 | } 32 | .topByKey(mu)(Ordering.by(e => -e._2)) 33 | 34 | val p_betas = 35 | neighbors.map { 36 | case (i, arr) => 37 | var betamin = Double.NegativeInfinity 38 | var betamax = Double.PositiveInfinity 39 | var beta = 1.0 40 | 41 | val d = DenseVector(arr.map(_._2)) 42 | var (h, p) = Hbeta(d, beta) 43 | 44 | //logInfo("data was " + d.toArray.toList) 45 | //logInfo("array P was " + p.toList) 46 | 47 | // Evaluate whether the perplexity is within tolerance 48 | def Hdiff = h - logU 49 | var tries = 0 50 | while (Math.abs(Hdiff) > tol && tries < 50) { 51 | //If not, increase or decrease precision 52 | if (Hdiff > 0) { 53 | betamin = beta 54 | beta = if (betamax.isInfinite) beta * 2 else (beta + betamax) / 2 55 | } else { 56 | betamax = beta 57 | beta = if (betamin.isInfinite) beta / 2 else (beta + betamin) / 2 58 | } 59 | 60 | // Recompute the values 61 | val HP = Hbeta(d, beta) 62 | h = HP._1 63 | p = HP._2 64 | tries = tries + 1 65 | } 66 | 67 | //logInfo("array P is " + p.toList) 68 | 69 | (arr.map(_._1).zip(p.toArray).map { case (j, v) => MatrixEntry(i, j, v) }, beta) 70 | } 71 | 72 | logger.info("Mean value of sigma: " + p_betas.map(x => math.sqrt(1 / x._2)).mean) 73 | new CoordinateMatrix(p_betas.flatMap(_._1)) 74 | } 75 | } -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/dbscan/LocalDBSCANArchery.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.dbscan 2 | 3 | import scala.collection.mutable.Queue 4 | import org.apache.spark.internal.Logging 5 | import archery.Box 6 | import archery.Entry 7 | import archery.Point 8 | import archery.RTree 9 | import org.apache.spark.ml.dbscan.DBSCANLabeledPoint.Flag 10 | 11 | /** 12 | * An implementation of DBSCAN using an R-Tree to improve its running time 13 | */ 14 | class LocalDBSCANArchery(eps: Double, minPoints: Int) extends Logging { 15 | 16 | val minDistanceSquared = eps * eps 17 | 18 | def fit(points: Iterable[DBSCANPoint]): Iterable[DBSCANLabeledPoint] = { 19 | 20 | val tree = points.foldLeft(RTree[DBSCANLabeledPoint]())( 21 | (tempTree, p) => 22 | tempTree.insert( 23 | Entry(Point(p.x.toFloat, p.y.toFloat), new DBSCANLabeledPoint(p)))) 24 | 25 | var cluster = DBSCANLabeledPoint.Unknown 26 | 27 | tree.entries.foreach(entry => { 28 | 29 | val point = entry.value 30 | 31 | if (!point.visited) { 32 | point.visited = true 33 | 34 | val neighbors = tree.search(toBoundingBox(point), inRange(point)) 35 | 36 | if (neighbors.size < minPoints) { 37 | point.flag = Flag.Noise 38 | } else { 39 | cluster += 1 40 | expandCluster(point, neighbors, tree, cluster) 41 | } 42 | 43 | } 44 | 45 | }) 46 | 47 | logDebug(s"total: $cluster") 48 | 49 | tree.entries.map(_.value).toIterable 50 | 51 | } 52 | 53 | private def expandCluster( 54 | point: DBSCANLabeledPoint, 55 | neighbors: Seq[Entry[DBSCANLabeledPoint]], 56 | tree: RTree[DBSCANLabeledPoint], 57 | cluster: Int): Unit = { 58 | 59 | point.flag = Flag.Core 60 | point.cluster = cluster 61 | 62 | val left = Queue(neighbors) 63 | 64 | while (left.nonEmpty) { 65 | 66 | left.dequeue().foreach(neighborEntry => { 67 | 68 | val neighbor = neighborEntry.value 69 | 70 | if (!neighbor.visited) { 71 | 72 | neighbor.visited = true 73 | neighbor.cluster = cluster 74 | 75 | val neighborNeighbors = tree.search(toBoundingBox(neighbor), inRange(neighbor)) 76 | 77 | if (neighborNeighbors.size >= minPoints) { 78 | neighbor.flag = Flag.Core 79 | left.enqueue(neighborNeighbors) 80 | } else { 81 | neighbor.flag = Flag.Border 82 | } 83 | } 84 | 85 | if (neighbor.cluster == DBSCANLabeledPoint.Unknown) { 86 | neighbor.cluster = cluster 87 | neighbor.flag = Flag.Border 88 | } 89 | 90 | }) 91 | 92 | } 93 | 94 | } 95 | 96 | private def inRange(point: DBSCANPoint)(entry: Entry[DBSCANLabeledPoint]): Boolean = { 97 | entry.value.distanceSquared(point) <= minDistanceSquared 98 | } 99 | 100 | private def toBoundingBox(point: DBSCANPoint): Box = { 101 | Box( 102 | (point.x - eps).toFloat, 103 | (point.y - eps).toFloat, 104 | (point.x + eps).toFloat, 105 | (point.y + eps).toFloat) 106 | } 107 | 108 | } 109 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/dbscan/LocalDBSCANNaive.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.dbscan 2 | 3 | 4 | import scala.collection.mutable.Queue 5 | import org.apache.spark.internal.Logging 6 | import org.apache.spark.ml.dbscan.DBSCANLabeledPoint.Flag 7 | import org.apache.spark.ml.linalg.Vectors 8 | 9 | /** 10 | * A naive implementation of DBSCAN. It has O(n2) complexity 11 | * but uses no extra memory. This implementation is not used 12 | * by the parallel version of DBSCAN. 13 | * 14 | */ 15 | class LocalDBSCANNaive(eps: Double, minPoints: Int) extends Logging { 16 | 17 | val minDistanceSquared = eps * eps 18 | 19 | def samplePoint: Array[DBSCANLabeledPoint] = 20 | Array(new DBSCANLabeledPoint(Vectors.dense(Array(0D, 0D)))) 21 | 22 | def fit(points: Iterable[DBSCANPoint]): Iterable[DBSCANLabeledPoint] = { 23 | 24 | logInfo(s"About to start fitting") 25 | 26 | val labeledPoints = points.map { new DBSCANLabeledPoint(_) }.toArray 27 | 28 | val totalClusters = 29 | labeledPoints 30 | .foldLeft(DBSCANLabeledPoint.Unknown)( 31 | (cluster, point) => { 32 | if (!point.visited) { 33 | point.visited = true 34 | 35 | val neighbors = findNeighbors(point, labeledPoints) 36 | 37 | if (neighbors.size < minPoints) { 38 | point.flag = Flag.Noise 39 | cluster 40 | } else { 41 | expandCluster(point, neighbors, labeledPoints, cluster + 1) 42 | cluster + 1 43 | } 44 | } else { 45 | cluster 46 | } 47 | }) 48 | 49 | logInfo(s"found: $totalClusters clusters") 50 | 51 | labeledPoints 52 | 53 | } 54 | 55 | private def findNeighbors( 56 | point: DBSCANPoint, 57 | all: Array[DBSCANLabeledPoint]): Iterable[DBSCANLabeledPoint] = 58 | all.view.filter(other => { 59 | point.distanceSquared(other) <= minDistanceSquared 60 | }) 61 | 62 | def expandCluster( 63 | point: DBSCANLabeledPoint, 64 | neighbors: Iterable[DBSCANLabeledPoint], 65 | all: Array[DBSCANLabeledPoint], 66 | cluster: Int): Unit = { 67 | 68 | point.flag = Flag.Core 69 | point.cluster = cluster 70 | 71 | var allNeighbors = Queue(neighbors) 72 | 73 | while (allNeighbors.nonEmpty) { 74 | allNeighbors.dequeue().foreach(neighbor => { 75 | if (!neighbor.visited) { 76 | 77 | neighbor.visited = true 78 | neighbor.cluster = cluster 79 | 80 | val neighborNeighbors = findNeighbors(neighbor, all) 81 | 82 | if (neighborNeighbors.size >= minPoints) { 83 | neighbor.flag = Flag.Core 84 | allNeighbors.enqueue(neighborNeighbors) 85 | } else { 86 | neighbor.flag = Flag.Border 87 | } 88 | 89 | if (neighbor.cluster == DBSCANLabeledPoint.Unknown) { 90 | neighbor.cluster = cluster 91 | neighbor.flag = Flag.Border 92 | } 93 | } 94 | 95 | }) 96 | 97 | } 98 | 99 | } 100 | 101 | } 102 | 103 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/sampling/UnderSampling.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.sampling 2 | 3 | import org.apache.spark.ml.Transformer 4 | import org.apache.spark.ml.param._ 5 | import org.apache.spark.ml.util.Identifiable 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.types.StructType 8 | import org.apache.spark.sql.{DataFrame, Dataset, Row} 9 | 10 | /** 11 | * Created by endy on 16-12-8. 12 | */ 13 | 14 | trait UnderSamplingParams extends Params{ 15 | final val threshold = new DoubleParam(this, "threshold", "The threshold whether to " + 16 | "undersampling sample of a class", (x: Double) => x > 1) 17 | def setThreshold(value: Double): this.type = set(threshold, value) 18 | 19 | final val dependentColName = new Param[String](this, "dependentColName", "The column that " + 20 | "provide label values") 21 | def setDependentColName(value: String): this.type = set(dependentColName, value) 22 | 23 | final val withReplacement = new BooleanParam(this, "withReplacement", "") 24 | def setWithReplacement(value: Boolean): this.type = set(withReplacement, value) 25 | 26 | final val primaryClass = new DoubleParam(this, "primaryClass", "primary class that to under " + 27 | "sampling") 28 | def setPrimaryClass(value: Double): this.type = set(primaryClass, value) 29 | } 30 | 31 | 32 | class UnderSampling(override val uid: String) extends Transformer with UnderSamplingParams{ 33 | 34 | def this() = this(Identifiable.randomUID("UnderSampling")) 35 | /** 36 | * Transforms the input dataset. 37 | */ 38 | override def transform(dataset: Dataset[_]): DataFrame = { 39 | 40 | val labelCountPair = dataset.groupBy($(dependentColName)).count().collect() 41 | 42 | val primaryClassCount = labelCountPair 43 | .filter{ case Row(label: Double, count: Long) => label == ${primaryClass}} 44 | .map(x => x.get(1)).headOption.getOrElse(-1L).asInstanceOf[Long] 45 | 46 | if (primaryClassCount == -1) throw new Exception("The label is not exist") 47 | 48 | val res = labelCountPair.zipWithIndex.map { 49 | case (Row(label: Double, count: Long), index: Int) => 50 | val ratio = count / primaryClassCount.toDouble 51 | 52 | /** 53 | * if ratio < threshold, only return samples of this label, 54 | * otherwise we sample the data from the samples of this label. 55 | * 56 | * The desired number of samples is : num = primaryClassCount * threshold 57 | * so the fraction of sample method is: num / count = threshold / ratio 58 | */ 59 | val df = if (ratio < ${threshold}) dataset.filter(col($(dependentColName)) === label) 60 | else dataset.filter(col($(dependentColName)) === label) 61 | .sample(${withReplacement}, ${threshold} / ratio) 62 | 63 | df.toDF() 64 | }.reduce(_ union _) 65 | 66 | res 67 | } 68 | 69 | override def copy(extra: ParamMap): Transformer = defaultCopy(extra) 70 | 71 | /** 72 | * :: DeveloperApi :: 73 | * 74 | * Check transform validity and derive the output schema from the input schema. 75 | * 76 | * Typical implementation should first conduct verification on schema change and parameter 77 | * validity, including complex parameter interaction checks. 78 | */ 79 | override def transformSchema(schema: StructType): StructType = { 80 | schema 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/timeseries/MatrixUtil.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.timeseries 2 | 3 | import breeze.linalg.{CSCMatrix, DenseMatrix, DenseVector, Matrix, SliceVector, SparseVector, Vector} 4 | import io.transwarp.hubble.error.HubbleErrors 5 | import org.apache.spark.ml.linalg.{DenseMatrix => SDM, Matrix => SM, SparseMatrix => SSM} 6 | import org.apache.spark.ml.linalg.{DenseVector => SDV, SparseVector => SSV, Vector => SV} 7 | /** 8 | * Created by endy on 16-12-16. 9 | */ 10 | object MatrixUtil { 11 | 12 | def matToRowArrs(mat: SM): Array[Array[Double]] = { 13 | val arrs = new Array[Array[Double]](mat.numRows) 14 | for (r <- 0 until mat.numRows) { 15 | arrs(r) = toBreeze(mat)(r to r, 0 until mat.numCols).toDenseMatrix.toArray 16 | } 17 | arrs 18 | } 19 | 20 | def toBreeze(sparkMatrix: SM): Matrix[Double] = { 21 | sparkMatrix match { 22 | case dm: SDM => 23 | if (!dm.isTransposed) { 24 | new DenseMatrix[Double](dm.numRows, dm.numCols, dm.values) 25 | } else { 26 | val breezeMatrix = new DenseMatrix[Double](dm.numCols, dm.numRows, dm.values) 27 | breezeMatrix.t 28 | } 29 | case sm: SSM => 30 | if (!sm.isTransposed) { 31 | new CSCMatrix[Double](sm.values, sm.numRows, sm.numCols, sm.colPtrs, sm.rowIndices) 32 | } else { 33 | val breezeMatrix = 34 | new CSCMatrix[Double](sm.values, sm.numCols, sm.numRows, sm.colPtrs, sm.rowIndices) 35 | breezeMatrix.t 36 | } 37 | case _ => 38 | throw HubbleErrors.typeNotSupported( 39 | s"Do not support conversion from type ${sparkMatrix.getClass.getName}.") 40 | } 41 | } 42 | 43 | def toBreeze(sparkVector: SV): Vector[Double] = { 44 | sparkVector match { 45 | case v: SDV => 46 | new DenseVector[Double](v.values) 47 | case v: SSV => 48 | new SparseVector[Double](v.indices, v.values, v.size) 49 | } 50 | } 51 | 52 | 53 | def fromBreeze(breeze: Matrix[Double]): SM = { 54 | breeze match { 55 | case dm: DenseMatrix[Double] => 56 | new SDM(dm.rows, dm.cols, dm.data, dm.isTranspose) 57 | case sm: CSCMatrix[Double] => 58 | // There is no isTranspose flag for sparse matrices in Breeze 59 | new SSM(sm.rows, sm.cols, sm.colPtrs, sm.rowIndices, sm.data) 60 | case _ => 61 | throw HubbleErrors.typeNotSupported( 62 | s"Do not support conversion from type ${breeze.getClass.getName}.") 63 | } 64 | } 65 | 66 | def fromBreeze(breezeVector: Vector[Double]): SV = { 67 | breezeVector match { 68 | case v: DenseVector[Double] => 69 | if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) { 70 | new SDV(v.data) 71 | } else { 72 | new SDV(v.toArray) // Can't use underlying array directly, so make a new one 73 | } 74 | case v: SparseVector[Double] => 75 | if (v.index.length == v.used) { 76 | new SSV(v.length, v.index, v.data) 77 | } else { 78 | new SSV(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used)) 79 | } 80 | case v: SliceVector[_, Double] => 81 | new SDV(v.toArray) 82 | case v: Vector[_] => 83 | throw HubbleErrors.typeNotSupported("Unsupported Breeze vector type: " + v.getClass.getName) 84 | } 85 | } 86 | 87 | } 88 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/dbscan/DBSCAN2.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.dbscan 2 | 3 | import org.apache.spark.ml.{Estimator, Model} 4 | import org.apache.spark.ml.param.{DoubleParam, IntParam, ParamMap, Params} 5 | import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} 6 | import org.apache.spark.rdd.RDD 7 | import org.apache.spark.sql.{DataFrame, Dataset, Row} 8 | import org.apache.spark.sql.types.{IntegerType, StructType} 9 | import org.apache.spark.ml.linalg.{Vector, VectorUDT} 10 | import org.apache.spark.ml.util.{Identifiable, SchemaUtils} 11 | import org.apache.spark.sql.functions.{col, udf} 12 | 13 | /** 14 | * Created by endy on 17-12-5. 15 | */ 16 | 17 | trait DBSCANParams extends Params with HasFeaturesCol with HasPredictionCol{ 18 | final val eps = new DoubleParam(this, "eps", "the maximum distance between two points" + 19 | " for them to be considered as part of the same region") 20 | def getEps: Double = ${eps} 21 | 22 | final val minPoints = new IntParam(this, "minPoints", "the minimum number of" + 23 | " points required to form a dense region") 24 | def getMinPoints: Int = ${minPoints} 25 | 26 | final val maxPointsPerPartition = new IntParam(this, "maxPointsPerPartition", 27 | "the largest number of points in a single partition") 28 | 29 | def getMaxPointsPerPartition: Int = ${maxPointsPerPartition} 30 | 31 | protected def validateAndTransformSchema(schema: StructType): StructType = { 32 | SchemaUtils.checkColumnType(schema, ${featuresCol}, new VectorUDT) 33 | SchemaUtils.appendColumn(schema, ${predictionCol}, IntegerType) 34 | } 35 | } 36 | 37 | class DBSCAN2(override val uid: String) extends Estimator[DBSCAN2Model] with DBSCANParams{ 38 | 39 | setDefault(eps -> 0.3, minPoints -> 10, maxPointsPerPartition -> 250) 40 | 41 | def this() = this(Identifiable.randomUID("dbscan")) 42 | 43 | def setEps(value: Double): this.type = set(eps, value) 44 | 45 | def setMinPoints(value: Int): this.type = set(minPoints, value) 46 | 47 | def setMaxPointsPerPartition(value: Int): this.type = set(maxPointsPerPartition, value) 48 | 49 | override def fit(dataset: Dataset[_]): DBSCAN2Model = { 50 | val instances: RDD[Vector] = dataset.select(col(${featuresCol})).rdd.map { 51 | case Row(point: Vector) => point 52 | } 53 | 54 | val dbscan = DBSCAN.train(instances, ${eps}, ${minPoints}, ${maxPointsPerPartition}) 55 | 56 | new DBSCAN2Model(uid, dbscan) 57 | } 58 | 59 | override def copy(extra: ParamMap): Estimator[DBSCAN2Model] = defaultCopy(extra) 60 | 61 | override def transformSchema(schema: StructType): StructType = { 62 | validateAndTransformSchema(schema) 63 | } 64 | } 65 | 66 | class DBSCAN2Model(override val uid: String, val model: DBSCAN) extends 67 | Model[DBSCAN2Model] with DBSCANParams{ 68 | 69 | override def copy(extra: ParamMap): DBSCAN2Model = defaultCopy(extra) 70 | 71 | override def transform(dataset: Dataset[_]): DataFrame = { 72 | val clustered = model.labeledPoints 73 | .map(p => (p.vector(0), p.vector(1), p.vector, p.cluster)) 74 | 75 | dataset.sparkSession.createDataFrame(clustered) 76 | .toDF(dataset.schema.fieldNames(0), 77 | dataset.schema.fieldNames(1), 78 | ${featuresCol}, ${predictionCol}) 79 | } 80 | 81 | override def transformSchema(schema: StructType): StructType = { 82 | validateAndTransformSchema(schema) 83 | } 84 | } 85 | 86 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/sampling/OverSampling.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.sampling 2 | 3 | import org.apache.spark.ml.Transformer 4 | import org.apache.spark.ml.param._ 5 | import org.apache.spark.ml.util.Identifiable 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.{DataFrame, Dataset, Row} 8 | import org.apache.spark.sql.types.StructType 9 | 10 | /** 11 | * Created by endy on 16-12-8. 12 | */ 13 | 14 | trait OverSamplingParams extends Params{ 15 | final val threshold = new DoubleParam(this, "threshold", "The threshold whether to " + 16 | "undersampling sample of a class", (x: Double) => x > 1) 17 | def setThreshold(value: Double): this.type = set(threshold, value) 18 | 19 | final val dependentColName = new Param[String](this, "dependentColName", "The column that " + 20 | "provide label values") 21 | def setDependentColName(value: String): this.type = set(dependentColName, value) 22 | 23 | final val primaryClass = new DoubleParam(this, "primaryClass", "primary class that to under " + 24 | "sampling") 25 | def setPrimaryClass(value: Double): this.type = set(primaryClass, value) 26 | } 27 | 28 | 29 | class OverSampling(override val uid: String) extends Transformer with OverSamplingParams { 30 | def this() = this(Identifiable.randomUID("OverSampling")) 31 | 32 | /** 33 | * Transforms the input dataset. 34 | */ 35 | override def transform(dataset: Dataset[_]): DataFrame = { 36 | val labelCountPair = dataset.groupBy($(dependentColName)).count().collect() 37 | 38 | val primaryClassCount = labelCountPair 39 | .filter{ case Row(label: Double, count: Long) => label == ${primaryClass}} 40 | .map(x => x.get(1)).headOption.getOrElse(-1L).asInstanceOf[Long] 41 | 42 | if (primaryClassCount == -1) throw new Exception("The label is not exist") 43 | 44 | val res = labelCountPair.zipWithIndex 45 | .map { 46 | case (Row(label: Double, count: Long), index: Int) => 47 | val ratio = primaryClassCount / count.toDouble 48 | 49 | /** 50 | * if ratio < threshold, only return samples of this label, 51 | * otherwise we sample the data from the samples of this label. 52 | * 53 | * The desired number of samples is : num = primaryClassCount * threshold 54 | * so the fraction of sample method is: num / count = ratio / threshold. 55 | * Because fraction > 1, the value of 'withReplacement' parameter must be true 56 | */ 57 | val df = if (ratio < ${threshold}) { 58 | dataset.filter(col($(dependentColName)) === label) 59 | } else { 60 | val desiredFraction = ratio / ${threshold} 61 | dataset.filter(col($(dependentColName)) === label) 62 | .sample(withReplacement = true, desiredFraction) 63 | } 64 | df.toDF() 65 | }.reduce(_ union _) 66 | 67 | res 68 | } 69 | 70 | override def copy(extra: ParamMap): Transformer = defaultCopy(extra) 71 | 72 | /** 73 | * :: DeveloperApi :: 74 | * 75 | * Check transform validity and derive the output schema from the input schema. 76 | * 77 | * Typical implementation should first conduct verification on schema change and parameter 78 | * validity, including complex parameter interaction checks. 79 | */ 80 | override def transformSchema(schema: StructType): StructType = { 81 | schema 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/ml/knn_is/KNN_ISSuite.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.knn_is 2 | 3 | import org.apache.spark.SparkFunSuite 4 | import org.apache.spark.ml.feature.LabeledPoint 5 | import org.apache.spark.ml.linalg.Vectors 6 | import org.apache.spark.ml.util.DefaultReadWriteTest 7 | import org.apache.spark.mllib.evaluation.MulticlassMetrics 8 | import org.apache.spark.mllib.util.MLlibTestSparkContext 9 | import org.apache.spark.rdd.RDD 10 | import org.apache.spark.sql.{Dataset, Row} 11 | 12 | import scala.util.Random 13 | 14 | class KNN_ISSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { 15 | @transient var dataset: Dataset[_] = _ 16 | 17 | override def beforeAll(): Unit = { 18 | super.beforeAll() 19 | dataset = spark.createDataFrame(KNN_ISSuite.generateKnnInput(1.0, 1.0, 20 | nPoints = 1000, seed = 42)) 21 | } 22 | 23 | test("knn: default params") { 24 | val knn_is = new KNN_ISClassifier() 25 | assert(knn_is.getLabelCol === "label") 26 | assert(knn_is.getFeaturesCol === "features") 27 | assert(knn_is.getPredictionCol === "prediction") 28 | assert(knn_is.getK == 1) 29 | assert(knn_is.getDistanceType == 1) 30 | assert(knn_is.getNumSamplesTest == 1) 31 | assert(knn_is.getNumClass == 1) 32 | assert(knn_is.getNumIter == 1) 33 | assert(knn_is.getInc == 0) 34 | assert(knn_is.getSubdel == 0) 35 | assert(knn_is.getTopdel == 0) 36 | } 37 | 38 | test("train"){ 39 | val knn_is = new KNN_ISClassifier() 40 | knn_is.fit(dataset) 41 | } 42 | 43 | test("transform: one iterationNum"){ 44 | val knn_is = new KNN_ISClassifier() 45 | .setNumClass(2) 46 | .setNumSamplesTest(dataset.count().toInt) 47 | .setK(5) 48 | 49 | val model = knn_is.fit(dataset) 50 | 51 | val results = model.transform(dataset) 52 | assert(results.count() == dataset.count()) 53 | 54 | val source = dataset.select("label").rdd.map{case Row(x: Double) => x} 55 | val res = results.select("prediction").rdd.map{case Row(x: Double) => x} 56 | 57 | val predictions = source.zip(res.asInstanceOf[RDD[Double]]) 58 | val metrics = new MulticlassMetrics(predictions) 59 | val precision = metrics.accuracy 60 | assert(precision == 0.64) 61 | } 62 | 63 | test("transform: more than one iterationNum"){ 64 | val knn_is = new KNN_ISClassifier() 65 | .setNumClass(2) 66 | .setNumSamplesTest(dataset.count().toInt) 67 | .setNumIter(3) 68 | .setK(5) 69 | 70 | val model = knn_is.fit(dataset) 71 | 72 | val results = model.transform(dataset) 73 | assert(results.count() == dataset.count()) 74 | 75 | val source = dataset.select("label") 76 | .rdd.map{case Row(x: Double) => x}.repartition(1) 77 | val res = results.select("prediction") 78 | .rdd.map{case Row(x: Double) => x}.repartition(1) 79 | 80 | val predictions = source.zip(res.asInstanceOf[RDD[Double]]) 81 | val metrics = new MulticlassMetrics(predictions) 82 | val precision = metrics.accuracy 83 | assert(precision == 0.648) 84 | } 85 | } 86 | 87 | object KNN_ISSuite { 88 | def generateKnnInput(offset: Double, 89 | scale: Double, 90 | nPoints: Int, 91 | seed: Int): Seq[LabeledPoint] = { 92 | val rnd = new Random(seed) 93 | val x1 = Array.fill[Double](nPoints)(rnd.nextGaussian()) 94 | 95 | val y = (0 until nPoints).map { i => 96 | val p = 1.0 / (1.0 + math.exp(-(offset + scale * x1(i)))) 97 | if (rnd.nextDouble() < p) 1.0 else 0.0 98 | } 99 | 100 | val testData = (0 until nPoints).map(i => LabeledPoint(y(i), Vectors.dense(Array(x1(i))))) 101 | testData 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/tsne/impl/BHTSNE.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.tsne.impl 2 | 3 | import breeze.linalg._ 4 | import breeze.stats.distributions.Rand 5 | import org.apache.spark.ml.tsne.tree.SPTree 6 | import org.apache.spark.ml.tsne.{TSNEGradient, TSNEHelper, TSNEParam, X2P} 7 | import org.apache.spark.mllib.linalg.distributed.RowMatrix 8 | import org.apache.spark.storage.StorageLevel 9 | import org.slf4j.LoggerFactory 10 | 11 | import scala.util.Random 12 | 13 | object BHTSNE { 14 | private def logger = LoggerFactory.getLogger(BHTSNE.getClass) 15 | 16 | def tsne( 17 | input: RowMatrix, 18 | noDims: Int = 2, 19 | maxIterations: Int = 1000, 20 | perplexity: Double = 30, 21 | theta: Double = 0.5, 22 | reportLoss: Int => Boolean = {i => i % 10 == 0}, 23 | callback: (Int, DenseMatrix[Double], Option[Double]) => Unit = {case _ => }, 24 | seed: Long = Random.nextLong() 25 | ): DenseMatrix[Double] = { 26 | if(input.rows.getStorageLevel == StorageLevel.NONE) { 27 | logger.warn("Input is not persisted and performance could be bad") 28 | } 29 | 30 | Rand.generator.setSeed(seed) 31 | 32 | val tsneParam = TSNEParam() 33 | import tsneParam._ 34 | 35 | val n = input.numRows().toInt 36 | val Y: DenseMatrix[Double] = DenseMatrix.rand(n, noDims, Rand.gaussian(0, 1)) :/ 1e4 37 | val iY = DenseMatrix.zeros[Double](n, noDims) 38 | val gains = DenseMatrix.ones[Double](n, noDims) 39 | 40 | // approximate p_{j|i} 41 | val p_ji = X2P(input, 1e-5, perplexity) 42 | val P = TSNEHelper.computeP(p_ji, n).glom() 43 | .map(rows => rows.map { 44 | case (i, data) => 45 | (i, data.map(_._1).toSeq, DenseVector(data.map(_._2 * exaggeration_factor).toArray)) 46 | }) 47 | .cache() 48 | 49 | var iteration = 1 50 | while(iteration <= maxIterations) { 51 | val bcY = P.context.broadcast(Y) 52 | val bcTree = P.context.broadcast(SPTree(Y)) 53 | 54 | val initialValue = (DenseMatrix.zeros[Double](n, noDims), DenseMatrix.zeros[Double](n, noDims), 0.0) 55 | val (posF, negF, sumQ) = P.treeAggregate(initialValue)( 56 | seqOp = (c, v) => { 57 | // c: (pos, neg, sumQ), v: Array[(i, Seq(j), vec(Distance))] 58 | TSNEGradient.computeEdgeForces(v, bcY.value, c._1) 59 | val q = TSNEGradient.computeNonEdgeForces(bcTree.value, bcY.value, theta, c._2, v.map(_._1): _*) 60 | (c._1, c._2, c._3 + q) 61 | }, 62 | combOp = (c1, c2) => { 63 | // c: (grad, loss) 64 | (c1._1 + c2._1, c1._2 + c2._2, c1._3 + c2._3) 65 | }) 66 | val dY: DenseMatrix[Double] = posF :- (negF :/ sumQ) 67 | 68 | TSNEHelper.update(Y, dY, iY, gains, iteration, tsneParam) 69 | 70 | if(reportLoss(iteration)) { 71 | val loss = P.treeAggregate(0.0)( 72 | seqOp = (c, v) => { 73 | TSNEGradient.computeLoss(v, bcY.value, sumQ) 74 | }, 75 | combOp = _ + _ 76 | ) 77 | logger.debug(s"Iteration $iteration finished with $loss") 78 | callback(iteration, Y.copy, Some(loss)) 79 | } else { 80 | logger.debug(s"Iteration $iteration finished") 81 | callback(iteration, Y.copy, None) 82 | } 83 | 84 | bcY.destroy() 85 | bcTree.destroy() 86 | 87 | //undo early exaggeration 88 | if(iteration == early_exaggeration) { 89 | P.foreach { 90 | rows => rows.foreach { 91 | case (_, _, vec) => vec.foreachPair { case (i, v) => vec.update(i, v / exaggeration_factor) } 92 | } 93 | } 94 | } 95 | 96 | iteration += 1 97 | } 98 | 99 | Y 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/util/SparkUtils.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.util 2 | 3 | import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV} 4 | import breeze.storage.Zero 5 | import org.apache.hadoop.fs.{FileSystem, Path} 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.deploy.SparkHadoopUtil 8 | import org.apache.spark.mllib.linalg.{DenseVector => SDV, SparseVector => SSV, Vector => SV} 9 | 10 | import scala.language.implicitConversions 11 | import scala.reflect.ClassTag 12 | 13 | 14 | object SparkUtils { 15 | implicit def toBreeze(sv: SV): BV[Double] = { 16 | sv match { 17 | case SDV(data) => 18 | new BDV(data) 19 | case SSV(size, indices, values) => 20 | new BSV(indices, values, size) 21 | } 22 | } 23 | 24 | implicit def fromBreeze(breezeVector: BV[Double]): SV = { 25 | breezeVector match { 26 | case v: BDV[Double] => 27 | if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) { 28 | new SDV(v.data) 29 | } else { 30 | new SDV(v.toArray) // Can't use underlying array directly, so make a new one 31 | } 32 | case v: BSV[Double] => 33 | if (v.index.length == v.used) { 34 | new SSV(v.length, v.index, v.data) 35 | } else { 36 | new SSV(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used)) 37 | } 38 | case v: BV[_] => 39 | sys.error("Unsupported Breeze vector type: " + v.getClass.getName) 40 | } 41 | } 42 | 43 | def toBreezeConv[T: ClassTag](sv: SV)(implicit num: Numeric[T]): BV[T] = { 44 | val zero = num.zero 45 | implicit val conv: Array[Double] => Array[T] = (data) => { 46 | data.map(ele => (zero match { 47 | case zero: Double => ele 48 | case zero: Float => ele.toFloat 49 | case zero: Int => ele.toInt 50 | case zero: Long => ele.toLong 51 | }).asInstanceOf[T]).array 52 | } 53 | sv match { 54 | case SDV(data) => 55 | new BDV[T](data) 56 | case SSV(size, indices, values) => 57 | new BSV[T](indices, values, size)(Zero[T](zero)) 58 | } 59 | } 60 | 61 | def fromBreezeConv[T: ClassTag](breezeVector: BV[T])(implicit num: Numeric[T]): SV = { 62 | implicit val conv: Array[T] => Array[Double] = (data) => { 63 | data.map(num.toDouble).array 64 | } 65 | breezeVector match { 66 | case v: BDV[T] => 67 | if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) { 68 | new SDV(v.data) 69 | } else { 70 | new SDV(v.toArray) // Can't use underlying array directly, so make a new one 71 | } 72 | case v: BSV[T] => 73 | if (v.index.length == v.used) { 74 | new SSV(v.length, v.index, v.data) 75 | } else { 76 | new SSV(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used)) 77 | } 78 | case v: BV[T] => 79 | sys.error("Unsupported Breeze vector type: " + v.getClass.getName) 80 | } 81 | } 82 | 83 | def getFileSystem(conf: SparkConf, path: Path): FileSystem = { 84 | val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf) 85 | if (sys.env.contains("HADOOP_CONF_DIR") || sys.env.contains("YARN_CONF_DIR")) { 86 | val hdfsConfPath = if (sys.env.get("HADOOP_CONF_DIR").isDefined) { 87 | sys.env.get("HADOOP_CONF_DIR").get + "/core-site.xml" 88 | } else { 89 | sys.env.get("YARN_CONF_DIR").get + "/core-site.xml" 90 | } 91 | hadoopConf.addResource(new Path(hdfsConfPath)) 92 | } 93 | path.getFileSystem(hadoopConf) 94 | } 95 | 96 | def deleteChkptDirs(conf: SparkConf, dirs: Array[String]): Unit = { 97 | val fs = getFileSystem(conf, new Path(dirs(0))) 98 | dirs.foreach(dir => { 99 | fs.delete(new Path(dir), true) 100 | }) 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/ml/timeseries/models/ARGARCHSuite.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.timeseries.models 2 | 3 | import org.apache.commons.math3.random.MersenneTwister 4 | import org.apache.spark.SparkFunSuite 5 | import org.apache.spark.ml.linalg.DenseVector 6 | import org.apache.spark.ml.timeseries.MatrixUtil 7 | import org.apache.spark.ml.util.DefaultReadWriteTest 8 | import org.apache.spark.mllib.util.MLlibTestSparkContext 9 | import org.apache.spark.mllib.util.TestingUtils._ 10 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} 11 | import org.apache.spark.sql.{Row, _} 12 | 13 | /** 14 | * Created by endy on 16-12-22. 15 | */ 16 | class ARGARCHSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest{ 17 | test("fit model") { 18 | val omega = 0.2 19 | val alpha = 0.3 20 | val beta = 0.5 21 | val genModel = new ARGARCHModel(0.0, 0.0, alpha, beta, omega) 22 | val rand = new MersenneTwister(5L) 23 | val n = 10000 24 | 25 | val ts = genModel.sample(n, rand) 26 | val data = genDf(ts) 27 | 28 | val model = new GARCH().fit(data) 29 | assert(model.omega - omega < .1) // TODO: we should be able to be more accurate 30 | assert(model.alpha - alpha < .02) 31 | assert(model.beta - beta < .02) 32 | } 33 | 34 | 35 | test("fit model 2") { 36 | val arr = Array[Double](0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 37 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 38 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 39 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 40 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 41 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 42 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 43 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 44 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 45 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 46 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 47 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 48 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 49 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 50 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 51 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 52 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 53 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 54 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 55 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1) 56 | val ts = genDf(arr) 57 | 58 | val model = new ARGARCH().fit(ts) 59 | 60 | assert(model.alpha ~== -0.106 absTol 0.001) 61 | assert(model.beta ~== -1.012 absTol 0.001) 62 | assert(model.omega ~== 0.190 absTol 0.01) 63 | assert(model.c ~== -0.0355 absTol 0.01) 64 | assert(model.phi ~== -0.339 absTol 0.01) 65 | } 66 | 67 | test("standardize and filter") { 68 | val model = new ARGARCHModel(40.0, .4, .2, .3, .4) 69 | val rand = new MersenneTwister(5L) 70 | val n = 10000 71 | 72 | val ts = new DenseVector(model.sample(n, rand)) 73 | 74 | // de-heteroskedasticize 75 | val standardized = model.removeTimeDependentEffects(ts) 76 | // heteroskedasticize 77 | val filtered = model.addTimeDependentEffects(standardized) 78 | 79 | assert((MatrixUtil.toBreeze(filtered) - MatrixUtil.toBreeze(ts)).toArray.forall(math.abs(_) < 80 | .001)) 81 | } 82 | 83 | def genDf(array: Array[Double]): DataFrame = { 84 | val schema = StructType(Array(StructField("time", StringType), StructField("timeseries", 85 | DoubleType))) 86 | 87 | val rdd = spark.sparkContext.parallelize( 88 | array.zipWithIndex.map(x => Row(x._2.formatted("%010d"), x._1))) 89 | 90 | spark.createDataFrame(rdd, schema) 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/util/LoaderUtils.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.util 2 | 3 | import org.apache.hadoop.fs._ 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD 6 | import org.apache.spark.sql.catalyst.ScalaReflection 7 | import org.apache.spark.sql.types.{DataType, StructField, StructType} 8 | import org.json4s._ 9 | import org.json4s.jackson.JsonMethods._ 10 | 11 | import scala.reflect.ClassTag 12 | import scala.reflect.runtime.universe.TypeTag 13 | 14 | // copy form Spark MLlib 15 | /** 16 | * Helper methods for loading models from files. 17 | */ 18 | private[ml] object LoaderUtils { 19 | 20 | /** Returns URI for path/data using the Hadoop filesystem */ 21 | def dataPath(path: String): String = new Path(path, "data").toUri.toString 22 | 23 | /** Returns URI for path/metadata using the Hadoop filesystem */ 24 | def metadataPath(path: String): String = new Path(path, "metadata").toUri.toString 25 | 26 | /** 27 | * Check the schema of loaded model data. 28 | * 29 | * This checks every field in the expected schema to make sure that a field with the same 30 | * name and DataType appears in the loaded schema. Note that this does NOT check metadata 31 | * or containsNull. 32 | * 33 | * @param loadedSchema Schema for model data loaded from file. 34 | * @tparam Data Expected data type from which an expected schema can be derived. 35 | */ 36 | def checkSchema[Data: TypeTag](loadedSchema: StructType): Unit = { 37 | // Check schema explicitly since erasure makes it hard to use match-case for checking. 38 | val expectedFields: Array[StructField] = 39 | ScalaReflection.schemaFor[Data].dataType.asInstanceOf[StructType].fields 40 | val loadedFields: Map[String, DataType] = 41 | loadedSchema.map(field => field.name -> field.dataType).toMap 42 | expectedFields.foreach { field => 43 | assert(loadedFields.contains(field.name), s"Unable to parse model data." + 44 | s" Expected field with name ${field.name} was missing in loaded schema:" + 45 | s" ${loadedFields.mkString(", ")}") 46 | } 47 | } 48 | 49 | /** 50 | * Load metadata from the given path. 51 | * @return (class name, version, metadata) 52 | */ 53 | def loadMetadata(sc: SparkContext, path: String): (String, String, JValue) = { 54 | implicit val formats = DefaultFormats 55 | val metadata = parse(sc.textFile(metadataPath(path)).first()) 56 | val clazz = (metadata \ "class").extract[String] 57 | val version = (metadata \ "version").extract[String] 58 | (clazz, version, metadata) 59 | } 60 | 61 | /** 62 | * Save an RDD to one HDFS file 63 | * @param sc SparkContext 64 | * @param rdd The RDD to save 65 | * @param outPathStr The HDFS file path of String 66 | * @param header Header line of HDFS file, used for storing some metadata 67 | * @param mapEle The function mapping each element of RDD to a line of String 68 | */ 69 | def RDD2HDFSFile[T: ClassTag](sc: SparkContext, 70 | rdd: RDD[T], 71 | outPathStr: String, 72 | header: => String, 73 | mapEle: T => String): Unit = { 74 | val hdpconf = sc.hadoopConfiguration 75 | val fs = FileSystem.get(hdpconf) 76 | val outPath = new Path(outPathStr) 77 | if (fs.exists(outPath)) { 78 | throw new InvalidPathException(s"Output path $outPathStr already exists.") 79 | } 80 | val fout = fs.create(outPath) 81 | fout.write(header.getBytes) 82 | fout.write("\n".getBytes) 83 | rdd.toLocalIterator.foreach(e => { 84 | fout.write(mapEle(e).getBytes) 85 | fout.write("\n".getBytes) 86 | }) 87 | fout.close() 88 | } 89 | 90 | /** 91 | * Load an RDD from one HDFS file 92 | * @param sc SparkContext 93 | * @param inPathStr The HDFS file path of String 94 | * @param init_f The function used for initialization after reading header 95 | * @param lineParser The function parses each line in HDFS file to an element of RDD 96 | */ 97 | def HDFSFile2RDD[T: ClassTag, M: ClassTag](sc: SparkContext, 98 | inPathStr: String, 99 | init_f: String => M, 100 | lineParser: (M, String) => T): (M, RDD[T]) = { 101 | val rawrdd = sc.textFile(inPathStr) 102 | val header = rawrdd.first() 103 | val meta = init_f(header) 104 | val rdd: RDD[T] = rawrdd.mapPartitions(iter => { 105 | val first = iter.next() 106 | if (first == header) { 107 | iter 108 | } else { 109 | Iterator.single(first) ++ iter 110 | } 111 | }.map(lineParser(meta, _))) 112 | (meta, rdd) 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/timeseries/Lag.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.timeseries 2 | 3 | import org.apache.spark.ml.linalg.{DenseMatrix, Matrix, Vector} 4 | 5 | /** 6 | * Created by endy on 16-12-16. 7 | */ 8 | object Lag { 9 | /** 10 | * Makes a lag matrix from the given time series with the given lag, trimming both rows and 11 | * columns so that every element in the matrix is full. 12 | */ 13 | def lagMatTrimBoth(x: Array[Double], maxLag: Int): Array[Array[Double]] = { 14 | lagMatTrimBoth(x, maxLag, false) 15 | } 16 | 17 | /** 18 | * Makes a lag matrix from the given time series with the given lag, trimming both rows and 19 | * columns so that every element in the matrix is full. 20 | */ 21 | def lagMatTrimBoth(x: Array[Double], maxLag: Int, includeOriginal: Boolean) 22 | : Array[Array[Double]] = { 23 | val numObservations = x.length 24 | val numRows = numObservations - maxLag 25 | val numCols = maxLag + (if (includeOriginal) 1 else 0) 26 | val lagMat = Array.ofDim[Double](numRows, numCols) 27 | 28 | val initialLag = if (includeOriginal) 0 else 1 29 | 30 | for (r <- 0 until numRows) { 31 | for (c <- initialLag to maxLag) { 32 | lagMat(r)(c - initialLag) = x(r + maxLag - c) 33 | } 34 | } 35 | lagMat 36 | } 37 | 38 | /** 39 | * Makes a lag matrix from the given time series with the given lag, trimming both rows and 40 | * columns so that every element in the matrix is full. 41 | */ 42 | def lagMatTrimBoth(x: Vector, maxLag: Int): Matrix = { 43 | lagMatTrimBoth(x, maxLag, false) 44 | } 45 | 46 | /** 47 | * Makes a lag matrix from the given time series with the given lag, trimming both rows and 48 | * columns so that every element in the matrix is full. 49 | */ 50 | def lagMatTrimBoth(x: Vector, maxLag: Int, includeOriginal: Boolean): Matrix = { 51 | val numObservations = x.size 52 | val numRows = numObservations - maxLag 53 | val numCols = maxLag + (if (includeOriginal) 1 else 0) 54 | val lagMat = new DenseMatrix(numRows, numCols, new Array[Double](numRows * numCols)) 55 | 56 | lagMatTrimBoth(x, lagMat, maxLag, includeOriginal, 0) 57 | lagMat 58 | } 59 | 60 | /** 61 | * @param x Vector to be lagged. 62 | * @param outputMat Matrix to place the lagged vector into, as a column. 63 | * @param numLags The number of times to lag the vector. E.g. if this is 2, the output matrix 64 | * will include one column that is the vector lagged by 1, and another column to 65 | * the right that is the vector lagged by 2. 66 | * @param includeOriginal Whether to place the original time series into the matrix as well. 67 | * @param colOffset The offset to start placing columns in the output mat. 68 | */ 69 | def lagMatTrimBoth( 70 | x: Vector, 71 | outputMat: DenseMatrix, 72 | numLags: Int, 73 | includeOriginal: Boolean, 74 | colOffset: Int): Unit = { 75 | val numRows = outputMat.numRows 76 | val numTruncatedRows = x.size - numRows 77 | 78 | val initialLag = if (includeOriginal) 0 else 1 79 | 80 | val breezeOutputMat = MatrixUtil.toBreeze(outputMat) 81 | for (r <- 0 until numRows) { 82 | for (lag <- initialLag to numLags) { 83 | val c = colOffset + lag - initialLag 84 | breezeOutputMat(r, c) = x(r + numTruncatedRows - lag) 85 | } 86 | } 87 | } 88 | 89 | /** 90 | * Creates a lagged matrix from a current matrix (represented in row-array form). 91 | * Lags each column the appropriate amount of times and then concatenates the columns. 92 | * So given a matrix [a b c], where a/b/c are column vectors, and calling with lag of 2, 93 | * becomes a matrix of the form [a_-1 a_-2 b_-1 b_-2 c_-1 c_-2] 94 | */ 95 | def lagMatTrimBoth( 96 | x: Array[Array[Double]], 97 | maxLag: Int, 98 | includeOriginal: Boolean): Array[Array[Double]] = { 99 | val xt = x.transpose 100 | // one matrix per column, consisting of all its lags 101 | val matrices = for (col <- xt) yield { 102 | Lag.lagMatTrimBoth(col, maxLag, includeOriginal) 103 | } 104 | // merge the matrices into 1 matrix by concatenating col-wise 105 | matrices.transpose.map(_.reduceLeft(_ ++ _)) 106 | } 107 | 108 | /** 109 | * Creates a lagged matrix from a current matrix (represented in row-array form). 110 | * Lags each column the appropriate amount of times and then concatenates the columns. 111 | * So given a matrix [a b c], where a/b/c are column vectors, and calling with lag of 2, 112 | * becomes a matrix of the form [a_-1 a_-2 b_-1 b_-2 c_-1 c_-2] 113 | * The original time series is not included in the matrix. 114 | */ 115 | def lagMatTrimBoth(x: Array[Array[Double]], maxLag: Int): Array[Array[Double]] = { 116 | lagMatTrimBoth(x, maxLag, false) 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/tsne/impl/LBFGSTSNE.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.tsne.impl 2 | 3 | import breeze.linalg._ 4 | import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS} 5 | import breeze.stats.distributions.Rand 6 | import org.apache.spark.ml.tsne.{TSNEGradient, X2P} 7 | import org.apache.spark.mllib.linalg.distributed.RowMatrix 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.storage.StorageLevel 10 | import org.slf4j.LoggerFactory 11 | 12 | import scala.util.Random 13 | 14 | /** 15 | * TODO: This doesn't work at all (yet or ever). 16 | */ 17 | object LBFGSTSNE { 18 | private def logger = LoggerFactory.getLogger(LBFGSTSNE.getClass) 19 | 20 | def tsne( 21 | input: RowMatrix, 22 | noDims: Int = 2, 23 | maxNumIterations: Int = 1000, 24 | numCorrections: Int = 10, 25 | convergenceTol: Double = 1e-4, 26 | perplexity: Double = 30, 27 | seed: Long = Random.nextLong()): DenseMatrix[Double] = { 28 | if(input.rows.getStorageLevel == StorageLevel.NONE) { 29 | logger.warn("Input is not persisted and performance could be bad") 30 | } 31 | 32 | Rand.generator.setSeed(seed) 33 | 34 | val n = input.numRows().toInt 35 | val early_exaggeration = 100 36 | val t_momentum = 250 37 | val initial_momentum = 0.5 38 | val final_momentum = 0.8 39 | val eta = 500.0 40 | val min_gain = 0.01 41 | 42 | val Y: DenseMatrix[Double] = DenseMatrix.rand(n, noDims, Rand.gaussian) //:* .0001 43 | val iY = DenseMatrix.zeros[Double](n, noDims) 44 | val gains = DenseMatrix.ones[Double](n, noDims) 45 | 46 | // approximate p_{j|i} 47 | val p_ji = X2P(input, 1e-5, perplexity) 48 | //logInfo(p_ji.toRowMatrix().rows.collect().toList.toString) 49 | // p_ij = (p_{i|j} + p_{j|i}) / 2n 50 | val P = p_ji.transpose().entries.union(p_ji.entries) 51 | .map(e => ((e.i.toInt, e.j.toInt), e.value)) 52 | .reduceByKey(_ + _) 53 | .map{case ((i, j), v) => (i, (j, v / 2 / n)) } 54 | .groupByKey() 55 | .glom() 56 | .cache() 57 | 58 | var iteration = 1 59 | 60 | { 61 | val costFun = new CostFun(P, n, noDims, true) 62 | val lbfgs = new LBFGS[DenseVector[Double]](maxNumIterations, numCorrections, convergenceTol) 63 | val states = lbfgs.iterations(new CachedDiffFunction(costFun), new DenseVector(Y.data)) 64 | 65 | while (states.hasNext) { 66 | val state = states.next() 67 | val loss = state.value 68 | //logInfo(state.convergedReason.get.toString) 69 | logger.debug(s"Iteration $iteration finished with $loss") 70 | 71 | Y := asDenseMatrix(state.x, n, noDims) 72 | //subscriber.onNext((iteration, Y.copy, Some(loss))) 73 | iteration += 1 74 | } 75 | } 76 | 77 | { 78 | val costFun = new CostFun(P, n, noDims, false) 79 | val lbfgs = new LBFGS[DenseVector[Double]](maxNumIterations, numCorrections, convergenceTol) 80 | val states = lbfgs.iterations(new CachedDiffFunction(costFun), new DenseVector(Y.data)) 81 | 82 | while (states.hasNext) { 83 | val state = states.next() 84 | val loss = state.value 85 | //logInfo(state.convergedReason.get.toString) 86 | logger.debug(s"Iteration $iteration finished with $loss") 87 | 88 | Y := asDenseMatrix(state.x, n, noDims) 89 | //subscriber.onNext((iteration, Y.copy, Some(loss))) 90 | iteration += 1 91 | } 92 | } 93 | 94 | Y 95 | } 96 | 97 | private[this] def asDenseMatrix(v: DenseVector[Double], n: Int, noDims: Int) = { 98 | v.asDenseMatrix.reshape(n, noDims) 99 | } 100 | 101 | private class CostFun( 102 | P: RDD[Array[(Int, Iterable[(Int, Double)])]], 103 | n: Int, 104 | noDims: Int, 105 | exaggeration: Boolean) extends DiffFunction[DenseVector[Double]] { 106 | 107 | override def calculate(weights: DenseVector[Double]): (Double, DenseVector[Double]) = { 108 | val bcY = P.context.broadcast(asDenseMatrix(weights, n, noDims)) 109 | val bcExaggeration = P.context.broadcast(exaggeration) 110 | 111 | val numerator = P.map{ arr => TSNEGradient.computeNumerator(bcY.value, arr.map(_._1): _*) }.cache() 112 | val bcNumerator = P.context.broadcast({ 113 | numerator.treeAggregate(0.0)(seqOp = (x, v) => x + sum(v), combOp = _ + _) 114 | }) 115 | 116 | val (dY, loss) = P.zip(numerator).treeAggregate((DenseMatrix.zeros[Double](n, noDims), 0.0))( 117 | seqOp = (c, v) => { 118 | // c: (grad, loss), v: (Array[(i, Iterable(j, Distance))], numerator) 119 | // TODO: See if we can include early_exaggeration 120 | val l = TSNEGradient.compute(v._1, bcY.value, v._2, bcNumerator.value, c._1, bcExaggeration.value) 121 | (c._1, c._2 + l) 122 | }, 123 | combOp = (c1, c2) => { 124 | // c: (grad, loss) 125 | (c1._1 += c2._1, c1._2 + c2._2) 126 | }) 127 | 128 | numerator.unpersist() 129 | 130 | (loss, new DenseVector(dY.data)) 131 | } 132 | } 133 | } -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | io.enme 4 | enme 5 | 1.0 6 | 7 | 8 | 2.11.8 9 | 2.2.0 10 | 2.11 11 | 0.4.0 12 | 13 | 14 | 15 | 16 | org.scala-lang 17 | scala-library 18 | ${scala.version} 19 | 20 | 21 | org.apache.spark 22 | spark-mllib_${scala.binary.version} 23 | ${spark.version} 24 | 25 | 26 | org.apache.spark 27 | spark-mllib_${scala.binary.version} 28 | ${spark.version} 29 | test-jar 30 | test 31 | 32 | 33 | org.apache.spark 34 | spark-core_${scala.binary.version} 35 | ${spark.version} 36 | test-jar 37 | test 38 | 39 | 40 | com.meetup 41 | archery_${scala.binary.version} 42 | ${archery.version} 43 | 44 | 45 | 46 | 47 | 48 | 49 | net.alchim31.maven 50 | scala-maven-plugin 51 | 3.2.1 52 | 53 | 54 | org.apache.maven.plugins 55 | maven-compiler-plugin 56 | 2.0.2 57 | 58 | 1.7 59 | 1.7 60 | utf8 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | org.codehaus.mojo 69 | build-helper-maven-plugin 70 | 1.7 71 | 72 | 73 | add-source 74 | generate-sources 75 | 76 | add-source 77 | 78 | 79 | 80 | src/main/java 81 | 82 | 83 | 84 | 85 | 86 | 87 | net.alchim31.maven 88 | scala-maven-plugin 89 | 3.2.0 90 | 91 | 92 | compile-scala-first 93 | process-resources 94 | 95 | add-source 96 | compile 97 | 98 | 99 | 100 | test-compile-scala 101 | process-test-resources 102 | 103 | add-source 104 | testCompile 105 | 106 | 107 | 108 | 109 | ${scala.version} 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | org.scala-tools 118 | maven-scala-plugin 119 | 120 | ${scala.version} 121 | 122 | 123 | 124 | 125 | 126 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/timeseries/models/Autoregression.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.timeseries.models 2 | 3 | import org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression 4 | import org.apache.spark.ml.{Estimator, Model} 5 | import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors} 6 | import org.apache.spark.ml.param.{Param, ParamMap, Params} 7 | import org.apache.spark.ml.timeseries.{Lag, MatrixUtil} 8 | import org.apache.spark.ml.timeseries.params.TimeSeriesParams 9 | import org.apache.spark.ml.util.Identifiable 10 | import org.apache.spark.sql.{DataFrame, Dataset, Row} 11 | import org.apache.spark.sql.types.{DoubleType, StructField, StructType} 12 | 13 | /** 14 | * Created by endy on 16-12-16. 15 | */ 16 | 17 | trait AutoregressionParams extends TimeSeriesParams { 18 | 19 | final val maxLag = new Param[Int](this, "maxLag", "max lag") 20 | def setMaxLag(value: Int): this.type = set(maxLag, value) 21 | 22 | final val noIntercept = new Param[Boolean](this, "noIntercept", "no intercept") 23 | def setNoIntercept(value: Boolean): this.type = set(noIntercept, value) 24 | } 25 | 26 | 27 | class Autoregression(override val uid: String) 28 | extends Estimator[ARModel] with AutoregressionParams{ 29 | 30 | def this() = this(Identifiable.randomUID("Autoregression")) 31 | 32 | setDefault(noIntercept -> false, maxLag -> 1, timeCol -> "time", 33 | timeSeriesCol -> "timeseries") 34 | /** 35 | * Fits a model to the input data. 36 | */ 37 | override def fit(dataset: Dataset[_]): ARModel = { 38 | val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map { 39 | case Row(time: String, value: Double) => (time, value) 40 | }.sortByKey().collect() 41 | 42 | val dataVector = Vectors.dense(data.map(x => x._2)) 43 | 44 | // Make left hand side 45 | val Y = MatrixUtil.toBreeze(dataVector)(${maxLag} until dataVector.size) 46 | // Make lagged right hand side 47 | val X = Lag.lagMatTrimBoth(dataVector, ${maxLag}) 48 | 49 | val regression = new OLSMultipleLinearRegression() 50 | regression.setNoIntercept(${noIntercept}) // drop intercept in regression 51 | regression.newSampleData(Y.toArray, MatrixUtil.matToRowArrs(X)) 52 | val params = regression.estimateRegressionParameters() 53 | val (c, coeffs) = if (${noIntercept}) (0.0, params) else (params.head, params.tail) 54 | 55 | new ARModel(c, coeffs) 56 | .setTimeCol(${timeCol}) 57 | .setTimeSeriesCol(${timeSeriesCol}) 58 | } 59 | 60 | override def copy(extra: ParamMap): Estimator[ARModel] = defaultCopy(extra) 61 | 62 | /** 63 | * :: DeveloperApi :: 64 | * 65 | * Check transform validity and derive the output schema from the input schema. 66 | * 67 | * Typical implementation should first conduct verification on schema change and parameter 68 | * validity, including complex parameter interaction checks. 69 | */ 70 | override def transformSchema(schema: StructType): StructType = { 71 | schema 72 | } 73 | } 74 | 75 | class ARModel(override val uid: String, val c: Double, val coefficients: Array[Double]) extends 76 | Model[ARModel] with AutoregressionParams { 77 | 78 | def this(c: Double, coefficients: Array[Double]) = this(Identifiable.randomUID("ARModel"), c, 79 | coefficients) 80 | 81 | /** 82 | * Transforms the input dataset. 83 | */ 84 | override def transform(dataset: Dataset[_]): DataFrame = { 85 | val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map { 86 | case Row(time: String, value: Double) => (time, value) 87 | }.sortByKey().collect() 88 | .map(x => x._2) 89 | 90 | val dataVector = Vectors.dense(data) 91 | 92 | val dest = addTimeDependentEffects(dataVector) 93 | 94 | val resRDD = dataset.sparkSession.sparkContext.parallelize(dest.toArray.map(x => Row(x))) 95 | 96 | val structType = transformSchema(dataset.schema) 97 | 98 | dataset.sparkSession.createDataFrame(resRDD, structType) 99 | } 100 | 101 | def removeTimeDependentEffects(ts: Vector): Vector = { 102 | val dest = new Array[Double](ts.size) 103 | var i = 0 104 | while (i < ts.size) { 105 | dest(i) = ts(i) - c 106 | var j = 0 107 | while (j < coefficients.length && i - j - 1 >= 0) { 108 | dest(i) -= ts(i - j - 1) * coefficients(j) 109 | j += 1 110 | } 111 | i += 1 112 | } 113 | new DenseVector(dest) 114 | } 115 | 116 | def addTimeDependentEffects(ts: Vector): Vector = { 117 | val dest = new Array[Double](ts.size) 118 | var i = 0 119 | while (i < ts.size) { 120 | dest(i) = c + ts(i) 121 | var j = 0 122 | while (j < coefficients.length && i - j - 1 >= 0) { 123 | dest(i) += dest(i - j - 1) * coefficients(j) 124 | j += 1 125 | } 126 | i += 1 127 | } 128 | new DenseVector(dest) 129 | } 130 | 131 | /** 132 | * :: DeveloperApi :: 133 | * 134 | * Check transform validity and derive the output schema from the input schema. 135 | * 136 | * Typical implementation should first conduct verification on schema change and parameter 137 | * validity, including complex parameter interaction checks. 138 | */ 139 | override def transformSchema(schema: StructType): StructType = { 140 | StructType(Array(StructField("Autoregression", DoubleType))) 141 | 142 | } 143 | 144 | override def copy(extra: ParamMap): ARModel = defaultCopy(extra) 145 | 146 | } 147 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/fm/FMModel.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.fm 2 | 3 | import org.apache.spark.ml.fm.FM._ 4 | import org.apache.spark.ml.util.LoaderUtils 5 | import org.apache.spark.ml.util.SparkUtils._ 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, RegressionMetrics} 8 | import org.apache.spark.mllib.linalg.{Vector => SV} 9 | import org.apache.spark.mllib.regression.LabeledPoint 10 | import org.apache.spark.mllib.util.{Loader, Saveable} 11 | import org.apache.spark.rdd.RDD 12 | import org.apache.spark.sql.{Row, SQLContext} 13 | import org.apache.spark.storage.StorageLevel 14 | import org.json4s.DefaultFormats 15 | import org.json4s.JsonDSL._ 16 | import org.json4s.jackson.JsonMethods._ 17 | 18 | class FMModel( 19 | val k: Int, 20 | val intercept: ED, 21 | val classification: Boolean, 22 | val factors: RDD[(Long, VD)]) extends Serializable with Saveable { 23 | def predict(data: RDD[(Long, SV)]): RDD[(Long, ED)] = { 24 | data.flatMap { case (sampleId, features) => 25 | features.activeIterator.filter(_._2 != 0.0).map { 26 | case (featureId, value) => 27 | (featureId.toLong, (sampleId, value)) 28 | } 29 | }.join(factors).map { case (featureId, ((sampleId, x), w)) => 30 | (sampleId, forwardInterval(k, x, w)) 31 | }.reduceByKey(reduceInterval).map { case (sampleId, arr) => 32 | var result = predictInterval(k, intercept, arr) 33 | if (classification) { 34 | result = 1.0 / (1.0 + math.exp(-result)) 35 | } 36 | (sampleId, result) 37 | } 38 | } 39 | 40 | def loss(data: RDD[(Long, LabeledPoint)]): Double = { 41 | // val minTarget = data.map(_._2.label).min() 42 | // val maxTarget = data.map(_._2.label).max() 43 | val perd = predict(data.map(t => (t._1, t._2.features))) 44 | val label = data.map(t => (t._1, t._2.label)) 45 | val scoreAndLabels = label.join(perd).map { case (_, (label, score)) => 46 | // var r = Math.max(score, minTarget) 47 | // r = Math.min(r, maxTarget) 48 | // pow(l - r, 2) 49 | (score, label) 50 | } 51 | scoreAndLabels.persist(StorageLevel.MEMORY_AND_DISK) 52 | val ret = if (classification) auc(scoreAndLabels) else rmse(scoreAndLabels) 53 | scoreAndLabels.unpersist(blocking = false) 54 | ret 55 | } 56 | 57 | def rmse(scoreAndLabels: RDD[(Double, Double)]): Double = { 58 | val metrics = new RegressionMetrics(scoreAndLabels) 59 | metrics.rootMeanSquaredError 60 | } 61 | 62 | def auc(scoreAndLabels: RDD[(Double, Double)]): Double = { 63 | val metrics = new BinaryClassificationMetrics(scoreAndLabels) 64 | metrics.areaUnderROC() 65 | } 66 | 67 | override def save(sc: SparkContext, path: String): Unit = { 68 | FMModel.SaveLoadV1_0.save(sc, path, k, intercept, classification, factors) 69 | } 70 | 71 | override protected def formatVersion: String = FMModel.SaveLoadV1_0.formatVersionV1_0 72 | } 73 | 74 | object FMModel extends Loader[FMModel] { 75 | 76 | override def load(sc: SparkContext, path: String): FMModel = { 77 | val (loadedClassName, version, metadata) = LoaderUtils.loadMetadata(sc, path) 78 | val versionV1_0 = SaveLoadV1_0.formatVersionV1_0 79 | val classNameV1_0 = SaveLoadV1_0.classNameV1_0 80 | if (loadedClassName == classNameV1_0 && version == versionV1_0) { 81 | implicit val formats = DefaultFormats 82 | val classification = (metadata \ "classification").extract[Boolean] 83 | val intercept = (metadata \ "intercept").extract[Double] 84 | val k = (metadata \ "k").extract[Int] 85 | val dataPath = LoaderUtils.dataPath(path) 86 | val sqlContext = new SQLContext(sc) 87 | val dataRDD = sqlContext.read.parquet(dataPath) 88 | val dataArray = dataRDD.select("featureId", "factors").take(1) 89 | assert(dataArray.length == 1, s"Unable to load $loadedClassName data from: $dataPath") 90 | val data = dataArray(0) 91 | assert(data.size == 2, s"Unable to load $loadedClassName data from: $dataPath") 92 | val factors = dataRDD.rdd.map { 93 | case Row(featureId: Long, factors: Seq[Double]) => 94 | (featureId, factors.toArray) 95 | } 96 | new FMModel(k, intercept, classification, factors) 97 | } else { 98 | throw new Exception( 99 | s"FMModel.load did not recognize model with (className, format version):" + 100 | s"($loadedClassName, $version). Supported:\n" + 101 | s" ($classNameV1_0, 1.0)") 102 | } 103 | 104 | } 105 | 106 | private object SaveLoadV1_0 { 107 | val formatVersionV1_0 = "1.0" 108 | val classNameV1_0 = "com.github.cloudml.zen.ml.recommendation.FMModel" 109 | 110 | def save( 111 | sc: SparkContext, 112 | path: String, 113 | k: Int, 114 | intercept: Double, 115 | classification: Boolean, 116 | factors: RDD[(Long, Array[Double])]): Unit = { 117 | val metadata = compact(render 118 | (("class" -> classNameV1_0) ~ ("version" -> formatVersionV1_0) ~ 119 | ("k" -> k) ~ ("intercept" -> intercept) ~ ("classification" -> classification))) 120 | sc.parallelize(Seq(metadata), 1).saveAsTextFile(LoaderUtils.metadataPath(path)) 121 | 122 | val sqlContext = new SQLContext(sc) 123 | import sqlContext.implicits._ 124 | // Create Parquet data. 125 | factors.toDF("featureId", "factors").write.parquet(LoaderUtils.dataPath(path)) 126 | } 127 | } 128 | 129 | } 130 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/ml/timeseries/UnivariateTimeSeriesSuite.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.timeseries 2 | 3 | import org.apache.commons.math3.random.MersenneTwister 4 | import org.apache.spark.SparkFunSuite 5 | import org.apache.spark.ml.linalg.{DenseVector, Matrices, Vectors} 6 | import org.apache.spark.ml.util.DefaultReadWriteTest 7 | import org.apache.spark.mllib.util.MLlibTestSparkContext 8 | import org.apache.spark.mllib.util.TestingUtils._ 9 | 10 | 11 | /** 12 | * Created by endy on 16-12-21. 13 | */ 14 | class UnivariateTimeSeriesSuite extends SparkFunSuite with MLlibTestSparkContext 15 | with DefaultReadWriteTest { 16 | 17 | test("lagIncludeOriginalsTrue") { 18 | val lagMatrix = UnivariateTimeSeries.lag(Vectors.dense(1.0, 2.0, 3.0, 4.0, 5.0), 2, true) 19 | assert(lagMatrix === Matrices.dense(3, 3, Array(3.0, 4.0, 5.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0))) 20 | } 21 | 22 | test("lagIncludeOriginalsFalse") { 23 | val lagMatrix = UnivariateTimeSeries.lag(Vectors.dense(1.0, 2.0, 3.0, 4.0, 5.0), 2, false) 24 | assert(lagMatrix == Matrices.dense(3, 2, Array(2.0, 3.0, 4.0, 1.0, 2.0, 3.0))) 25 | } 26 | 27 | test("autocorr") { 28 | val rand = new MersenneTwister(5L) 29 | val iidAutocorr = UnivariateTimeSeries.autocorr(Array.fill(10000)(rand.nextDouble * 5.0), 3) 30 | iidAutocorr.foreach(x => assert(math.abs(x) < .03)) 31 | } 32 | 33 | test("upsampling") { 34 | // replicating upsampling examples 35 | // from http://www.mathworks.com/help/signal/ref/upsample.html?searchHighlight=upsample 36 | val y = new DenseVector(Array(1.0, 2.0, 3.0, 4.0)) 37 | val yUp1 = UnivariateTimeSeries.upsample(y, 3, useZero = true).toArray 38 | assert(yUp1 === Array(1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 3.0, 0.0, 0.0, 4.0, 0.0, 0.0)) 39 | 40 | val yUp2 = UnivariateTimeSeries.upsample(y, 3, useZero = true, phase = 2).toArray 41 | assert(yUp2 === Array(0.0, 0.0, 1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 3.0, 0.0, 0.0, 4.0)) 42 | } 43 | 44 | test("downsampling") { 45 | // replicating downsampling examples 46 | // from http://www.mathworks.com/help/signal/ref/downsample.html?searchHighlight=downsample 47 | val y = new DenseVector((1 to 10).toArray.map(_.toDouble)) 48 | val yDown1 = UnivariateTimeSeries.downsample(y, 3).toArray 49 | assert(yDown1 === Array(1.0, 4.0, 7.0, 10.0)) 50 | 51 | val yDown2 = UnivariateTimeSeries.downsample(y, 3, phase = 2).toArray 52 | assert(yDown2 === Array(3.0, 6.0, 9.0)) 53 | } 54 | 55 | test("signal reconstruction with spline") { 56 | // If we have a frequent signal, downsample it (at a rate that doesn't cause aliasing) 57 | // and we upsample, and apply a filter (interpolation), then the result should be fairly 58 | // close to the original signal. In our case, we drop NAs that are not filled by interpolation 59 | // (i.e no extrapolation) 60 | 61 | val y = (1 to 1000).toArray.map(_.toDouble / 100.0).map(Math.sin) 62 | val vy = new DenseVector(y) 63 | val lessFreq = UnivariateTimeSeries.downsample(vy, 100) 64 | val moreFreq = UnivariateTimeSeries.upsample(lessFreq, 100) 65 | 66 | // work on copies 67 | val splineY = UnivariateTimeSeries.fillSpline(new DenseVector(moreFreq.toArray)).toArray 68 | val lineY = UnivariateTimeSeries.fillLinear(new DenseVector(moreFreq.toArray)).toArray 69 | 70 | val MSE = (est: Array[Double], obs: Array[Double]) => { 71 | val errs = est.zip(obs).filter(!_._1.isNaN).map { case (yhat, yi) => 72 | (yhat - yi) * (yhat - yi) 73 | } 74 | errs.sum / errs.length 75 | } 76 | 77 | val sE = MSE(splineY, y) 78 | val lE = MSE(lineY, y) 79 | 80 | // a cubic spline should be better than linear interpolation 81 | assert(sE < lE) 82 | } 83 | 84 | test("differencing at lag") { 85 | val rand = new MersenneTwister(10L) 86 | val n = 100 87 | val sampled = new DenseVector(Array.fill(n)(rand.nextGaussian)) 88 | val lag = 5 89 | val diffed = UnivariateTimeSeries.differencesAtLag(sampled, lag) 90 | val invDiffed = UnivariateTimeSeries.inverseDifferencesAtLag(diffed, lag) 91 | 92 | for (i <- 0 until n) { 93 | assert(sampled(i) ~== invDiffed(i) absTol 1e-6) 94 | } 95 | 96 | assert(diffed(10) == (sampled(10) - sampled(5))) 97 | assert(diffed(99) == (sampled(99) - sampled(94))) 98 | } 99 | 100 | test("differencing of order d") { 101 | val rand = new MersenneTwister(10L) 102 | val n = 100 103 | val sampled = new DenseVector(Array.fill(n)(rand.nextGaussian)) 104 | // differencing at order 1 and lag 1 should be the same 105 | val diffedOfOrder1 = UnivariateTimeSeries.differencesOfOrderD(sampled, 1) 106 | val diffedAtLag1 = UnivariateTimeSeries.differencesAtLag(sampled, 1) 107 | 108 | for (i <- 0 until n) { 109 | assert(diffedAtLag1(i) ~== diffedOfOrder1(i) absTol 1e-6) 110 | } 111 | 112 | // differencing at order and inversing should return the original series 113 | val diffedOfOrder5 = UnivariateTimeSeries.differencesOfOrderD(sampled, 5) 114 | val invDiffedOfOrder5 = UnivariateTimeSeries.inverseDifferencesOfOrderD(diffedOfOrder5, 5) 115 | 116 | for (i <- 0 until n) { 117 | assert(invDiffedOfOrder5(i) ~== sampled(i) absTol 1e-6) 118 | } 119 | 120 | // Differencing of order n + 1 should be the same as differencing one time a 121 | // vector that has already been differenced to order n 122 | val diffedOfOrder6 = UnivariateTimeSeries.differencesOfOrderD(sampled, 6) 123 | val diffedOneMore = UnivariateTimeSeries.differencesOfOrderD(diffedOfOrder5, 1) 124 | // compare start at index = 6 125 | for (i <- 6 until n) { 126 | assert(diffedOfOrder6(i) ~== diffedOneMore(i) absTol 1e-6) 127 | } 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/tsne/TSNEGradient.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.tsne 2 | 3 | import breeze.linalg._ 4 | import breeze.numerics._ 5 | import org.apache.spark.ml.tsne.tree.SPTree 6 | import org.slf4j.LoggerFactory 7 | 8 | object TSNEGradient { 9 | def logger = LoggerFactory.getLogger(TSNEGradient.getClass) 10 | 11 | /** 12 | * Compute the numerator from the matrix Y 13 | * 14 | * @param idx the index in the matrix to use. 15 | * @param Y the matrix to analyze 16 | * @return the numerator 17 | */ 18 | def computeNumerator(Y: DenseMatrix[Double], idx: Int *): DenseMatrix[Double] = { 19 | // Y_sum = ||Y_i||^2 20 | val sumY = sum(pow(Y, 2).apply(*, ::)) // n * 1 21 | val subY = Y(idx, ::).toDenseMatrix // k * 1 22 | val y1: DenseMatrix[Double] = Y * (-2.0 :* subY.t) // n * k 23 | val num: DenseMatrix[Double] = (y1(::, *) + sumY).t // k * n 24 | num := 1.0 :/ (1.0 :+ (num(::, *) + sumY(idx).toDenseVector)) // k * n 25 | 26 | idx.indices.foreach(i => num.update(i, idx(i), 0.0)) // num(i, i) = 0 27 | 28 | num 29 | } 30 | 31 | /** 32 | * Compute the TSNE Gradient at i. Update the gradient through dY then return costs attributed at i. 33 | * 34 | * @param data data point for row i by list of pair of (j, p_ij) and 0 <= j < n 35 | * @param Y current Y [n * 2] 36 | * @param totalNum the common numerator that captures the t-distribution of Y 37 | * @param dY gradient of Y 38 | * @return loss attributed to row i 39 | */ 40 | def compute( 41 | data: Array[(Int, Iterable[(Int, Double)])], 42 | Y: DenseMatrix[Double], 43 | num: DenseMatrix[Double], 44 | totalNum: Double, 45 | dY: DenseMatrix[Double], 46 | exaggeration: Boolean): Double = { 47 | // q = (1 + ||Y_i - Y_j||^2)^-1 / sum(1 + ||Y_k - Y_l||^2)^-1 48 | val q: DenseMatrix[Double] = num / totalNum 49 | q.foreachPair{case ((i, j), v) => q.update(i, j, math.max(v, 1e-12))} 50 | 51 | // q = q - p 52 | val loss = data.zipWithIndex.flatMap { 53 | case ((_, itr), i) => 54 | itr.map{ 55 | case (j, p) => 56 | val exaggeratedP = if(exaggeration) p * 4 else p 57 | val qij = q(i, j) 58 | val l = exaggeratedP * math.log(exaggeratedP / qij) 59 | q.update(i, j, qij - exaggeratedP) 60 | if(l.isNaN) 0.0 else l 61 | } 62 | }.sum 63 | 64 | // l = [ (p_ij - q_ij) * (1 + ||Y_i - Y_j||^2)^-1 ] 65 | q :*= -num 66 | // l_sum = [0 0 ... sum(l) ... 0] 67 | sum(q(*, ::)).foreachPair{ case (i, v) => q.update(i, data(i)._1, q(i, data(i)._1) - v) } 68 | 69 | // dY_i = -4 * (l - l_sum) * Y 70 | val dYi: DenseMatrix[Double] = -4.0 :* (q * Y) 71 | data.map(_._1).zipWithIndex.foreach{ 72 | case (i, idx) => dY(i, ::) := dYi(idx, ::) 73 | } 74 | 75 | loss 76 | } 77 | 78 | /** BH Tree related functions **/ 79 | 80 | /** 81 | * 82 | * @param data array of (row_id, Seq(col_id), Vector(P_ij)) 83 | * @param Y matrix 84 | * @param posF positive forces 85 | */ 86 | def computeEdgeForces(data: Array[(Int, Seq[Int], DenseVector[Double])], 87 | Y: DenseMatrix[Double], 88 | posF: DenseMatrix[Double]): Unit = { 89 | data.foreach { 90 | case (i, cols, vec) => 91 | // k x D - 1 x D => k x D 92 | val diff = Y(cols, ::).toDenseMatrix.apply(*, ::) - Y(i, ::).t 93 | // k x D => k x 1 94 | val qZ = 1.0 :+ sum(pow(diff, 2).apply(*, ::)) 95 | posF(i, ::) := (vec :/ qZ).t * (-diff) 96 | } 97 | } 98 | 99 | def computeNonEdgeForces(tree: SPTree, 100 | Y: DenseMatrix[Double], 101 | theta: Double, 102 | negF: DenseMatrix[Double], 103 | idx: Int *): Double = { 104 | idx.foldLeft(0.0)((acc, i) => acc + computeNonEdgeForce(tree, Y(i, ::).t, theta, negF, i)) 105 | } 106 | 107 | /** 108 | * Calcualte negative forces using BH approximation 109 | * 110 | * @param tree SPTree used for approximation 111 | * @param y y_i 112 | * @param theta threshold for correctness / speed 113 | * @param negF negative forces 114 | * @param i row 115 | * @return sum of Q 116 | */ 117 | private def computeNonEdgeForce(tree: SPTree, 118 | y: DenseVector[Double], 119 | theta: Double, 120 | negF: DenseMatrix[Double], 121 | i: Int): Double = { 122 | import tree._ 123 | if(getCount == 0 || (isLeaf && center.equals(y))) { 124 | 0.0 125 | } else { 126 | val diff = y - center 127 | val diffSq = sum(pow(diff, 2)) 128 | if(isLeaf || radiusSq / diffSq < theta) { 129 | val qZ = 1 / (1 + diffSq) 130 | val nqZ = getCount * qZ 131 | negF(i, ::) :+= (nqZ * qZ * diff).t 132 | nqZ 133 | } else { 134 | children.foldLeft(0.0)((acc, child) => acc + computeNonEdgeForce(child, y, theta, negF, i)) 135 | } 136 | } 137 | } 138 | 139 | def computeLoss(data: Array[(Int, Seq[Int], DenseVector[Double])], 140 | Y: DenseMatrix[Double], 141 | sumQ: Double): Double = { 142 | data.foldLeft(0.0){ 143 | case (acc, (i, cols, vec)) => 144 | val diff = Y(cols, ::).toDenseMatrix.apply(*, ::) - Y(i, ::).t 145 | val diffSq = sum(pow(diff, 2).apply(*, ::)) 146 | val Q = (1.0 :/ (1.0 :+ diffSq)) :/ sumQ 147 | sum(vec :* breeze.numerics.log(max(vec, 1e-12) :/ max(Q, 1e-12))) 148 | } 149 | } 150 | } -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/mvm/MVMModel.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.mvm 2 | 3 | import org.apache.spark.ml.mvm.MVM._ 4 | import org.apache.spark.ml.util.LoaderUtils 5 | import org.apache.spark.ml.util.SparkUtils._ 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.mllib.evaluation.{RegressionMetrics, BinaryClassificationMetrics} 8 | import org.apache.spark.mllib.linalg.{Vector => SV} 9 | import org.apache.spark.mllib.regression.LabeledPoint 10 | import org.apache.spark.mllib.util.{Loader, Saveable} 11 | import org.apache.spark.rdd.RDD 12 | import org.apache.spark.sql.{Row, SQLContext} 13 | import org.apache.spark.storage.StorageLevel 14 | import org.json4s.DefaultFormats 15 | import org.json4s.JsonDSL._ 16 | import org.json4s.jackson.JsonMethods._ 17 | 18 | import scala.math._ 19 | 20 | class MVMModel( 21 | val k: Int, 22 | val views: Array[Long], 23 | val classification: Boolean, 24 | val factors: RDD[(Long, VD)]) extends Serializable with Saveable { 25 | def predict(data: RDD[(Long, SV)]): RDD[(Long, ED)] = { 26 | val numFeatures = data.first()._2.size.toLong 27 | data.flatMap { case (sampleId, features) => 28 | features.activeIterator.filter(_._2 != 0.0).map { 29 | case (featureId, value) => 30 | (featureId.toLong, (sampleId, value)) 31 | } ++ views.indices.map { i => (numFeatures + i, (sampleId, 1D)) } 32 | }.join(factors).map { case (featureId, ((sampleId, x), w)) => 33 | val viewSize = views.length 34 | val viewId = featureId2viewId(featureId, views) 35 | (sampleId, forwardInterval(k, viewSize, viewId, x, w)) 36 | }.reduceByKey(reduceInterval).map { case (sampleId, arr) => 37 | var result = predictInterval(k, arr) 38 | if (classification) { 39 | result = 1.0 / (1.0 + math.exp(-result)) 40 | } 41 | (sampleId, result) 42 | } 43 | } 44 | 45 | def loss(data: RDD[(Long, LabeledPoint)]): Double = { 46 | // val minTarget = data.map(_._2.label).min() 47 | // val maxTarget = data.map(_._2.label).max() 48 | val perd = predict(data.map(t => (t._1, t._2.features))) 49 | val label = data.map(t => (t._1, t._2.label)) 50 | val scoreAndLabels = label.join(perd).map { case (_, (label, score)) => 51 | // var r = Math.max(score, minTarget) 52 | // r = Math.min(r, maxTarget) 53 | // pow(l - r, 2) 54 | (score, label) 55 | } 56 | scoreAndLabels.persist(StorageLevel.MEMORY_AND_DISK) 57 | val ret = if (classification) auc(scoreAndLabels) else rmse(scoreAndLabels) 58 | scoreAndLabels.unpersist(blocking = false) 59 | ret 60 | } 61 | 62 | def rmse(scoreAndLabels: RDD[(Double, Double)]): Double = { 63 | val metrics = new RegressionMetrics(scoreAndLabels) 64 | metrics.rootMeanSquaredError 65 | } 66 | 67 | def auc(scoreAndLabels: RDD[(Double, Double)]): Double = { 68 | val metrics = new BinaryClassificationMetrics(scoreAndLabels) 69 | metrics.areaUnderROC() 70 | } 71 | 72 | override def save(sc: SparkContext, path: String): Unit = { 73 | MVMModel.SaveLoadV1_0.save(sc, path, k, views, classification, factors) 74 | } 75 | 76 | override protected def formatVersion: String = MVMModel.SaveLoadV1_0.formatVersionV1_0 77 | } 78 | 79 | object MVMModel extends Loader[MVMModel] { 80 | 81 | override def load(sc: SparkContext, path: String): MVMModel = { 82 | val (loadedClassName, version, metadata) = LoaderUtils.loadMetadata(sc, path) 83 | val versionV1_0 = SaveLoadV1_0.formatVersionV1_0 84 | val classNameV1_0 = SaveLoadV1_0.classNameV1_0 85 | if (loadedClassName == classNameV1_0 && version == versionV1_0) { 86 | implicit val formats = DefaultFormats 87 | val classification = (metadata \ "classification").extract[Boolean] 88 | val views = (metadata \ "views").extract[String].split(",").map(_.toLong) 89 | val k = (metadata \ "k").extract[Int] 90 | val dataPath = LoaderUtils.dataPath(path) 91 | val sqlContext = new SQLContext(sc) 92 | val dataRDD = sqlContext.read.parquet(dataPath) 93 | val dataArray = dataRDD.select("featureId", "factors").take(1) 94 | assert(dataArray.size == 1, s"Unable to load $loadedClassName data from: $dataPath") 95 | val data = dataArray(0) 96 | assert(data.size == 2, s"Unable to load $loadedClassName data from: $dataPath") 97 | val factors = dataRDD.rdd.map { 98 | case Row(featureId: Long, factors: Seq[Double]) => 99 | (featureId, factors.toArray) 100 | } 101 | new MVMModel(k, views, classification, factors) 102 | } else { 103 | throw new Exception( 104 | s"FMModel.load did not recognize model with (className, format version):" + 105 | s"($loadedClassName, $version). Supported:\n" + 106 | s" ($classNameV1_0, 1.0)") 107 | } 108 | 109 | } 110 | 111 | private object SaveLoadV1_0 { 112 | val formatVersionV1_0 = "1.0" 113 | val classNameV1_0 = "com.github.cloudml.zen.ml.recommendation.MVMModel" 114 | 115 | def save( 116 | sc: SparkContext, 117 | path: String, 118 | k: Int, 119 | views: Array[Long], 120 | classification: Boolean, 121 | factors: RDD[(Long, Array[Double])]): Unit = { 122 | val metadata = compact(render 123 | (("class" -> classNameV1_0) ~ ("version" -> formatVersionV1_0) ~ 124 | ("k" -> k) ~ ("views" -> views.mkString(",")) ~ ("classification" -> classification))) 125 | sc.parallelize(Seq(metadata), 1).saveAsTextFile(LoaderUtils.metadataPath(path)) 126 | 127 | val sqlContext = new SQLContext(sc) 128 | import sqlContext.implicits._ 129 | // Create Parquet data. 130 | factors.toDF("featureId", "factors").write.parquet(LoaderUtils.dataPath(path)) 131 | } 132 | } 133 | 134 | } -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/fm/BSFMModel.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.fm 2 | 3 | import org.apache.spark.ml.fm.BSFM._ 4 | import org.apache.spark.ml.util.LoaderUtils 5 | import org.apache.spark.ml.util.SparkUtils._ 6 | import org.apache.spark.SparkContext 7 | import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, RegressionMetrics} 8 | import org.apache.spark.mllib.linalg.{Vector => SV} 9 | import org.apache.spark.mllib.regression.LabeledPoint 10 | import org.apache.spark.mllib.util.{Loader, Saveable} 11 | import org.apache.spark.rdd.RDD 12 | import org.apache.spark.sql.{Row, SQLContext} 13 | import org.apache.spark.storage.StorageLevel 14 | import org.json4s.DefaultFormats 15 | import org.json4s.JsonDSL._ 16 | import org.json4s.jackson.JsonMethods._ 17 | 18 | import scala.math._ 19 | 20 | class BSFMModel( 21 | val k: Int, 22 | val intercept: ED, 23 | val views: Array[Long], 24 | val classification: Boolean, 25 | val factors: RDD[(Long, VD)]) extends Serializable with Saveable { 26 | def predict(data: RDD[(Long, SV)]): RDD[(Long, ED)] = { 27 | val numFeatures = data.first()._2.size.toLong 28 | data.flatMap { case (sampleId, features) => 29 | features.activeIterator.filter(_._2 != 0.0).map { 30 | case (featureId, value) => 31 | (featureId.toLong, (sampleId, value)) 32 | } ++ views.indices.map { i => (numFeatures + i, (sampleId, 1D)) } 33 | }.join(factors).map { case (featureId, ((sampleId, x), w)) => 34 | val viewSize = views.length 35 | val viewId = featureId2viewId(featureId, views) 36 | (sampleId, forwardInterval(k, viewSize, viewId, x, w)) 37 | }.reduceByKey(forwardReduceInterval).map { case (sampleId, arr) => 38 | var result = predictInterval(k, views.length, intercept, arr) 39 | if (classification) { 40 | result = 1.0 / (1.0 + math.exp(-result)) 41 | } 42 | (sampleId, result) 43 | } 44 | } 45 | 46 | def loss(data: RDD[(Long, LabeledPoint)]): Double = { 47 | // val minTarget = data.map(_._2.label).min() 48 | // val maxTarget = data.map(_._2.label).max() 49 | val perd = predict(data.map(t => (t._1, t._2.features))) 50 | val label = data.map(t => (t._1, t._2.label)) 51 | val scoreAndLabels = label.join(perd).map { case (_, (label, score)) => 52 | // var r = Math.max(score, minTarget) 53 | // r = Math.min(r, maxTarget) 54 | // pow(l - r, 2) 55 | (score, label) 56 | } 57 | scoreAndLabels.persist(StorageLevel.MEMORY_AND_DISK) 58 | val ret = if (classification) auc(scoreAndLabels) else rmse(scoreAndLabels) 59 | scoreAndLabels.unpersist(blocking = false) 60 | ret 61 | } 62 | 63 | def rmse(scoreAndLabels: RDD[(Double, Double)]): Double = { 64 | val metrics = new RegressionMetrics(scoreAndLabels) 65 | metrics.rootMeanSquaredError 66 | } 67 | 68 | def auc(scoreAndLabels: RDD[(Double, Double)]): Double = { 69 | val metrics = new BinaryClassificationMetrics(scoreAndLabels) 70 | metrics.areaUnderROC() 71 | } 72 | 73 | override def save(sc: SparkContext, path: String): Unit = { 74 | BSFMModel.SaveLoadV1_0.save(sc, path, k, intercept, views, classification, factors) 75 | } 76 | 77 | override protected def formatVersion: String = BSFMModel.SaveLoadV1_0.formatVersionV1_0 78 | } 79 | 80 | object BSFMModel extends Loader[BSFMModel] { 81 | 82 | override def load(sc: SparkContext, path: String): BSFMModel = { 83 | val (loadedClassName, version, metadata) = LoaderUtils.loadMetadata(sc, path) 84 | val versionV1_0 = SaveLoadV1_0.formatVersionV1_0 85 | val classNameV1_0 = SaveLoadV1_0.classNameV1_0 86 | if (loadedClassName == classNameV1_0 && version == versionV1_0) { 87 | implicit val formats = DefaultFormats 88 | val classification = (metadata \ "classification").extract[Boolean] 89 | val intercept = (metadata \ "intercept").extract[Double] 90 | val views = (metadata \ "views").extract[String].split(",").map(_.toLong) 91 | val k = (metadata \ "k").extract[Int] 92 | val dataPath = LoaderUtils.dataPath(path) 93 | val sqlContext = new SQLContext(sc) 94 | val dataRDD = sqlContext.read.parquet(dataPath) 95 | val dataArray = dataRDD.select("featureId", "factors").take(1) 96 | assert(dataArray.size == 1, s"Unable to load $loadedClassName data from: $dataPath") 97 | val data = dataArray(0) 98 | assert(data.size == 2, s"Unable to load $loadedClassName data from: $dataPath") 99 | val factors = dataRDD.rdd.map { 100 | case Row(featureId: Long, factors: Seq[Double]) => 101 | (featureId, factors.toArray) 102 | } 103 | new BSFMModel(k, intercept, views, classification, factors) 104 | } else { 105 | throw new Exception( 106 | s"FMModel.load did not recognize model with (className, format version):" + 107 | s"($loadedClassName, $version). Supported:\n" + 108 | s" ($classNameV1_0, 1.0)") 109 | } 110 | 111 | } 112 | 113 | private object SaveLoadV1_0 { 114 | val formatVersionV1_0 = "1.0" 115 | val classNameV1_0 = "com.github.cloudml.zen.ml.recommendation.BSFMModel" 116 | 117 | def save( 118 | sc: SparkContext, 119 | path: String, 120 | k: Int, 121 | intercept: Double, 122 | views: Array[Long], 123 | classification: Boolean, 124 | factors: RDD[(Long, Array[Double])]): Unit = { 125 | val metadata = compact(render 126 | (("class" -> classNameV1_0) ~ ("version" -> formatVersionV1_0) ~ ("intercept" -> intercept) ~ 127 | ("k" -> k) ~ ("views" -> views.mkString(",")) ~ ("classification" -> classification))) 128 | sc.parallelize(Seq(metadata), 1).saveAsTextFile(LoaderUtils.metadataPath(path)) 129 | 130 | val sqlContext = new SQLContext(sc) 131 | import sqlContext.implicits._ 132 | // Create Parquet data. 133 | factors.toDF("featureId", "factors").write.parquet(LoaderUtils.dataPath(path)) 134 | } 135 | } 136 | 137 | } 138 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/dbscan/EvenSplitPartitioner.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.dbscan 2 | 3 | import scala.annotation.tailrec 4 | 5 | import org.apache.spark.internal.Logging 6 | 7 | /** 8 | * Helper methods for calling the partitioner 9 | */ 10 | object EvenSplitPartitioner { 11 | 12 | def partition( 13 | toSplit: Set[(DBSCANRectangle, Int)], 14 | maxPointsPerPartition: Long, 15 | minimumRectangleSize: Double): List[(DBSCANRectangle, Int)] = { 16 | new EvenSplitPartitioner(maxPointsPerPartition, minimumRectangleSize) 17 | .findPartitions(toSplit) 18 | } 19 | 20 | } 21 | 22 | class EvenSplitPartitioner( 23 | maxPointsPerPartition: Long, 24 | minimumRectangleSize: Double) extends Logging { 25 | 26 | type RectangleWithCount = (DBSCANRectangle, Int) 27 | 28 | def findPartitions(toSplit: Set[RectangleWithCount]): List[RectangleWithCount] = { 29 | 30 | val boundingRectangle = findBoundingRectangle(toSplit) 31 | 32 | def pointsIn = pointsInRectangle(toSplit, _: DBSCANRectangle) 33 | 34 | val toPartition = List((boundingRectangle, pointsIn(boundingRectangle))) 35 | val partitioned = List[RectangleWithCount]() 36 | 37 | logTrace("About to start partitioning") 38 | val partitions = partition(toPartition, partitioned, pointsIn) 39 | logTrace("Done") 40 | 41 | // remove empty partitions 42 | partitions.filter({ case (partition, count) => count > 0 }) 43 | } 44 | 45 | @tailrec 46 | private def partition( 47 | remaining: List[RectangleWithCount], 48 | partitioned: List[RectangleWithCount], 49 | pointsIn: (DBSCANRectangle) => Int): List[RectangleWithCount] = { 50 | 51 | remaining match { 52 | case (rectangle, count) :: rest => 53 | if (count > maxPointsPerPartition) { 54 | 55 | if (canBeSplit(rectangle)) { 56 | logTrace(s"About to split: $rectangle") 57 | def cost = (r: DBSCANRectangle) => ((pointsIn(rectangle) / 2) - pointsIn(r)).abs 58 | val (split1, split2) = split(rectangle, cost) 59 | logTrace(s"Found split: $split1, $split2") 60 | val s1 = (split1, pointsIn(split1)) 61 | val s2 = (split2, pointsIn(split2)) 62 | partition(s1 :: s2 :: rest, partitioned, pointsIn) 63 | 64 | } else { 65 | logWarning(s"Can't split: ($rectangle -> $count) (maxSize: $maxPointsPerPartition)") 66 | partition(rest, (rectangle, count) :: partitioned, pointsIn) 67 | } 68 | 69 | } else { 70 | partition(rest, (rectangle, count) :: partitioned, pointsIn) 71 | } 72 | 73 | case Nil => partitioned 74 | 75 | } 76 | 77 | } 78 | 79 | def split( 80 | rectangle: DBSCANRectangle, 81 | cost: (DBSCANRectangle) => Int): (DBSCANRectangle, DBSCANRectangle) = { 82 | 83 | val smallestSplit = 84 | findPossibleSplits(rectangle) 85 | .reduceLeft { 86 | (smallest, current) => 87 | 88 | if (cost(current) < cost(smallest)) { 89 | current 90 | } else { 91 | smallest 92 | } 93 | 94 | } 95 | 96 | (smallestSplit, (complement(smallestSplit, rectangle))) 97 | 98 | } 99 | 100 | /** 101 | * Returns the box that covers the space inside boundary that is not covered by box 102 | */ 103 | private def complement(box: DBSCANRectangle, boundary: DBSCANRectangle): DBSCANRectangle = 104 | if (box.x == boundary.x && box.y == boundary.y) { 105 | if (boundary.x2 >= box.x2 && boundary.y2 >= box.y2) { 106 | if (box.y2 == boundary.y2) { 107 | DBSCANRectangle(box.x2, box.y, boundary.x2, boundary.y2) 108 | } else if (box.x2 == boundary.x2) { 109 | DBSCANRectangle(box.x, box.y2, boundary.x2, boundary.y2) 110 | } else { 111 | throw new IllegalArgumentException("rectangle is not a proper sub-rectangle") 112 | } 113 | } else { 114 | throw new IllegalArgumentException("rectangle is smaller than boundary") 115 | } 116 | } else { 117 | throw new IllegalArgumentException("unequal rectangle") 118 | } 119 | 120 | /** 121 | * Returns all the possible ways in which the given box can be split 122 | */ 123 | private def findPossibleSplits(box: DBSCANRectangle): Set[DBSCANRectangle] = { 124 | 125 | val xSplits = (box.x + minimumRectangleSize) until box.x2 by minimumRectangleSize 126 | 127 | val ySplits = (box.y + minimumRectangleSize) until box.y2 by minimumRectangleSize 128 | 129 | val splits = 130 | xSplits.map(x => DBSCANRectangle(box.x, box.y, x, box.y2)) ++ 131 | ySplits.map(y => DBSCANRectangle(box.x, box.y, box.x2, y)) 132 | 133 | logTrace(s"Possible splits: $splits") 134 | 135 | splits.toSet 136 | } 137 | 138 | /** 139 | * Returns true if the given rectangle can be split into at least two rectangles of minimum size 140 | */ 141 | private def canBeSplit(box: DBSCANRectangle): Boolean = { 142 | (box.x2 - box.x > minimumRectangleSize * 2 || 143 | box.y2 - box.y > minimumRectangleSize * 2) 144 | } 145 | 146 | def pointsInRectangle(space: Set[RectangleWithCount], rectangle: DBSCANRectangle): Int = { 147 | space.view 148 | .filter({ case (current, _) => rectangle.contains(current) }) 149 | .foldLeft(0) { 150 | case (total, (_, count)) => total + count 151 | } 152 | } 153 | 154 | def findBoundingRectangle(rectanglesWithCount: Set[RectangleWithCount]): DBSCANRectangle = { 155 | 156 | val invertedRectangle = 157 | DBSCANRectangle(Double.MaxValue, Double.MaxValue, Double.MinValue, Double.MinValue) 158 | 159 | rectanglesWithCount.foldLeft(invertedRectangle) { 160 | case (bounding, (c, _)) => 161 | DBSCANRectangle( 162 | bounding.x.min(c.x), bounding.y.min(c.y), 163 | bounding.x2.max(c.x2), bounding.y2.max(c.y2)) 164 | } 165 | 166 | } 167 | 168 | } 169 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/timeseries/models/ARGARCH.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.timeseries.models 2 | 3 | import io.transwarp.discover.timeseries.params.TimeSeriesParams 4 | import org.apache.commons.math3.random.RandomGenerator 5 | import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors} 6 | import org.apache.spark.ml.param.ParamMap 7 | import org.apache.spark.ml.util.Identifiable 8 | import org.apache.spark.ml.{Estimator, Model} 9 | import org.apache.spark.sql._ 10 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} 11 | 12 | /** 13 | * Created by endy on 16-12-22. 14 | */ 15 | 16 | class ARGARCH(override val uid: String) extends Estimator[ARGARCHModel] with TimeSeriesParams { 17 | setDefault(timeCol -> "time", 18 | timeSeriesCol -> "timeseries") 19 | 20 | def this() = this(Identifiable.randomUID("ARGARCH")) 21 | /** 22 | * Fits a model to the input data. 23 | */ 24 | override def fit(dataset: Dataset[_]): ARGARCHModel = { 25 | val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map { 26 | case Row(time: String, value: Double) => (time, value) 27 | }.sortByKey().collect() 28 | 29 | val dataVector = Vectors.dense(data.map(x => x._2)) 30 | 31 | val arModel = new Autoregression().fit(dataset) 32 | val residuals = arModel.removeTimeDependentEffects(dataVector) 33 | val dataFrame = generateDf(dataset.sparkSession, residuals.toArray) 34 | val garchModel = new GARCH().fit(dataFrame) 35 | 36 | new ARGARCHModel(arModel.c, arModel.coefficients(0), garchModel.omega, garchModel.alpha, 37 | garchModel.beta) 38 | } 39 | 40 | override def copy(extra: ParamMap): Estimator[ARGARCHModel] = defaultCopy(extra) 41 | 42 | /** 43 | * :: DeveloperApi :: 44 | * 45 | * Check transform validity and derive the output schema from the input schema. 46 | * 47 | * Typical implementation should first conduct verification on schema change and parameter 48 | * validity, including complex parameter interaction checks. 49 | */ 50 | override def transformSchema(schema: StructType): StructType = schema 51 | 52 | private def generateDf(sparkSession: SparkSession, array: Array[Double]): DataFrame = { 53 | val schema = StructType(Array(StructField(${timeCol}, StringType), StructField(${timeSeriesCol}, 54 | DoubleType))) 55 | 56 | val rdd = sparkSession.sparkContext.parallelize( 57 | array.zipWithIndex.map(x => Row(x._2.formatted("%010d"), x._1))) 58 | 59 | sparkSession.createDataFrame(rdd, schema) 60 | } 61 | } 62 | 63 | class ARGARCHModel(override val uid: String, val c: Double, val phi: Double, val omega: Double, 64 | val alpha: Double, val beta: Double) extends 65 | Model[ARGARCHModel] with TimeSeriesParams { 66 | 67 | def this(c: Double, phi: Double, omega: Double, alpha: Double, beta: Double) = 68 | this(Identifiable.randomUID("ARGARCHModel"), c, phi, omega, alpha, beta) 69 | 70 | override def copy(extra: ParamMap): ARGARCHModel = defaultCopy(extra) 71 | 72 | /** 73 | * Transforms the input dataset. 74 | */ 75 | override def transform(dataset: Dataset[_]): DataFrame = { 76 | val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map { 77 | case Row(time: String, value: Double) => (time, value) 78 | }.sortByKey().collect() 79 | 80 | val dataVector = Vectors.dense(data.map(x => x._2)) 81 | 82 | val dest = addTimeDependentEffects(dataVector) 83 | 84 | val resRDD = dataset.sparkSession.sparkContext.parallelize(dest.toArray.map(x => Row(x))) 85 | 86 | val structType = transformSchema(dataset.schema) 87 | 88 | dataset.sparkSession.createDataFrame(resRDD, structType) 89 | } 90 | 91 | /** 92 | * :: DeveloperApi :: 93 | * 94 | * Check transform validity and derive the output schema from the input schema. 95 | * 96 | * Typical implementation should first conduct verification on schema change and parameter 97 | * validity, including complex parameter interaction checks. 98 | */ 99 | override def transformSchema(schema: StructType): StructType = { 100 | StructType(Array(StructField("ARGARCH", DoubleType))) 101 | } 102 | 103 | def removeTimeDependentEffects(ts: Vector): Vector = { 104 | val destArr = new Array[Double](ts.size) 105 | var prevEta = ts(0) - c 106 | var prevVariance = omega / (1.0 - alpha - beta) 107 | destArr(0) = prevEta / math.sqrt(prevVariance) 108 | for (i <- 1 until ts.size) { 109 | val variance = omega + alpha * prevEta * prevEta + beta * prevVariance 110 | val eta = ts(i) - c - phi * ts(i - 1) 111 | destArr(i) = eta / math.sqrt(variance) 112 | 113 | prevEta = eta 114 | prevVariance = variance 115 | } 116 | new DenseVector(destArr) 117 | } 118 | 119 | def addTimeDependentEffects(ts: Vector): Vector = { 120 | val destArr = new Array[Double](ts.size) 121 | var prevVariance = omega / (1.0 - alpha - beta) 122 | var prevEta = ts(0) * math.sqrt(prevVariance) 123 | destArr(0) = c + prevEta 124 | for (i <- 1 until ts.size) { 125 | val variance = omega + alpha * prevEta * prevEta + beta * prevVariance 126 | val standardizedEta = ts(i) 127 | val eta = standardizedEta * math.sqrt(variance) 128 | destArr(i) = c + phi * destArr(i - 1) + eta 129 | 130 | prevEta = eta 131 | prevVariance = variance 132 | } 133 | new DenseVector(destArr) 134 | } 135 | 136 | private def sampleWithVariances(n: Int, rand: RandomGenerator): (Array[Double], Array[Double]) = { 137 | val ts = new Array[Double](n) 138 | val variances = new Array[Double](n) 139 | variances(0) = omega / (1 - alpha - beta) 140 | var eta = math.sqrt(variances(0)) * rand.nextGaussian() 141 | for (i <- 1 until n) { 142 | variances(i) = omega + beta * variances(i-1) + alpha * eta * eta 143 | eta = math.sqrt(variances(i)) * rand.nextGaussian() 144 | ts(i) = c + phi * ts(i - 1) + eta 145 | } 146 | 147 | (ts, variances) 148 | } 149 | 150 | /** 151 | * Samples a random time series of a given length with the properties of the model. 152 | * 153 | * @param n The length of the time series to sample. 154 | * @param rand The random generator used to generate the observations. 155 | * @return The samples time series. 156 | */ 157 | def sample(n: Int, rand: RandomGenerator): Array[Double] = sampleWithVariances(n, rand)._1 158 | } 159 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/knn/KNNClassifier.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.classification 2 | 3 | import org.apache.spark.broadcast.Broadcast 4 | import org.apache.spark.ml.param.ParamMap 5 | import org.apache.spark.ml.param.shared.HasWeightCol 6 | import org.apache.spark.ml.util.{Identifiable, SchemaUtils} 7 | import org.apache.spark.ml.linalg._ 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.sql.types.{DoubleType, StructType} 10 | import org.apache.spark.sql.{DataFrame, Dataset, Row} 11 | import org.apache.spark.storage.StorageLevel 12 | import org.apache.spark.ml.feature.LabeledPoint 13 | 14 | import scala.collection.mutable.ArrayBuffer 15 | 16 | /** 17 | * Created by endy on 17-1-9. 18 | */ 19 | class KNNClassifier(override val uid: String) extends 20 | ProbabilisticClassifier[Vector, KNNClassifier, KNNClassificationModel] 21 | with KNNParams { 22 | 23 | def this() = this(Identifiable.randomUID("KNNClassifier")) 24 | 25 | def setK(value: Int): this.type = set(k, value) 26 | 27 | def setTopTreeSize(value: Int): this.type = set(topTreeSize, value) 28 | 29 | def setTopTreeLeafSize(value: Int): this.type = set(topTreeLeafSize, value) 30 | 31 | def setSubTreeLeafSize(value: Int): this.type = set(subTreeLeafSize, value) 32 | 33 | def setBufferSizeSampleSizes(value: Array[Int]): this.type = set(bufferSizeSampleSizes, value) 34 | 35 | def setBalanceThreshold(value: Double): this.type = set(balanceThreshold, value) 36 | 37 | def setSeed(value: Long): this.type = set(seed, value) 38 | 39 | override protected def train(dataset: Dataset[_]): KNNClassificationModel = { 40 | // Extract columns from data. If dataset is persisted, do not persist oldDataset. 41 | val instances = extractLabeledPoints(dataset).map { 42 | case LabeledPoint(label: Double, features: Vector) => (label, features) 43 | } 44 | val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE 45 | if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK) 46 | 47 | val labelSummarizer = instances.treeAggregate( 48 | new MultiClassSummarizer)( 49 | seqOp = (c, v) => (c, v) match { 50 | case (labelSummarizer: MultiClassSummarizer, (label: Double, features: Vector)) => 51 | labelSummarizer.add(label) 52 | }, 53 | combOp = (c1, c2) => (c1, c2) match { 54 | case (classSummarizer1: MultiClassSummarizer, classSummarizer2: MultiClassSummarizer) => 55 | classSummarizer1.merge(classSummarizer2) 56 | }) 57 | 58 | val histogram = labelSummarizer.histogram 59 | val numInvalid = labelSummarizer.countInvalid 60 | val numClasses = histogram.length 61 | 62 | if (numInvalid != 0) { 63 | val msg = s"Classification labels should be in {0 to ${numClasses - 1} " + 64 | s"Found $numInvalid invalid labels." 65 | throw Exception 66 | } 67 | 68 | val knnModel = copyValues(new KNN()).fit(dataset) 69 | knnModel.toNewClassificationModel(uid, numClasses) 70 | } 71 | 72 | override def fit(dataset: Dataset[_]): KNNClassificationModel = { 73 | // Need to overwrite this method because we need to manually overwrite the buffer size 74 | // because it is not supposed to stay the same as the Classifier if user sets it to -1. 75 | transformSchema(dataset.schema, logging = true) 76 | val model = train(dataset) 77 | val bufferSize = model.getBufferSize 78 | copyValues(model.setParent(this)).setBufferSize(bufferSize) 79 | } 80 | 81 | override def copy(extra: ParamMap): KNNClassifier = defaultCopy(extra) 82 | } 83 | 84 | class KNNClassificationModel(override val uid: String, val topTree: Broadcast[Tree], 85 | val subTrees: RDD[Tree], val _numClasses: Int) extends 86 | ProbabilisticClassificationModel[Vector, KNNClassificationModel] 87 | with KNNModelParams with HasWeightCol with Serializable { 88 | require(subTrees.getStorageLevel != StorageLevel.NONE, 89 | "KNNModel is not designed to work with Trees that have not been cached") 90 | 91 | /** @group setParam */ 92 | def setK(value: Int): this.type = set(k, value) 93 | 94 | /** @group setParam */ 95 | def setBufferSize(value: Double): this.type = set(bufferSize, value) 96 | 97 | override def numClasses: Int = _numClasses 98 | 99 | override def transform(dataset: Dataset[_]): DataFrame = { 100 | val getWeight: Row => Double = r => 1.0 101 | 102 | val merged = transform(dataset, topTree, subTrees).map { 103 | case (id, labels) => 104 | val vector = new Array[Double](numClasses) 105 | var i = 0 106 | while (i < labels.length) { 107 | vector(labels(i).getDouble(0).toInt) += getWeight(labels(i)) 108 | i += 1 109 | } 110 | val rawPrediction = Vectors.dense(vector) 111 | lazy val probability = raw2probability(rawPrediction) 112 | lazy val prediction = probability2prediction(probability) 113 | 114 | val values = new ArrayBuffer[Any] 115 | if ($(rawPredictionCol).nonEmpty) { 116 | values.append(rawPrediction) 117 | } 118 | if ($(probabilityCol).nonEmpty) { 119 | values.append(probability) 120 | } 121 | if ($(predictionCol).nonEmpty) { 122 | values.append(prediction) 123 | } 124 | (id, values) 125 | } 126 | 127 | dataset.sqlContext.createDataFrame( 128 | dataset.rdd.zipWithIndex().map { case (row, i) => (i, row) } 129 | .leftOuterJoin(merged) // make sure we don't lose any observations 130 | .map { 131 | case (i, (row, values)) => Row.fromSeq(row.asInstanceOf[Row].toSeq ++ values.get) 132 | }, 133 | transformSchema(dataset.schema) 134 | ) 135 | } 136 | 137 | override def transformSchema(schema: StructType): StructType = { 138 | var transformed = schema 139 | if ($(rawPredictionCol).nonEmpty) { 140 | transformed = SchemaUtils.appendColumn(transformed, $(rawPredictionCol), new VectorUDT) 141 | } 142 | if ($(probabilityCol).nonEmpty) { 143 | transformed = SchemaUtils.appendColumn(transformed, $(probabilityCol), new VectorUDT) 144 | } 145 | if ($(predictionCol).nonEmpty) { 146 | transformed = SchemaUtils.appendColumn(transformed, $(predictionCol), DoubleType) 147 | } 148 | transformed 149 | } 150 | 151 | override def copy(extra: ParamMap): KNNClassificationModel = { 152 | val copied = new KNNClassificationModel(uid, topTree, subTrees, numClasses) 153 | copyValues(copied, extra).setParent(parent) 154 | } 155 | 156 | override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = { 157 | 158 | rawPrediction match { 159 | case dv: DenseVector => 160 | val size = dv.size 161 | val sum = dv.toArray.sum 162 | 163 | var i = 0 164 | while (i < size) { 165 | dv.values(i) /= sum 166 | i += 1 167 | } 168 | 169 | dv 170 | case sv: SparseVector => 171 | throw Exception 172 | } 173 | } 174 | 175 | override protected def predictRaw(features: Vector): Vector = { 176 | throw Exception 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/timeseries/models/EWMA.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.timeseries.models 2 | 3 | import org.apache.commons.math3.analysis.{MultivariateFunction, MultivariateVectorFunction} 4 | import org.apache.commons.math3.optim.{InitialGuess, MaxEval, MaxIter, SimpleValueChecker} 5 | import org.apache.commons.math3.optim.nonlinear.scalar.{GoalType, ObjectiveFunction, ObjectiveFunctionGradient} 6 | import org.apache.commons.math3.optim.nonlinear.scalar.gradient.NonLinearConjugateGradientOptimizer 7 | import org.apache.spark.ml.{Estimator, Model} 8 | import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors} 9 | import org.apache.spark.ml.param.{Param, ParamMap} 10 | import org.apache.spark.ml.timeseries.params.TimeSeriesParams 11 | import org.apache.spark.ml.util.Identifiable 12 | import org.apache.spark.sql.{DataFrame, Dataset, Row} 13 | import org.apache.spark.sql.types.{DoubleType, StructField, StructType} 14 | 15 | /** 16 | * Fits an Exponentially Weight Moving Average model (EWMA) to a time series. 17 | */ 18 | 19 | trait EWMAParams extends TimeSeriesParams { 20 | final val maxEval = new Param[Int](this, "maxEval", "max eval") 21 | def setMaxEval(value: Int): this.type = set(maxEval, value) 22 | 23 | final val maxIter = new Param[Int](this, "maxIter", "max iteration") 24 | def setMaxIter(value: Int): this.type = set(maxIter, value) 25 | 26 | final val initPoint = new Param[Double](this, "initPoint", "init point") 27 | def setInitPoint(value: Double): this.type = set(initPoint, value) 28 | } 29 | 30 | class EWMA(override val uid: String) extends Estimator[EWMAModel] with EWMAParams{ 31 | 32 | setDefault(timeCol -> "time", 33 | timeSeriesCol -> "timeseries") 34 | 35 | def this() = this(Identifiable.randomUID("EWMA")) 36 | 37 | /** 38 | * Fits an EWMA model to a time series. Uses the first point in the time series as a starting 39 | * value. Uses sum squared error as an objective function to optimize to find smoothing parameter 40 | * The model for EWMA is recursively defined as S_t = (1 - a) * X_t + a * S_{t-1}, where 41 | * a is the smoothing parameter, X is the original series, and S is the smoothed series 42 | * Note that the optimization is performed as unbounded optimization, although in its formal 43 | * definition the smoothing parameter is <= 1, which corresponds to an inequality bounded 44 | * optimization. Given this, the resulting smoothing parameter should always be sanity checked 45 | * https://en.wikipedia.org/wiki/Exponential_smoothing 46 | * @param dataset the time series dataset to which we want to fit an EWMA model 47 | * @return EWMA model 48 | */ 49 | override def fit(dataset: Dataset[_]): EWMAModel = { 50 | 51 | val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map { 52 | case Row(time: String, value: Double) => (time, value) 53 | }.sortByKey().collect() 54 | .map(x => x._2) 55 | 56 | val dataVector = Vectors.dense(data) 57 | 58 | val optimizer = new NonLinearConjugateGradientOptimizer( 59 | NonLinearConjugateGradientOptimizer.Formula.FLETCHER_REEVES, 60 | new SimpleValueChecker(1e-6, 1e-6)) 61 | 62 | 63 | val gradient = new ObjectiveFunctionGradient(new MultivariateVectorFunction() { 64 | def value(params: Array[Double]): Array[Double] = { 65 | val g = new EWMAModel(params(0)).gradient(dataVector) 66 | Array(g) 67 | } 68 | }) 69 | 70 | val objectiveFunction = new ObjectiveFunction(new MultivariateFunction() { 71 | def value(params: Array[Double]): Double = { 72 | new EWMAModel(params(0)).sse(dataVector) 73 | } 74 | }) 75 | // optimization parameters 76 | val initGuess = new InitialGuess(Array(${initPoint})) 77 | val goal = GoalType.MINIMIZE 78 | // optimization step 79 | val optimal = optimizer.optimize(objectiveFunction, goal, gradient, initGuess, 80 | new MaxIter(${maxIter}), new MaxEval(${maxEval})) 81 | val params = optimal.getPoint 82 | 83 | new EWMAModel(params(0)) 84 | .setTimeCol(${timeCol}) 85 | .setTimeSeriesCol(${timeSeriesCol}) 86 | 87 | } 88 | 89 | override def copy(extra: ParamMap): Estimator[EWMAModel] = defaultCopy(extra) 90 | 91 | /** 92 | * Check transform validity and derive the output schema from the input schema. 93 | * 94 | * Typical implementation should first conduct verification on schema change and parameter 95 | * validity, including complex parameter interaction checks. 96 | */ 97 | override def transformSchema(schema: StructType): StructType = { 98 | schema 99 | } 100 | } 101 | 102 | 103 | class EWMAModel(override val uid: String, val smoothing: Double) 104 | extends Model[EWMAModel] with EWMAParams{ 105 | 106 | def this(smoothing: Double) = this(Identifiable.randomUID("EWMAModel"), smoothing) 107 | 108 | /** 109 | * Calculates the SSE for a given timeseries ts given 110 | * the smoothing parameter of the current model 111 | * The forecast for the observation at period t + 1 is the smoothed value at time t 112 | * Source: http://people.duke.edu/~rnau/411avg.htm 113 | * @param ts the time series to fit a EWMA model to 114 | * @return Sum Squared Error 115 | */ 116 | def sse(ts: Vector): Double = { 117 | val n = ts.size 118 | 119 | val smoothed = addTimeDependentEffects(ts) 120 | var i = 0 121 | var error = 0.0 122 | var sqrErrors = 0.0 123 | while (i < n - 1) { 124 | error = ts(i + 1) - smoothed(i) 125 | sqrErrors += error * error 126 | i += 1 127 | } 128 | 129 | sqrErrors 130 | } 131 | 132 | /** 133 | * Calculates the gradient of the SSE cost function for our EWMA model 134 | * @return gradient 135 | */ 136 | def gradient(ts: Vector): Double = { 137 | val n = ts.size 138 | // val smoothed = new DenseVector(Array.fill(n)(0.0)) 139 | val smoothed = addTimeDependentEffects(ts) 140 | 141 | var error = 0.0 142 | var prevSmoothed = ts(0) 143 | var prevDSda = 0.0 // derivative of the EWMA function at time t - 1: (d S(t - 1)/ d smoothing) 144 | var dSda = 0.0 // derivative of the EWMA function at time t: (d S(t) / d smoothing) 145 | var dJda = 0.0 // derivative of our SSE cost function 146 | var i = 0 147 | 148 | while (i < n - 1) { 149 | error = ts(i + 1) - smoothed(i) 150 | dSda = ts(i) - prevSmoothed + (1 - smoothing) * prevDSda 151 | dJda += error * dSda 152 | prevDSda = dSda 153 | prevSmoothed = smoothed(i) 154 | i += 1 155 | } 156 | 2 * dJda 157 | } 158 | 159 | def addTimeDependentEffects(ts: Vector): Vector = { 160 | val arr = Array.fill(ts.size)(0.0) 161 | arr(0) = ts(0) // by definition in our model S_0 = X_0 162 | for (i <- 1 until ts.size) { 163 | arr(i) = smoothing * ts(i) + (1 - smoothing) * arr(i - 1) 164 | } 165 | new DenseVector(arr) 166 | } 167 | 168 | 169 | override def copy(extra: ParamMap): EWMAModel = defaultCopy(extra) 170 | 171 | /** 172 | * Transforms the input dataset. 173 | */ 174 | override def transform(dataset: Dataset[_]): DataFrame = { 175 | 176 | val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map { 177 | case Row(time: String, value: Double) => (time, value) 178 | }.sortByKey().collect() 179 | .map(x => x._2) 180 | 181 | val dataVector = Vectors.dense(data) 182 | 183 | val res = addTimeDependentEffects(dataVector) 184 | 185 | val resRDD = dataset.sparkSession.sparkContext.parallelize(res.toArray.map(x => Row(x))) 186 | 187 | val structType = transformSchema(dataset.schema) 188 | 189 | dataset.sparkSession.createDataFrame(resRDD, structType) 190 | } 191 | 192 | /** 193 | * Check transform validity and derive the output schema from the input schema. 194 | * 195 | * Typical implementation should first conduct verification on schema change and parameter 196 | * validity, including complex parameter interaction checks. 197 | */ 198 | override def transformSchema(schema: StructType): StructType = { 199 | StructType(Array(StructField("EMA", DoubleType))) 200 | } 201 | } 202 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/timeseries/models/GARCH.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.timeseries.models 2 | 3 | import io.transwarp.discover.timeseries.params.TimeSeriesParams 4 | import io.transwarp.midas.constant.midas.params.timeseries.{GARCHParams, TimeSeriesParams} 5 | import org.apache.commons.math3.analysis.{MultivariateFunction, MultivariateVectorFunction} 6 | import org.apache.commons.math3.optim.{InitialGuess, MaxEval, MaxIter, SimpleValueChecker} 7 | import org.apache.commons.math3.optim.nonlinear.scalar.{ObjectiveFunction, ObjectiveFunctionGradient} 8 | import org.apache.commons.math3.optim.nonlinear.scalar.gradient.NonLinearConjugateGradientOptimizer 9 | import org.apache.commons.math3.random.RandomGenerator 10 | import org.apache.spark.ml.{Estimator, Model} 11 | import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors} 12 | import org.apache.spark.ml.param.{Param, ParamMap} 13 | import org.apache.spark.ml.timeseries.params.TimeSeriesParams 14 | import org.apache.spark.ml.util.Identifiable 15 | import org.apache.spark.sql.{DataFrame, Dataset, Row} 16 | import org.apache.spark.sql.types.{DoubleType, StructField, StructType} 17 | 18 | /** 19 | * Created by endy on 16-12-22. 20 | */ 21 | 22 | trait GARCHParams extends TimeSeriesParams { 23 | final val maxEval = new Param[Int](this, "maxEval", "max eval") 24 | def setMaxEval(value: Int): this.type = set(maxEval, value) 25 | 26 | final val maxIter = new Param[Int](this, "maxIter", "max iteration") 27 | def setMaxIter(value: Int): this.type = set(maxIter, value) 28 | } 29 | 30 | class GARCH(override val uid: String) extends Estimator[GARCHModel] with GARCHParams{ 31 | 32 | setDefault(timeCol -> "time", 33 | timeSeriesCol -> "timeseries", 34 | maxEval -> 10000, 35 | maxIter -> 10000) 36 | 37 | def this() = this(Identifiable.randomUID("GARCH")) 38 | 39 | /** 40 | * Fits a model to the input data. 41 | */ 42 | override def fit(dataset: Dataset[_]): GARCHModel = { 43 | 44 | val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map { 45 | case Row(time: String, value: Double) => (time, value) 46 | }.sortByKey().collect() 47 | 48 | val dataVector = Vectors.dense(data.map(x => x._2)) 49 | 50 | val optimizer = new NonLinearConjugateGradientOptimizer( 51 | NonLinearConjugateGradientOptimizer.Formula.FLETCHER_REEVES, 52 | new SimpleValueChecker(1e-6, 1e-6)) 53 | 54 | val gradient = new ObjectiveFunctionGradient(new MultivariateVectorFunction() { 55 | def value(params: Array[Double]): Array[Double] = { 56 | new GARCHModel(params(0), params(1), params(2)).gradient(dataVector) 57 | } 58 | }) 59 | val objectiveFunction = new ObjectiveFunction(new MultivariateFunction() { 60 | def value(params: Array[Double]): Double = { 61 | new GARCHModel(params(0), params(1), params(2)).logLikelihood(dataVector) 62 | } 63 | }) 64 | 65 | val initialGuess = new InitialGuess(Array(.2, .2, .2)) // TODO: make this smarter 66 | 67 | val optimal = optimizer.optimize(objectiveFunction, gradient, initialGuess, 68 | new MaxIter(${maxIter}), new MaxEval(${maxEval})) 69 | 70 | val params = optimal.getPoint 71 | new GARCHModel(params(0), params(1), params(2)) 72 | .setTimeCol(${timeCol}).setTimeSeriesCol(${timeSeriesCol}) 73 | 74 | } 75 | 76 | override def copy(extra: ParamMap): Estimator[GARCHModel] = defaultCopy(extra) 77 | 78 | /** 79 | * :: DeveloperApi :: 80 | * 81 | * Check transform validity and derive the output schema from the input schema. 82 | * 83 | * Typical implementation should first conduct verification on schema change and parameter 84 | * validity, including complex parameter interaction checks. 85 | */ 86 | override def transformSchema(schema: StructType): StructType = schema 87 | } 88 | 89 | class GARCHModel(override val uid: String, val omega: Double, val alpha: Double, val beta: Double) 90 | extends Model[GARCHModel] with GARCHParams { 91 | 92 | def this(omega: Double, alpha: Double, beta: Double) = this(Identifiable.randomUID("GARCH"), 93 | omega, alpha, beta) 94 | 95 | override def copy(extra: ParamMap): GARCHModel = defaultCopy(extra) 96 | 97 | /** 98 | * Transforms the input dataset. 99 | */ 100 | override def transform(dataset: Dataset[_]): DataFrame = { 101 | val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map { 102 | case Row(time: String, value: Double) => (time, value) 103 | }.sortByKey().collect() 104 | 105 | val dataVector = Vectors.dense(data.map(x => x._2)) 106 | 107 | val dest = addTimeDependentEffects(dataVector) 108 | 109 | val resRDD = dataset.sparkSession.sparkContext.parallelize(dest.toArray.map(x => Row(x))) 110 | 111 | val structType = transformSchema(dataset.schema) 112 | 113 | dataset.sparkSession.createDataFrame(resRDD, structType) 114 | } 115 | 116 | /** 117 | * :: DeveloperApi :: 118 | * 119 | * Check transform validity and derive the output schema from the input schema. 120 | * 121 | * Typical implementation should first conduct verification on schema change and parameter 122 | * validity, including complex parameter interaction checks. 123 | */ 124 | override def transformSchema(schema: StructType): StructType = 125 | StructType(Array(StructField("GARCH", DoubleType))) 126 | /** 127 | * Returns the log likelihood of the parameters on the given time series. 128 | * 129 | * Based on https://pdfs.semanticscholar.org/7da8/bfa5295375c1141d797e80065a599153c19d.pdf 130 | */ 131 | def logLikelihood(ts: Vector): Double = { 132 | var sum = 0.0 133 | iterateWithHAndEta(ts) { (i, h, eta, prevH, prevEta) => 134 | sum += -.5 * math.log(h) - .5 * eta * eta / h 135 | } 136 | sum + -.5 * math.log(2 * math.Pi) * (ts.size - 1) 137 | } 138 | 139 | private def iterateWithHAndEta(ts: Vector) 140 | (fn: (Int, Double, Double, Double, Double) => Unit): Unit = { 141 | var prevH = omega / (1 - alpha - beta) 142 | var i = 1 143 | while (i < ts.size) { 144 | val h = omega + alpha * ts(i - 1) * ts(i - 1) + beta * prevH 145 | fn(i, h, ts(i), prevH, ts(i - 1)) 146 | prevH = h 147 | i += 1 148 | } 149 | } 150 | 151 | def gradient(ts: Vector): Array[Double] = { 152 | var omegaGradient = 0.0 153 | var alphaGradient = 0.0 154 | var betaGradient = 0.0 155 | var omegaDhdtheta = 0.0 156 | var alphaDhdtheta = 0.0 157 | var betaDhdtheta = 0.0 158 | iterateWithHAndEta(ts) { (i, h, eta, prevH, prevEta) => 159 | omegaDhdtheta = 1 + beta * omegaDhdtheta 160 | alphaDhdtheta = prevEta * prevEta + beta * alphaDhdtheta 161 | betaDhdtheta = prevH + beta * betaDhdtheta 162 | 163 | val multiplier = (eta * eta / (h * h)) - (1 / h) 164 | omegaGradient += multiplier * omegaDhdtheta 165 | alphaGradient += multiplier * alphaDhdtheta 166 | betaGradient += multiplier * betaDhdtheta 167 | } 168 | Array(omegaGradient * .5, alphaGradient * .5, betaGradient * .5) 169 | } 170 | 171 | def addTimeDependentEffects(ts: Vector): Vector = { 172 | 173 | val destArr = new Array[Double](ts.size) 174 | 175 | var prevVariance = omega / (1.0 - alpha - beta) 176 | var prevEta = ts(0) * math.sqrt(prevVariance) 177 | 178 | destArr(0) = prevEta 179 | for (i <- 1 until ts.size) { 180 | val variance = omega + alpha * prevEta * prevEta + beta * prevVariance 181 | val standardizedEta = ts(i) 182 | val eta = standardizedEta * math.sqrt(variance) 183 | destArr(i) = eta 184 | 185 | prevEta = eta 186 | prevVariance = variance 187 | } 188 | new DenseVector(destArr) 189 | } 190 | 191 | private def sampleWithVariances(n: Int, rand: RandomGenerator): (Array[Double], Array[Double]) = { 192 | val ts = new Array[Double](n) 193 | val variances = new Array[Double](n) 194 | variances(0) = omega / (1 - alpha - beta) 195 | var eta = math.sqrt(variances(0)) * rand.nextGaussian() 196 | for (i <- 1 until n) { 197 | variances(i) = omega + beta * variances(i-1) + alpha * eta * eta 198 | eta = math.sqrt(variances(i)) * rand.nextGaussian() 199 | ts(i) = eta 200 | } 201 | 202 | (ts, variances) 203 | } 204 | 205 | /** 206 | * Samples a random time series of a given length with the properties of the model. 207 | * 208 | * @param n The length of the time series to sample. 209 | * @param rand The random generator used to generate the observations. 210 | * @return The samples time series. 211 | */ 212 | def sample(n: Int, rand: RandomGenerator): Array[Double] = sampleWithVariances(n, rand)._1 213 | } 214 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/ml/timeseries/models/HoltWintersSuite.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.timeseries.models 2 | 3 | import org.apache.spark.SparkFunSuite 4 | import org.apache.spark.ml.util.DefaultReadWriteTest 5 | import org.apache.spark.mllib.util.MLlibTestSparkContext 6 | import org.apache.spark.mllib.util.TestingUtils._ 7 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} 8 | import org.apache.spark.sql.{Dataset, Row} 9 | 10 | /** 11 | * Created by endy on 16-12-21. 12 | */ 13 | class HoltWintersSuite extends SparkFunSuite with MLlibTestSparkContext 14 | with DefaultReadWriteTest { 15 | 16 | @transient var dataSet: Dataset[_] = _ 17 | @transient var dataSet2: Dataset[_] = _ 18 | 19 | val tsAirPassengers = Array( 20 | 112.0, 118.0, 132.0, 129.0, 121.0, 135.0, 148.0, 148.0, 136.0, 119.0, 104.0, 118.0, 115.0, 21 | 126.0, 141.0, 135.0, 125.0, 149.0, 170.0, 170.0, 158.0, 133.0, 114.0, 140.0, 145.0, 150.0, 22 | 178.0, 163.0, 172.0, 178.0, 199.0, 199.0, 184.0, 162.0, 146.0, 166.0, 171.0, 180.0, 193.0, 23 | 181.0, 183.0, 218.0, 230.0, 242.0, 209.0, 191.0, 172.0, 194.0, 196.0, 196.0, 236.0, 235.0, 24 | 229.0, 243.0, 264.0, 272.0, 237.0, 211.0, 180.0, 201.0, 204.0, 188.0, 235.0, 227.0, 234.0, 25 | 264.0, 302.0, 293.0, 259.0, 229.0, 203.0, 229.0, 242.0, 233.0, 267.0, 269.0, 270.0, 315.0, 26 | 364.0, 347.0, 312.0, 274.0, 237.0, 278.0, 284.0, 277.0, 317.0, 313.0, 318.0, 374.0, 413.0, 27 | 405.0, 355.0, 306.0, 271.0, 306.0, 315.0, 301.0, 356.0, 348.0, 355.0, 422.0, 465.0, 467.0, 28 | 404.0, 347.0, 305.0, 336.0, 340.0, 318.0, 362.0, 348.0, 363.0, 435.0, 491.0, 505.0, 404.0, 29 | 359.0, 310.0, 337.0, 360.0, 342.0, 406.0, 396.0, 420.0, 472.0, 548.0, 559.0, 463.0, 407.0, 30 | 362.0, 405.0, 417.0, 391.0, 419.0, 461.0, 472.0, 535.0, 622.0, 606.0, 508.0, 461.0, 390.0, 31 | 432.0) 32 | 33 | val tsCO2 = Array( 34 | 315.42, 316.31, 316.50, 317.56, 318.13, 318.00, 316.39, 314.65, 313.68, 313.18, 314.66, 315.43, 35 | 316.27, 316.81, 317.42, 318.87, 319.87, 319.43, 318.01, 315.74, 314.00, 313.68, 314.84, 316.03, 36 | 316.73, 317.54, 318.38, 319.31, 320.42, 319.61, 318.42, 316.63, 314.83, 315.16, 315.94, 316.85, 37 | 317.78, 318.40, 319.53, 320.42, 320.85, 320.45, 319.45, 317.25, 316.11, 315.27, 316.53, 317.53, 38 | 318.58, 318.92, 319.70, 321.22, 322.08, 321.31, 319.58, 317.61, 316.05, 315.83, 316.91, 318.20, 39 | 319.41, 320.07, 320.74, 321.40, 322.06, 321.73, 320.27, 318.54, 316.54, 316.71, 317.53, 318.55, 40 | 319.27, 320.28, 320.73, 321.97, 322.00, 321.71, 321.05, 318.71, 317.66, 317.14, 318.70, 319.25, 41 | 320.46, 321.43, 322.23, 323.54, 323.91, 323.59, 322.24, 320.20, 318.48, 317.94, 319.63, 320.87, 42 | 322.17, 322.34, 322.88, 324.25, 324.83, 323.93, 322.38, 320.76, 319.10, 319.24, 320.56, 321.80, 43 | 322.40, 322.99, 323.73, 324.86, 325.40, 325.20, 323.98, 321.95, 320.18, 320.09, 321.16, 322.74, 44 | 323.83, 324.26, 325.47, 326.50, 327.21, 326.54, 325.72, 323.50, 322.22, 321.62, 322.69, 323.95, 45 | 324.89, 325.82, 326.77, 327.97, 327.91, 327.50, 326.18, 324.53, 322.93, 322.90, 323.85, 324.96, 46 | 326.01, 326.51, 327.01, 327.62, 328.76, 328.40, 327.20, 325.27, 323.20, 323.40, 324.63, 325.85, 47 | 326.60, 327.47, 327.58, 329.56, 329.90, 328.92, 327.88, 326.16, 324.68, 325.04, 326.34, 327.39, 48 | 328.37, 329.40, 330.14, 331.33, 332.31, 331.90, 330.70, 329.15, 327.35, 327.02, 327.99, 328.48, 49 | 329.18, 330.55, 331.32, 332.48, 332.92, 332.08, 331.01, 329.23, 327.27, 327.21, 328.29, 329.41, 50 | 330.23, 331.25, 331.87, 333.14, 333.80, 333.43, 331.73, 329.90, 328.40, 328.17, 329.32, 330.59, 51 | 331.58, 332.39, 333.33, 334.41, 334.71, 334.17, 332.89, 330.77, 329.14, 328.78, 330.14, 331.52, 52 | 332.75, 333.24, 334.53, 335.90, 336.57, 336.10, 334.76, 332.59, 331.42, 330.98, 332.24, 333.68, 53 | 334.80, 335.22, 336.47, 337.59, 337.84, 337.72, 336.37, 334.51, 332.60, 332.38, 333.75, 334.78, 54 | 336.05, 336.59, 337.79, 338.71, 339.30, 339.12, 337.56, 335.92, 333.75, 333.70, 335.12, 336.56, 55 | 337.84, 338.19, 339.91, 340.60, 341.29, 341.00, 339.39, 337.43, 335.72, 335.84, 336.93, 338.04, 56 | 339.06, 340.30, 341.21, 342.33, 342.74, 342.08, 340.32, 338.26, 336.52, 336.68, 338.19, 339.44, 57 | 340.57, 341.44, 342.53, 343.39, 343.96, 343.18, 341.88, 339.65, 337.81, 337.69, 339.09, 340.32, 58 | 341.20, 342.35, 342.93, 344.77, 345.58, 345.14, 343.81, 342.21, 339.69, 339.82, 340.98, 342.82, 59 | 343.52, 344.33, 345.11, 346.88, 347.25, 346.62, 345.22, 343.11, 340.90, 341.18, 342.80, 344.04, 60 | 344.79, 345.82, 347.25, 348.17, 348.74, 348.07, 346.38, 344.51, 342.92, 342.62, 344.06, 345.38, 61 | 346.11, 346.78, 347.68, 349.37, 350.03, 349.37, 347.76, 345.73, 344.68, 343.99, 345.48, 346.72, 62 | 347.84, 348.29, 349.23, 350.80, 351.66, 351.07, 349.33, 347.92, 346.27, 346.18, 347.64, 348.78, 63 | 350.25, 351.54, 352.05, 353.41, 354.04, 353.62, 352.22, 350.27, 348.55, 348.72, 349.91, 351.18, 64 | 352.60, 352.92, 353.53, 355.26, 355.52, 354.97, 353.75, 351.52, 349.64, 349.83, 351.14, 352.37, 65 | 353.50, 354.55, 355.23, 356.04, 357.00, 356.07, 354.67, 352.76, 350.82, 351.04, 352.69, 354.07, 66 | 354.59, 355.63, 357.03, 358.48, 359.22, 358.12, 356.06, 353.92, 352.05, 352.11, 353.64, 354.89, 67 | 355.88, 356.63, 357.72, 359.07, 359.58, 359.17, 356.94, 354.92, 352.94, 353.23, 354.09, 355.33, 68 | 356.63, 357.10, 358.32, 359.41, 360.23, 359.55, 357.53, 355.48, 353.67, 353.95, 355.30, 356.78, 69 | 358.34, 358.89, 359.95, 361.25, 361.67, 360.94, 359.55, 357.49, 355.84, 356.00, 357.59, 359.05, 70 | 359.98, 361.03, 361.66, 363.48, 363.82, 363.30, 361.94, 359.50, 358.11, 357.80, 359.61, 360.74, 71 | 362.09, 363.29, 364.06, 364.76, 365.45, 365.01, 363.70, 361.54, 359.51, 359.65, 360.80, 362.38, 72 | 363.23, 364.06, 364.61, 366.40, 366.84, 365.68, 364.52, 362.57, 360.24, 360.83, 362.49, 364.34 73 | ) 74 | 75 | override def beforeAll(): Unit = { 76 | super.beforeAll() 77 | 78 | val schema = StructType(Array(StructField("time", StringType), StructField("timeseries", 79 | DoubleType))) 80 | 81 | var data = tsAirPassengers.zipWithIndex.map(x => (x._2.formatted("%011d"), x._1)) 82 | val rdd = sc.parallelize(data.map(x => Row(x._1, x._2))) 83 | dataSet = spark.createDataFrame(rdd, schema) 84 | 85 | data = tsCO2.zipWithIndex.map(x => (x._2.formatted("%011d"), x._1)) 86 | val rdd2 = sc.parallelize(data.map(x => Row(x._1, x._2))) 87 | dataSet2 = spark.createDataFrame(rdd2, schema) 88 | } 89 | 90 | test("Optimal Paramaters alpha beta gamma - Additive Model") { 91 | val model = new HoltWinters() 92 | .setTimeCol("time") 93 | .setTimeSeriesCol("timeseries") 94 | .setModelType("additive") 95 | .setPeriod(12) 96 | .setMaxIter(30000) 97 | .setMaxEval(30000) 98 | .fit(dataSet) 99 | 100 | assert(model.alpha ~== 0.24796 absTol 0.01 ) 101 | assert(model.beta ~== 0.03453 absTol 0.01 ) 102 | assert(model.gamma ~== 1.0 absTol 0.01 ) 103 | } 104 | 105 | test("Forecast - Additive Model") { 106 | val model = new HoltWinters() 107 | .setTimeCol("time") 108 | .setTimeSeriesCol("timeseries") 109 | .setModelType("additive") 110 | .setPeriod(12) 111 | .setMaxIter(30000) 112 | .setMaxEval(30000) 113 | .fit(dataSet) 114 | 115 | val forecasted = model.transform(dataSet).collect().map{ 116 | case Row(x: Double) => x 117 | } 118 | 119 | val actualForecasted = new Array[Double](12) 120 | actualForecasted(0) = 453.4977 121 | actualForecasted(1) = 429.3906 122 | actualForecasted(2) = 467.0361 123 | actualForecasted(3) = 503.2574 124 | actualForecasted(4) = 512.3395 125 | actualForecasted(5) = 571.8880 126 | actualForecasted(6) = 652.6095 127 | actualForecasted(7) = 637.4623 128 | actualForecasted(8) = 539.7548 129 | actualForecasted(9) = 490.7250 130 | actualForecasted(10) = 424.4593 131 | actualForecasted(11) = 469.5315 132 | 133 | for (i <- 0 until 12) { 134 | assert(forecasted(i) ~== actualForecasted(i) absTol 10) 135 | } 136 | } 137 | 138 | 139 | test("Optimal Paramaters alpha beta gamma - Multiplicative Model") { 140 | val model = new HoltWinters() 141 | .setTimeCol("time") 142 | .setTimeSeriesCol("timeseries") 143 | .setModelType("multiplicative") 144 | .setPeriod(12) 145 | .setMaxIter(30000) 146 | .setMaxEval(30000) 147 | .fit(dataSet2) 148 | 149 | assert(model.alpha ~== 0.51265 absTol 0.01 ) 150 | assert(model.beta ~== 0.00949 absTol 0.01 ) 151 | assert(model.gamma ~== 0.47289 absTol 0.1 ) 152 | } 153 | 154 | test("Forecast - Multiplicative Model") { 155 | val model = new HoltWinters() 156 | .setTimeCol("time") 157 | .setTimeSeriesCol("timeseries") 158 | .setModelType("multiplicative") 159 | .setPeriod(12) 160 | .setMaxIter(30000) 161 | .setMaxEval(30000) 162 | .fit(dataSet2) 163 | 164 | val forecasted = model.transform(dataSet2).collect().map{ 165 | case Row(x: Double) => x 166 | } 167 | 168 | val actualForecasted = new Array[Double](12) 169 | actualForecasted(0) = 365.1079 170 | actualForecasted(1) = 365.9664 171 | actualForecasted(2) = 366.7343 172 | actualForecasted(3) = 368.1364 173 | actualForecasted(4) = 368.6674 174 | actualForecasted(5) = 367.9508 175 | actualForecasted(6) = 366.5318 176 | actualForecasted(7) = 364.3799 177 | actualForecasted(8) = 362.4731 178 | actualForecasted(9) = 362.7520 179 | actualForecasted(10) = 364.2203 180 | actualForecasted(11) = 365.6741 181 | 182 | for (i <- 0 until 12) { 183 | assert(forecasted(i) ~== actualForecasted(i) absTol 10) 184 | } 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/ml/timeseries/models/ARIMASuite.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.timeseries.models 2 | 3 | import org.apache.commons.math3.random.{MersenneTwister, RandomGenerator} 4 | import org.apache.spark.SparkFunSuite 5 | import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors} 6 | import org.apache.spark.ml.timeseries.UnivariateTimeSeries 7 | import org.apache.spark.ml.util.DefaultReadWriteTest 8 | import org.apache.spark.mllib.util.MLlibTestSparkContext 9 | import org.apache.spark.mllib.util.TestingUtils._ 10 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} 11 | import org.apache.spark.sql.{DataFrame, Dataset, Row} 12 | 13 | 14 | /** 15 | * Created by endy on 16-12-20. 16 | */ 17 | class ARIMASuite extends SparkFunSuite with MLlibTestSparkContext 18 | with DefaultReadWriteTest { 19 | 20 | @transient var dataSet: Dataset[_] = _ 21 | test("compare with R") { 22 | // > R.Version()$version.string 23 | // [1] "R version 3.2.0 (2015-04-16)" 24 | // > set.seed(456) 25 | // y <- arima.sim(n=250,list(ar=0.3,ma=0.7),mean = 5) 26 | // write.table(y, file = "resources/R_ARIMA_DataSet1.csv", row.names = FALSE, col.names = FALSE) 27 | val dataFile = getClass.getResource("/timeseries/R_ARIMA_DataSet1.csv").toString 28 | 29 | val rawData = sc.textFile(dataFile).map(line => line.toDouble) 30 | .collect().zipWithIndex 31 | 32 | val schema = StructType(Array(StructField("time", StringType), StructField("timeseries", 33 | DoubleType))) 34 | 35 | val rdd = sc.parallelize(rawData.map(x => Row(x._2.formatted("%05d"), x._1))) 36 | val dataset = spark.createDataFrame(rdd, schema) 37 | 38 | val model = new ARIMA() 39 | .setP(1) 40 | .setD(0) 41 | .setQ(1) 42 | .setTimeCol("time") 43 | .setTimeSeriesCol("timeseries") 44 | .fit(dataset) 45 | 46 | val Array(c, ar, ma) = model.coefficient 47 | assert(ar ~== 0.3 absTol 0.05) 48 | assert(ma ~== 0.7 absTol 0.05) 49 | } 50 | 51 | test("Data sampled from a given model should result in similar model if fit") { 52 | val rand = new MersenneTwister(10L) 53 | val model = new ARIMAModel(2, 1, 2, Array(8.2, 0.2, 0.5, 0.3, 0.1)) 54 | val (_, sampled) = sample(1000, rand, model) 55 | 56 | val newModel = new ARIMA() 57 | .setP(2) 58 | .setD(1) 59 | .setQ(2) 60 | .setTimeCol("time") 61 | .setTimeSeriesCol("timeseries") 62 | .fit(sampled) 63 | 64 | val Array(c, ar1, ar2, ma1, ma2) = model.coefficient 65 | val Array(cTest, ar1Test, ar2Test, ma1Test, ma2Test) = newModel.coefficient 66 | 67 | // intercept is given more leeway 68 | assert(c ~== cTest absTol 1) 69 | assert(ar1Test ~== ar1 absTol 0.1) 70 | assert(ma1Test ~== ma1 absTol 0.1) 71 | assert(ar2Test ~== ar2 absTol 0.1) 72 | assert(ma2Test ~== ma2 absTol 0.1) 73 | } 74 | 75 | test("Fitting CSS with BOBYQA and conjugate gradient descent should be fairly similar") { 76 | val rand = new MersenneTwister(10L) 77 | val model = new ARIMAModel(2, 1, 2, Array(8.2, 0.2, 0.5, 0.3, 0.1)) 78 | val (_, sampled) = sample(1000, rand, model) 79 | 80 | val fitWithBOBYQA = new ARIMA() 81 | .setP(2) 82 | .setD(1) 83 | .setQ(2) 84 | .setTimeCol("time") 85 | .setTimeSeriesCol("timeseries") 86 | .setMethod("css-bobyqa") 87 | .fit(sampled) 88 | 89 | val fitWithCGD = new ARIMA() 90 | .setP(2) 91 | .setD(1) 92 | .setQ(2) 93 | .setTimeCol("time") 94 | .setTimeSeriesCol("timeseries") 95 | .setMethod("css-cgd") 96 | .fit(sampled) 97 | 98 | val Array(c, ar1, ar2, ma1, ma2) = fitWithBOBYQA.coefficient 99 | val Array(cCGD, ar1CGD, ar2CGD, ma1CGD, ma2CGD) = fitWithCGD.coefficient 100 | 101 | // give more leeway for intercept 102 | assert(cCGD ~== c absTol 1) 103 | assert(ar1CGD ~== ar1 absTol 0.1) 104 | assert(ar2CGD ~== ar2 absTol 0.1) 105 | assert(ma1CGD ~== ma1 absTol 0.1) 106 | assert(ma2CGD ~== ma2 absTol 0.1) 107 | } 108 | 109 | test("Fitting ARIMA(p, d, q) should be the same as fitting a d-order differenced ARMA(p, q)") { 110 | val rand = new MersenneTwister(10L) 111 | val model = new ARIMAModel(1, 1, 2, Array(0.3, 0.7, 0.1), hasIntercept = false) 112 | val (vec, sampled) = sample(1000, rand, model) 113 | 114 | val arimaModel = new ARIMA() 115 | .setP(1) 116 | .setD(1) 117 | .setQ(2) 118 | .setTimeCol("time") 119 | .setTimeSeriesCol("timeseries") 120 | .setIncludeIntercept(false) 121 | .fit(sampled) 122 | 123 | 124 | val differenceSample = UnivariateTimeSeries.differencesOfOrderD(vec, 1).toArray.drop(1) 125 | 126 | val dataFrame = genDf(differenceSample) 127 | 128 | val armaModel = new ARIMA() 129 | .setP(1) 130 | .setD(0) 131 | .setQ(2) 132 | .setTimeCol("time") 133 | .setTimeSeriesCol("timeseries") 134 | .setIncludeIntercept(false) 135 | .fit(dataFrame) 136 | 137 | val Array(refAR, refMA1, refMA2) = model.coefficient 138 | val Array(iAR, iMA1, iMA2) = arimaModel.coefficient 139 | val Array(ar, ma1, ma2) = armaModel.coefficient 140 | 141 | // ARIMA model should match parameters used to sample, to some extent 142 | assert(iAR ~== refAR absTol 0.05) 143 | assert(iMA1 ~== refMA1 absTol 0.05) 144 | assert(iMA2 ~== refMA2 absTol 0.05) 145 | 146 | // ARMA model parameters of differenced sample should be equal to ARIMA model parameters 147 | assert(ar == iAR) 148 | assert(ma1 == iMA1) 149 | assert(ma2 == iMA2) 150 | } 151 | 152 | test("Fitting ARIMA(0, 0, 0) with intercept term results in model with average as parameter") { 153 | val rand = new MersenneTwister(10L) 154 | val (vec, sampled) = sample(100, rand) 155 | 156 | val model = new ARIMA() 157 | .setP(0) 158 | .setD(0) 159 | .setQ(0) 160 | .setTimeCol("time") 161 | .setTimeSeriesCol("timeseries") 162 | .fit(sampled) 163 | 164 | val mean = vec.toArray.sum / vec.size 165 | 166 | assert(model.coefficient(0) ~== mean absTol 1e-4) 167 | } 168 | 169 | test("Fitting ARIMA(0, 0, 0) with intercept term results in model with average as the forecast") { 170 | val rand = new MersenneTwister(10L) 171 | val (vec, sampled) = sample(100, rand) 172 | val model = new ARIMA() 173 | .setP(0) 174 | .setD(0) 175 | .setQ(0) 176 | .setTimeCol("time") 177 | .setTimeSeriesCol("timeseries") 178 | .fit(sampled) 179 | 180 | val mean = vec.toArray.sum / vec.size 181 | 182 | assert(model.coefficient(0) ~== mean absTol 1e-4) 183 | val forecast = model 184 | .setNFuture(10).transform(sampled).collect() 185 | .map{case Row(s: Double) => s} 186 | 187 | for(i <- 100 until 110) { 188 | assert(forecast(i) ~== mean absTol 1e-4) 189 | } 190 | } 191 | 192 | test("Fitting an integrated time series of order 3") { 193 | // > set.seed(10) 194 | // > vals <- arima.sim(list(ma = c(0.2), order = c(0, 3, 1)), 200) 195 | // > arima(order = c(0, 3, 1), vals, method = "CSS") 196 | // 197 | // Call: 198 | // arima(x = vals, order = c(0, 3, 1), method = "CSS") 199 | // 200 | // Coefficients: 201 | // ma1 202 | // 0.2523 203 | // s.e. 0.0623 204 | // 205 | // sigma^2 estimated as 0.9218: part log likelihood = -275.65 206 | // > write.table(y, file = "resources/R_ARIMA_DataSet2.csv", row.names = FALSE, col.names = 207 | // FALSE) 208 | val dataFile = getClass.getResource("/timeseries/R_ARIMA_DataSet2.csv").toString 209 | val rawData = sc.textFile(dataFile).map(line => line.toDouble) 210 | .collect().zipWithIndex 211 | 212 | val schema = StructType(Array(StructField("time", StringType), StructField("timeseries", 213 | DoubleType))) 214 | 215 | val rdd = sc.parallelize(rawData.map(x => Row(x._2.formatted("%05d"), x._1))) 216 | val dataset = spark.createDataFrame(rdd, schema) 217 | val model = new ARIMA() 218 | .setP(0) 219 | .setD(3) 220 | .setQ(1) 221 | .setTimeCol("time") 222 | .setTimeSeriesCol("timeseries") 223 | .fit(dataset) 224 | 225 | val Array(c, ma) = model.coefficient 226 | assert(ma ~== 0.2 absTol 0.05) 227 | } 228 | /** 229 | * Sample a series of size n assuming an ARIMA(p, d, q) process. 230 | * 231 | * @param n size of sample 232 | * @return series reflecting ARIMA(p, d, q) process 233 | */ 234 | def sample(n: Int, rand: RandomGenerator, model: ARIMAModel): (Vector, DataFrame) = { 235 | val vec = new DenseVector(Array.fill[Double](n)(rand.nextGaussian)) 236 | val res = model.addTimeDependentEffects(vec, vec).toArray 237 | 238 | val schema = StructType(Array(StructField("time", StringType), StructField("timeseries", 239 | DoubleType))) 240 | 241 | val rdd = sc.parallelize(res.zipWithIndex.map(x => Row(x._2.formatted("%05d"), x._1))) 242 | 243 | (Vectors.dense(res), spark.createDataFrame(rdd, schema)) 244 | } 245 | 246 | /** 247 | * Sample a series of size n assuming an ARIMA(p, d, q) process. 248 | * 249 | * @param n size of sample 250 | * @return series reflecting ARIMA(p, d, q) process 251 | */ 252 | def sample(n: Int, rand: RandomGenerator): (Vector, DataFrame) = { 253 | val vec = new DenseVector(Array.fill[Double](n)(rand.nextGaussian)).toArray 254 | 255 | val schema = StructType(Array(StructField("time", StringType), StructField("timeseries", 256 | DoubleType))) 257 | 258 | val rdd = sc.parallelize(vec.zipWithIndex.map(x => Row(x._2.formatted("%05d"), x._1))) 259 | 260 | (Vectors.dense(vec), spark.createDataFrame(rdd, schema)) 261 | } 262 | 263 | def genDf(array: Array[Double]): DataFrame = { 264 | val schema = StructType(Array(StructField("time", StringType), StructField("timeseries", 265 | DoubleType))) 266 | 267 | val rdd = spark.sparkContext.parallelize( 268 | array.zipWithIndex.map(x => Row(x._2.formatted("%010d"), x._1))) 269 | 270 | spark.createDataFrame(rdd, schema) 271 | } 272 | 273 | } 274 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/dbscan/DBSCAN.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.dbscan 2 | 3 | import org.apache.spark.ml.dbscan.DBSCANLabeledPoint.Flag 4 | import org.apache.spark.internal.Logging 5 | import org.apache.spark.ml.linalg.Vector 6 | import org.apache.spark.rdd.RDD 7 | 8 | /** 9 | * Top level method for calling DBSCAN 10 | */ 11 | object DBSCAN { 12 | 13 | /** 14 | * Train a DBSCAN Model using the given set of parameters 15 | * 16 | * @param data training points stored as `RDD[Vector]` 17 | * only the first two points of the vector are taken into consideration 18 | * @param eps the maximum distance between two points for them to be considered as part 19 | * of the same region 20 | * @param minPoints the minimum number of points required to form a dense region 21 | * @param maxPointsPerPartition the largest number of points in a single partition 22 | */ 23 | def train( 24 | data: RDD[Vector], 25 | eps: Double, 26 | minPoints: Int, 27 | maxPointsPerPartition: Int): DBSCAN = { 28 | 29 | new DBSCAN(eps, minPoints, maxPointsPerPartition, null, null).train(data) 30 | 31 | } 32 | 33 | } 34 | 35 | /** 36 | * A parallel implementation of DBSCAN clustering. The implementation will split the data space 37 | * into a number of partitions, making a best effort to keep the number of points in each 38 | * partition under `maxPointsPerPartition`. After partitioning, traditional DBSCAN 39 | * clustering will be run in parallel for each partition and finally the results 40 | * of each partition will be merged to identify global clusters. 41 | * 42 | * This is an iterative algorithm that will make multiple passes over the data, 43 | * any given RDDs should be cached by the user. 44 | */ 45 | class DBSCAN private ( val eps: Double, 46 | val minPoints: Int, 47 | val maxPointsPerPartition: Int, 48 | @transient val partitions: List[(Int, DBSCANRectangle)], 49 | @transient private val labeledPartitionedPoints: 50 | RDD[(Int, DBSCANLabeledPoint)]) 51 | 52 | extends Serializable with Logging { 53 | 54 | type Margins = (DBSCANRectangle, DBSCANRectangle, DBSCANRectangle) 55 | type ClusterId = (Int, Int) 56 | 57 | def minimumRectangleSize: Double = 2 * eps 58 | 59 | def labeledPoints: RDD[DBSCANLabeledPoint] = { 60 | labeledPartitionedPoints.values 61 | } 62 | 63 | private def train(vectors: RDD[Vector]): DBSCAN = { 64 | // generate the smallest rectangles that split the space 65 | // and count how many points are contained in each one of them 66 | val minimumRectanglesWithCount = 67 | vectors 68 | .map(toMinimumBoundingRectangle) 69 | .map((_, 1)) 70 | .aggregateByKey(0)(_ + _, _ + _) 71 | .collect() 72 | .toSet 73 | 74 | // find the best partitions for the data space 75 | val localPartitions = EvenSplitPartitioner 76 | .partition(minimumRectanglesWithCount, maxPointsPerPartition, minimumRectangleSize) 77 | 78 | logDebug("Found partitions: ") 79 | localPartitions.foreach(p => logDebug(p.toString)) 80 | 81 | // grow partitions to include eps 82 | val localMargins = 83 | localPartitions 84 | .map({ case (p, _) => (p.shrink(eps), p, p.shrink(-eps)) }) 85 | .zipWithIndex 86 | 87 | val margins = vectors.context.broadcast(localMargins) 88 | 89 | // assign each point to its proper partition 90 | val duplicated = for { 91 | point <- vectors.map(DBSCANPoint) 92 | ((inner, main, outer), id) <- margins.value 93 | if outer.contains(point) 94 | } yield (id, point) 95 | 96 | val numOfPartitions = localPartitions.size 97 | 98 | // perform local dbscan 99 | val clustered = 100 | duplicated 101 | .groupByKey(numOfPartitions) 102 | .flatMapValues(points => 103 | new LocalDBSCANNaive(eps, minPoints).fit(points)) 104 | .cache() 105 | 106 | // find all candidate points for merging clusters and group them 107 | val mergePoints = 108 | clustered 109 | .flatMap({ 110 | case (partition, point) => 111 | margins.value 112 | .filter({ 113 | case ((inner, main, _), _) => main.contains(point) && !inner.almostContains(point) 114 | }) 115 | .map({ 116 | case (_, newPartition) => (newPartition, (partition, point)) 117 | }) 118 | }) 119 | .groupByKey() 120 | 121 | logDebug("About to find adjacencies") 122 | // find all clusters with aliases from merging candidates 123 | val adjacencies = 124 | mergePoints 125 | .flatMapValues(findAdjacencies) 126 | .values 127 | .collect() 128 | 129 | // generated adjacency graph 130 | val adjacencyGraph = adjacencies.foldLeft(DBSCANGraph[ClusterId]()) { 131 | case (graph, (from, to)) => graph.connect(from, to) 132 | } 133 | 134 | logDebug("About to find all cluster ids") 135 | // find all cluster ids 136 | val localClusterIds = 137 | clustered 138 | .filter({ case (_, point) => point.flag != Flag.Noise }) 139 | .mapValues(_.cluster) 140 | .distinct() 141 | .collect() 142 | .toList 143 | 144 | // assign a global Id to all clusters, where connected clusters get the same id 145 | val (total, clusterIdToGlobalId) = localClusterIds.foldLeft((0, Map[ClusterId, Int]())) { 146 | case ((id, map), clusterId) => { 147 | 148 | map.get(clusterId) match { 149 | case None => { 150 | val nextId = id + 1 151 | val connectedClusters = adjacencyGraph.getConnected(clusterId) + clusterId 152 | logDebug(s"Connected clusters $connectedClusters") 153 | val toadd = connectedClusters.map((_, nextId)).toMap 154 | (nextId, map ++ toadd) 155 | } 156 | case Some(x) => 157 | (id, map) 158 | } 159 | 160 | } 161 | } 162 | 163 | logDebug("Global Clusters") 164 | clusterIdToGlobalId.foreach(e => logDebug(e.toString)) 165 | logInfo(s"Total Clusters: ${localClusterIds.size}, Unique: $total") 166 | 167 | val clusterIds = vectors.context.broadcast(clusterIdToGlobalId) 168 | 169 | logDebug("About to relabel inner points") 170 | // relabel non-duplicated points 171 | val labeledInner = 172 | clustered 173 | .filter(isInnerPoint(_, margins.value)) 174 | .map { 175 | case (partition, point) => { 176 | 177 | if (point.flag != Flag.Noise) { 178 | point.cluster = clusterIds.value((partition, point.cluster)) 179 | } 180 | 181 | (partition, point) 182 | } 183 | } 184 | 185 | logDebug("About to relabel outer points") 186 | // de-duplicate and label merge points 187 | val labeledOuter = 188 | mergePoints.flatMapValues(partition => { 189 | partition.foldLeft(Map[DBSCANPoint, DBSCANLabeledPoint]())({ 190 | case (all, (partition, point)) => 191 | 192 | if (point.flag != Flag.Noise) { 193 | point.cluster = clusterIds.value((partition, point.cluster)) 194 | } 195 | 196 | all.get(point) match { 197 | case None => all + (point -> point) 198 | case Some(prev) => { 199 | // override previous entry unless new entry is noise 200 | if (point.flag != Flag.Noise) { 201 | prev.flag = point.flag 202 | prev.cluster = point.cluster 203 | } 204 | all 205 | } 206 | } 207 | 208 | }).values 209 | }) 210 | 211 | val finalPartitions = localMargins.map { 212 | case ((_, p, _), index) => (index, p) 213 | } 214 | logDebug("Done") 215 | new DBSCAN( 216 | eps, 217 | minPoints, 218 | maxPointsPerPartition, 219 | finalPartitions, 220 | labeledInner.union(labeledOuter)) 221 | 222 | } 223 | 224 | /** 225 | * Find the appropriate label to the given `vector` 226 | * 227 | * This method is not yet implemented 228 | */ 229 | def predict(vector: Vector): Double = { 230 | var centerid = 0 231 | partitions.foreach{x => 232 | if (x._2.contains(DBSCANPoint(vector))){ 233 | centerid = x._1 234 | } 235 | } 236 | centerid.toDouble 237 | } 238 | 239 | private def isInnerPoint( 240 | entry: (Int, DBSCANLabeledPoint), 241 | margins: List[(Margins, Int)]): Boolean = { 242 | entry match { 243 | case (partition, point) => 244 | val ((inner, _, _), _) = margins.filter({ 245 | case (_, id) => id == partition 246 | }).head 247 | 248 | inner.almostContains(point) 249 | } 250 | } 251 | 252 | private def findAdjacencies(partition: Iterable[(Int, DBSCANLabeledPoint)]): 253 | Set[((Int, Int), (Int, Int))] = { 254 | 255 | val zero = (Map[DBSCANPoint, ClusterId](), Set[(ClusterId, ClusterId)]()) 256 | 257 | val (seen, adjacencies) = partition.foldLeft(zero)({ 258 | case ((seen, adjacencies), (partition, point)) => 259 | // noise points are not relevant for adjacencies 260 | if (point.flag == Flag.Noise) { 261 | (seen, adjacencies) 262 | } else { 263 | val clusterId = (partition, point.cluster) 264 | seen.get(point) match { 265 | case None => (seen + (point -> clusterId), adjacencies) 266 | case Some(prevClusterId) => (seen, adjacencies + ((prevClusterId, clusterId))) 267 | } 268 | 269 | } 270 | }) 271 | 272 | adjacencies 273 | } 274 | 275 | private def toMinimumBoundingRectangle(vector: Vector): DBSCANRectangle = { 276 | val point = DBSCANPoint(vector) 277 | val x = corner(point.x) 278 | val y = corner(point.y) 279 | DBSCANRectangle(x, y, x + minimumRectangleSize, y + minimumRectangleSize) 280 | } 281 | 282 | private def corner(p: Double): Double = 283 | (shiftIfNegative(p) / minimumRectangleSize).intValue * minimumRectangleSize 284 | 285 | private def shiftIfNegative(p: Double): Double = 286 | if (p < 0) p - minimumRectangleSize else p 287 | 288 | } 289 | 290 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/timeseries/models/HoltWinters.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.timeseries.models 2 | 3 | import io.transwarp.discover.timeseries.params.TimeSeriesParams 4 | import io.transwarp.midas.constant.midas.params.timeseries.{HoltWintersParams, TimeSeriesParams} 5 | import org.apache.commons.math3.analysis.MultivariateFunction 6 | import org.apache.commons.math3.optim.{InitialGuess, MaxEval, MaxIter, SimpleBounds} 7 | import org.apache.commons.math3.optim.nonlinear.scalar.{GoalType, ObjectiveFunction} 8 | import org.apache.commons.math3.optim.nonlinear.scalar.noderiv.BOBYQAOptimizer 9 | import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors} 10 | import org.apache.spark.ml.{Estimator, Model} 11 | import org.apache.spark.ml.param.{Param, ParamMap} 12 | import org.apache.spark.ml.timeseries.params.TimeSeriesParams 13 | import org.apache.spark.ml.util.Identifiable 14 | import org.apache.spark.sql.{DataFrame, Dataset, Row} 15 | import org.apache.spark.sql.types.{DoubleType, StructField, StructType} 16 | 17 | /** 18 | * Triple exponential smoothing takes into account seasonal changes as well as trends. 19 | * Seasonality is defined to be the tendency of time-series data to exhibit behavior that repeats 20 | * itself every L periods, much like any harmonic function. 21 | * 22 | * The Holt-Winters method is a popular and effective approach to forecasting seasonal time series 23 | * 24 | * See https://en.wikipedia.org/wiki/Exponential_smoothing#Triple_exponential_smoothing 25 | * for more information on Triple Exponential Smoothing 26 | * See https://www.otexts.org/fpp/7/5 and 27 | * https://stat.ethz.ch/R-manual/R-devel/library/stats/html/HoltWinters.html 28 | * for more information on Holt Winter Method. 29 | */ 30 | 31 | trait HoltWintersParams extends TimeSeriesParams{ 32 | final val maxEval = new Param[Int](this, "maxEval", "max eval") 33 | def setMaxEval(value: Int): this.type = set(maxEval, value) 34 | 35 | final val maxIter = new Param[Int](this, "maxIter", "max iteration") 36 | def setMaxIter(value: Int): this.type = set(maxIter, value) 37 | 38 | final val period = new Param[Int](this, "period", "Seasonality of data") 39 | def setPeriod(value: Int): this.type = set(period, value) 40 | 41 | final val modelType = new Param[String](this, "modelType", "Two variations " + 42 | "differ in the nature of the seasonal component. Additive method is preferred when seasonal " + 43 | "variations are roughly constant through the series, Multiplicative method is preferred when " + 44 | "the seasonal variations are changing proportional to the level of the series") 45 | def setModelType(value: String): this.type = set(modelType, value) 46 | } 47 | 48 | class HoltWinters(override val uid: String) extends Estimator[HoltWintersModel] with 49 | HoltWintersParams { 50 | 51 | setDefault(timeCol -> "time", 52 | timeSeriesCol -> "timeseries", 53 | maxEval -> 10000, 54 | maxIter -> 10000) 55 | 56 | def this() = this(Identifiable.randomUID("HoltWinters")) 57 | /** 58 | * Fits a model to the input data. 59 | */ 60 | override def fit(dataset: Dataset[_]): HoltWintersModel = { 61 | 62 | val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map { 63 | case Row(time: String, value: Double) => (time, value) 64 | }.sortByKey().collect() 65 | 66 | val dataVector = Vectors.dense(data.map(x => x._2)) 67 | val optimizer = new BOBYQAOptimizer(7) 68 | 69 | val objectiveFunction = new ObjectiveFunction(new MultivariateFunction() { 70 | def value(params: Array[Double]): Double = { 71 | new HoltWintersModel(params(0), params(1), params(2)) 72 | .setModelType(${modelType}) 73 | .setPeriod(${period}) 74 | .sse(dataVector) 75 | } 76 | }) 77 | 78 | // The starting guesses in R's stats:HoltWinters 79 | val initGuess = new InitialGuess(Array(0.3, 0.1, 0.1)) 80 | val goal = GoalType.MINIMIZE 81 | val bounds = new SimpleBounds(Array(0.0, 0.0, 0.0), Array(1.0, 1.0, 1.0)) 82 | val optimal = optimizer.optimize(objectiveFunction, goal, bounds, initGuess, 83 | new MaxIter(${maxIter}), new MaxEval(${maxEval})) 84 | val params = optimal.getPoint 85 | new HoltWintersModel(params(0), params(1), params(2)) 86 | .setModelType(${modelType}) 87 | .setPeriod (${period}) 88 | .setTimeCol(${timeCol}) 89 | .setTimeSeriesCol(${timeSeriesCol}) 90 | } 91 | 92 | override def copy(extra: ParamMap): Estimator[HoltWintersModel] = defaultCopy(extra) 93 | 94 | /** 95 | * :: DeveloperApi :: 96 | * 97 | * Check transform validity and derive the output schema from the input schema. 98 | * 99 | * Typical implementation should first conduct verification on schema change and parameter 100 | * validity, including complex parameter interaction checks. 101 | */ 102 | override def transformSchema(schema: StructType): StructType = schema 103 | } 104 | 105 | class HoltWintersModel(override val uid: String, 106 | val alpha: Double, val beta: Double, val gamma: Double) 107 | extends Model[HoltWintersModel] with HoltWintersParams { 108 | 109 | def this(alpha: Double, beta: Double, gamma: Double) = this(Identifiable.randomUID 110 | ("HoltWintersModel"), alpha, beta, gamma) 111 | 112 | override def copy(extra: ParamMap): HoltWintersModel = defaultCopy(extra) 113 | 114 | /** 115 | * Transforms the input dataset. 116 | */ 117 | override def transform(dataset: Dataset[_]): DataFrame = { 118 | val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map { 119 | case Row(time: String, value: Double) => (time, value) 120 | }.sortByKey().collect() 121 | 122 | val dataVector = Vectors.dense(data.map(x => x._2)) 123 | 124 | val destArr = new Array[Double](${period}) 125 | val (_, level, trend, season) = getHoltWintersComponents(dataVector) 126 | val n = dataVector.size 127 | 128 | val finalLevel = level(n - ${period}) 129 | val finalTrend = trend(n - ${period}) 130 | val finalSeason = new Array[Double](${period}) 131 | 132 | for (i <- 0 until ${period}) { 133 | finalSeason(i) = season(i + n - ${period}) 134 | } 135 | 136 | for (i <- 0 until ${period}) { 137 | destArr(i) = if (${modelType}.equalsIgnoreCase("additive")) { 138 | (finalLevel + (i + 1) * finalTrend) + finalSeason(i % ${period}) 139 | } else { 140 | (finalLevel + (i + 1) * finalTrend) * finalSeason(i % ${period}) 141 | } 142 | } 143 | 144 | val resRDD = dataset.sparkSession.sparkContext.parallelize(destArr.map(x => Row(x))) 145 | 146 | val structType = transformSchema(dataset.schema) 147 | 148 | dataset.sparkSession.createDataFrame(resRDD, structType) 149 | } 150 | 151 | /** 152 | * :: DeveloperApi :: 153 | * 154 | * Check transform validity and derive the output schema from the input schema. 155 | * 156 | * Typical implementation should first conduct verification on schema change and parameter 157 | * validity, including complex parameter interaction checks. 158 | */ 159 | override def transformSchema(schema: StructType): StructType = { 160 | StructType(Array(StructField("HoltWinters", DoubleType))) 161 | } 162 | 163 | /** 164 | * Calculates sum of squared errors, used to estimate the alpha and beta parameters 165 | * 166 | * @param ts A time series for which we want to calculate the SSE, given the current parameters 167 | * @return SSE 168 | */ 169 | def sse(ts: Vector): Double = { 170 | val n = ts.size 171 | val smoothed = addTimeDependentEffects(ts) 172 | 173 | var error = 0.0 174 | var sqrErrors = 0.0 175 | 176 | // We predict only from period by using the first period - 1 elements. 177 | for(i <- ${period} until n) { 178 | error = ts(i) - smoothed(i) 179 | sqrErrors += error * error 180 | } 181 | 182 | sqrErrors 183 | } 184 | 185 | def addTimeDependentEffects(ts: Vector): Vector = { 186 | val destArr = Array.fill(ts.size)(0.0) 187 | val fitted = getHoltWintersComponents(ts)._1 188 | for (i <- 0 until ts.size) { 189 | destArr(i) = fitted(i) 190 | } 191 | Vectors.dense(destArr) 192 | } 193 | 194 | /** 195 | * Start from the intial parameters and then iterate to find the final parameters 196 | * using the equations of HoltWinter Method. 197 | * See https://www.otexts.org/fpp/7/5 and 198 | * https://stat.ethz.ch/R-manual/R-devel/library/stats/html/HoltWinters.html 199 | * for more information on Holt Winter Method equations. 200 | * 201 | * @param ts A time series for which we want the HoltWinter parameters level,trend and season. 202 | * @return (level trend season). Final vectors of level trend and season are returned. 203 | */ 204 | def getHoltWintersComponents(ts: Vector): (Vector, Vector, Vector, Vector) = { 205 | val n = ts.size 206 | require(n >= 2, "Requires length of at least 2") 207 | 208 | val dest = new Array[Double](n) 209 | 210 | val level = new Array[Double](n) 211 | val trend = new Array[Double](n) 212 | val season = new Array[Double](n) 213 | 214 | val (initLevel, initTrend, initSeason) = initHoltWinters(ts) 215 | level(0) = initLevel 216 | trend(0) = initTrend 217 | for (i <- 0 until initSeason.size){ 218 | season(i) = initSeason(i) 219 | } 220 | 221 | for (i <- 0 until (n - ${period})) { 222 | dest(i + ${period}) = level(i) + trend(i) 223 | 224 | // Add the seasonal factor for additive and multiply for multiplicative model. 225 | if (${modelType}.equalsIgnoreCase("additive")) { 226 | dest(i + ${period}) += season(i) 227 | } else { 228 | dest(i + ${period}) *= season(i) 229 | } 230 | 231 | val levelWeight = if (${modelType}.equalsIgnoreCase("additive")) { 232 | ts(i + ${period}) - season(i) 233 | } else { 234 | ts(i + ${period}) / season(i) 235 | } 236 | 237 | level(i + 1) = alpha * levelWeight + (1 - alpha) * (level(i) + trend(i)) 238 | 239 | trend(i + 1) = beta * (level(i + 1) - level(i)) + (1 - beta) * trend(i) 240 | 241 | val seasonWeight = if (${modelType}.equalsIgnoreCase("additive")) { 242 | ts(i + ${period}) - level(i + 1) 243 | } else { 244 | ts(i + ${period}) / level(i + 1) 245 | } 246 | season(i + ${period}) = gamma * seasonWeight + (1 - gamma) * season(i) 247 | } 248 | 249 | (Vectors.dense(dest), Vectors.dense(level), Vectors.dense(trend), Vectors.dense(season)) 250 | } 251 | 252 | def getKernel: (Array[Double]) = { 253 | if (${period} % 2 == 0){ 254 | val kernel = Array.fill(${period} + 1)(1.0 / ${period}) 255 | kernel(0) = 0.5 / ${period} 256 | kernel(${period}) = 0.5 / ${period} 257 | kernel 258 | } else { 259 | Array.fill(${period})(1.0 / ${period}) 260 | } 261 | } 262 | 263 | /** 264 | * Function to calculate the Weighted moving average/convolution using above kernel/weights 265 | * for input data. 266 | * See http://robjhyndman.com/papers/movingaverage.pdf for more information 267 | * @param inData Series on which you want to do moving average 268 | * @param kernel Weight vector for weighted moving average 269 | */ 270 | def convolve(inData: Array[Double], kernel: Array[Double]): (Array[Double]) = { 271 | val kernelSize = kernel.length 272 | val dataSize = inData.length 273 | 274 | val outData = new Array[Double](dataSize - kernelSize + 1) 275 | 276 | var end = 0 277 | while (end <= (dataSize - kernelSize)) { 278 | var sum = 0.0 279 | for (i <- 0 until kernelSize) { 280 | sum += kernel(i) * inData(end + i) 281 | } 282 | outData(end) = sum 283 | end += 1 284 | } 285 | outData 286 | } 287 | 288 | /** 289 | * Function to get the initial level, trend and season using method suggested in 290 | * http://robjhyndman.com/hyndsight/hw-initialization/ 291 | * @param ts 292 | */ 293 | def initHoltWinters(ts: Vector): (Double, Double, Array[Double]) = { 294 | val arrTs = ts.toArray 295 | 296 | // Decompose a window of time series into level trend and seasonal using convolution 297 | val kernel = getKernel 298 | val kernelSize = kernel.size 299 | val trend = convolve(arrTs.take(${period} * 2), kernel) 300 | 301 | // Remove the trend from time series. Subtract for additive and divide for multiplicative 302 | val n = (kernelSize -1) / 2 303 | val removeTrend = arrTs.take(${period} * 2).zip( 304 | Array.fill(n)(0.0) ++ trend ++ Array.fill(n)(0.0)).map{ 305 | case (a, t) => 306 | if (t != 0){ 307 | if (${modelType}.equalsIgnoreCase("additive")) { 308 | a - t 309 | } else { 310 | a / t 311 | } 312 | } else { 313 | 0 314 | } 315 | } 316 | 317 | // seasonal mean is sum of mean of all season values of that period 318 | val seasonalMean = removeTrend.splitAt(${period}).zipped.map { case (prevx, x) => 319 | if (prevx == 0 || x == 0) x + prevx else (x + prevx) / 2 320 | } 321 | 322 | val meanOfFigures = seasonalMean.sum / ${period} 323 | 324 | // The seasonal mean is then centered and removed to get season. 325 | // Subtract for additive and divide for multiplicative. 326 | val initSeason = if (${modelType}.equalsIgnoreCase("additive")) { 327 | seasonalMean.map(_ - meanOfFigures ) 328 | } else { 329 | seasonalMean.map(_ / meanOfFigures ) 330 | } 331 | 332 | // Do Simple Linear Regression to find the initial level and trend 333 | val indices = 1 to trend.length 334 | val xbar = (indices.sum: Double) / indices.size 335 | val ybar = trend.sum / trend.length 336 | 337 | val xxbar = indices.map( x => (x - xbar) * (x - xbar) ).sum 338 | val xybar = indices.zip(trend).map { 339 | case (x, y) => (x - xbar) * (y - ybar) 340 | }.sum 341 | 342 | val initTrend = xybar / xxbar 343 | val initLevel = ybar - (initTrend * xbar) 344 | 345 | (initLevel, initTrend, initSeason) 346 | } 347 | } 348 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/ml/timeseries/UnivariateTimeSeries.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.ml.timeseries 2 | 3 | import java.util.Arrays 4 | 5 | import breeze.stats._ 6 | import org.apache.commons.math3.analysis.interpolation.SplineInterpolator 7 | import org.apache.spark.ml.linalg.{DenseVector, Matrix, Vector, Vectors} 8 | 9 | /** 10 | * Created by endy on 16-12-20. 11 | */ 12 | object UnivariateTimeSeries { 13 | 14 | /** 15 | * Lags the univariate time series 16 | * 17 | * Example input vector: (1.0, 2.0, 3.0, 4.0, 5.0) 18 | * 19 | * With lag 2 and includeOriginal = true should give output matrix: 20 | * 21 | * 3.0 2.0 1.0 22 | * 4.0 3.0 2.0 23 | * 5.0 4.0 3.0 24 | */ 25 | def lag(ts: Vector, maxLag: Int, includeOriginal: Boolean): Matrix = { 26 | Lag.lagMatTrimBoth(ts, maxLag, includeOriginal) 27 | } 28 | 29 | def autocorr(ts: Array[Double], numLags: Int): Array[Double] = { 30 | autocorr(new DenseVector(ts), numLags).toArray 31 | } 32 | 33 | /** 34 | * Computes the sample autocorrelation of the given series. 35 | */ 36 | def autocorr(ts: Vector, numLags: Int): Vector = { 37 | val corrs = new Array[Double](numLags) 38 | var i = 1 39 | val breezeTs = MatrixUtil.toBreeze(ts) 40 | while (i <= numLags) { 41 | val slice1 = breezeTs(i until ts.size) 42 | val slice2 = breezeTs(0 until ts.size - i) 43 | val mean1 = mean(slice1) 44 | val mean2 = mean(slice2) 45 | var variance1 = 0.0 46 | var variance2 = 0.0 47 | var covariance = 0.0 48 | var j = 0 49 | while (j < ts.size - i) { 50 | val diff1 = slice1(j) - mean1 51 | val diff2 = slice2(j) - mean2 52 | variance1 += diff1 * diff1 53 | variance2 += diff2 * diff2 54 | covariance += diff1 * diff2 55 | j += 1 56 | } 57 | 58 | corrs(i - 1) = covariance / (math.sqrt(variance1) * math.sqrt(variance2)) 59 | i += 1 60 | } 61 | new DenseVector(corrs) 62 | } 63 | 64 | def quotients(ts: Vector, lag: Int): Vector = { 65 | val ret = new Array[Double](ts.size - lag) 66 | var i = 0 67 | while (i < ret.length) { 68 | ret(i) = ts(i + lag) / ts(i) 69 | i += 1 70 | } 71 | new DenseVector(ret) 72 | } 73 | 74 | def price2ret(ts: Vector, lag: Int): Vector = { 75 | val ret = new Array[Double](ts.size - lag) 76 | var i = 0 77 | while (i < ret.length) { 78 | ret(i) = ts(i + lag) / ts(i) - 1.0 79 | i += 1 80 | } 81 | new DenseVector(ret) 82 | } 83 | 84 | /** 85 | * Trim leading NaNs from a series. 86 | */ 87 | def trimLeading(ts: Vector): Vector = { 88 | val start = firstNotNaN(ts) 89 | if (start < ts.size) { 90 | Vectors.dense(Arrays.copyOfRange(ts.toArray, start, ts.size)) 91 | } else { 92 | Vectors.zeros(0) 93 | } 94 | } 95 | 96 | /** 97 | * Trim trailing NaNs from a series. 98 | */ 99 | def trimTrailing(ts: Vector): Vector = { 100 | val end = lastNotNaN(ts) 101 | if (end > 0) { 102 | Vectors.dense(Arrays.copyOfRange(ts.toArray, 0, end)) 103 | } else { 104 | Vectors.zeros(0) 105 | } 106 | } 107 | 108 | def firstNotNaN(ts: Vector): Int = { 109 | var i = 0 110 | while (i < ts.size) { 111 | if (!java.lang.Double.isNaN(ts(i))) { 112 | return i 113 | } 114 | i += 1 115 | } 116 | i 117 | } 118 | 119 | def lastNotNaN(ts: Vector): Int = { 120 | var i = ts.size - 1 121 | while (i >= 0) { 122 | if (!java.lang.Double.isNaN(ts(i))) { 123 | return i 124 | } 125 | i -= 1 126 | } 127 | i 128 | } 129 | 130 | def fillts(ts: Vector, fillMethod: String): Vector = { 131 | fillMethod match { 132 | case "linear" => fillLinear(ts) 133 | case "nearest" => fillNearest(ts) 134 | case "next" => fillNext(ts) 135 | case "previous" => fillPrevious(ts) 136 | case "spline" => fillSpline(ts) 137 | case "zero" => fillValue(ts, 0) 138 | case _ => throw new UnsupportedOperationException() 139 | } 140 | } 141 | 142 | /** 143 | * Replace all NaNs with a specific value 144 | */ 145 | def fillValue(values: Array[Double], filler: Double): Array[Double] = { 146 | fillValue(new DenseVector(values), filler).toArray 147 | } 148 | 149 | /** 150 | * Replace all NaNs with a specific value 151 | */ 152 | def fillValue(values: Vector, filler: Double): DenseVector = { 153 | val result = values.copy.toArray 154 | var i = 0 155 | while (i < result.size) { 156 | if (result(i).isNaN) result(i) = filler 157 | i += 1 158 | } 159 | new DenseVector(result) 160 | } 161 | 162 | def fillNearest(values: Array[Double]): Array[Double] = { 163 | fillNearest(new DenseVector(values)).toArray 164 | } 165 | 166 | def fillNearest(values: Vector): DenseVector = { 167 | val result = values.copy.toArray 168 | var lastExisting = -1 169 | var nextExisting = -1 170 | var i = 1 171 | while (i < result.length) { 172 | if (result(i).isNaN) { 173 | if (nextExisting < i) { 174 | nextExisting = i + 1 175 | while (nextExisting < result.length && result(nextExisting).isNaN) { 176 | nextExisting += 1 177 | } 178 | } 179 | 180 | if (lastExisting < 0 && nextExisting >= result.size) { 181 | throw new IllegalArgumentException("Input is all NaNs!") 182 | } else if (nextExisting >= result.size || // TODO: check this 183 | (lastExisting >= 0 && i - lastExisting < nextExisting - i)) { 184 | result(i) = result(lastExisting) 185 | } else { 186 | result(i) = result(nextExisting) 187 | } 188 | } else { 189 | lastExisting = i 190 | } 191 | i += 1 192 | } 193 | new DenseVector(result) 194 | } 195 | 196 | def fillPrevious(values: Array[Double]): Array[Double] = { 197 | fillPrevious(new DenseVector(values)).toArray 198 | } 199 | 200 | /** 201 | * fills in NaN with the previously available not NaN, scanning from left to right. 202 | * 1 NaN NaN 2 Nan -> 1 1 1 2 2 203 | */ 204 | def fillPrevious(values: Vector): DenseVector = { 205 | val result = values.copy.toArray 206 | var filler = Double.NaN // initial value, maintains invariant 207 | var i = 0 208 | while (i < result.length) { 209 | filler = if (result(i).isNaN) filler else result(i) 210 | result(i) = filler 211 | i += 1 212 | } 213 | new DenseVector(result) 214 | } 215 | 216 | def fillNext(values: Array[Double]): Array[Double] = { 217 | fillNext(new DenseVector(values)).toArray 218 | } 219 | 220 | /** 221 | * fills in NaN with the next available not NaN, scanning from right to left. 222 | * 1 NaN NaN 2 Nan -> 1 2 2 2 NaN 223 | */ 224 | def fillNext(values: Vector): DenseVector = { 225 | val result = values.copy.toArray 226 | var filler = Double.NaN // initial value, maintains invariant 227 | var i = result.length - 1 228 | while (i >= 0) { 229 | filler = if (result(i).isNaN) filler else result(i) 230 | result(i) = filler 231 | i -= 1 232 | } 233 | new DenseVector(result) 234 | } 235 | 236 | def fillWithDefault(values: Array[Double], filler: Double): Array[Double] = { 237 | fillWithDefault(new DenseVector(values), filler).toArray 238 | } 239 | 240 | /** 241 | * fills in NaN with a default value 242 | */ 243 | def fillWithDefault(values: Vector, filler: Double): DenseVector = { 244 | val result = values.copy.toArray 245 | var i = 0 246 | while (i < result.length) { 247 | result(i) = if (result(i).isNaN) filler else result(i) 248 | i += 1 249 | } 250 | new DenseVector(result) 251 | } 252 | 253 | def fillLinear(values: Array[Double]): Array[Double] = { 254 | fillLinear(new DenseVector(values)).toArray 255 | } 256 | 257 | def fillLinear(values: Vector): DenseVector = { 258 | val result = values.copy.toArray 259 | var i = 1 260 | while (i < result.length - 1) { 261 | val rangeStart = i 262 | while (i < result.length - 1 && result(i).isNaN) { 263 | i += 1 264 | } 265 | val before = result(rangeStart - 1) 266 | val after = result(i) 267 | if (i != rangeStart && !before.isNaN && !after.isNaN) { 268 | val increment = (after - before) / (i - (rangeStart - 1)) 269 | for (j <- rangeStart until i) { 270 | result(j) = result(j - 1) + increment 271 | } 272 | } 273 | i += 1 274 | } 275 | new DenseVector(result) 276 | } 277 | 278 | def fillSpline(values: Array[Double]): Array[Double] = { 279 | fillSpline(new DenseVector(values)).toArray 280 | } 281 | 282 | /** 283 | * Fill in NaN values using a natural cubic spline. 284 | * @param values Vector to interpolate 285 | * @return Interpolated vector 286 | */ 287 | def fillSpline(values: Vector): DenseVector = { 288 | val result = values.copy.toArray 289 | val interp = new SplineInterpolator() 290 | val knotsAndValues = values.toArray.zipWithIndex.filter(!_._1.isNaN) 291 | // Note that the type of unzip is missed up in scala 10.4 as per 292 | // https://issues.scala-lang.org/browse/SI-8081 293 | // given that this project is using scala 10.4, we cannot use unzip, so unpack manually 294 | val knotsX = knotsAndValues.map(_._2.toDouble) 295 | val knotsY = knotsAndValues.map(_._1) 296 | val filler = interp.interpolate(knotsX, knotsY) 297 | 298 | // values that we can interpolate between, others need to be filled w/ other function 299 | var i = knotsX(0).toInt 300 | val end = knotsX.last.toInt 301 | 302 | while (i < end) { 303 | result(i) = filler.value(i.toDouble) 304 | i += 1 305 | } 306 | new DenseVector(result) 307 | } 308 | 309 | 310 | /** 311 | * Down sample by taking every nth element starting from offset phase 312 | * @param values Vector to down sample 313 | * @param n take every nth element 314 | * @param phase offset from starting index 315 | * @return downsampled vector with appropriate length 316 | */ 317 | def downsample(values: Vector, n: Int, phase: Int = 0): DenseVector = { 318 | val origLen = values.size 319 | val newLen = Math.ceil((values.size - phase) / n.toDouble).toInt 320 | val sampledValues = Array.fill(newLen)(0.0) 321 | var i = phase 322 | var j = 0 323 | 324 | while (j < newLen) { 325 | sampledValues(j) = values(i) 326 | i += n 327 | j += 1 328 | } 329 | new DenseVector(sampledValues) 330 | } 331 | 332 | /** 333 | * Up sample by inserting n - 1 elements into the original values vector, starting at index phase 334 | * @param values the original data vector 335 | * @param n the number of insertions between elements 336 | * @param phase the offset to begin 337 | * @param useZero fill with zeros rather than NaN 338 | * @return upsampled vector filled with zeros or NaN, as specified by user 339 | */ 340 | def upsample(values: Vector, n: Int, phase: Int = 0, useZero: Boolean = false): DenseVector = { 341 | val filler = if (useZero) 0 else Double.NaN 342 | val origLen = values.size 343 | val newLen = origLen * n 344 | val sampledValues = Array.fill(newLen)(filler) 345 | var i = phase 346 | var j = 0 347 | 348 | while (j < origLen) { 349 | sampledValues(i) = values(j) 350 | i += n 351 | j += 1 352 | } 353 | new DenseVector(sampledValues) 354 | } 355 | 356 | /** 357 | * Difference a vector with respect to the m-th prior element. Size-preserving by leaving first 358 | * `m` elements intact. This is the inverse of the `inverseDifferences` function. 359 | * @param ts Series to difference 360 | * @param destTs Series to store the differenced values (and return for convenience) 361 | * @param lag The difference lag (e.g. x means destTs(i) = ts(i) - ts(i - x), etc) 362 | * @param startIndex the starting index for the differencing. Must be at least equal to lag 363 | * @return the differenced vector, for convenience 364 | */ 365 | def differencesAtLag(ts: Vector, destTs: Vector, lag: Int, startIndex: Int): Vector = { 366 | require(startIndex >= lag, "starting index cannot be less than lag") 367 | val diffedTs = if (destTs == null) ts.copy else destTs 368 | if (lag == 0) { 369 | diffedTs 370 | } else { 371 | val arr = diffedTs.toArray 372 | val n = ts.size 373 | var i = 0 374 | 375 | while (i < n) { 376 | // elements prior to starting point are copied over without modification 377 | arr(i) = if (i < startIndex) ts(i) else ts(i) - ts(i - lag) 378 | i += 1 379 | } 380 | diffedTs 381 | } 382 | } 383 | 384 | /** 385 | * Convenience wrapper around `differencesAtLag[Vector[Double], Vector[Double], Int, Int]` 386 | * @param ts vector to difference 387 | * @param lag the difference lag (e.g. x means destTs(i) = ts(i) - ts(i - x), etc) 388 | * @return the differenced vector, for convenience 389 | */ 390 | def differencesAtLag(ts: Vector, lag: Int): Vector = { 391 | differencesAtLag(ts, null, lag, lag) 392 | } 393 | 394 | /** 395 | * Calculate an "inverse-differenced" vector of a given lag. Size-preserving by leaving first 396 | * `startIndex` elements intact. This is the inverse of the `differences` function. 397 | * @param diffedTs differenced vector that we want to inverse 398 | * @param destTs Series to store the added up values (and return for convenience) 399 | * @param lag The difference lag (e.g. x means destTs(i) = diffedTs(i) + destTs(i - x), etc) 400 | * @param startIndex the starting index for the differencing. Must be at least equal to lag 401 | * @return the inverse differenced vector, for convenience 402 | */ 403 | def inverseDifferencesAtLag(diffedTs: Vector, destTs: Vector, lag: Int, 404 | startIndex: Int): Vector = { 405 | require(startIndex >= lag, "starting index cannot be less than lag") 406 | val addedTs = if (destTs == null) diffedTs.copy else destTs 407 | if (lag == 0) { 408 | addedTs 409 | } else { 410 | val n = diffedTs.size 411 | var i = 0 412 | 413 | val arr = addedTs.toArray 414 | while (i < n) { 415 | // elements prior to starting point are copied over without modification 416 | arr(i) = if (i < startIndex) diffedTs(i) else diffedTs(i) + addedTs(i - lag) 417 | i += 1 418 | } 419 | addedTs 420 | } 421 | } 422 | 423 | /** 424 | * Convenience wrapper around `inverseDifferencesAtLag[Vector[Double], Vector[Double], Int, Int]` 425 | * @param diffedTs differenced vector that we want to inverse 426 | * @param lag the difference lag (e.g. x means destTs(i) = ts(i) - ts(i - x), etc) 427 | * @return the inverse differenced vector, for convenience 428 | */ 429 | def inverseDifferencesAtLag(diffedTs: Vector, lag: Int): Vector = { 430 | inverseDifferencesAtLag(diffedTs, null, lag, lag) 431 | } 432 | 433 | /** 434 | * Performs differencing of order `d`. This means we recursively difference a vector a total of 435 | * d-times. So that d = 2 is a vector of the differences of differences. Note that for each 436 | * difference level, d_i, the element at ts(d_i - 1) corresponds to the value in the prior 437 | * iteration. 438 | * @param ts time series to difference 439 | * @param d order of differencing 440 | * @return a vector of the same length differenced to order d 441 | */ 442 | def differencesOfOrderD(ts: Vector, d: Int): Vector = { 443 | // we create 2 copies to avoid copying with every call, and simply swap them as necessary 444 | // for higher order differencing 445 | var (diffedTs, origTs) = (ts.copy, ts.copy) 446 | var swap: Vector = null 447 | for (i <- 1 to d) { 448 | swap = origTs 449 | origTs = diffedTs 450 | diffedTs = swap 451 | differencesAtLag(origTs, diffedTs, 1, i) 452 | } 453 | diffedTs 454 | } 455 | 456 | /** 457 | * Inverses differencing of order `d`. 458 | * @param diffedTs time series to reverse differencing process 459 | * @param d order of differencing 460 | * @return a vector of the same length, which when differenced to order ts, yields the original 461 | * vector provided 462 | */ 463 | def inverseDifferencesOfOrderD(diffedTs: Vector, d: Int): Vector = { 464 | val addedTs = diffedTs.copy 465 | for (i <- d to 1 by -1) { 466 | inverseDifferencesAtLag(addedTs, addedTs, 1, i) 467 | } 468 | addedTs 469 | } 470 | 471 | def rollSum(ts: Vector, n: Int): Vector = { 472 | new DenseVector(ts.toArray.sliding(n).toList.map(_.sum).toIndexedSeq.toArray[Double]) 473 | } 474 | } 475 | --------------------------------------------------------------------------------