├── .idea
    └── vcs.xml
├── src
    ├── main
    │   └── scala
    │   │   └── org
    │   │       └── apache
    │   │           └── spark
    │   │               └── ml
    │   │                   ├── util
    │   │                       ├── Utils.scala
    │   │                       ├── XORShiftRandom.scala
    │   │                       ├── DBHPartitioner.scala
    │   │                       ├── SparkUtils.scala
    │   │                       └── LoaderUtils.scala
    │   │                   ├── dbscan
    │   │                       ├── DBSCANPoint.scala
    │   │                       ├── DBSCANLabeledPoint.scala
    │   │                       ├── DBSCANRectangle.scala
    │   │                       ├── DBSCANGraph.scala
    │   │                       ├── LocalDBSCANArchery.scala
    │   │                       ├── LocalDBSCANNaive.scala
    │   │                       ├── DBSCAN2.scala
    │   │                       ├── EvenSplitPartitioner.scala
    │   │                       └── DBSCAN.scala
    │   │                   ├── tsne
    │   │                       ├── TSNEParam.scala
    │   │                       ├── TSNEHelper.scala
    │   │                       ├── impl
    │   │                       │   ├── SimpleTSNE.scala
    │   │                       │   ├── BHTSNE.scala
    │   │                       │   └── LBFGSTSNE.scala
    │   │                       ├── tree
    │   │                       │   └── SPTree.scala
    │   │                       ├── X2P.scala
    │   │                       └── TSNEGradient.scala
    │   │                   ├── timeseries
    │   │                       ├── params
    │   │                       │   └── TimeSeriesParams.scala
    │   │                       ├── MatrixUtil.scala
    │   │                       ├── Lag.scala
    │   │                       ├── models
    │   │                       │   ├── Autoregression.scala
    │   │                       │   ├── ARGARCH.scala
    │   │                       │   ├── EWMA.scala
    │   │                       │   ├── GARCH.scala
    │   │                       │   └── HoltWinters.scala
    │   │                       └── UnivariateTimeSeries.scala
    │   │                   ├── knn
    │   │                       ├── Distance.scala
    │   │                       └── KNNClassifier.scala
    │   │                   ├── sampling
    │   │                       ├── UnderSampling.scala
    │   │                       └── OverSampling.scala
    │   │                   ├── fm
    │   │                       ├── FMModel.scala
    │   │                       └── BSFMModel.scala
    │   │                   └── mvm
    │   │                       └── MVMModel.scala
    └── test
    │   └── scala
    │       └── org
    │           └── apache
    │               └── spark
    │                   └── ml
    │                       ├── timeseries
    │                           ├── MatrixUtilSuite.scala
    │                           ├── models
    │                           │   ├── GARCHSuite.scala
    │                           │   ├── EWMASuite.scala
    │                           │   ├── AutoregressionSuite.scala
    │                           │   ├── ARGARCHSuite.scala
    │                           │   ├── HoltWintersSuite.scala
    │                           │   └── ARIMASuite.scala
    │                           └── UnivariateTimeSeriesSuite.scala
    │                       └── knn_is
    │                           └── KNN_ISSuite.scala
├── README.md
└── pom.xml


/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/util/Utils.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.util
 2 | 
 3 | import java.util.Random
 4 | 
 5 | object Utils {
 6 |   val random = new Random()
 7 |   def log1pExp(x: Double): Double = {
 8 |     if (x > 0) {
 9 |       x + math.log1p(math.exp(-x))
10 |     } else {
11 |       math.log1p(math.exp(x))
12 |     }
13 |   }
14 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/dbscan/DBSCANPoint.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.dbscan
 2 | 
 3 | import org.apache.spark.ml.linalg.Vector
 4 | 
 5 | case class DBSCANPoint(val vector: Vector) {
 6 | 
 7 |   def x: Double = vector(0)
 8 |   def y: Double = vector(1)
 9 | 
10 |   def distanceSquared(other: DBSCANPoint): Double = {
11 |     val dx = other.x - x
12 |     val dy = other.y - y
13 |     (dx * dx) + (dy * dy)
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/tsne/TSNEParam.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.tsne
 2 | 
 3 | case class TSNEParam(
 4 |                       early_exaggeration: Int = 100,
 5 |                       exaggeration_factor: Double = 4.0,
 6 |                       t_momentum: Int = 25,
 7 |                       initial_momentum: Double = 0.5,
 8 |                       final_momentum: Double = 0.8,
 9 |                       eta: Double = 500.0,
10 |                       min_gain: Double = 0.01
11 |                     )


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/timeseries/params/TimeSeriesParams.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.timeseries.params
 2 | 
 3 | import org.apache.spark.ml.param.{Param, Params}
 4 | 
 5 | /**
 6 |   * Created by endy on 16-12-22.
 7 |   */
 8 | trait TimeSeriesParams extends Params {
 9 |   final val timeCol = new Param[String](this, "timeCol",
10 |     "The column that stored time value")
11 |   def setTimeCol(value: String): this.type = set(timeCol, value)
12 | 
13 |   final val timeSeriesCol = new Param[String](this, "timeSeriesCol",
14 |     "The column that stored time series value")
15 |   def setTimeSeriesCol(value: String): this.type = set(timeSeriesCol, value)
16 | }
17 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/dbscan/DBSCANLabeledPoint.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.dbscan
 2 | 
 3 | import org.apache.spark.ml.linalg.Vector
 4 | 
 5 | /**
 6 |   * Companion constants for labeled points
 7 |   */
 8 | object DBSCANLabeledPoint {
 9 | 
10 |   val Unknown = 0
11 | 
12 |   object Flag extends Enumeration {
13 |     type Flag = Value
14 |     val Border, Core, Noise, NotFlagged = Value
15 |   }
16 | 
17 | }
18 | 
19 | class DBSCANLabeledPoint(vector: Vector) extends DBSCANPoint(vector) {
20 | 
21 |   def this(point: DBSCANPoint) = this(point.vector)
22 | 
23 |   var flag = DBSCANLabeledPoint.Flag.NotFlagged
24 |   var cluster = DBSCANLabeledPoint.Unknown
25 |   var visited = false
26 | 
27 |   override def toString(): String = {
28 |     s"$vector,$cluster,$flag"
29 |   }
30 | 
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Distributed Algorithms On Spark
 2 | 
 3 | This project implement some popular algorithms on spark.You can read the papers of them to see their details.
 4 | 
 5 | Currently it support the following algorithms and I will add some other algorithms in the future.
 6 | 
 7 | - Distributed KNN
 8 | - Down Sampling
 9 | - Over Sampling
10 | - Affinity Propagation
11 | - Distributed t-SNE
12 | - Factorization Machines
13 | - Multi-view Machines
14 | - Block Structures Factorization Machines
15 | - Timeseries models
16 | - DBSCAN
17 | 
18 | 
19 | This project support spark 2.x
20 | 
21 | ## reference
22 | 
23 | - https://github.com/viirya/SparkAffinityPropagation
24 | - https://github.com/saurfang/spark-tsne
25 | - https://github.com/cloudml/zen
26 | - https://github.com/sryza/spark-timeseries
27 | - https://github.com/irvingc/dbscan-on-spark
28 | - http://mlwiki.org/index.php/Metric_Trees


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/ml/timeseries/MatrixUtilSuite.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.timeseries
 2 | 
 3 | import org.apache.spark.SparkFunSuite
 4 | import org.apache.spark.ml.linalg.{Matrices, Vectors}
 5 | import org.apache.spark.ml.util.DefaultReadWriteTest
 6 | import org.apache.spark.mllib.util.MLlibTestSparkContext
 7 | 
 8 | /**
 9 |   * Created by endy on 16-12-21.
10 |   */
11 | class MatrixUtilSuite extends SparkFunSuite with MLlibTestSparkContext
12 |   with DefaultReadWriteTest {
13 |   test("modifying toBreeze version modifies original tensor") {
14 |     val vec = Vectors.dense(1.0, 2.0, 3.0)
15 |     val breezeVec = MatrixUtil.toBreeze(vec)
16 |     breezeVec(1) = 4.0
17 |     assert(vec(1) == 4.0)
18 | 
19 |     val mat = Matrices.zeros(3, 4)
20 |     val breezeMat = MatrixUtil.toBreeze(mat)
21 |     breezeMat(0, 1) = 2.0
22 |     assert(mat(0, 1) == 2.0)
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/dbscan/DBSCANRectangle.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.dbscan
 2 | 
 3 | /**
 4 |   * A rectangle with a left corner of (x, y) and a right upper corner of (x2, y2)
 5 |   */
 6 | case class DBSCANRectangle(x: Double, y: Double, x2: Double, y2: Double) {
 7 | 
 8 |   /**
 9 |     * Returns whether other is contained by this box
10 |     */
11 |   def contains(other: DBSCANRectangle): Boolean = {
12 |     x <= other.x && other.x2 <= x2 && y <= other.y && other.y2 <= y2
13 |   }
14 | 
15 |   /**
16 |     * Returns whether point is contained by this box
17 |     */
18 |   def contains(point: DBSCANPoint): Boolean = {
19 |     x <= point.x && point.x <= x2 && y <= point.y && point.y <= y2
20 |   }
21 | 
22 |   /**
23 |     * Returns a new box from shrinking this box by the given amount
24 |     */
25 |   def shrink(amount: Double): DBSCANRectangle = {
26 |     DBSCANRectangle(x + amount, y + amount, x2 - amount, y2 - amount)
27 |   }
28 | 
29 |   /**
30 |     * Returns a whether the rectangle contains the point, and the point
31 |     * is not in the rectangle's border
32 |     */
33 |   def almostContains(point: DBSCANPoint): Boolean = {
34 |     x < point.x && point.x < x2 && y < point.y && point.y < y2
35 |   }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/knn/Distance.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.classification
 2 | 
 3 | import org.apache.spark.ml.linalg.Vector
 4 | 
 5 | object Distance extends Enumeration {
 6 | 
 7 |   val Euclidean, Manhattan = Value
 8 | 
 9 |   /**
10 |     * Computes the (Manhattan or Euclidean) distance between instance x and instance y.
11 |     * The type of the distance used is determined by the value of distanceType.
12 |     *
13 |     * @param x instance x
14 |     * @param y instance y
15 |     * @param distanceType type of the distance used (Distance.Euclidean or Distance.Manhattan)
16 |     * @return Distance
17 |     */
18 |   def apply(x: Vector, y: Vector, distanceType: Distance.Value): Double = {
19 |     distanceType match {
20 |       case Euclidean => euclidean(x, y)
21 |       case Manhattan => manhattan(x, y)
22 |       case _ => euclidean(x, y)
23 |     }
24 |   }
25 | 
26 |   /**
27 |     * Computes the Euclidean distance between instance x and instance y.
28 |     * The type of the distance used is determined by the value of distanceType.
29 |     *
30 |     * @param x instance x
31 |     * @param y instance y
32 |     * @return Euclidean distance
33 |     */
34 |   private def euclidean(x: Vector, y: Vector): Double = {
35 |     var sum = 0.0
36 |     val size = x.size
37 | 
38 |     for (i <- 0 until size) sum += (x(i) - y(i)) * (x(i) - y(i))
39 | 
40 |     Math.sqrt(sum)
41 |   }
42 | 
43 |   /**
44 |     * Computes the Manhattan distance between instance x and instance y.
45 |     * The type of the distance used is determined by the value of distanceType.
46 |     *
47 |     * @param x instance x
48 |     * @param y instance y
49 |     * @return Manhattan distance
50 |     */
51 |   private def manhattan(x: Vector, y: Vector): Double = {
52 |     var sum = 0.0
53 |     val size = x.size
54 | 
55 |     for (i <- 0 until size) sum += Math.abs(x(i) - y(i))
56 | 
57 |     sum
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/util/XORShiftRandom.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.util
 2 | 
 3 | import java.nio.ByteBuffer
 4 | import java.util.{Random => JavaRandom}
 5 | 
 6 | import scala.util.hashing.MurmurHash3
 7 | 
 8 | /**
 9 |   * This class implements a XORShift random number generator algorithm
10 |   * Source:
11 |   * Marsaglia, G. (2003). Xorshift RNGs. Journal of Statistical Software, Vol. 8, Issue 14.
12 |   * @see <a href="http://www.jstatsoft.org/v08/i14/paper">Paper</a>
13 |   *      This implementation is approximately 3.5 times faster than
14 |   *      { @link java.util.Random java.util.Random}, partly because of the algorithm, but also due
15 |   *              to renouncing thread safety. JDK's implementation uses an AtomicLong seed, this class
16 |   *              uses a regular Long. We can forgo thread safety since we use a new instance of the RNG
17 |   *              for each thread.
18 |   */
19 | class XORShiftRandom(init: Long) extends JavaRandom(init) {
20 | 
21 |   def this() = this(System.nanoTime)
22 | 
23 |   private var seed = XORShiftRandom.hashSeed(init)
24 | 
25 |   // we need to just override next - this will be called by nextInt, nextDouble,
26 |   // nextGaussian, nextLong, etc.
27 |   override protected def next(bits: Int): Int = {
28 |     var nextSeed = seed ^ (seed << 21)
29 |     nextSeed ^= (nextSeed >>> 35)
30 |     nextSeed ^= (nextSeed << 4)
31 |     seed = nextSeed
32 |     (nextSeed & ((1L << bits) - 1)).asInstanceOf[Int]
33 |   }
34 | 
35 |   override def setSeed(s: Long) {
36 |     seed = XORShiftRandom.hashSeed(s)
37 |   }
38 | }
39 | 
40 | /** Contains benchmark method and main method to run benchmark of the RNG */
41 | object XORShiftRandom {
42 | 
43 |   /** Hash seeds to have 0/1 bits throughout. */
44 |   private def hashSeed(seed: Long): Long = {
45 |     val bytes = ByteBuffer.allocate(java.lang.Long.SIZE).putLong(seed).array()
46 |     MurmurHash3.bytesHash(bytes)
47 |   }
48 | 
49 | }


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/ml/timeseries/models/GARCHSuite.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.timeseries.models
 2 | 
 3 | import org.apache.commons.math3.random.MersenneTwister
 4 | import org.apache.spark.SparkFunSuite
 5 | import org.apache.spark.ml.linalg.DenseVector
 6 | import org.apache.spark.ml.util.DefaultReadWriteTest
 7 | import org.apache.spark.mllib.util.MLlibTestSparkContext
 8 | 
 9 | /**
10 |   * Created by endy on 16-12-22.
11 |   */
12 | class GARCHSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest{
13 | 
14 |   test("GARCH log likelihood") {
15 |     val model = new GARCHModel(.2, .3, .4)
16 |     val rand = new MersenneTwister(5L)
17 |     val n = 10000
18 | 
19 |     val ts = new DenseVector(model.sample(n, rand))
20 |     val logLikelihoodWithRightModel = model.logLikelihood(ts)
21 | 
22 |     val logLikelihoodWithWrongModel1 = new GARCHModel(.3, .4, .5).logLikelihood(ts)
23 |     val logLikelihoodWithWrongModel2 = new GARCHModel(.25, .35, .45).logLikelihood(ts)
24 |     val logLikelihoodWithWrongModel3 = new GARCHModel(.1, .2, .3).logLikelihood(ts)
25 | 
26 |     assert(logLikelihoodWithRightModel > logLikelihoodWithWrongModel1)
27 |     assert(logLikelihoodWithRightModel > logLikelihoodWithWrongModel2)
28 |     assert(logLikelihoodWithRightModel > logLikelihoodWithWrongModel3)
29 |     assert(logLikelihoodWithWrongModel2 > logLikelihoodWithWrongModel1)
30 |   }
31 | 
32 |   test("gradient") {
33 |     val alpha = 0.3
34 |     val beta = 0.4
35 |     val omega = 0.2
36 |     val genModel = new GARCHModel(omega, alpha, beta)
37 |     val rand = new MersenneTwister(5L)
38 |     val n = 10000
39 | 
40 |     val ts = new DenseVector(genModel.sample(n, rand))
41 | 
42 |     val gradient1 = new GARCHModel(omega + .1, alpha + .05, beta + .1).gradient(ts)
43 |     assert(gradient1.forall(_ < 0.0))
44 |     val gradient2 = new GARCHModel(omega - .1, alpha - .05, beta - .1).gradient(ts)
45 |     assert(gradient2.forall(_ > 0.0))
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/tsne/TSNEHelper.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.tsne
 2 | 
 3 | import breeze.linalg._
 4 | import breeze.stats._
 5 | import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
 6 | import org.apache.spark.rdd.RDD
 7 | 
 8 | object TSNEHelper {
 9 |   // p_ij = (p_{i|j} + p_{j|i}) / 2n
10 |   def computeP(p_ji: CoordinateMatrix, n: Int): RDD[(Int, Iterable[(Int, Double)])] = {
11 |     p_ji.entries
12 |       .flatMap(e => Seq(
13 |         ((e.i.toInt, e.j.toInt), e.value),
14 |         ((e.j.toInt, e.i.toInt), e.value)
15 |       ))
16 |       .reduceByKey(_ + _) // p + p'
17 |       .map{case ((i, j), v) => (i, (j, math.max(v / 2 / n, 1e-12))) } // p / 2n
18 |       .groupByKey()
19 |   }
20 | 
21 |   /**
22 |     * Update Y via gradient dY
23 |     * @param Y current Y
24 |     * @param dY gradient dY
25 |     * @param iY stored y_i - y_{i-1}
26 |     * @param gains adaptive learning rates
27 |     * @param iteration n
28 |     * @param param [[TSNEParam]]
29 |     * @return
30 |     */
31 |   def update(Y: DenseMatrix[Double],
32 |              dY: DenseMatrix[Double],
33 |              iY: DenseMatrix[Double],
34 |              gains: DenseMatrix[Double],
35 |              iteration: Int,
36 |              param: TSNEParam): DenseMatrix[Double] = {
37 |     import param._
38 |     val momentum = if (iteration <= t_momentum) initial_momentum else final_momentum
39 |     gains.foreachPair {
40 |       case ((i, j), old_gain) =>
41 |         val new_gain = math.max(min_gain,
42 |           if ((dY.unsafeValueAt(i, j) > 0.0) != (iY.unsafeValueAt(i, j) > 0.0))
43 |             old_gain + 0.2
44 |           else
45 |             old_gain * 0.8
46 |         )
47 |         gains.unsafeUpdate(i, j, new_gain)
48 | 
49 |         val new_iY = momentum * iY.unsafeValueAt(i, j) - eta * new_gain * dY.unsafeValueAt(i, j)
50 |         iY.unsafeUpdate(i, j, new_iY)
51 | 
52 |         Y.unsafeUpdate(i, j, Y.unsafeValueAt(i, j) + new_iY) // Y += iY
53 |     }
54 |     Y := Y(*, ::) - (mean(Y(::, *)): DenseMatrix[Double]).toDenseVector
55 |   }
56 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/dbscan/DBSCANGraph.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.dbscan
 2 | 
 3 | import scala.annotation.tailrec
 4 | 
 5 | /**
 6 |   * Top level method for creating a DBSCANGraph
 7 |   */
 8 | object DBSCANGraph {
 9 | 
10 |   /**
11 |     * Create an empty graph
12 |     */
13 |   def apply[T](): DBSCANGraph[T] = new DBSCANGraph(Map[T, Set[T]]())
14 | 
15 | }
16 | 
17 | /**
18 |   * An immutable unweighted graph with vertexes and edges
19 |   */
20 | class DBSCANGraph[T] private (nodes: Map[T, Set[T]]) extends Serializable {
21 | 
22 |   /**
23 |     * Add the given vertex `v` to the graph
24 |     *
25 |     */
26 |   def addVertex(v: T): DBSCANGraph[T] = {
27 |     nodes.get(v) match {
28 |       case None => new DBSCANGraph(nodes + (v -> Set()))
29 |       case Some(_) => this
30 |     }
31 |   }
32 | 
33 |   /**
34 |     * Insert an edge from `from` to `to`
35 |     */
36 |   def insertEdge(from: T, to: T): DBSCANGraph[T] = {
37 |     nodes.get(from) match {
38 |       case None => new DBSCANGraph(nodes + (from -> Set(to)))
39 |       case Some(edge) => new DBSCANGraph(nodes + (from -> (edge + to)))
40 |     }
41 |   }
42 | 
43 |   /**
44 |     * Insert a vertex from `one` to `another`, and from `another` to `one`
45 |     *
46 |     */
47 |   def connect(one: T, another: T): DBSCANGraph[T] = {
48 |     insertEdge(one, another).insertEdge(another, one)
49 |   }
50 | 
51 |   /**
52 |     * Find all vertexes that are reachable from `from`
53 |     */
54 |   def getConnected(from: T): Set[T] = {
55 |     getAdjacent(Set(from), Set[T](), Set[T]()) - from
56 |   }
57 | 
58 |   @tailrec
59 |   private def getAdjacent(tovisit: Set[T], visited: Set[T], adjacent: Set[T]): Set[T] = {
60 | 
61 |     tovisit.headOption match {
62 |       case Some(current) =>
63 |         nodes.get(current) match {
64 |           case Some(edges) =>
65 |             getAdjacent(edges.diff(visited) ++ tovisit.tail, visited + current, adjacent ++ edges)
66 |           case None => getAdjacent(tovisit.tail, visited, adjacent)
67 |         }
68 |       case None => adjacent
69 |     }
70 | 
71 |   }
72 | 
73 | }
74 | 
75 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/util/DBHPartitioner.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.util
 2 | 
 3 | import scala.reflect.ClassTag
 4 | 
 5 | import org.apache.spark.HashPartitioner
 6 | import org.apache.spark.graphx._
 7 | import org.apache.spark.graphx.impl.GraphImpl
 8 | import org.apache.spark.storage.StorageLevel
 9 | 
10 | /**
11 |   * Degree-Based Hashing, the paper:
12 |   * Distributed Power-law Graph Computing: Theoretical and Empirical Analysis
13 |   */
14 | class DBHPartitioner(val partitions: Int, val threshold: Int = 0)
15 |   extends HashPartitioner(partitions) {
16 |   /**
17 |     * Default DBH doesn't consider the situation where both the degree of src and
18 |     * dst vertices are both small than a given threshold value
19 |     */
20 |   def getKey(et: EdgeTriplet[Int, _]): Long = {
21 |     val srcId = et.srcId
22 |     val dstId = et.dstId
23 |     val srcDeg = et.srcAttr
24 |     val dstDeg = et.dstAttr
25 |     val maxDeg = math.max(srcDeg, dstDeg)
26 |     val minDegId = if (maxDeg == srcDeg) dstId else srcId
27 |     val maxDegId = if (maxDeg == srcDeg) srcId else dstId
28 |     if (maxDeg < threshold) {
29 |       maxDegId
30 |     } else {
31 |       minDegId
32 |     }
33 |   }
34 | 
35 |   override def equals(other: Any): Boolean = other match {
36 |     case dbh: DBHPartitioner =>
37 |       dbh.numPartitions == numPartitions
38 |     case _ =>
39 |       false
40 |   }
41 | }
42 | 
43 | object DBHPartitioner {
44 |   def partitionByDBH[VD: ClassTag, ED: ClassTag](input: Graph[VD, ED],
45 |                                                  storageLevel: StorageLevel): Graph[VD, ED] = {
46 |     val edges = input.edges
47 |     val conf = edges.context.getConf
48 |     val numPartitions = conf.getInt("", edges.partitions.length)
49 |     val dbh = new DBHPartitioner(numPartitions, 0)
50 |     val degGraph = GraphImpl(input.degrees, edges)
51 |     val newEdges = degGraph.triplets.mapPartitions(_.map(et =>
52 |       (dbh.getKey(et), Edge(et.srcId, et.dstId, et.attr))
53 |     )).partitionBy(dbh).map(_._2)
54 |     GraphImpl(input.vertices, newEdges, null.asInstanceOf[VD], storageLevel, storageLevel)
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/ml/timeseries/models/EWMASuite.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.timeseries.models
 2 | 
 3 | import org.apache.spark.SparkFunSuite
 4 | import org.apache.spark.ml.util.DefaultReadWriteTest
 5 | import org.apache.spark.mllib.util.MLlibTestSparkContext
 6 | import org.apache.spark.mllib.util.TestingUtils._
 7 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
 8 | import org.apache.spark.sql.{Dataset, Row}
 9 | 
10 | class EWMASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest{
11 |   @transient var dataSet: Dataset[_] = _
12 |   @transient var dataSet1: Dataset[_] = _
13 | 
14 |   override def beforeAll(): Unit = {
15 |     super.beforeAll()
16 | 
17 |     val schema = StructType(Array(StructField("time", StringType), StructField("timeseries",
18 |       DoubleType)))
19 | 
20 |     val smoothed = Array(
21 |       Array("201512", 7.0), Array("201601", 8.0), Array("201602", 9.0),
22 |       Array("201509", 4.0), Array("201510", 5.0), Array("201511", 6.0),
23 |       Array("201506", 1.0), Array("201507", 2.0), Array("201508", 3.0),
24 |       Array("201603", 10.0))
25 | 
26 |     val orig1 = sc.parallelize(smoothed.map(x => Row(x: _*)))
27 |     dataSet = spark.createDataFrame(orig1, schema)
28 | 
29 |     val oil = Array(
30 |       Array("201506", 446.7), Array("201507", 454.5), Array("201508", 455.7),
31 |       Array("201512", 425.3), Array("201601", 485.1), Array("201602", 506.0),
32 |       Array("201509", 423.6), Array("201510", 456.3), Array("201511", 440.6),
33 |       Array("201603", 526.8), Array("201604", 514.3), Array("201605", 494.2))
34 | 
35 |     val orig2 = sc.parallelize(oil.map(x => Row(x: _*)))
36 |     dataSet1 = spark.createDataFrame(orig2, schema)
37 |   }
38 | 
39 | 
40 |   test("add time dependent effects") {
41 | 
42 |     val m1 = new EWMAModel(0.2).setTimeCol("time").setTimeSeriesCol("timeseries")
43 |     val res = m1.transform(dataSet).collect().map{case Row(x: Double) => x}
44 | 
45 |     assert(res(0) == 1.0)
46 |     assert(res(1) ~== 1.2 absTol 10E-5)
47 |   }
48 | 
49 |   test("fitting EWMA model") {
50 |     val model = new EWMA()
51 |       .setTimeCol("time")
52 |       .setTimeSeriesCol("timeseries")
53 |       .setMaxIter(10000)
54 |       .setMaxEval(10000)
55 |       .setInitPoint(.94)
56 |       .fit(dataSet1)
57 | 
58 |     assert(model.smoothing ~== 0.89 absTol 0.01) // approximately 0.89
59 |   }
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/ml/timeseries/models/AutoregressionSuite.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.timeseries.models
 2 | 
 3 | import org.apache.commons.math3.random.{MersenneTwister, RandomGenerator}
 4 | import org.apache.spark.SparkFunSuite
 5 | import org.apache.spark.ml.linalg.DenseVector
 6 | import org.apache.spark.ml.util.DefaultReadWriteTest
 7 | import org.apache.spark.mllib.util.MLlibTestSparkContext
 8 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
 9 | import org.apache.spark.sql.{Dataset, Row}
10 | 
11 | /**
12 |   * Created by endy on 16-12-19.
13 |   */
14 | class AutoregressionSuite extends SparkFunSuite with MLlibTestSparkContext
15 |   with DefaultReadWriteTest {
16 | 
17 |   @transient var dataSet: Dataset[_] = _
18 | 
19 |   override def beforeAll(): Unit = {
20 |     super.beforeAll()
21 |   }
22 | 
23 |   test("fit AR(1) model") {
24 |     val ts = sample(5000, new MersenneTwister(10L), 1.5, Array(.2))
25 | 
26 |     val fittedModel = new Autoregression()
27 |       .setTimeCol("time")
28 |       .setTimeSeriesCol("timeseries")
29 |       .setMaxLag(1)
30 |       .setNoIntercept(false)
31 |       .fit(ts)
32 | 
33 |     assert(fittedModel.coefficients.length == 1)
34 |     assert(math.abs(fittedModel.c - 1.5) < .07)
35 |     assert(math.abs(fittedModel.coefficients(0) - .2) < .03)
36 |   }
37 | 
38 |   test("fit AR(2) model") {
39 | 
40 |     val ts = sample(5000, new MersenneTwister(10L), 1.5, Array(.2, .3))
41 |     val fittedModel = new Autoregression()
42 |       .setTimeCol("time")
43 |       .setTimeSeriesCol("timeseries")
44 |       .setMaxLag(2)
45 |       .setNoIntercept(false)
46 |       .fit(ts)
47 | 
48 |     assert(fittedModel.coefficients.length == 2)
49 |     assert(math.abs(fittedModel.c - 1.5) < .15)
50 |     assert(math.abs(fittedModel.coefficients(0) - .2) < .03)
51 |     assert(math.abs(fittedModel.coefficients(1) - .3) < .03)
52 |   }
53 | 
54 |   def sample(n: Int, rand: RandomGenerator, c: Double, coefficients: Array[Double]): Dataset[_] = {
55 |     val vec = new DenseVector(Array.fill[Double](n)(rand.nextGaussian()))
56 |     val res = new ARModel(c, coefficients).addTimeDependentEffects(vec).toArray
57 |         .zipWithIndex
58 | 
59 |     val schema = StructType(Array(StructField("time", StringType), StructField("timeseries",
60 |       DoubleType)))
61 | 
62 |     val rdd = sc.parallelize(res.map(x => Row(x._2.formatted("%05d"), x._1)))
63 | 
64 |     spark.createDataFrame(rdd, schema)
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/tsne/impl/SimpleTSNE.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.tsne.impl
 2 | 
 3 | import breeze.linalg._
 4 | import breeze.stats.distributions.Rand
 5 | import org.apache.spark.ml.tsne.{TSNEGradient, TSNEHelper, TSNEParam, X2P}
 6 | import org.apache.spark.mllib.linalg.distributed.RowMatrix
 7 | import org.apache.spark.storage.StorageLevel
 8 | import org.slf4j.LoggerFactory
 9 | 
10 | import scala.util.Random
11 | 
12 | object SimpleTSNE {
13 |   private def logger = LoggerFactory.getLogger(SimpleTSNE.getClass)
14 | 
15 |   def tsne(
16 |             input: RowMatrix,
17 |             noDims: Int = 2,
18 |             maxIterations: Int = 1000,
19 |             perplexity: Double = 30,
20 |             callback: (Int, DenseMatrix[Double], Option[Double]) => Unit = {case _ => },
21 |             seed: Long = Random.nextLong()): DenseMatrix[Double] = {
22 |     if(input.rows.getStorageLevel == StorageLevel.NONE) {
23 |       logger.warn("Input is not persisted and performance could be bad")
24 |     }
25 | 
26 |     Rand.generator.setSeed(seed)
27 | 
28 |     val tsneParam = TSNEParam()
29 |     import tsneParam._
30 | 
31 |     val n = input.numRows().toInt
32 |     val Y: DenseMatrix[Double] = DenseMatrix.rand(n, noDims, Rand.gaussian(0, 1))
33 |     val iY = DenseMatrix.zeros[Double](n, noDims)
34 |     val gains = DenseMatrix.ones[Double](n, noDims)
35 | 
36 |     // approximate p_{j|i}
37 |     val p_ji = X2P(input, 1e-5, perplexity)
38 |     val P = TSNEHelper.computeP(p_ji, n).glom().cache()
39 | 
40 |     var iteration = 1
41 |     while(iteration <= maxIterations) {
42 |       val bcY = P.context.broadcast(Y)
43 | 
44 |       val numerator = P.map{ arr => TSNEGradient.computeNumerator(bcY.value, arr.map(_._1): _*) }.cache()
45 |       val bcNumerator = P.context.broadcast({
46 |         numerator.treeAggregate(0.0)(seqOp = (x, v) => x + sum(v), combOp = _ + _)
47 |       })
48 | 
49 |       val (dY, loss) = P.zip(numerator).treeAggregate((DenseMatrix.zeros[Double](n, noDims), 0.0))(
50 |         seqOp = (c, v) => {
51 |           // c: (grad, loss), v: (Array[(i, Iterable(j, Distance))], numerator)
52 |           val l = TSNEGradient.compute(v._1, bcY.value, v._2, bcNumerator.value, c._1, iteration <= early_exaggeration)
53 |           (c._1, c._2 + l)
54 |         },
55 |         combOp = (c1, c2) => {
56 |           // c: (grad, loss)
57 |           (c1._1 + c2._1, c1._2 + c2._2)
58 |         })
59 | 
60 |       bcY.destroy()
61 |       bcNumerator.destroy()
62 |       numerator.unpersist()
63 | 
64 |       TSNEHelper.update(Y, dY, iY, gains, iteration, tsneParam)
65 | 
66 |       logger.debug(s"Iteration $iteration finished with $loss")
67 |       callback(iteration, Y.copy, Some(loss))
68 |       iteration += 1
69 |     }
70 |     Y
71 |   }
72 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/tsne/tree/SPTree.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.tsne.tree
 2 | 
 3 | import breeze.linalg._
 4 | import breeze.numerics._
 5 | 
 6 | import scala.annotation.tailrec
 7 | 
 8 | 
 9 | class SPTree private[tree](val dimension: Int,
10 |                            val corner: DenseVector[Double],
11 |                            val width: DenseVector[Double]) extends Serializable {
12 |   private[this] val childWidth: DenseVector[Double] = width :/ 2.0
13 |   lazy val radiusSq: Double = sum(pow(width, 2))
14 |   private[tree] val totalMass: DenseVector[Double] = DenseVector.zeros(dimension)
15 |   private var count: Int = 0
16 |   private var leaf: Boolean = true
17 |   val center: DenseVector[Double] = DenseVector.zeros(dimension)
18 | 
19 |   lazy val children: Array[SPTree] = {
20 |     (0 until pow(2, dimension)).toArray.map {
21 |       i =>
22 |         val bits = DenseVector(s"%0${dimension}d".format(i.toBinaryString.toInt).toArray.map(_.toDouble - '0'.toDouble))
23 |         val childCorner: DenseVector[Double] = corner + (bits :* childWidth)
24 |         new SPTree(dimension, childCorner, childWidth)
25 |     }
26 |   }
27 | 
28 |   final def insert(vector: DenseVector[Double], finalize: Boolean = false): SPTree = {
29 |     totalMass += vector
30 |     count += 1
31 | 
32 |     if(leaf) {
33 |       if(count == 1) { // first to leaf
34 |         center := vector
35 |       } else if(!vector.equals(center)) {
36 |         (1 until count).foreach(_ => getCell(center).insert(center, finalize)) //subdivide
37 |         leaf = false
38 |       }
39 |     }
40 | 
41 |     if(finalize) computeCenter(false)
42 | 
43 |     if(leaf) this else getCell(vector).insert(vector, finalize)
44 |   }
45 | 
46 |   def computeCenter(recursive: Boolean = true): Unit = {
47 |     if(count > 0) {
48 |       center := totalMass / count.toDouble
49 |       if(recursive) children.foreach(_.computeCenter())
50 |     }
51 |   }
52 | 
53 |   def getCell(vector: DenseVector[Double]): SPTree = {
54 |     val idx = ((vector - corner) :/ childWidth).data
55 |     children(idx.foldLeft(0)((acc, i) => acc * 2 + min(max(i.ceil.toInt - 1, 0), 1)))
56 |   }
57 | 
58 |   def getCount: Int = count
59 | 
60 |   def isLeaf: Boolean = leaf
61 | }
62 | 
63 | object SPTree {
64 |   def apply(Y: DenseMatrix[Double]): SPTree = {
65 |     val d = Y.cols
66 |     val minMaxs = minMax(Y(::, *)).toDenseVector
67 |     val mins = minMaxs.mapValues(_._1)
68 |     val maxs = minMaxs.mapValues(_._2)
69 | 
70 |     val tree = new SPTree(Y.cols, mins, maxs - mins)
71 | 
72 |     // insert points but wait till end to compute all centers
73 |     //Y(*, ::).foreach(tree.insert(_, finalize = false))
74 |     (0 until Y.rows).foreach(i => tree.insert(Y(i, ::).t, finalize = false))
75 |     // compute all center of mass
76 |     tree.computeCenter()
77 | 
78 |     tree
79 |   }
80 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/tsne/X2P.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.tsne
 2 | 
 3 | import breeze.linalg.DenseVector
 4 | import org.apache.spark.mllib.X2PHelper._
 5 | import org.apache.spark.mllib.linalg.Vectors
 6 | import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry, RowMatrix}
 7 | import org.apache.spark.mllib.rdd.MLPairRDDFunctions._
 8 | import org.slf4j.LoggerFactory
 9 | 
10 | object X2P {
11 | 
12 |   private def logger = LoggerFactory.getLogger(X2P.getClass)
13 | 
14 |   def apply(x: RowMatrix, tol: Double = 1e-5, perplexity: Double = 30.0): CoordinateMatrix = {
15 |     require(tol >= 0, "Tolerance must be non-negative")
16 |     require(perplexity > 0, "Perplexity must be positive")
17 | 
18 |     val mu = (3 * perplexity).toInt //TODO: Expose this as parameter
19 |     val logU = Math.log(perplexity)
20 |     val norms = x.rows.map(Vectors.norm(_, 2.0))
21 |     norms.persist()
22 |     val rowsWithNorm = x.rows.zip(norms).map{ case (v, norm) => VectorWithNorm(v, norm) }
23 |     val neighbors = rowsWithNorm.zipWithIndex()
24 |       .cartesian(rowsWithNorm.zipWithIndex())
25 |       .flatMap {
26 |         case ((u, i), (v, j)) =>
27 |           if(i < j) {
28 |             val dist = fastSquaredDistance(u, v)
29 |             Seq((i, (j, dist)), (j, (i, dist)))
30 |           } else Seq.empty
31 |       }
32 |       .topByKey(mu)(Ordering.by(e => -e._2))
33 | 
34 |     val p_betas =
35 |       neighbors.map {
36 |         case (i, arr) =>
37 |           var betamin = Double.NegativeInfinity
38 |           var betamax = Double.PositiveInfinity
39 |           var beta = 1.0
40 | 
41 |           val d = DenseVector(arr.map(_._2))
42 |           var (h, p) = Hbeta(d, beta)
43 | 
44 |           //logInfo("data was " + d.toArray.toList)
45 |           //logInfo("array P was " + p.toList)
46 | 
47 |           // Evaluate whether the perplexity is within tolerance
48 |           def Hdiff = h - logU
49 |           var tries = 0
50 |           while (Math.abs(Hdiff) > tol && tries < 50) {
51 |             //If not, increase or decrease precision
52 |             if (Hdiff > 0) {
53 |               betamin = beta
54 |               beta = if (betamax.isInfinite) beta * 2 else (beta + betamax) / 2
55 |             } else {
56 |               betamax = beta
57 |               beta = if (betamin.isInfinite) beta / 2 else (beta + betamin) / 2
58 |             }
59 | 
60 |             // Recompute the values
61 |             val HP = Hbeta(d, beta)
62 |             h = HP._1
63 |             p = HP._2
64 |             tries = tries + 1
65 |           }
66 | 
67 |           //logInfo("array P is " + p.toList)
68 | 
69 |           (arr.map(_._1).zip(p.toArray).map { case (j, v) => MatrixEntry(i, j, v) }, beta)
70 |       }
71 | 
72 |     logger.info("Mean value of sigma: " + p_betas.map(x => math.sqrt(1 / x._2)).mean)
73 |     new CoordinateMatrix(p_betas.flatMap(_._1))
74 |   }
75 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/dbscan/LocalDBSCANArchery.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.dbscan
  2 | 
  3 | import scala.collection.mutable.Queue
  4 | import org.apache.spark.internal.Logging
  5 | import archery.Box
  6 | import archery.Entry
  7 | import archery.Point
  8 | import archery.RTree
  9 | import org.apache.spark.ml.dbscan.DBSCANLabeledPoint.Flag
 10 | 
 11 | /**
 12 |   * An implementation of DBSCAN using an R-Tree to improve its running time
 13 |   */
 14 | class LocalDBSCANArchery(eps: Double, minPoints: Int) extends Logging {
 15 | 
 16 |   val minDistanceSquared = eps * eps
 17 | 
 18 |   def fit(points: Iterable[DBSCANPoint]): Iterable[DBSCANLabeledPoint] = {
 19 | 
 20 |     val tree = points.foldLeft(RTree[DBSCANLabeledPoint]())(
 21 |       (tempTree, p) =>
 22 |         tempTree.insert(
 23 |           Entry(Point(p.x.toFloat, p.y.toFloat), new DBSCANLabeledPoint(p))))
 24 | 
 25 |     var cluster = DBSCANLabeledPoint.Unknown
 26 | 
 27 |     tree.entries.foreach(entry => {
 28 | 
 29 |       val point = entry.value
 30 | 
 31 |       if (!point.visited) {
 32 |         point.visited = true
 33 | 
 34 |         val neighbors = tree.search(toBoundingBox(point), inRange(point))
 35 | 
 36 |         if (neighbors.size < minPoints) {
 37 |           point.flag = Flag.Noise
 38 |         } else {
 39 |           cluster += 1
 40 |           expandCluster(point, neighbors, tree, cluster)
 41 |         }
 42 | 
 43 |       }
 44 | 
 45 |     })
 46 | 
 47 |     logDebug(s"total: $cluster")
 48 | 
 49 |     tree.entries.map(_.value).toIterable
 50 | 
 51 |   }
 52 | 
 53 |   private def expandCluster(
 54 |                              point: DBSCANLabeledPoint,
 55 |                              neighbors: Seq[Entry[DBSCANLabeledPoint]],
 56 |                              tree: RTree[DBSCANLabeledPoint],
 57 |                              cluster: Int): Unit = {
 58 | 
 59 |     point.flag = Flag.Core
 60 |     point.cluster = cluster
 61 | 
 62 |     val left = Queue(neighbors)
 63 | 
 64 |     while (left.nonEmpty) {
 65 | 
 66 |       left.dequeue().foreach(neighborEntry => {
 67 | 
 68 |         val neighbor = neighborEntry.value
 69 | 
 70 |         if (!neighbor.visited) {
 71 | 
 72 |           neighbor.visited = true
 73 |           neighbor.cluster = cluster
 74 | 
 75 |           val neighborNeighbors = tree.search(toBoundingBox(neighbor), inRange(neighbor))
 76 | 
 77 |           if (neighborNeighbors.size >= minPoints) {
 78 |             neighbor.flag = Flag.Core
 79 |             left.enqueue(neighborNeighbors)
 80 |           } else {
 81 |             neighbor.flag = Flag.Border
 82 |           }
 83 |         }
 84 | 
 85 |         if (neighbor.cluster == DBSCANLabeledPoint.Unknown) {
 86 |           neighbor.cluster = cluster
 87 |           neighbor.flag = Flag.Border
 88 |         }
 89 | 
 90 |       })
 91 | 
 92 |     }
 93 | 
 94 |   }
 95 | 
 96 |   private def inRange(point: DBSCANPoint)(entry: Entry[DBSCANLabeledPoint]): Boolean = {
 97 |     entry.value.distanceSquared(point) <= minDistanceSquared
 98 |   }
 99 | 
100 |   private def toBoundingBox(point: DBSCANPoint): Box = {
101 |     Box(
102 |       (point.x - eps).toFloat,
103 |       (point.y - eps).toFloat,
104 |       (point.x + eps).toFloat,
105 |       (point.y + eps).toFloat)
106 |   }
107 | 
108 | }
109 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/dbscan/LocalDBSCANNaive.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.dbscan
  2 | 
  3 | 
  4 | import scala.collection.mutable.Queue
  5 | import org.apache.spark.internal.Logging
  6 | import org.apache.spark.ml.dbscan.DBSCANLabeledPoint.Flag
  7 | import org.apache.spark.ml.linalg.Vectors
  8 | 
  9 | /**
 10 |   * A naive implementation of DBSCAN. It has O(n2) complexity
 11 |   * but uses no extra memory. This implementation is not used
 12 |   * by the parallel version of DBSCAN.
 13 |   *
 14 |   */
 15 | class LocalDBSCANNaive(eps: Double, minPoints: Int) extends Logging {
 16 | 
 17 |   val minDistanceSquared = eps * eps
 18 | 
 19 |   def samplePoint: Array[DBSCANLabeledPoint] =
 20 |     Array(new DBSCANLabeledPoint(Vectors.dense(Array(0D, 0D))))
 21 | 
 22 |   def fit(points: Iterable[DBSCANPoint]): Iterable[DBSCANLabeledPoint] = {
 23 | 
 24 |     logInfo(s"About to start fitting")
 25 | 
 26 |     val labeledPoints = points.map { new DBSCANLabeledPoint(_) }.toArray
 27 | 
 28 |     val totalClusters =
 29 |       labeledPoints
 30 |         .foldLeft(DBSCANLabeledPoint.Unknown)(
 31 |           (cluster, point) => {
 32 |             if (!point.visited) {
 33 |               point.visited = true
 34 | 
 35 |               val neighbors = findNeighbors(point, labeledPoints)
 36 | 
 37 |               if (neighbors.size < minPoints) {
 38 |                 point.flag = Flag.Noise
 39 |                 cluster
 40 |               } else {
 41 |                 expandCluster(point, neighbors, labeledPoints, cluster + 1)
 42 |                 cluster + 1
 43 |               }
 44 |             } else {
 45 |               cluster
 46 |             }
 47 |           })
 48 | 
 49 |     logInfo(s"found: $totalClusters clusters")
 50 | 
 51 |     labeledPoints
 52 | 
 53 |   }
 54 | 
 55 |   private def findNeighbors(
 56 |                              point: DBSCANPoint,
 57 |                              all: Array[DBSCANLabeledPoint]): Iterable[DBSCANLabeledPoint] =
 58 |     all.view.filter(other => {
 59 |       point.distanceSquared(other) <= minDistanceSquared
 60 |     })
 61 | 
 62 |   def expandCluster(
 63 |                      point: DBSCANLabeledPoint,
 64 |                      neighbors: Iterable[DBSCANLabeledPoint],
 65 |                      all: Array[DBSCANLabeledPoint],
 66 |                      cluster: Int): Unit = {
 67 | 
 68 |     point.flag = Flag.Core
 69 |     point.cluster = cluster
 70 | 
 71 |     var allNeighbors = Queue(neighbors)
 72 | 
 73 |     while (allNeighbors.nonEmpty) {
 74 |       allNeighbors.dequeue().foreach(neighbor => {
 75 |         if (!neighbor.visited) {
 76 | 
 77 |           neighbor.visited = true
 78 |           neighbor.cluster = cluster
 79 | 
 80 |           val neighborNeighbors = findNeighbors(neighbor, all)
 81 | 
 82 |           if (neighborNeighbors.size >= minPoints) {
 83 |             neighbor.flag = Flag.Core
 84 |             allNeighbors.enqueue(neighborNeighbors)
 85 |           } else {
 86 |             neighbor.flag = Flag.Border
 87 |           }
 88 | 
 89 |           if (neighbor.cluster == DBSCANLabeledPoint.Unknown) {
 90 |             neighbor.cluster = cluster
 91 |             neighbor.flag = Flag.Border
 92 |           }
 93 |         }
 94 | 
 95 |       })
 96 | 
 97 |     }
 98 | 
 99 |   }
100 | 
101 | }
102 | 
103 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/sampling/UnderSampling.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.sampling
 2 | 
 3 | import org.apache.spark.ml.Transformer
 4 | import org.apache.spark.ml.param._
 5 | import org.apache.spark.ml.util.Identifiable
 6 | import org.apache.spark.sql.functions._
 7 | import org.apache.spark.sql.types.StructType
 8 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
 9 | 
10 | /**
11 |   * Created by endy on 16-12-8.
12 |   */
13 | 
14 | trait UnderSamplingParams extends Params{
15 |   final val threshold = new DoubleParam(this, "threshold", "The threshold whether to  " +
16 |     "undersampling sample of a class", (x: Double) => x > 1)
17 |   def setThreshold(value: Double): this.type = set(threshold, value)
18 | 
19 |   final val dependentColName = new Param[String](this, "dependentColName", "The column that " +
20 |     "provide label values")
21 |   def setDependentColName(value: String): this.type = set(dependentColName, value)
22 | 
23 |   final val withReplacement = new BooleanParam(this, "withReplacement", "")
24 |   def setWithReplacement(value: Boolean): this.type = set(withReplacement, value)
25 | 
26 |   final val primaryClass = new DoubleParam(this, "primaryClass", "primary class that to under " +
27 |     "sampling")
28 |   def setPrimaryClass(value: Double): this.type = set(primaryClass, value)
29 | }
30 | 
31 | 
32 | class UnderSampling(override val uid: String) extends Transformer with UnderSamplingParams{
33 | 
34 |   def this() = this(Identifiable.randomUID("UnderSampling"))
35 |   /**
36 |     * Transforms the input dataset.
37 |     */
38 |   override def transform(dataset: Dataset[_]): DataFrame = {
39 | 
40 |     val labelCountPair = dataset.groupBy($(dependentColName)).count().collect()
41 | 
42 |     val primaryClassCount = labelCountPair
43 |       .filter{ case Row(label: Double, count: Long) => label == ${primaryClass}}
44 |       .map(x => x.get(1)).headOption.getOrElse(-1L).asInstanceOf[Long]
45 | 
46 |     if (primaryClassCount == -1) throw new Exception("The label is not exist")
47 | 
48 |     val res = labelCountPair.zipWithIndex.map {
49 |       case (Row(label: Double, count: Long), index: Int) =>
50 |        val ratio = count / primaryClassCount.toDouble
51 | 
52 |        /**
53 |          * if ratio < threshold, only return samples of this label,
54 |          * otherwise we sample the data from the samples of this label.
55 |          *
56 |          * The desired number of samples is : num = primaryClassCount * threshold
57 |          * so the fraction of sample method is: num / count = threshold / ratio
58 |          */
59 |        val df = if (ratio < ${threshold}) dataset.filter(col($(dependentColName)) === label)
60 |             else dataset.filter(col($(dependentColName)) === label)
61 |           .sample(${withReplacement}, ${threshold} / ratio)
62 | 
63 |        df.toDF()
64 |     }.reduce(_ union _)
65 | 
66 |     res
67 |   }
68 | 
69 |   override def copy(extra: ParamMap): Transformer = defaultCopy(extra)
70 | 
71 |   /**
72 |     * :: DeveloperApi ::
73 |     *
74 |     * Check transform validity and derive the output schema from the input schema.
75 |     *
76 |     * Typical implementation should first conduct verification on schema change and parameter
77 |     * validity, including complex parameter interaction checks.
78 |     */
79 |   override def transformSchema(schema: StructType): StructType = {
80 |     schema
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/timeseries/MatrixUtil.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.timeseries
 2 | 
 3 | import breeze.linalg.{CSCMatrix, DenseMatrix, DenseVector, Matrix, SliceVector, SparseVector, Vector}
 4 | import io.transwarp.hubble.error.HubbleErrors
 5 | import org.apache.spark.ml.linalg.{DenseMatrix => SDM, Matrix => SM, SparseMatrix => SSM}
 6 | import org.apache.spark.ml.linalg.{DenseVector => SDV, SparseVector => SSV, Vector => SV}
 7 | /**
 8 |   * Created by endy on 16-12-16.
 9 |   */
10 | object MatrixUtil {
11 | 
12 |   def matToRowArrs(mat: SM): Array[Array[Double]] = {
13 |     val arrs = new Array[Array[Double]](mat.numRows)
14 |     for (r <- 0 until mat.numRows) {
15 |       arrs(r) = toBreeze(mat)(r to r, 0 until mat.numCols).toDenseMatrix.toArray
16 |     }
17 |     arrs
18 |   }
19 | 
20 |   def toBreeze(sparkMatrix: SM): Matrix[Double] = {
21 |     sparkMatrix match {
22 |       case dm: SDM =>
23 |         if (!dm.isTransposed) {
24 |           new DenseMatrix[Double](dm.numRows, dm.numCols, dm.values)
25 |         } else {
26 |           val breezeMatrix = new DenseMatrix[Double](dm.numCols, dm.numRows, dm.values)
27 |           breezeMatrix.t
28 |         }
29 |       case sm: SSM =>
30 |         if (!sm.isTransposed) {
31 |           new CSCMatrix[Double](sm.values, sm.numRows, sm.numCols, sm.colPtrs, sm.rowIndices)
32 |         } else {
33 |           val breezeMatrix =
34 |             new CSCMatrix[Double](sm.values, sm.numCols, sm.numRows, sm.colPtrs, sm.rowIndices)
35 |           breezeMatrix.t
36 |         }
37 |       case _ =>
38 |         throw HubbleErrors.typeNotSupported(
39 |           s"Do not support conversion from type ${sparkMatrix.getClass.getName}.")
40 |     }
41 |   }
42 | 
43 |   def toBreeze(sparkVector: SV): Vector[Double] = {
44 |     sparkVector match {
45 |       case v: SDV =>
46 |         new DenseVector[Double](v.values)
47 |       case v: SSV =>
48 |         new SparseVector[Double](v.indices, v.values, v.size)
49 |     }
50 |   }
51 | 
52 | 
53 |   def fromBreeze(breeze: Matrix[Double]): SM = {
54 |     breeze match {
55 |       case dm: DenseMatrix[Double] =>
56 |         new SDM(dm.rows, dm.cols, dm.data, dm.isTranspose)
57 |       case sm: CSCMatrix[Double] =>
58 |         // There is no isTranspose flag for sparse matrices in Breeze
59 |         new SSM(sm.rows, sm.cols, sm.colPtrs, sm.rowIndices, sm.data)
60 |       case _ =>
61 |         throw HubbleErrors.typeNotSupported(
62 |           s"Do not support conversion from type ${breeze.getClass.getName}.")
63 |     }
64 |   }
65 | 
66 |   def fromBreeze(breezeVector: Vector[Double]): SV = {
67 |     breezeVector match {
68 |       case v: DenseVector[Double] =>
69 |         if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) {
70 |           new SDV(v.data)
71 |         } else {
72 |           new SDV(v.toArray)  // Can't use underlying array directly, so make a new one
73 |         }
74 |       case v: SparseVector[Double] =>
75 |         if (v.index.length == v.used) {
76 |           new SSV(v.length, v.index, v.data)
77 |         } else {
78 |           new SSV(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used))
79 |         }
80 |       case v: SliceVector[_, Double] =>
81 |         new SDV(v.toArray)
82 |       case v: Vector[_] =>
83 |        throw HubbleErrors.typeNotSupported("Unsupported Breeze vector type: " + v.getClass.getName)
84 |     }
85 |   }
86 | 
87 | }
88 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/dbscan/DBSCAN2.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.dbscan
 2 | 
 3 | import org.apache.spark.ml.{Estimator, Model}
 4 | import org.apache.spark.ml.param.{DoubleParam, IntParam, ParamMap, Params}
 5 | import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol}
 6 | import org.apache.spark.rdd.RDD
 7 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
 8 | import org.apache.spark.sql.types.{IntegerType, StructType}
 9 | import org.apache.spark.ml.linalg.{Vector, VectorUDT}
10 | import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
11 | import org.apache.spark.sql.functions.{col, udf}
12 | 
13 | /**
14 |   * Created by endy on 17-12-5.
15 |   */
16 | 
17 | trait DBSCANParams extends Params with HasFeaturesCol with HasPredictionCol{
18 |   final val eps = new DoubleParam(this, "eps", "the maximum distance between two points" +
19 |     " for them to be considered as part of the same region")
20 |   def getEps: Double = ${eps}
21 | 
22 |   final val minPoints = new IntParam(this, "minPoints", "the minimum number of" +
23 |     " points required to form a dense region")
24 |   def getMinPoints: Int = ${minPoints}
25 | 
26 |   final val maxPointsPerPartition = new IntParam(this, "maxPointsPerPartition",
27 |     "the largest number of points in a single partition")
28 | 
29 |   def getMaxPointsPerPartition: Int = ${maxPointsPerPartition}
30 | 
31 |   protected def validateAndTransformSchema(schema: StructType): StructType = {
32 |     SchemaUtils.checkColumnType(schema, ${featuresCol}, new VectorUDT)
33 |     SchemaUtils.appendColumn(schema, ${predictionCol}, IntegerType)
34 |   }
35 | }
36 | 
37 | class DBSCAN2(override val uid: String) extends Estimator[DBSCAN2Model] with DBSCANParams{
38 | 
39 |   setDefault(eps -> 0.3, minPoints -> 10, maxPointsPerPartition -> 250)
40 | 
41 |   def this() = this(Identifiable.randomUID("dbscan"))
42 | 
43 |   def setEps(value: Double): this.type = set(eps, value)
44 | 
45 |   def setMinPoints(value: Int): this.type = set(minPoints, value)
46 | 
47 |   def setMaxPointsPerPartition(value: Int): this.type = set(maxPointsPerPartition, value)
48 | 
49 |   override def fit(dataset: Dataset[_]): DBSCAN2Model = {
50 |     val instances: RDD[Vector] = dataset.select(col(${featuresCol})).rdd.map {
51 |       case Row(point: Vector) => point
52 |     }
53 | 
54 |     val dbscan = DBSCAN.train(instances, ${eps}, ${minPoints}, ${maxPointsPerPartition})
55 | 
56 |     new DBSCAN2Model(uid, dbscan)
57 |   }
58 | 
59 |   override def copy(extra: ParamMap): Estimator[DBSCAN2Model] = defaultCopy(extra)
60 | 
61 |   override def transformSchema(schema: StructType): StructType = {
62 |     validateAndTransformSchema(schema)
63 |   }
64 | }
65 | 
66 | class DBSCAN2Model(override val uid: String, val model: DBSCAN) extends
67 |   Model[DBSCAN2Model] with DBSCANParams{
68 | 
69 |   override def copy(extra: ParamMap): DBSCAN2Model = defaultCopy(extra)
70 | 
71 |   override def transform(dataset: Dataset[_]): DataFrame = {
72 |     val clustered = model.labeledPoints
73 |       .map(p => (p.vector(0), p.vector(1), p.vector, p.cluster))
74 | 
75 |     dataset.sparkSession.createDataFrame(clustered)
76 |       .toDF(dataset.schema.fieldNames(0),
77 |         dataset.schema.fieldNames(1),
78 |         ${featuresCol}, ${predictionCol})
79 |   }
80 | 
81 |   override def transformSchema(schema: StructType): StructType = {
82 |     validateAndTransformSchema(schema)
83 |   }
84 | }
85 | 
86 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/sampling/OverSampling.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.sampling
 2 | 
 3 | import org.apache.spark.ml.Transformer
 4 | import org.apache.spark.ml.param._
 5 | import org.apache.spark.ml.util.Identifiable
 6 | import org.apache.spark.sql.functions._
 7 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
 8 | import org.apache.spark.sql.types.StructType
 9 | 
10 | /**
11 |   * Created by endy on 16-12-8.
12 |   */
13 | 
14 | trait OverSamplingParams extends Params{
15 |   final val threshold = new DoubleParam(this, "threshold", "The threshold whether to  " +
16 |     "undersampling sample of a class", (x: Double) => x > 1)
17 |   def setThreshold(value: Double): this.type = set(threshold, value)
18 | 
19 |   final val dependentColName = new Param[String](this, "dependentColName", "The column that " +
20 |     "provide label values")
21 |   def setDependentColName(value: String): this.type = set(dependentColName, value)
22 | 
23 |   final val primaryClass = new DoubleParam(this, "primaryClass", "primary class that to under " +
24 |     "sampling")
25 |   def setPrimaryClass(value: Double): this.type = set(primaryClass, value)
26 | }
27 | 
28 | 
29 | class OverSampling(override val uid: String) extends Transformer with OverSamplingParams {
30 |   def this() = this(Identifiable.randomUID("OverSampling"))
31 | 
32 |   /**
33 |     * Transforms the input dataset.
34 |     */
35 |   override def transform(dataset: Dataset[_]): DataFrame = {
36 |     val labelCountPair = dataset.groupBy($(dependentColName)).count().collect()
37 | 
38 |     val primaryClassCount = labelCountPair
39 |       .filter{ case Row(label: Double, count: Long) => label == ${primaryClass}}
40 |           .map(x => x.get(1)).headOption.getOrElse(-1L).asInstanceOf[Long]
41 | 
42 |     if (primaryClassCount == -1) throw new Exception("The label is not exist")
43 | 
44 |     val res = labelCountPair.zipWithIndex
45 |       .map {
46 |           case (Row(label: Double, count: Long), index: Int) =>
47 |             val ratio = primaryClassCount / count.toDouble
48 | 
49 |             /**
50 |               * if ratio < threshold, only return samples of this label,
51 |               * otherwise we sample the data from the samples of this label.
52 |               *
53 |               * The desired number of samples is : num = primaryClassCount * threshold
54 |               * so the fraction of sample method is: num / count = ratio / threshold.
55 |               * Because fraction > 1, the value of 'withReplacement' parameter must be true
56 |               */
57 |             val df = if (ratio < ${threshold}) {
58 |               dataset.filter(col($(dependentColName)) === label)
59 |             } else {
60 |               val desiredFraction = ratio / ${threshold}
61 |               dataset.filter(col($(dependentColName)) === label)
62 |                 .sample(withReplacement = true, desiredFraction)
63 |             }
64 |             df.toDF()
65 |      }.reduce(_ union _)
66 | 
67 |     res
68 |   }
69 | 
70 |   override def copy(extra: ParamMap): Transformer = defaultCopy(extra)
71 | 
72 |   /**
73 |     * :: DeveloperApi ::
74 |     *
75 |     * Check transform validity and derive the output schema from the input schema.
76 |     *
77 |     * Typical implementation should first conduct verification on schema change and parameter
78 |     * validity, including complex parameter interaction checks.
79 |     */
80 |   override def transformSchema(schema: StructType): StructType = {
81 |     schema
82 |   }
83 | }
84 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/ml/knn_is/KNN_ISSuite.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.knn_is
  2 | 
  3 | import org.apache.spark.SparkFunSuite
  4 | import org.apache.spark.ml.feature.LabeledPoint
  5 | import org.apache.spark.ml.linalg.Vectors
  6 | import org.apache.spark.ml.util.DefaultReadWriteTest
  7 | import org.apache.spark.mllib.evaluation.MulticlassMetrics
  8 | import org.apache.spark.mllib.util.MLlibTestSparkContext
  9 | import org.apache.spark.rdd.RDD
 10 | import org.apache.spark.sql.{Dataset, Row}
 11 | 
 12 | import scala.util.Random
 13 | 
 14 | class KNN_ISSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
 15 |   @transient var dataset: Dataset[_] = _
 16 | 
 17 |   override def beforeAll(): Unit = {
 18 |     super.beforeAll()
 19 |     dataset = spark.createDataFrame(KNN_ISSuite.generateKnnInput(1.0, 1.0,
 20 |       nPoints = 1000, seed = 42))
 21 |   }
 22 | 
 23 |   test("knn: default params") {
 24 |     val knn_is = new KNN_ISClassifier()
 25 |     assert(knn_is.getLabelCol === "label")
 26 |     assert(knn_is.getFeaturesCol === "features")
 27 |     assert(knn_is.getPredictionCol === "prediction")
 28 |     assert(knn_is.getK == 1)
 29 |     assert(knn_is.getDistanceType == 1)
 30 |     assert(knn_is.getNumSamplesTest == 1)
 31 |     assert(knn_is.getNumClass == 1)
 32 |     assert(knn_is.getNumIter == 1)
 33 |     assert(knn_is.getInc == 0)
 34 |     assert(knn_is.getSubdel == 0)
 35 |     assert(knn_is.getTopdel == 0)
 36 |   }
 37 | 
 38 |   test("train"){
 39 |     val knn_is = new KNN_ISClassifier()
 40 |     knn_is.fit(dataset)
 41 |   }
 42 | 
 43 |   test("transform: one iterationNum"){
 44 |     val knn_is = new KNN_ISClassifier()
 45 |       .setNumClass(2)
 46 |       .setNumSamplesTest(dataset.count().toInt)
 47 |       .setK(5)
 48 | 
 49 |     val model = knn_is.fit(dataset)
 50 | 
 51 |     val results = model.transform(dataset)
 52 |     assert(results.count() == dataset.count())
 53 | 
 54 |     val source = dataset.select("label").rdd.map{case Row(x: Double) => x}
 55 |     val res = results.select("prediction").rdd.map{case Row(x: Double) => x}
 56 | 
 57 |     val predictions = source.zip(res.asInstanceOf[RDD[Double]])
 58 |     val metrics = new MulticlassMetrics(predictions)
 59 |     val precision = metrics.accuracy
 60 |     assert(precision == 0.64)
 61 |   }
 62 | 
 63 |   test("transform: more than one iterationNum"){
 64 |     val knn_is = new KNN_ISClassifier()
 65 |       .setNumClass(2)
 66 |       .setNumSamplesTest(dataset.count().toInt)
 67 |       .setNumIter(3)
 68 |       .setK(5)
 69 | 
 70 |     val model = knn_is.fit(dataset)
 71 | 
 72 |     val results = model.transform(dataset)
 73 |     assert(results.count() == dataset.count())
 74 | 
 75 |     val source = dataset.select("label")
 76 |       .rdd.map{case Row(x: Double) => x}.repartition(1)
 77 |     val res = results.select("prediction")
 78 |       .rdd.map{case Row(x: Double) => x}.repartition(1)
 79 | 
 80 |     val predictions = source.zip(res.asInstanceOf[RDD[Double]])
 81 |     val metrics = new MulticlassMetrics(predictions)
 82 |     val precision = metrics.accuracy
 83 |     assert(precision == 0.648)
 84 |   }
 85 | }
 86 | 
 87 | object KNN_ISSuite {
 88 |   def generateKnnInput(offset: Double,
 89 |                        scale: Double,
 90 |                        nPoints: Int,
 91 |                        seed: Int): Seq[LabeledPoint] = {
 92 |     val rnd = new Random(seed)
 93 |     val x1 = Array.fill[Double](nPoints)(rnd.nextGaussian())
 94 | 
 95 |     val y = (0 until nPoints).map { i =>
 96 |       val p = 1.0 / (1.0 + math.exp(-(offset + scale * x1(i))))
 97 |       if (rnd.nextDouble() < p) 1.0 else 0.0
 98 |     }
 99 | 
100 |     val testData = (0 until nPoints).map(i => LabeledPoint(y(i), Vectors.dense(Array(x1(i)))))
101 |     testData
102 |   }
103 | }
104 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/tsne/impl/BHTSNE.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.tsne.impl
  2 | 
  3 | import breeze.linalg._
  4 | import breeze.stats.distributions.Rand
  5 | import org.apache.spark.ml.tsne.tree.SPTree
  6 | import org.apache.spark.ml.tsne.{TSNEGradient, TSNEHelper, TSNEParam, X2P}
  7 | import org.apache.spark.mllib.linalg.distributed.RowMatrix
  8 | import org.apache.spark.storage.StorageLevel
  9 | import org.slf4j.LoggerFactory
 10 | 
 11 | import scala.util.Random
 12 | 
 13 | object BHTSNE {
 14 |   private def logger = LoggerFactory.getLogger(BHTSNE.getClass)
 15 | 
 16 |   def tsne(
 17 |             input: RowMatrix,
 18 |             noDims: Int = 2,
 19 |             maxIterations: Int = 1000,
 20 |             perplexity: Double = 30,
 21 |             theta: Double = 0.5,
 22 |             reportLoss: Int => Boolean = {i => i % 10 == 0},
 23 |             callback: (Int, DenseMatrix[Double], Option[Double]) => Unit = {case _ => },
 24 |             seed: Long = Random.nextLong()
 25 |           ): DenseMatrix[Double] = {
 26 |     if(input.rows.getStorageLevel == StorageLevel.NONE) {
 27 |       logger.warn("Input is not persisted and performance could be bad")
 28 |     }
 29 | 
 30 |     Rand.generator.setSeed(seed)
 31 | 
 32 |     val tsneParam = TSNEParam()
 33 |     import tsneParam._
 34 | 
 35 |     val n = input.numRows().toInt
 36 |     val Y: DenseMatrix[Double] = DenseMatrix.rand(n, noDims, Rand.gaussian(0, 1)) :/ 1e4
 37 |     val iY = DenseMatrix.zeros[Double](n, noDims)
 38 |     val gains = DenseMatrix.ones[Double](n, noDims)
 39 | 
 40 |     // approximate p_{j|i}
 41 |     val p_ji = X2P(input, 1e-5, perplexity)
 42 |     val P = TSNEHelper.computeP(p_ji, n).glom()
 43 |       .map(rows => rows.map {
 44 |         case (i, data) =>
 45 |           (i, data.map(_._1).toSeq, DenseVector(data.map(_._2 * exaggeration_factor).toArray))
 46 |       })
 47 |       .cache()
 48 | 
 49 |     var iteration = 1
 50 |     while(iteration <= maxIterations) {
 51 |       val bcY = P.context.broadcast(Y)
 52 |       val bcTree = P.context.broadcast(SPTree(Y))
 53 | 
 54 |       val initialValue = (DenseMatrix.zeros[Double](n, noDims), DenseMatrix.zeros[Double](n, noDims), 0.0)
 55 |       val (posF, negF, sumQ) = P.treeAggregate(initialValue)(
 56 |         seqOp = (c, v) => {
 57 |           // c: (pos, neg, sumQ), v: Array[(i, Seq(j), vec(Distance))]
 58 |           TSNEGradient.computeEdgeForces(v, bcY.value, c._1)
 59 |           val q = TSNEGradient.computeNonEdgeForces(bcTree.value, bcY.value, theta, c._2, v.map(_._1): _*)
 60 |           (c._1, c._2, c._3 + q)
 61 |         },
 62 |         combOp = (c1, c2) => {
 63 |           // c: (grad, loss)
 64 |           (c1._1 + c2._1, c1._2 + c2._2, c1._3 + c2._3)
 65 |         })
 66 |       val dY: DenseMatrix[Double] = posF :- (negF :/ sumQ)
 67 | 
 68 |       TSNEHelper.update(Y, dY, iY, gains, iteration, tsneParam)
 69 | 
 70 |       if(reportLoss(iteration)) {
 71 |         val loss = P.treeAggregate(0.0)(
 72 |           seqOp = (c, v) => {
 73 |             TSNEGradient.computeLoss(v, bcY.value, sumQ)
 74 |           },
 75 |           combOp = _ + _
 76 |         )
 77 |         logger.debug(s"Iteration $iteration finished with $loss")
 78 |         callback(iteration, Y.copy, Some(loss))
 79 |       } else {
 80 |         logger.debug(s"Iteration $iteration finished")
 81 |         callback(iteration, Y.copy, None)
 82 |       }
 83 | 
 84 |       bcY.destroy()
 85 |       bcTree.destroy()
 86 | 
 87 |       //undo early exaggeration
 88 |       if(iteration == early_exaggeration) {
 89 |         P.foreach {
 90 |           rows => rows.foreach {
 91 |             case (_, _, vec) => vec.foreachPair { case (i, v) => vec.update(i, v / exaggeration_factor) }
 92 |           }
 93 |         }
 94 |       }
 95 | 
 96 |       iteration += 1
 97 |     }
 98 | 
 99 |     Y
100 |   }
101 | }
102 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/util/SparkUtils.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.util
  2 | 
  3 | import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
  4 | import breeze.storage.Zero
  5 | import org.apache.hadoop.fs.{FileSystem, Path}
  6 | import org.apache.spark.SparkConf
  7 | import org.apache.spark.deploy.SparkHadoopUtil
  8 | import org.apache.spark.mllib.linalg.{DenseVector => SDV, SparseVector => SSV, Vector => SV}
  9 | 
 10 | import scala.language.implicitConversions
 11 | import scala.reflect.ClassTag
 12 | 
 13 | 
 14 | object SparkUtils {
 15 |   implicit def toBreeze(sv: SV): BV[Double] = {
 16 |     sv match {
 17 |       case SDV(data) =>
 18 |         new BDV(data)
 19 |       case SSV(size, indices, values) =>
 20 |         new BSV(indices, values, size)
 21 |     }
 22 |   }
 23 | 
 24 |   implicit def fromBreeze(breezeVector: BV[Double]): SV = {
 25 |     breezeVector match {
 26 |       case v: BDV[Double] =>
 27 |         if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) {
 28 |           new SDV(v.data)
 29 |         } else {
 30 |           new SDV(v.toArray) // Can't use underlying array directly, so make a new one
 31 |         }
 32 |       case v: BSV[Double] =>
 33 |         if (v.index.length == v.used) {
 34 |           new SSV(v.length, v.index, v.data)
 35 |         } else {
 36 |           new SSV(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used))
 37 |         }
 38 |       case v: BV[_] =>
 39 |         sys.error("Unsupported Breeze vector type: " + v.getClass.getName)
 40 |     }
 41 |   }
 42 | 
 43 |   def toBreezeConv[T: ClassTag](sv: SV)(implicit num: Numeric[T]): BV[T] = {
 44 |     val zero = num.zero
 45 |     implicit val conv: Array[Double] => Array[T] = (data) => {
 46 |       data.map(ele => (zero match {
 47 |         case zero: Double => ele
 48 |         case zero: Float => ele.toFloat
 49 |         case zero: Int => ele.toInt
 50 |         case zero: Long => ele.toLong
 51 |       }).asInstanceOf[T]).array
 52 |     }
 53 |     sv match {
 54 |       case SDV(data) =>
 55 |         new BDV[T](data)
 56 |       case SSV(size, indices, values) =>
 57 |         new BSV[T](indices, values, size)(Zero[T](zero))
 58 |     }
 59 |   }
 60 | 
 61 |   def fromBreezeConv[T: ClassTag](breezeVector: BV[T])(implicit num: Numeric[T]): SV = {
 62 |     implicit val conv: Array[T] => Array[Double] = (data) => {
 63 |       data.map(num.toDouble).array
 64 |     }
 65 |     breezeVector match {
 66 |       case v: BDV[T] =>
 67 |         if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) {
 68 |           new SDV(v.data)
 69 |         } else {
 70 |           new SDV(v.toArray) // Can't use underlying array directly, so make a new one
 71 |         }
 72 |       case v: BSV[T] =>
 73 |         if (v.index.length == v.used) {
 74 |           new SSV(v.length, v.index, v.data)
 75 |         } else {
 76 |           new SSV(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used))
 77 |         }
 78 |       case v: BV[T] =>
 79 |         sys.error("Unsupported Breeze vector type: " + v.getClass.getName)
 80 |     }
 81 |   }
 82 | 
 83 |   def getFileSystem(conf: SparkConf, path: Path): FileSystem = {
 84 |     val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
 85 |     if (sys.env.contains("HADOOP_CONF_DIR") || sys.env.contains("YARN_CONF_DIR")) {
 86 |       val hdfsConfPath = if (sys.env.get("HADOOP_CONF_DIR").isDefined) {
 87 |         sys.env.get("HADOOP_CONF_DIR").get + "/core-site.xml"
 88 |       } else {
 89 |         sys.env.get("YARN_CONF_DIR").get + "/core-site.xml"
 90 |       }
 91 |       hadoopConf.addResource(new Path(hdfsConfPath))
 92 |     }
 93 |     path.getFileSystem(hadoopConf)
 94 |   }
 95 | 
 96 |   def deleteChkptDirs(conf: SparkConf, dirs: Array[String]): Unit = {
 97 |     val fs = getFileSystem(conf, new Path(dirs(0)))
 98 |     dirs.foreach(dir => {
 99 |       fs.delete(new Path(dir), true)
100 |     })
101 |   }
102 | }
103 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/ml/timeseries/models/ARGARCHSuite.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.ml.timeseries.models
 2 | 
 3 | import org.apache.commons.math3.random.MersenneTwister
 4 | import org.apache.spark.SparkFunSuite
 5 | import org.apache.spark.ml.linalg.DenseVector
 6 | import org.apache.spark.ml.timeseries.MatrixUtil
 7 | import org.apache.spark.ml.util.DefaultReadWriteTest
 8 | import org.apache.spark.mllib.util.MLlibTestSparkContext
 9 | import org.apache.spark.mllib.util.TestingUtils._
10 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
11 | import org.apache.spark.sql.{Row, _}
12 | 
13 | /**
14 |   * Created by endy on 16-12-22.
15 |   */
16 | class ARGARCHSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest{
17 |   test("fit model") {
18 |     val omega = 0.2
19 |     val alpha = 0.3
20 |     val beta = 0.5
21 |     val genModel = new ARGARCHModel(0.0, 0.0, alpha, beta, omega)
22 |     val rand = new MersenneTwister(5L)
23 |     val n = 10000
24 | 
25 |     val ts = genModel.sample(n, rand)
26 |     val data = genDf(ts)
27 | 
28 |     val model = new GARCH().fit(data)
29 |     assert(model.omega - omega < .1) // TODO: we should be able to be more accurate
30 |     assert(model.alpha - alpha < .02)
31 |     assert(model.beta - beta < .02)
32 |   }
33 | 
34 | 
35 |   test("fit model 2") {
36 |     val arr = Array[Double](0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
37 |       0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
38 |       0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
39 |       0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
40 |       0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
41 |       0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
42 |       0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
43 |       0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
44 |       0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
45 |       0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
46 |       0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
47 |       0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
48 |       0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
49 |       0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
50 |       0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
51 |       0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
52 |       0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
53 |       0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
54 |       0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
55 |       0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1)
56 |     val ts = genDf(arr)
57 | 
58 |     val model = new ARGARCH().fit(ts)
59 | 
60 |     assert(model.alpha ~== -0.106 absTol 0.001)
61 |     assert(model.beta ~== -1.012 absTol 0.001)
62 |     assert(model.omega ~== 0.190 absTol 0.01)
63 |     assert(model.c ~== -0.0355 absTol 0.01)
64 |     assert(model.phi ~== -0.339 absTol 0.01)
65 |   }
66 | 
67 |   test("standardize and filter") {
68 |     val model = new ARGARCHModel(40.0, .4, .2, .3, .4)
69 |     val rand = new MersenneTwister(5L)
70 |     val n = 10000
71 | 
72 |     val ts = new DenseVector(model.sample(n, rand))
73 | 
74 |     // de-heteroskedasticize
75 |     val standardized = model.removeTimeDependentEffects(ts)
76 |     // heteroskedasticize
77 |     val filtered = model.addTimeDependentEffects(standardized)
78 | 
79 |     assert((MatrixUtil.toBreeze(filtered) - MatrixUtil.toBreeze(ts)).toArray.forall(math.abs(_) <
80 |       .001))
81 |   }
82 | 
83 |   def genDf(array: Array[Double]): DataFrame = {
84 |     val schema = StructType(Array(StructField("time", StringType), StructField("timeseries",
85 |       DoubleType)))
86 | 
87 |     val rdd = spark.sparkContext.parallelize(
88 |       array.zipWithIndex.map(x => Row(x._2.formatted("%010d"), x._1)))
89 | 
90 |     spark.createDataFrame(rdd, schema)
91 |   }
92 | }
93 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/util/LoaderUtils.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.util
  2 | 
  3 | import org.apache.hadoop.fs._
  4 | import org.apache.spark.SparkContext
  5 | import org.apache.spark.rdd.RDD
  6 | import org.apache.spark.sql.catalyst.ScalaReflection
  7 | import org.apache.spark.sql.types.{DataType, StructField, StructType}
  8 | import org.json4s._
  9 | import org.json4s.jackson.JsonMethods._
 10 | 
 11 | import scala.reflect.ClassTag
 12 | import scala.reflect.runtime.universe.TypeTag
 13 | 
 14 | // copy form Spark MLlib
 15 | /**
 16 |   * Helper methods for loading models from files.
 17 |   */
 18 | private[ml] object LoaderUtils {
 19 | 
 20 |   /** Returns URI for path/data using the Hadoop filesystem */
 21 |   def dataPath(path: String): String = new Path(path, "data").toUri.toString
 22 | 
 23 |   /** Returns URI for path/metadata using the Hadoop filesystem */
 24 |   def metadataPath(path: String): String = new Path(path, "metadata").toUri.toString
 25 | 
 26 |   /**
 27 |     * Check the schema of loaded model data.
 28 |     *
 29 |     * This checks every field in the expected schema to make sure that a field with the same
 30 |     * name and DataType appears in the loaded schema.  Note that this does NOT check metadata
 31 |     * or containsNull.
 32 |     *
 33 |     * @param loadedSchema  Schema for model data loaded from file.
 34 |     * @tparam Data  Expected data type from which an expected schema can be derived.
 35 |     */
 36 |   def checkSchema[Data: TypeTag](loadedSchema: StructType): Unit = {
 37 |     // Check schema explicitly since erasure makes it hard to use match-case for checking.
 38 |     val expectedFields: Array[StructField] =
 39 |       ScalaReflection.schemaFor[Data].dataType.asInstanceOf[StructType].fields
 40 |     val loadedFields: Map[String, DataType] =
 41 |       loadedSchema.map(field => field.name -> field.dataType).toMap
 42 |     expectedFields.foreach { field =>
 43 |       assert(loadedFields.contains(field.name), s"Unable to parse model data." +
 44 |         s"  Expected field with name ${field.name} was missing in loaded schema:" +
 45 |         s" ${loadedFields.mkString(", ")}")
 46 |     }
 47 |   }
 48 | 
 49 |   /**
 50 |     * Load metadata from the given path.
 51 |     * @return (class name, version, metadata)
 52 |     */
 53 |   def loadMetadata(sc: SparkContext, path: String): (String, String, JValue) = {
 54 |     implicit val formats = DefaultFormats
 55 |     val metadata = parse(sc.textFile(metadataPath(path)).first())
 56 |     val clazz = (metadata \ "class").extract[String]
 57 |     val version = (metadata \ "version").extract[String]
 58 |     (clazz, version, metadata)
 59 |   }
 60 | 
 61 |   /**
 62 |     * Save an RDD to one HDFS file
 63 |     * @param sc SparkContext
 64 |     * @param rdd The RDD to save
 65 |     * @param outPathStr The HDFS file path of String
 66 |     * @param header Header line of HDFS file, used for storing some metadata
 67 |     * @param mapEle The function mapping each element of RDD to a line of String
 68 |     */
 69 |   def RDD2HDFSFile[T: ClassTag](sc: SparkContext,
 70 |                                 rdd: RDD[T],
 71 |                                 outPathStr: String,
 72 |                                 header: => String,
 73 |                                 mapEle: T => String): Unit = {
 74 |     val hdpconf = sc.hadoopConfiguration
 75 |     val fs = FileSystem.get(hdpconf)
 76 |     val outPath = new Path(outPathStr)
 77 |     if (fs.exists(outPath)) {
 78 |       throw new InvalidPathException(s"Output path $outPathStr already exists.")
 79 |     }
 80 |     val fout = fs.create(outPath)
 81 |     fout.write(header.getBytes)
 82 |     fout.write("\n".getBytes)
 83 |     rdd.toLocalIterator.foreach(e => {
 84 |       fout.write(mapEle(e).getBytes)
 85 |       fout.write("\n".getBytes)
 86 |     })
 87 |     fout.close()
 88 |   }
 89 | 
 90 |   /**
 91 |     * Load an RDD from one HDFS file
 92 |     * @param sc SparkContext
 93 |     * @param inPathStr The HDFS file path of String
 94 |     * @param init_f The function used for initialization after reading header
 95 |     * @param lineParser The function parses each line in HDFS file to an element of RDD
 96 |     */
 97 |   def HDFSFile2RDD[T: ClassTag, M: ClassTag](sc: SparkContext,
 98 |                                              inPathStr: String,
 99 |                                              init_f: String => M,
100 |                                              lineParser: (M, String) => T): (M, RDD[T]) = {
101 |     val rawrdd = sc.textFile(inPathStr)
102 |     val header = rawrdd.first()
103 |     val meta = init_f(header)
104 |     val rdd: RDD[T] = rawrdd.mapPartitions(iter => {
105 |       val first = iter.next()
106 |       if (first == header) {
107 |         iter
108 |       } else {
109 |         Iterator.single(first) ++ iter
110 |       }
111 |     }.map(lineParser(meta, _)))
112 |     (meta, rdd)
113 |   }
114 | }
115 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/timeseries/Lag.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.timeseries
  2 | 
  3 | import org.apache.spark.ml.linalg.{DenseMatrix, Matrix, Vector}
  4 | 
  5 | /**
  6 |   * Created by endy on 16-12-16.
  7 |   */
  8 | object Lag {
  9 |   /**
 10 |     * Makes a lag matrix from the given time series with the given lag, trimming both rows and
 11 |     * columns so that every element in the matrix is full.
 12 |     */
 13 |   def lagMatTrimBoth(x: Array[Double], maxLag: Int): Array[Array[Double]] = {
 14 |     lagMatTrimBoth(x, maxLag, false)
 15 |   }
 16 | 
 17 |   /**
 18 |     * Makes a lag matrix from the given time series with the given lag, trimming both rows and
 19 |     * columns so that every element in the matrix is full.
 20 |     */
 21 |   def lagMatTrimBoth(x: Array[Double], maxLag: Int, includeOriginal: Boolean)
 22 |   : Array[Array[Double]] = {
 23 |     val numObservations = x.length
 24 |     val numRows = numObservations - maxLag
 25 |     val numCols = maxLag + (if (includeOriginal) 1 else 0)
 26 |     val lagMat = Array.ofDim[Double](numRows, numCols)
 27 | 
 28 |     val initialLag = if (includeOriginal) 0 else 1
 29 | 
 30 |     for (r <- 0 until numRows) {
 31 |       for (c <- initialLag to maxLag) {
 32 |         lagMat(r)(c - initialLag) = x(r + maxLag - c)
 33 |       }
 34 |     }
 35 |     lagMat
 36 |   }
 37 | 
 38 |   /**
 39 |     * Makes a lag matrix from the given time series with the given lag, trimming both rows and
 40 |     * columns so that every element in the matrix is full.
 41 |     */
 42 |   def lagMatTrimBoth(x: Vector, maxLag: Int): Matrix = {
 43 |     lagMatTrimBoth(x, maxLag, false)
 44 |   }
 45 | 
 46 |   /**
 47 |     * Makes a lag matrix from the given time series with the given lag, trimming both rows and
 48 |     * columns so that every element in the matrix is full.
 49 |     */
 50 |   def lagMatTrimBoth(x: Vector, maxLag: Int, includeOriginal: Boolean): Matrix = {
 51 |     val numObservations = x.size
 52 |     val numRows = numObservations - maxLag
 53 |     val numCols = maxLag + (if (includeOriginal) 1 else 0)
 54 |     val lagMat = new DenseMatrix(numRows, numCols, new Array[Double](numRows * numCols))
 55 | 
 56 |     lagMatTrimBoth(x, lagMat, maxLag, includeOriginal, 0)
 57 |     lagMat
 58 |   }
 59 | 
 60 |   /**
 61 |     * @param x Vector to be lagged.
 62 |     * @param outputMat Matrix to place the lagged vector into, as a column.
 63 |     * @param numLags The number of times to lag the vector.  E.g. if this is 2, the output matrix
 64 |     *                will include one column that is the vector lagged by 1, and another column to
 65 |     *                the right that is the vector lagged by 2.
 66 |     * @param includeOriginal Whether to place the original time series into the matrix as well.
 67 |     * @param colOffset The offset to start placing columns in the output mat.
 68 |     */
 69 |   def lagMatTrimBoth(
 70 |                       x: Vector,
 71 |                       outputMat: DenseMatrix,
 72 |                       numLags: Int,
 73 |                       includeOriginal: Boolean,
 74 |                       colOffset: Int): Unit = {
 75 |     val numRows = outputMat.numRows
 76 |     val numTruncatedRows = x.size - numRows
 77 | 
 78 |     val initialLag = if (includeOriginal) 0 else 1
 79 | 
 80 |     val breezeOutputMat = MatrixUtil.toBreeze(outputMat)
 81 |     for (r <- 0 until numRows) {
 82 |       for (lag <- initialLag to numLags) {
 83 |         val c = colOffset + lag - initialLag
 84 |         breezeOutputMat(r, c) = x(r + numTruncatedRows - lag)
 85 |       }
 86 |     }
 87 |   }
 88 | 
 89 |   /**
 90 |     * Creates a lagged matrix from a current matrix (represented in row-array form).
 91 |     * Lags each column the appropriate amount of times and then concatenates the columns.
 92 |     * So given a matrix [a b c], where a/b/c are column vectors, and calling with lag of 2,
 93 |     * becomes a matrix of the form [a_-1 a_-2 b_-1 b_-2 c_-1 c_-2]
 94 |     */
 95 |   def lagMatTrimBoth(
 96 |                       x: Array[Array[Double]],
 97 |                       maxLag: Int,
 98 |                       includeOriginal: Boolean): Array[Array[Double]] = {
 99 |     val xt = x.transpose
100 |     // one matrix per column, consisting of all its lags
101 |     val matrices = for (col <- xt) yield {
102 |       Lag.lagMatTrimBoth(col, maxLag, includeOriginal)
103 |     }
104 |     // merge the matrices into 1 matrix by concatenating col-wise
105 |     matrices.transpose.map(_.reduceLeft(_ ++ _))
106 |   }
107 | 
108 |   /**
109 |     * Creates a lagged matrix from a current matrix (represented in row-array form).
110 |     * Lags each column the appropriate amount of times and then concatenates the columns.
111 |     * So given a matrix [a b c], where a/b/c are column vectors, and calling with lag of 2,
112 |     * becomes a matrix of the form [a_-1 a_-2 b_-1 b_-2 c_-1 c_-2]
113 |     * The original time series is not included in the matrix.
114 |     */
115 |   def lagMatTrimBoth(x: Array[Array[Double]], maxLag: Int): Array[Array[Double]] = {
116 |     lagMatTrimBoth(x, maxLag, false)
117 |   }
118 | }
119 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/tsne/impl/LBFGSTSNE.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.tsne.impl
  2 | 
  3 | import breeze.linalg._
  4 | import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS}
  5 | import breeze.stats.distributions.Rand
  6 | import org.apache.spark.ml.tsne.{TSNEGradient, X2P}
  7 | import org.apache.spark.mllib.linalg.distributed.RowMatrix
  8 | import org.apache.spark.rdd.RDD
  9 | import org.apache.spark.storage.StorageLevel
 10 | import org.slf4j.LoggerFactory
 11 | 
 12 | import scala.util.Random
 13 | 
 14 | /**
 15 |   * TODO: This doesn't work at all (yet or ever).
 16 |   */
 17 | object LBFGSTSNE {
 18 |   private def logger = LoggerFactory.getLogger(LBFGSTSNE.getClass)
 19 | 
 20 |   def tsne(
 21 |             input: RowMatrix,
 22 |             noDims: Int = 2,
 23 |             maxNumIterations: Int = 1000,
 24 |             numCorrections: Int = 10,
 25 |             convergenceTol: Double = 1e-4,
 26 |             perplexity: Double = 30,
 27 |             seed: Long = Random.nextLong()): DenseMatrix[Double] = {
 28 |     if(input.rows.getStorageLevel == StorageLevel.NONE) {
 29 |       logger.warn("Input is not persisted and performance could be bad")
 30 |     }
 31 | 
 32 |     Rand.generator.setSeed(seed)
 33 | 
 34 |     val n = input.numRows().toInt
 35 |     val early_exaggeration = 100
 36 |     val t_momentum = 250
 37 |     val initial_momentum = 0.5
 38 |     val final_momentum = 0.8
 39 |     val eta = 500.0
 40 |     val min_gain = 0.01
 41 | 
 42 |     val Y: DenseMatrix[Double] = DenseMatrix.rand(n, noDims, Rand.gaussian) //:* .0001
 43 |     val iY = DenseMatrix.zeros[Double](n, noDims)
 44 |     val gains = DenseMatrix.ones[Double](n, noDims)
 45 | 
 46 |     // approximate p_{j|i}
 47 |     val p_ji = X2P(input, 1e-5, perplexity)
 48 |     //logInfo(p_ji.toRowMatrix().rows.collect().toList.toString)
 49 |     // p_ij = (p_{i|j} + p_{j|i}) / 2n
 50 |     val P = p_ji.transpose().entries.union(p_ji.entries)
 51 |       .map(e => ((e.i.toInt, e.j.toInt), e.value))
 52 |       .reduceByKey(_ + _)
 53 |       .map{case ((i, j), v) => (i, (j, v / 2 / n)) }
 54 |       .groupByKey()
 55 |       .glom()
 56 |       .cache()
 57 | 
 58 |     var iteration = 1
 59 | 
 60 |     {
 61 |       val costFun = new CostFun(P, n, noDims, true)
 62 |       val lbfgs = new LBFGS[DenseVector[Double]](maxNumIterations, numCorrections, convergenceTol)
 63 |       val states = lbfgs.iterations(new CachedDiffFunction(costFun), new DenseVector(Y.data))
 64 | 
 65 |       while (states.hasNext) {
 66 |         val state = states.next()
 67 |         val loss = state.value
 68 |         //logInfo(state.convergedReason.get.toString)
 69 |         logger.debug(s"Iteration $iteration finished with $loss")
 70 | 
 71 |         Y := asDenseMatrix(state.x, n, noDims)
 72 |         //subscriber.onNext((iteration, Y.copy, Some(loss)))
 73 |         iteration += 1
 74 |       }
 75 |     }
 76 | 
 77 |     {
 78 |       val costFun = new CostFun(P, n, noDims, false)
 79 |       val lbfgs = new LBFGS[DenseVector[Double]](maxNumIterations, numCorrections, convergenceTol)
 80 |       val states = lbfgs.iterations(new CachedDiffFunction(costFun), new DenseVector(Y.data))
 81 | 
 82 |       while (states.hasNext) {
 83 |         val state = states.next()
 84 |         val loss = state.value
 85 |         //logInfo(state.convergedReason.get.toString)
 86 |         logger.debug(s"Iteration $iteration finished with $loss")
 87 | 
 88 |         Y := asDenseMatrix(state.x, n, noDims)
 89 |         //subscriber.onNext((iteration, Y.copy, Some(loss)))
 90 |         iteration += 1
 91 |       }
 92 |     }
 93 | 
 94 |     Y
 95 |   }
 96 | 
 97 |   private[this] def asDenseMatrix(v: DenseVector[Double], n: Int, noDims: Int) = {
 98 |     v.asDenseMatrix.reshape(n, noDims)
 99 |   }
100 | 
101 |   private class CostFun(
102 |                          P: RDD[Array[(Int, Iterable[(Int, Double)])]],
103 |                          n: Int,
104 |                          noDims: Int,
105 |                          exaggeration: Boolean) extends DiffFunction[DenseVector[Double]] {
106 | 
107 |     override def calculate(weights: DenseVector[Double]): (Double, DenseVector[Double]) = {
108 |       val bcY = P.context.broadcast(asDenseMatrix(weights, n, noDims))
109 |       val bcExaggeration = P.context.broadcast(exaggeration)
110 | 
111 |       val numerator = P.map{ arr => TSNEGradient.computeNumerator(bcY.value, arr.map(_._1): _*) }.cache()
112 |       val bcNumerator = P.context.broadcast({
113 |         numerator.treeAggregate(0.0)(seqOp = (x, v) => x + sum(v), combOp = _ + _)
114 |       })
115 | 
116 |       val (dY, loss) = P.zip(numerator).treeAggregate((DenseMatrix.zeros[Double](n, noDims), 0.0))(
117 |         seqOp = (c, v) => {
118 |           // c: (grad, loss), v: (Array[(i, Iterable(j, Distance))], numerator)
119 |           // TODO: See if we can include early_exaggeration
120 |           val l = TSNEGradient.compute(v._1, bcY.value, v._2, bcNumerator.value, c._1, bcExaggeration.value)
121 |           (c._1, c._2 + l)
122 |         },
123 |         combOp = (c1, c2) => {
124 |           // c: (grad, loss)
125 |           (c1._1 += c2._1, c1._2 + c2._2)
126 |         })
127 | 
128 |       numerator.unpersist()
129 | 
130 |       (loss, new DenseVector(dY.data))
131 |     }
132 |   }
133 | }


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  2 |     <modelVersion>4.0.0</modelVersion>
  3 |     <groupId>io.enme</groupId>
  4 |     <artifactId>enme</artifactId>
  5 |     <version>1.0</version>
  6 | 
  7 |     <properties>
  8 |         <scala.version>2.11.8</scala.version>
  9 |         <spark.version>2.2.0</spark.version>
 10 |         <scala.binary.version>2.11</scala.binary.version>
 11 |         <archery.version>0.4.0</archery.version>
 12 |     </properties>
 13 | 
 14 |     <dependencies>
 15 |         <dependency>
 16 |             <groupId>org.scala-lang</groupId>
 17 |             <artifactId>scala-library</artifactId>
 18 |             <version>${scala.version}</version>
 19 |         </dependency>
 20 |         <dependency>
 21 |             <groupId>org.apache.spark</groupId>
 22 |             <artifactId>spark-mllib_${scala.binary.version}</artifactId>
 23 |             <version>${spark.version}</version>
 24 |         </dependency>
 25 |         <dependency>
 26 |             <groupId>org.apache.spark</groupId>
 27 |             <artifactId>spark-mllib_${scala.binary.version}</artifactId>
 28 |             <version>${spark.version}</version>
 29 |             <type>test-jar</type>
 30 |             <scope>test</scope>
 31 |         </dependency>
 32 |         <dependency>
 33 |             <groupId>org.apache.spark</groupId>
 34 |             <artifactId>spark-core_${scala.binary.version}</artifactId>
 35 |             <version>${spark.version}</version>
 36 |             <type>test-jar</type>
 37 |             <scope>test</scope>
 38 |         </dependency>
 39 |         <dependency>
 40 |             <groupId>com.meetup</groupId>
 41 |             <artifactId>archery_${scala.binary.version}</artifactId>
 42 |             <version>${archery.version}</version>
 43 |         </dependency>
 44 |     </dependencies>
 45 |     <build>
 46 |         <pluginManagement>
 47 |             <plugins>
 48 |                 <plugin>
 49 |                     <groupId>net.alchim31.maven</groupId>
 50 |                     <artifactId>scala-maven-plugin</artifactId>
 51 |                     <version>3.2.1</version>
 52 |                 </plugin>
 53 |                 <plugin>
 54 |                     <groupId>org.apache.maven.plugins</groupId>
 55 |                     <artifactId>maven-compiler-plugin</artifactId>
 56 |                     <version>2.0.2</version>
 57 |                     <configuration>
 58 |                         <source>1.7</source>
 59 |                         <target>1.7</target>
 60 |                         <encoding>utf8</encoding>
 61 |                     </configuration>
 62 |                 </plugin>
 63 |             </plugins>
 64 |         </pluginManagement>
 65 | 
 66 |         <plugins>
 67 |             <plugin>
 68 |                 <groupId>org.codehaus.mojo</groupId>
 69 |                 <artifactId>build-helper-maven-plugin</artifactId>
 70 |                 <version>1.7</version>
 71 |                 <executions>
 72 |                     <execution>
 73 |                         <id>add-source</id>
 74 |                         <phase>generate-sources</phase>
 75 |                         <goals>
 76 |                             <goal>add-source</goal>
 77 |                         </goals>
 78 |                         <configuration>
 79 |                             <sources>
 80 |                                 <source>src/main/java</source>
 81 |                             </sources>
 82 |                         </configuration>
 83 |                     </execution>
 84 |                 </executions>
 85 |             </plugin>
 86 |             <plugin>
 87 |                 <groupId>net.alchim31.maven</groupId>
 88 |                 <artifactId>scala-maven-plugin</artifactId>
 89 |                 <version>3.2.0</version>
 90 |                 <executions>
 91 |                     <execution>
 92 |                         <id>compile-scala-first</id>
 93 |                         <phase>process-resources</phase>
 94 |                         <goals>
 95 |                             <goal>add-source</goal>
 96 |                             <goal>compile</goal>
 97 |                         </goals>
 98 |                     </execution>
 99 |                     <execution>
100 |                         <id>test-compile-scala</id>
101 |                         <phase>process-test-resources</phase>
102 |                         <goals>
103 |                             <goal>add-source</goal>
104 |                             <goal>testCompile</goal>
105 |                         </goals>
106 |                     </execution>
107 |                 </executions>
108 |                 <configuration>
109 |                     <scalaVersion>${scala.version}</scalaVersion>
110 |                 </configuration>
111 |             </plugin>
112 |         </plugins>
113 |     </build>
114 |     <reporting>
115 |         <plugins>
116 |             <plugin>
117 |                 <groupId>org.scala-tools</groupId>
118 |                 <artifactId>maven-scala-plugin</artifactId>
119 |                 <configuration>
120 |                     <scalaVersion>${scala.version}</scalaVersion>
121 |                 </configuration>
122 |             </plugin>
123 |         </plugins>
124 |     </reporting>
125 | </project>
126 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/timeseries/models/Autoregression.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.timeseries.models
  2 | 
  3 | import org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression
  4 | import org.apache.spark.ml.{Estimator, Model}
  5 | import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}
  6 | import org.apache.spark.ml.param.{Param, ParamMap, Params}
  7 | import org.apache.spark.ml.timeseries.{Lag, MatrixUtil}
  8 | import org.apache.spark.ml.timeseries.params.TimeSeriesParams
  9 | import org.apache.spark.ml.util.Identifiable
 10 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
 11 | import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
 12 | 
 13 | /**
 14 |   * Created by endy on 16-12-16.
 15 |   */
 16 | 
 17 | trait AutoregressionParams extends TimeSeriesParams {
 18 | 
 19 |   final val maxLag = new Param[Int](this, "maxLag", "max lag")
 20 |   def setMaxLag(value: Int): this.type = set(maxLag, value)
 21 | 
 22 |   final val noIntercept = new Param[Boolean](this, "noIntercept", "no intercept")
 23 |   def setNoIntercept(value: Boolean): this.type = set(noIntercept, value)
 24 | }
 25 | 
 26 | 
 27 | class Autoregression(override val uid: String)
 28 |   extends Estimator[ARModel] with AutoregressionParams{
 29 | 
 30 |   def this() = this(Identifiable.randomUID("Autoregression"))
 31 | 
 32 |   setDefault(noIntercept -> false, maxLag -> 1, timeCol -> "time",
 33 |     timeSeriesCol -> "timeseries")
 34 |   /**
 35 |     * Fits a model to the input data.
 36 |     */
 37 |   override def fit(dataset: Dataset[_]): ARModel = {
 38 |     val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map {
 39 |       case Row(time: String, value: Double) => (time, value)
 40 |     }.sortByKey().collect()
 41 | 
 42 |     val dataVector = Vectors.dense(data.map(x => x._2))
 43 | 
 44 |     // Make left hand side
 45 |     val Y = MatrixUtil.toBreeze(dataVector)(${maxLag} until dataVector.size)
 46 |     // Make lagged right hand side
 47 |     val X = Lag.lagMatTrimBoth(dataVector, ${maxLag})
 48 | 
 49 |     val regression = new OLSMultipleLinearRegression()
 50 |     regression.setNoIntercept(${noIntercept}) // drop intercept in regression
 51 |     regression.newSampleData(Y.toArray, MatrixUtil.matToRowArrs(X))
 52 |     val params = regression.estimateRegressionParameters()
 53 |     val (c, coeffs) = if (${noIntercept}) (0.0, params) else (params.head, params.tail)
 54 | 
 55 |     new ARModel(c, coeffs)
 56 |       .setTimeCol(${timeCol})
 57 |       .setTimeSeriesCol(${timeSeriesCol})
 58 |   }
 59 | 
 60 |   override def copy(extra: ParamMap): Estimator[ARModel] = defaultCopy(extra)
 61 | 
 62 |   /**
 63 |     * :: DeveloperApi ::
 64 |     *
 65 |     * Check transform validity and derive the output schema from the input schema.
 66 |     *
 67 |     * Typical implementation should first conduct verification on schema change and parameter
 68 |     * validity, including complex parameter interaction checks.
 69 |     */
 70 |   override def transformSchema(schema: StructType): StructType = {
 71 |     schema
 72 |   }
 73 | }
 74 | 
 75 | class ARModel(override val uid: String, val c: Double, val coefficients: Array[Double]) extends
 76 |   Model[ARModel] with AutoregressionParams {
 77 | 
 78 |   def this(c: Double, coefficients: Array[Double]) = this(Identifiable.randomUID("ARModel"), c,
 79 |     coefficients)
 80 | 
 81 |   /**
 82 |     * Transforms the input dataset.
 83 |     */
 84 |   override def transform(dataset: Dataset[_]): DataFrame = {
 85 |     val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map {
 86 |       case Row(time: String, value: Double) => (time, value)
 87 |     }.sortByKey().collect()
 88 |       .map(x => x._2)
 89 | 
 90 |     val dataVector = Vectors.dense(data)
 91 | 
 92 |     val dest = addTimeDependentEffects(dataVector)
 93 | 
 94 |     val resRDD = dataset.sparkSession.sparkContext.parallelize(dest.toArray.map(x => Row(x)))
 95 | 
 96 |     val structType = transformSchema(dataset.schema)
 97 | 
 98 |     dataset.sparkSession.createDataFrame(resRDD, structType)
 99 |   }
100 | 
101 |   def removeTimeDependentEffects(ts: Vector): Vector = {
102 |     val dest = new Array[Double](ts.size)
103 |     var i = 0
104 |     while (i < ts.size) {
105 |       dest(i) = ts(i) - c
106 |       var j = 0
107 |       while (j < coefficients.length && i - j - 1 >= 0) {
108 |         dest(i) -= ts(i - j - 1) * coefficients(j)
109 |         j += 1
110 |       }
111 |       i += 1
112 |     }
113 |     new DenseVector(dest)
114 |   }
115 | 
116 |   def addTimeDependentEffects(ts: Vector): Vector = {
117 |     val dest = new Array[Double](ts.size)
118 |     var i = 0
119 |     while (i < ts.size) {
120 |       dest(i) = c + ts(i)
121 |       var j = 0
122 |       while (j < coefficients.length && i - j - 1 >= 0) {
123 |         dest(i) += dest(i - j - 1) * coefficients(j)
124 |         j += 1
125 |       }
126 |       i += 1
127 |     }
128 |     new DenseVector(dest)
129 |   }
130 | 
131 |   /**
132 |     * :: DeveloperApi ::
133 |     *
134 |     * Check transform validity and derive the output schema from the input schema.
135 |     *
136 |     * Typical implementation should first conduct verification on schema change and parameter
137 |     * validity, including complex parameter interaction checks.
138 |     */
139 |   override def transformSchema(schema: StructType): StructType = {
140 |     StructType(Array(StructField("Autoregression", DoubleType)))
141 | 
142 |   }
143 | 
144 |   override def copy(extra: ParamMap): ARModel = defaultCopy(extra)
145 | 
146 | }
147 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/fm/FMModel.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.fm
  2 | 
  3 | import org.apache.spark.ml.fm.FM._
  4 | import org.apache.spark.ml.util.LoaderUtils
  5 | import org.apache.spark.ml.util.SparkUtils._
  6 | import org.apache.spark.SparkContext
  7 | import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, RegressionMetrics}
  8 | import org.apache.spark.mllib.linalg.{Vector => SV}
  9 | import org.apache.spark.mllib.regression.LabeledPoint
 10 | import org.apache.spark.mllib.util.{Loader, Saveable}
 11 | import org.apache.spark.rdd.RDD
 12 | import org.apache.spark.sql.{Row, SQLContext}
 13 | import org.apache.spark.storage.StorageLevel
 14 | import org.json4s.DefaultFormats
 15 | import org.json4s.JsonDSL._
 16 | import org.json4s.jackson.JsonMethods._
 17 | 
 18 | class FMModel(
 19 |                val k: Int,
 20 |                val intercept: ED,
 21 |                val classification: Boolean,
 22 |                val factors: RDD[(Long, VD)]) extends Serializable with Saveable {
 23 |   def predict(data: RDD[(Long, SV)]): RDD[(Long, ED)] = {
 24 |     data.flatMap { case (sampleId, features) =>
 25 |       features.activeIterator.filter(_._2 != 0.0).map {
 26 |         case (featureId, value) =>
 27 |           (featureId.toLong, (sampleId, value))
 28 |       }
 29 |     }.join(factors).map { case (featureId, ((sampleId, x), w)) =>
 30 |       (sampleId, forwardInterval(k, x, w))
 31 |     }.reduceByKey(reduceInterval).map { case (sampleId, arr) =>
 32 |       var result = predictInterval(k, intercept, arr)
 33 |       if (classification) {
 34 |         result = 1.0 / (1.0 + math.exp(-result))
 35 |       }
 36 |       (sampleId, result)
 37 |     }
 38 |   }
 39 | 
 40 |   def loss(data: RDD[(Long, LabeledPoint)]): Double = {
 41 |     // val minTarget = data.map(_._2.label).min()
 42 |     // val maxTarget = data.map(_._2.label).max()
 43 |     val perd = predict(data.map(t => (t._1, t._2.features)))
 44 |     val label = data.map(t => (t._1, t._2.label))
 45 |     val scoreAndLabels = label.join(perd).map { case (_, (label, score)) =>
 46 |       // var r = Math.max(score, minTarget)
 47 |       // r = Math.min(r, maxTarget)
 48 |       // pow(l - r, 2)
 49 |       (score, label)
 50 |     }
 51 |     scoreAndLabels.persist(StorageLevel.MEMORY_AND_DISK)
 52 |     val ret = if (classification) auc(scoreAndLabels) else rmse(scoreAndLabels)
 53 |     scoreAndLabels.unpersist(blocking = false)
 54 |     ret
 55 |   }
 56 | 
 57 |   def rmse(scoreAndLabels: RDD[(Double, Double)]): Double = {
 58 |     val metrics = new RegressionMetrics(scoreAndLabels)
 59 |     metrics.rootMeanSquaredError
 60 |   }
 61 | 
 62 |   def auc(scoreAndLabels: RDD[(Double, Double)]): Double = {
 63 |     val metrics = new BinaryClassificationMetrics(scoreAndLabels)
 64 |     metrics.areaUnderROC()
 65 |   }
 66 | 
 67 |   override def save(sc: SparkContext, path: String): Unit = {
 68 |     FMModel.SaveLoadV1_0.save(sc, path, k, intercept, classification, factors)
 69 |   }
 70 | 
 71 |   override protected def formatVersion: String = FMModel.SaveLoadV1_0.formatVersionV1_0
 72 | }
 73 | 
 74 | object FMModel extends Loader[FMModel] {
 75 | 
 76 |   override def load(sc: SparkContext, path: String): FMModel = {
 77 |     val (loadedClassName, version, metadata) = LoaderUtils.loadMetadata(sc, path)
 78 |     val versionV1_0 = SaveLoadV1_0.formatVersionV1_0
 79 |     val classNameV1_0 = SaveLoadV1_0.classNameV1_0
 80 |     if (loadedClassName == classNameV1_0 && version == versionV1_0) {
 81 |       implicit val formats = DefaultFormats
 82 |       val classification = (metadata \ "classification").extract[Boolean]
 83 |       val intercept = (metadata \ "intercept").extract[Double]
 84 |       val k = (metadata \ "k").extract[Int]
 85 |       val dataPath = LoaderUtils.dataPath(path)
 86 |       val sqlContext = new SQLContext(sc)
 87 |       val dataRDD = sqlContext.read.parquet(dataPath)
 88 |       val dataArray = dataRDD.select("featureId", "factors").take(1)
 89 |       assert(dataArray.length == 1, s"Unable to load $loadedClassName data from: $dataPath")
 90 |       val data = dataArray(0)
 91 |       assert(data.size == 2, s"Unable to load $loadedClassName data from: $dataPath")
 92 |       val factors = dataRDD.rdd.map {
 93 |         case Row(featureId: Long, factors: Seq[Double]) =>
 94 |           (featureId, factors.toArray)
 95 |       }
 96 |       new FMModel(k, intercept, classification, factors)
 97 |     } else {
 98 |       throw new Exception(
 99 |         s"FMModel.load did not recognize model with (className, format version):" +
100 |           s"($loadedClassName, $version).  Supported:\n" +
101 |           s"  ($classNameV1_0, 1.0)")
102 |     }
103 | 
104 |   }
105 | 
106 |   private object SaveLoadV1_0 {
107 |     val formatVersionV1_0 = "1.0"
108 |     val classNameV1_0 = "com.github.cloudml.zen.ml.recommendation.FMModel"
109 | 
110 |     def save(
111 |               sc: SparkContext,
112 |               path: String,
113 |               k: Int,
114 |               intercept: Double,
115 |               classification: Boolean,
116 |               factors: RDD[(Long, Array[Double])]): Unit = {
117 |       val metadata = compact(render
118 |       (("class" -> classNameV1_0) ~ ("version" -> formatVersionV1_0) ~
119 |         ("k" -> k) ~ ("intercept" -> intercept) ~ ("classification" -> classification)))
120 |       sc.parallelize(Seq(metadata), 1).saveAsTextFile(LoaderUtils.metadataPath(path))
121 | 
122 |       val sqlContext = new SQLContext(sc)
123 |       import sqlContext.implicits._
124 |       // Create Parquet data.
125 |       factors.toDF("featureId", "factors").write.parquet(LoaderUtils.dataPath(path))
126 |     }
127 |   }
128 | 
129 | }
130 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/ml/timeseries/UnivariateTimeSeriesSuite.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.timeseries
  2 | 
  3 | import org.apache.commons.math3.random.MersenneTwister
  4 | import org.apache.spark.SparkFunSuite
  5 | import org.apache.spark.ml.linalg.{DenseVector, Matrices, Vectors}
  6 | import org.apache.spark.ml.util.DefaultReadWriteTest
  7 | import org.apache.spark.mllib.util.MLlibTestSparkContext
  8 | import org.apache.spark.mllib.util.TestingUtils._
  9 | 
 10 | 
 11 | /**
 12 |   * Created by endy on 16-12-21.
 13 |   */
 14 | class UnivariateTimeSeriesSuite extends SparkFunSuite with MLlibTestSparkContext
 15 |   with DefaultReadWriteTest {
 16 | 
 17 |   test("lagIncludeOriginalsTrue") {
 18 |     val lagMatrix = UnivariateTimeSeries.lag(Vectors.dense(1.0, 2.0, 3.0, 4.0, 5.0), 2, true)
 19 |     assert(lagMatrix === Matrices.dense(3, 3, Array(3.0, 4.0, 5.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0)))
 20 |   }
 21 | 
 22 |   test("lagIncludeOriginalsFalse") {
 23 |     val lagMatrix = UnivariateTimeSeries.lag(Vectors.dense(1.0, 2.0, 3.0, 4.0, 5.0), 2, false)
 24 |     assert(lagMatrix == Matrices.dense(3, 2, Array(2.0, 3.0, 4.0, 1.0, 2.0, 3.0)))
 25 |   }
 26 | 
 27 |   test("autocorr") {
 28 |     val rand = new MersenneTwister(5L)
 29 |     val iidAutocorr = UnivariateTimeSeries.autocorr(Array.fill(10000)(rand.nextDouble * 5.0), 3)
 30 |     iidAutocorr.foreach(x => assert(math.abs(x) < .03))
 31 |   }
 32 | 
 33 |   test("upsampling") {
 34 |     // replicating upsampling examples
 35 |     // from http://www.mathworks.com/help/signal/ref/upsample.html?searchHighlight=upsample
 36 |     val y = new DenseVector(Array(1.0, 2.0, 3.0, 4.0))
 37 |     val yUp1 = UnivariateTimeSeries.upsample(y, 3, useZero = true).toArray
 38 |     assert(yUp1 === Array(1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 3.0, 0.0, 0.0, 4.0, 0.0, 0.0))
 39 | 
 40 |     val yUp2 = UnivariateTimeSeries.upsample(y, 3, useZero = true, phase = 2).toArray
 41 |     assert(yUp2 === Array(0.0, 0.0, 1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 3.0, 0.0, 0.0, 4.0))
 42 |   }
 43 | 
 44 |   test("downsampling") {
 45 |     // replicating downsampling examples
 46 |     // from http://www.mathworks.com/help/signal/ref/downsample.html?searchHighlight=downsample
 47 |     val y = new DenseVector((1 to 10).toArray.map(_.toDouble))
 48 |     val yDown1 = UnivariateTimeSeries.downsample(y, 3).toArray
 49 |     assert(yDown1 === Array(1.0, 4.0, 7.0, 10.0))
 50 | 
 51 |     val yDown2 = UnivariateTimeSeries.downsample(y, 3, phase = 2).toArray
 52 |     assert(yDown2 === Array(3.0, 6.0, 9.0))
 53 |   }
 54 | 
 55 |   test("signal reconstruction with spline") {
 56 |     // If we have a frequent signal, downsample it (at a rate that doesn't cause aliasing)
 57 |     // and we upsample, and apply a filter (interpolation), then the result should be fairly
 58 |     // close to the original signal. In our case, we drop NAs that are not filled by interpolation
 59 |     // (i.e no extrapolation)
 60 | 
 61 |     val y = (1 to 1000).toArray.map(_.toDouble / 100.0).map(Math.sin)
 62 |     val vy = new DenseVector(y)
 63 |     val lessFreq = UnivariateTimeSeries.downsample(vy, 100)
 64 |     val moreFreq = UnivariateTimeSeries.upsample(lessFreq, 100)
 65 | 
 66 |     // work on copies
 67 |     val splineY = UnivariateTimeSeries.fillSpline(new DenseVector(moreFreq.toArray)).toArray
 68 |     val lineY = UnivariateTimeSeries.fillLinear(new DenseVector(moreFreq.toArray)).toArray
 69 | 
 70 |     val MSE = (est: Array[Double], obs: Array[Double]) => {
 71 |       val errs = est.zip(obs).filter(!_._1.isNaN).map { case (yhat, yi) =>
 72 |         (yhat - yi) * (yhat - yi)
 73 |       }
 74 |       errs.sum / errs.length
 75 |     }
 76 | 
 77 |     val sE = MSE(splineY, y)
 78 |     val lE = MSE(lineY, y)
 79 | 
 80 |     // a cubic spline should be better than linear interpolation
 81 |     assert(sE < lE)
 82 |   }
 83 | 
 84 |   test("differencing at lag") {
 85 |     val rand = new MersenneTwister(10L)
 86 |     val n = 100
 87 |     val sampled = new DenseVector(Array.fill(n)(rand.nextGaussian))
 88 |     val lag = 5
 89 |     val diffed = UnivariateTimeSeries.differencesAtLag(sampled, lag)
 90 |     val invDiffed = UnivariateTimeSeries.inverseDifferencesAtLag(diffed, lag)
 91 | 
 92 |     for (i <- 0 until n) {
 93 |       assert(sampled(i) ~== invDiffed(i) absTol 1e-6)
 94 |     }
 95 | 
 96 |     assert(diffed(10) == (sampled(10) - sampled(5)))
 97 |     assert(diffed(99) == (sampled(99) - sampled(94)))
 98 |   }
 99 | 
100 |   test("differencing of order d") {
101 |     val rand = new MersenneTwister(10L)
102 |     val n = 100
103 |     val sampled = new DenseVector(Array.fill(n)(rand.nextGaussian))
104 |     // differencing at order 1 and lag 1 should be the same
105 |     val diffedOfOrder1 = UnivariateTimeSeries.differencesOfOrderD(sampled, 1)
106 |     val diffedAtLag1 = UnivariateTimeSeries.differencesAtLag(sampled, 1)
107 | 
108 |     for (i <- 0 until n) {
109 |       assert(diffedAtLag1(i) ~== diffedOfOrder1(i) absTol 1e-6)
110 |     }
111 | 
112 |     // differencing at order and inversing should return the original series
113 |     val diffedOfOrder5 = UnivariateTimeSeries.differencesOfOrderD(sampled, 5)
114 |     val invDiffedOfOrder5 = UnivariateTimeSeries.inverseDifferencesOfOrderD(diffedOfOrder5, 5)
115 | 
116 |     for (i <- 0 until n) {
117 |       assert(invDiffedOfOrder5(i) ~== sampled(i) absTol 1e-6)
118 |     }
119 | 
120 |     // Differencing of order n + 1 should be the same as differencing one time a
121 |     // vector that has already been differenced to order n
122 |     val diffedOfOrder6 = UnivariateTimeSeries.differencesOfOrderD(sampled, 6)
123 |     val diffedOneMore = UnivariateTimeSeries.differencesOfOrderD(diffedOfOrder5, 1)
124 |     // compare start at index = 6
125 |     for (i <- 6 until n) {
126 |       assert(diffedOfOrder6(i) ~== diffedOneMore(i) absTol 1e-6)
127 |     }
128 |   }
129 | }
130 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/tsne/TSNEGradient.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.tsne
  2 | 
  3 | import breeze.linalg._
  4 | import breeze.numerics._
  5 | import org.apache.spark.ml.tsne.tree.SPTree
  6 | import org.slf4j.LoggerFactory
  7 | 
  8 | object TSNEGradient {
  9 |   def logger = LoggerFactory.getLogger(TSNEGradient.getClass)
 10 | 
 11 |   /**
 12 |     * Compute the numerator from the matrix Y
 13 |     *
 14 |     * @param idx the index in the matrix to use.
 15 |     * @param Y the matrix to analyze
 16 |     * @return the numerator
 17 |     */
 18 |   def computeNumerator(Y: DenseMatrix[Double], idx: Int *): DenseMatrix[Double] = {
 19 |     // Y_sum = ||Y_i||^2
 20 |     val sumY = sum(pow(Y, 2).apply(*, ::)) // n * 1
 21 |     val subY = Y(idx, ::).toDenseMatrix // k * 1
 22 |     val y1: DenseMatrix[Double] = Y * (-2.0 :* subY.t) // n * k
 23 |     val num: DenseMatrix[Double] = (y1(::, *) + sumY).t // k * n
 24 |     num := 1.0 :/ (1.0 :+ (num(::, *) + sumY(idx).toDenseVector)) // k * n
 25 | 
 26 |     idx.indices.foreach(i => num.update(i, idx(i), 0.0)) // num(i, i) = 0
 27 | 
 28 |     num
 29 |   }
 30 | 
 31 |   /**
 32 |     * Compute the TSNE Gradient at i. Update the gradient through dY then return costs attributed at i.
 33 |     *
 34 |     * @param data data point for row i by list of pair of (j, p_ij) and 0 <= j < n
 35 |     * @param Y current Y [n * 2]
 36 |     * @param totalNum the common numerator that captures the t-distribution of Y
 37 |     * @param dY gradient of Y
 38 |     * @return loss attributed to row i
 39 |     */
 40 |   def compute(
 41 |                data: Array[(Int, Iterable[(Int, Double)])],
 42 |                Y: DenseMatrix[Double],
 43 |                num: DenseMatrix[Double],
 44 |                totalNum: Double,
 45 |                dY: DenseMatrix[Double],
 46 |                exaggeration: Boolean): Double = {
 47 |     // q = (1 + ||Y_i - Y_j||^2)^-1 / sum(1 + ||Y_k - Y_l||^2)^-1
 48 |     val q: DenseMatrix[Double] = num / totalNum
 49 |     q.foreachPair{case ((i, j), v) => q.update(i, j, math.max(v, 1e-12))}
 50 | 
 51 |     // q = q - p
 52 |     val loss = data.zipWithIndex.flatMap {
 53 |       case ((_, itr), i) =>
 54 |         itr.map{
 55 |           case (j, p) =>
 56 |             val exaggeratedP = if(exaggeration) p * 4 else p
 57 |             val qij = q(i, j)
 58 |             val l = exaggeratedP * math.log(exaggeratedP / qij)
 59 |             q.update(i, j,  qij - exaggeratedP)
 60 |             if(l.isNaN) 0.0 else l
 61 |         }
 62 |     }.sum
 63 | 
 64 |     // l = [ (p_ij - q_ij) * (1 + ||Y_i - Y_j||^2)^-1 ]
 65 |     q :*= -num
 66 |     // l_sum = [0 0 ... sum(l) ... 0]
 67 |     sum(q(*, ::)).foreachPair{ case (i, v) => q.update(i, data(i)._1, q(i, data(i)._1) - v) }
 68 | 
 69 |     // dY_i = -4 * (l - l_sum) * Y
 70 |     val dYi: DenseMatrix[Double] = -4.0 :* (q * Y)
 71 |     data.map(_._1).zipWithIndex.foreach{
 72 |       case (i, idx) => dY(i, ::) := dYi(idx, ::)
 73 |     }
 74 | 
 75 |     loss
 76 |   }
 77 | 
 78 |   /** BH Tree related functions **/
 79 | 
 80 |   /**
 81 |     *
 82 |     * @param data array of (row_id, Seq(col_id), Vector(P_ij))
 83 |     * @param Y matrix
 84 |     * @param posF positive forces
 85 |     */
 86 |   def computeEdgeForces(data: Array[(Int, Seq[Int], DenseVector[Double])],
 87 |                         Y: DenseMatrix[Double],
 88 |                         posF: DenseMatrix[Double]): Unit = {
 89 |     data.foreach {
 90 |       case (i, cols, vec) =>
 91 |         // k x D - 1 x D  => k x D
 92 |         val diff = Y(cols, ::).toDenseMatrix.apply(*, ::) - Y(i, ::).t
 93 |         // k x D => k x 1
 94 |         val qZ = 1.0 :+ sum(pow(diff, 2).apply(*, ::))
 95 |         posF(i, ::) := (vec :/ qZ).t * (-diff)
 96 |     }
 97 |   }
 98 | 
 99 |   def computeNonEdgeForces(tree: SPTree,
100 |                            Y: DenseMatrix[Double],
101 |                            theta: Double,
102 |                            negF: DenseMatrix[Double],
103 |                            idx: Int *): Double = {
104 |     idx.foldLeft(0.0)((acc, i) => acc + computeNonEdgeForce(tree, Y(i, ::).t, theta, negF, i))
105 |   }
106 | 
107 |   /**
108 |     * Calcualte negative forces using BH approximation
109 |     *
110 |     * @param tree SPTree used for approximation
111 |     * @param y y_i
112 |     * @param theta threshold for correctness / speed
113 |     * @param negF negative forces
114 |     * @param i row
115 |     * @return sum of Q
116 |     */
117 |   private def computeNonEdgeForce(tree: SPTree,
118 |                                   y: DenseVector[Double],
119 |                                   theta: Double,
120 |                                   negF: DenseMatrix[Double],
121 |                                   i: Int): Double = {
122 |     import tree._
123 |     if(getCount == 0 || (isLeaf && center.equals(y))) {
124 |       0.0
125 |     } else {
126 |       val diff = y - center
127 |       val diffSq = sum(pow(diff, 2))
128 |       if(isLeaf || radiusSq / diffSq < theta) {
129 |         val qZ = 1 / (1 + diffSq)
130 |         val nqZ = getCount * qZ
131 |         negF(i, ::) :+= (nqZ * qZ * diff).t
132 |         nqZ
133 |       } else {
134 |         children.foldLeft(0.0)((acc, child) => acc + computeNonEdgeForce(child, y, theta, negF, i))
135 |       }
136 |     }
137 |   }
138 | 
139 |   def computeLoss(data: Array[(Int, Seq[Int], DenseVector[Double])],
140 |                   Y: DenseMatrix[Double],
141 |                   sumQ: Double): Double = {
142 |     data.foldLeft(0.0){
143 |       case (acc, (i, cols, vec)) =>
144 |         val diff = Y(cols, ::).toDenseMatrix.apply(*, ::) - Y(i, ::).t
145 |         val diffSq =  sum(pow(diff, 2).apply(*, ::))
146 |         val Q = (1.0 :/ (1.0 :+ diffSq)) :/ sumQ
147 |         sum(vec :* breeze.numerics.log(max(vec, 1e-12) :/ max(Q, 1e-12)))
148 |     }
149 |   }
150 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/mvm/MVMModel.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.mvm
  2 | 
  3 | import org.apache.spark.ml.mvm.MVM._
  4 | import org.apache.spark.ml.util.LoaderUtils
  5 | import org.apache.spark.ml.util.SparkUtils._
  6 | import org.apache.spark.SparkContext
  7 | import org.apache.spark.mllib.evaluation.{RegressionMetrics, BinaryClassificationMetrics}
  8 | import org.apache.spark.mllib.linalg.{Vector => SV}
  9 | import org.apache.spark.mllib.regression.LabeledPoint
 10 | import org.apache.spark.mllib.util.{Loader, Saveable}
 11 | import org.apache.spark.rdd.RDD
 12 | import org.apache.spark.sql.{Row, SQLContext}
 13 | import org.apache.spark.storage.StorageLevel
 14 | import org.json4s.DefaultFormats
 15 | import org.json4s.JsonDSL._
 16 | import org.json4s.jackson.JsonMethods._
 17 | 
 18 | import scala.math._
 19 | 
 20 | class MVMModel(
 21 |                 val k: Int,
 22 |                 val views: Array[Long],
 23 |                 val classification: Boolean,
 24 |                 val factors: RDD[(Long, VD)]) extends Serializable with Saveable {
 25 |   def predict(data: RDD[(Long, SV)]): RDD[(Long, ED)] = {
 26 |     val numFeatures = data.first()._2.size.toLong
 27 |     data.flatMap { case (sampleId, features) =>
 28 |       features.activeIterator.filter(_._2 != 0.0).map {
 29 |         case (featureId, value) =>
 30 |           (featureId.toLong, (sampleId, value))
 31 |       } ++ views.indices.map { i => (numFeatures + i, (sampleId, 1D)) }
 32 |     }.join(factors).map { case (featureId, ((sampleId, x), w)) =>
 33 |       val viewSize = views.length
 34 |       val viewId = featureId2viewId(featureId, views)
 35 |       (sampleId, forwardInterval(k, viewSize, viewId, x, w))
 36 |     }.reduceByKey(reduceInterval).map { case (sampleId, arr) =>
 37 |       var result = predictInterval(k, arr)
 38 |       if (classification) {
 39 |         result = 1.0 / (1.0 + math.exp(-result))
 40 |       }
 41 |       (sampleId, result)
 42 |     }
 43 |   }
 44 | 
 45 |   def loss(data: RDD[(Long, LabeledPoint)]): Double = {
 46 |     // val minTarget = data.map(_._2.label).min()
 47 |     // val maxTarget = data.map(_._2.label).max()
 48 |     val perd = predict(data.map(t => (t._1, t._2.features)))
 49 |     val label = data.map(t => (t._1, t._2.label))
 50 |     val scoreAndLabels = label.join(perd).map { case (_, (label, score)) =>
 51 |       // var r = Math.max(score, minTarget)
 52 |       // r = Math.min(r, maxTarget)
 53 |       // pow(l - r, 2)
 54 |       (score, label)
 55 |     }
 56 |     scoreAndLabels.persist(StorageLevel.MEMORY_AND_DISK)
 57 |     val ret = if (classification) auc(scoreAndLabels) else rmse(scoreAndLabels)
 58 |     scoreAndLabels.unpersist(blocking = false)
 59 |     ret
 60 |   }
 61 | 
 62 |   def rmse(scoreAndLabels: RDD[(Double, Double)]): Double = {
 63 |     val metrics = new RegressionMetrics(scoreAndLabels)
 64 |     metrics.rootMeanSquaredError
 65 |   }
 66 | 
 67 |   def auc(scoreAndLabels: RDD[(Double, Double)]): Double = {
 68 |     val metrics = new BinaryClassificationMetrics(scoreAndLabels)
 69 |     metrics.areaUnderROC()
 70 |   }
 71 | 
 72 |   override def save(sc: SparkContext, path: String): Unit = {
 73 |     MVMModel.SaveLoadV1_0.save(sc, path, k, views, classification, factors)
 74 |   }
 75 | 
 76 |   override protected def formatVersion: String = MVMModel.SaveLoadV1_0.formatVersionV1_0
 77 | }
 78 | 
 79 | object MVMModel extends Loader[MVMModel] {
 80 | 
 81 |   override def load(sc: SparkContext, path: String): MVMModel = {
 82 |     val (loadedClassName, version, metadata) = LoaderUtils.loadMetadata(sc, path)
 83 |     val versionV1_0 = SaveLoadV1_0.formatVersionV1_0
 84 |     val classNameV1_0 = SaveLoadV1_0.classNameV1_0
 85 |     if (loadedClassName == classNameV1_0 && version == versionV1_0) {
 86 |       implicit val formats = DefaultFormats
 87 |       val classification = (metadata \ "classification").extract[Boolean]
 88 |       val views = (metadata \ "views").extract[String].split(",").map(_.toLong)
 89 |       val k = (metadata \ "k").extract[Int]
 90 |       val dataPath = LoaderUtils.dataPath(path)
 91 |       val sqlContext = new SQLContext(sc)
 92 |       val dataRDD = sqlContext.read.parquet(dataPath)
 93 |       val dataArray = dataRDD.select("featureId", "factors").take(1)
 94 |       assert(dataArray.size == 1, s"Unable to load $loadedClassName data from: $dataPath")
 95 |       val data = dataArray(0)
 96 |       assert(data.size == 2, s"Unable to load $loadedClassName data from: $dataPath")
 97 |       val factors = dataRDD.rdd.map {
 98 |         case Row(featureId: Long, factors: Seq[Double]) =>
 99 |           (featureId, factors.toArray)
100 |       }
101 |       new MVMModel(k, views, classification, factors)
102 |     } else {
103 |       throw new Exception(
104 |         s"FMModel.load did not recognize model with (className, format version):" +
105 |           s"($loadedClassName, $version).  Supported:\n" +
106 |           s"  ($classNameV1_0, 1.0)")
107 |     }
108 | 
109 |   }
110 | 
111 |   private object SaveLoadV1_0 {
112 |     val formatVersionV1_0 = "1.0"
113 |     val classNameV1_0 = "com.github.cloudml.zen.ml.recommendation.MVMModel"
114 | 
115 |     def save(
116 |               sc: SparkContext,
117 |               path: String,
118 |               k: Int,
119 |               views: Array[Long],
120 |               classification: Boolean,
121 |               factors: RDD[(Long, Array[Double])]): Unit = {
122 |       val metadata = compact(render
123 |       (("class" -> classNameV1_0) ~ ("version" -> formatVersionV1_0) ~
124 |         ("k" -> k) ~ ("views" -> views.mkString(",")) ~ ("classification" -> classification)))
125 |       sc.parallelize(Seq(metadata), 1).saveAsTextFile(LoaderUtils.metadataPath(path))
126 | 
127 |       val sqlContext = new SQLContext(sc)
128 |       import sqlContext.implicits._
129 |       // Create Parquet data.
130 |       factors.toDF("featureId", "factors").write.parquet(LoaderUtils.dataPath(path))
131 |     }
132 |   }
133 | 
134 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/fm/BSFMModel.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.fm
  2 | 
  3 | import org.apache.spark.ml.fm.BSFM._
  4 | import org.apache.spark.ml.util.LoaderUtils
  5 | import org.apache.spark.ml.util.SparkUtils._
  6 | import org.apache.spark.SparkContext
  7 | import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, RegressionMetrics}
  8 | import org.apache.spark.mllib.linalg.{Vector => SV}
  9 | import org.apache.spark.mllib.regression.LabeledPoint
 10 | import org.apache.spark.mllib.util.{Loader, Saveable}
 11 | import org.apache.spark.rdd.RDD
 12 | import org.apache.spark.sql.{Row, SQLContext}
 13 | import org.apache.spark.storage.StorageLevel
 14 | import org.json4s.DefaultFormats
 15 | import org.json4s.JsonDSL._
 16 | import org.json4s.jackson.JsonMethods._
 17 | 
 18 | import scala.math._
 19 | 
 20 | class BSFMModel(
 21 |                  val k: Int,
 22 |                  val intercept: ED,
 23 |                  val views: Array[Long],
 24 |                  val classification: Boolean,
 25 |                  val factors: RDD[(Long, VD)]) extends Serializable with Saveable {
 26 |   def predict(data: RDD[(Long, SV)]): RDD[(Long, ED)] = {
 27 |     val numFeatures = data.first()._2.size.toLong
 28 |     data.flatMap { case (sampleId, features) =>
 29 |       features.activeIterator.filter(_._2 != 0.0).map {
 30 |         case (featureId, value) =>
 31 |           (featureId.toLong, (sampleId, value))
 32 |       } ++ views.indices.map { i => (numFeatures + i, (sampleId, 1D)) }
 33 |     }.join(factors).map { case (featureId, ((sampleId, x), w)) =>
 34 |       val viewSize = views.length
 35 |       val viewId = featureId2viewId(featureId, views)
 36 |       (sampleId, forwardInterval(k, viewSize, viewId, x, w))
 37 |     }.reduceByKey(forwardReduceInterval).map { case (sampleId, arr) =>
 38 |       var result = predictInterval(k, views.length, intercept, arr)
 39 |       if (classification) {
 40 |         result = 1.0 / (1.0 + math.exp(-result))
 41 |       }
 42 |       (sampleId, result)
 43 |     }
 44 |   }
 45 | 
 46 |   def loss(data: RDD[(Long, LabeledPoint)]): Double = {
 47 |     // val minTarget = data.map(_._2.label).min()
 48 |     // val maxTarget = data.map(_._2.label).max()
 49 |     val perd = predict(data.map(t => (t._1, t._2.features)))
 50 |     val label = data.map(t => (t._1, t._2.label))
 51 |     val scoreAndLabels = label.join(perd).map { case (_, (label, score)) =>
 52 |       // var r = Math.max(score, minTarget)
 53 |       // r = Math.min(r, maxTarget)
 54 |       // pow(l - r, 2)
 55 |       (score, label)
 56 |     }
 57 |     scoreAndLabels.persist(StorageLevel.MEMORY_AND_DISK)
 58 |     val ret = if (classification) auc(scoreAndLabels) else rmse(scoreAndLabels)
 59 |     scoreAndLabels.unpersist(blocking = false)
 60 |     ret
 61 |   }
 62 | 
 63 |   def rmse(scoreAndLabels: RDD[(Double, Double)]): Double = {
 64 |     val metrics = new RegressionMetrics(scoreAndLabels)
 65 |     metrics.rootMeanSquaredError
 66 |   }
 67 | 
 68 |   def auc(scoreAndLabels: RDD[(Double, Double)]): Double = {
 69 |     val metrics = new BinaryClassificationMetrics(scoreAndLabels)
 70 |     metrics.areaUnderROC()
 71 |   }
 72 | 
 73 |   override def save(sc: SparkContext, path: String): Unit = {
 74 |     BSFMModel.SaveLoadV1_0.save(sc, path, k, intercept, views, classification, factors)
 75 |   }
 76 | 
 77 |   override protected def formatVersion: String = BSFMModel.SaveLoadV1_0.formatVersionV1_0
 78 | }
 79 | 
 80 | object BSFMModel extends Loader[BSFMModel] {
 81 | 
 82 |   override def load(sc: SparkContext, path: String): BSFMModel = {
 83 |     val (loadedClassName, version, metadata) = LoaderUtils.loadMetadata(sc, path)
 84 |     val versionV1_0 = SaveLoadV1_0.formatVersionV1_0
 85 |     val classNameV1_0 = SaveLoadV1_0.classNameV1_0
 86 |     if (loadedClassName == classNameV1_0 && version == versionV1_0) {
 87 |       implicit val formats = DefaultFormats
 88 |       val classification = (metadata \ "classification").extract[Boolean]
 89 |       val intercept = (metadata \ "intercept").extract[Double]
 90 |       val views = (metadata \ "views").extract[String].split(",").map(_.toLong)
 91 |       val k = (metadata \ "k").extract[Int]
 92 |       val dataPath = LoaderUtils.dataPath(path)
 93 |       val sqlContext = new SQLContext(sc)
 94 |       val dataRDD = sqlContext.read.parquet(dataPath)
 95 |       val dataArray = dataRDD.select("featureId", "factors").take(1)
 96 |       assert(dataArray.size == 1, s"Unable to load $loadedClassName data from: $dataPath")
 97 |       val data = dataArray(0)
 98 |       assert(data.size == 2, s"Unable to load $loadedClassName data from: $dataPath")
 99 |       val factors = dataRDD.rdd.map {
100 |         case Row(featureId: Long, factors: Seq[Double]) =>
101 |           (featureId, factors.toArray)
102 |       }
103 |       new BSFMModel(k, intercept, views, classification, factors)
104 |     } else {
105 |       throw new Exception(
106 |         s"FMModel.load did not recognize model with (className, format version):" +
107 |           s"($loadedClassName, $version).  Supported:\n" +
108 |           s"  ($classNameV1_0, 1.0)")
109 |     }
110 | 
111 |   }
112 | 
113 |   private object SaveLoadV1_0 {
114 |     val formatVersionV1_0 = "1.0"
115 |     val classNameV1_0 = "com.github.cloudml.zen.ml.recommendation.BSFMModel"
116 | 
117 |     def save(
118 |               sc: SparkContext,
119 |               path: String,
120 |               k: Int,
121 |               intercept: Double,
122 |               views: Array[Long],
123 |               classification: Boolean,
124 |               factors: RDD[(Long, Array[Double])]): Unit = {
125 |       val metadata = compact(render
126 |       (("class" -> classNameV1_0) ~ ("version" -> formatVersionV1_0) ~ ("intercept" -> intercept) ~
127 |         ("k" -> k) ~ ("views" -> views.mkString(",")) ~ ("classification" -> classification)))
128 |       sc.parallelize(Seq(metadata), 1).saveAsTextFile(LoaderUtils.metadataPath(path))
129 | 
130 |       val sqlContext = new SQLContext(sc)
131 |       import sqlContext.implicits._
132 |       // Create Parquet data.
133 |       factors.toDF("featureId", "factors").write.parquet(LoaderUtils.dataPath(path))
134 |     }
135 |   }
136 | 
137 | }
138 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/dbscan/EvenSplitPartitioner.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.dbscan
  2 | 
  3 | import scala.annotation.tailrec
  4 | 
  5 | import org.apache.spark.internal.Logging
  6 | 
  7 | /**
  8 |   * Helper methods for calling the partitioner
  9 |   */
 10 | object EvenSplitPartitioner {
 11 | 
 12 |   def partition(
 13 |                  toSplit: Set[(DBSCANRectangle, Int)],
 14 |                  maxPointsPerPartition: Long,
 15 |                  minimumRectangleSize: Double): List[(DBSCANRectangle, Int)] = {
 16 |     new EvenSplitPartitioner(maxPointsPerPartition, minimumRectangleSize)
 17 |       .findPartitions(toSplit)
 18 |   }
 19 | 
 20 | }
 21 | 
 22 | class EvenSplitPartitioner(
 23 |                             maxPointsPerPartition: Long,
 24 |                             minimumRectangleSize: Double) extends Logging {
 25 | 
 26 |   type RectangleWithCount = (DBSCANRectangle, Int)
 27 | 
 28 |   def findPartitions(toSplit: Set[RectangleWithCount]): List[RectangleWithCount] = {
 29 | 
 30 |     val boundingRectangle = findBoundingRectangle(toSplit)
 31 | 
 32 |     def pointsIn = pointsInRectangle(toSplit, _: DBSCANRectangle)
 33 | 
 34 |     val toPartition = List((boundingRectangle, pointsIn(boundingRectangle)))
 35 |     val partitioned = List[RectangleWithCount]()
 36 | 
 37 |     logTrace("About to start partitioning")
 38 |     val partitions = partition(toPartition, partitioned, pointsIn)
 39 |     logTrace("Done")
 40 | 
 41 |     // remove empty partitions
 42 |     partitions.filter({ case (partition, count) => count > 0 })
 43 |   }
 44 | 
 45 |   @tailrec
 46 |   private def partition(
 47 |                          remaining: List[RectangleWithCount],
 48 |                          partitioned: List[RectangleWithCount],
 49 |                          pointsIn: (DBSCANRectangle) => Int): List[RectangleWithCount] = {
 50 | 
 51 |     remaining match {
 52 |       case (rectangle, count) :: rest =>
 53 |         if (count > maxPointsPerPartition) {
 54 | 
 55 |           if (canBeSplit(rectangle)) {
 56 |             logTrace(s"About to split: $rectangle")
 57 |             def cost = (r: DBSCANRectangle) => ((pointsIn(rectangle) / 2) - pointsIn(r)).abs
 58 |             val (split1, split2) = split(rectangle, cost)
 59 |             logTrace(s"Found split: $split1, $split2")
 60 |             val s1 = (split1, pointsIn(split1))
 61 |             val s2 = (split2, pointsIn(split2))
 62 |             partition(s1 :: s2 :: rest, partitioned, pointsIn)
 63 | 
 64 |           } else {
 65 |             logWarning(s"Can't split: ($rectangle -> $count) (maxSize: $maxPointsPerPartition)")
 66 |             partition(rest, (rectangle, count) :: partitioned, pointsIn)
 67 |           }
 68 | 
 69 |         } else {
 70 |           partition(rest, (rectangle, count) :: partitioned, pointsIn)
 71 |         }
 72 | 
 73 |       case Nil => partitioned
 74 | 
 75 |     }
 76 | 
 77 |   }
 78 | 
 79 |   def split(
 80 |              rectangle: DBSCANRectangle,
 81 |              cost: (DBSCANRectangle) => Int): (DBSCANRectangle, DBSCANRectangle) = {
 82 | 
 83 |     val smallestSplit =
 84 |       findPossibleSplits(rectangle)
 85 |         .reduceLeft {
 86 |           (smallest, current) =>
 87 | 
 88 |             if (cost(current) < cost(smallest)) {
 89 |               current
 90 |             } else {
 91 |               smallest
 92 |             }
 93 | 
 94 |         }
 95 | 
 96 |     (smallestSplit, (complement(smallestSplit, rectangle)))
 97 | 
 98 |   }
 99 | 
100 |   /**
101 |     * Returns the box that covers the space inside boundary that is not covered by box
102 |     */
103 |   private def complement(box: DBSCANRectangle, boundary: DBSCANRectangle): DBSCANRectangle =
104 |     if (box.x == boundary.x && box.y == boundary.y) {
105 |       if (boundary.x2 >= box.x2 && boundary.y2 >= box.y2) {
106 |         if (box.y2 == boundary.y2) {
107 |           DBSCANRectangle(box.x2, box.y, boundary.x2, boundary.y2)
108 |         } else if (box.x2 == boundary.x2) {
109 |           DBSCANRectangle(box.x, box.y2, boundary.x2, boundary.y2)
110 |         } else {
111 |           throw new IllegalArgumentException("rectangle is not a proper sub-rectangle")
112 |         }
113 |       } else {
114 |         throw new IllegalArgumentException("rectangle is smaller than boundary")
115 |       }
116 |     } else {
117 |       throw new IllegalArgumentException("unequal rectangle")
118 |     }
119 | 
120 |   /**
121 |     * Returns all the possible ways in which the given box can be split
122 |     */
123 |   private def findPossibleSplits(box: DBSCANRectangle): Set[DBSCANRectangle] = {
124 | 
125 |     val xSplits = (box.x + minimumRectangleSize) until box.x2 by minimumRectangleSize
126 | 
127 |     val ySplits = (box.y + minimumRectangleSize) until box.y2 by minimumRectangleSize
128 | 
129 |     val splits =
130 |       xSplits.map(x => DBSCANRectangle(box.x, box.y, x, box.y2)) ++
131 |         ySplits.map(y => DBSCANRectangle(box.x, box.y, box.x2, y))
132 | 
133 |     logTrace(s"Possible splits: $splits")
134 | 
135 |     splits.toSet
136 |   }
137 | 
138 |   /**
139 |     * Returns true if the given rectangle can be split into at least two rectangles of minimum size
140 |     */
141 |   private def canBeSplit(box: DBSCANRectangle): Boolean = {
142 |     (box.x2 - box.x > minimumRectangleSize * 2 ||
143 |       box.y2 - box.y > minimumRectangleSize * 2)
144 |   }
145 | 
146 |   def pointsInRectangle(space: Set[RectangleWithCount], rectangle: DBSCANRectangle): Int = {
147 |     space.view
148 |       .filter({ case (current, _) => rectangle.contains(current) })
149 |       .foldLeft(0) {
150 |         case (total, (_, count)) => total + count
151 |       }
152 |   }
153 | 
154 |   def findBoundingRectangle(rectanglesWithCount: Set[RectangleWithCount]): DBSCANRectangle = {
155 | 
156 |     val invertedRectangle =
157 |       DBSCANRectangle(Double.MaxValue, Double.MaxValue, Double.MinValue, Double.MinValue)
158 | 
159 |     rectanglesWithCount.foldLeft(invertedRectangle) {
160 |       case (bounding, (c, _)) =>
161 |         DBSCANRectangle(
162 |           bounding.x.min(c.x), bounding.y.min(c.y),
163 |           bounding.x2.max(c.x2), bounding.y2.max(c.y2))
164 |     }
165 | 
166 |   }
167 | 
168 | }
169 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/timeseries/models/ARGARCH.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.timeseries.models
  2 | 
  3 | import io.transwarp.discover.timeseries.params.TimeSeriesParams
  4 | import org.apache.commons.math3.random.RandomGenerator
  5 | import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}
  6 | import org.apache.spark.ml.param.ParamMap
  7 | import org.apache.spark.ml.util.Identifiable
  8 | import org.apache.spark.ml.{Estimator, Model}
  9 | import org.apache.spark.sql._
 10 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
 11 | 
 12 | /**
 13 |   * Created by endy on 16-12-22.
 14 |   */
 15 | 
 16 | class ARGARCH(override val uid: String)  extends Estimator[ARGARCHModel] with TimeSeriesParams {
 17 |   setDefault(timeCol -> "time",
 18 |     timeSeriesCol -> "timeseries")
 19 | 
 20 |   def this() = this(Identifiable.randomUID("ARGARCH"))
 21 |   /**
 22 |     * Fits a model to the input data.
 23 |     */
 24 |   override def fit(dataset: Dataset[_]): ARGARCHModel = {
 25 |     val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map {
 26 |       case Row(time: String, value: Double) => (time, value)
 27 |     }.sortByKey().collect()
 28 | 
 29 |     val dataVector = Vectors.dense(data.map(x => x._2))
 30 | 
 31 |     val arModel = new Autoregression().fit(dataset)
 32 |     val residuals = arModel.removeTimeDependentEffects(dataVector)
 33 |     val dataFrame = generateDf(dataset.sparkSession, residuals.toArray)
 34 |     val garchModel = new GARCH().fit(dataFrame)
 35 | 
 36 |     new ARGARCHModel(arModel.c, arModel.coefficients(0), garchModel.omega, garchModel.alpha,
 37 |       garchModel.beta)
 38 |   }
 39 | 
 40 |   override def copy(extra: ParamMap): Estimator[ARGARCHModel] = defaultCopy(extra)
 41 | 
 42 |   /**
 43 |     * :: DeveloperApi ::
 44 |     *
 45 |     * Check transform validity and derive the output schema from the input schema.
 46 |     *
 47 |     * Typical implementation should first conduct verification on schema change and parameter
 48 |     * validity, including complex parameter interaction checks.
 49 |     */
 50 |   override def transformSchema(schema: StructType): StructType = schema
 51 | 
 52 |   private def generateDf(sparkSession: SparkSession, array: Array[Double]): DataFrame = {
 53 |     val schema = StructType(Array(StructField(${timeCol}, StringType), StructField(${timeSeriesCol},
 54 |       DoubleType)))
 55 | 
 56 |     val rdd = sparkSession.sparkContext.parallelize(
 57 |       array.zipWithIndex.map(x => Row(x._2.formatted("%010d"), x._1)))
 58 | 
 59 |     sparkSession.createDataFrame(rdd, schema)
 60 |   }
 61 | }
 62 | 
 63 | class ARGARCHModel(override val uid: String, val c: Double, val phi: Double, val omega: Double,
 64 |                    val alpha: Double, val beta: Double) extends
 65 |                 Model[ARGARCHModel] with TimeSeriesParams {
 66 | 
 67 |   def this(c: Double, phi: Double, omega: Double, alpha: Double, beta: Double) =
 68 |       this(Identifiable.randomUID("ARGARCHModel"), c, phi, omega, alpha, beta)
 69 | 
 70 |   override def copy(extra: ParamMap): ARGARCHModel = defaultCopy(extra)
 71 | 
 72 |   /**
 73 |     * Transforms the input dataset.
 74 |     */
 75 |   override def transform(dataset: Dataset[_]): DataFrame = {
 76 |     val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map {
 77 |       case Row(time: String, value: Double) => (time, value)
 78 |     }.sortByKey().collect()
 79 | 
 80 |     val dataVector = Vectors.dense(data.map(x => x._2))
 81 | 
 82 |     val dest = addTimeDependentEffects(dataVector)
 83 | 
 84 |     val resRDD = dataset.sparkSession.sparkContext.parallelize(dest.toArray.map(x => Row(x)))
 85 | 
 86 |     val structType = transformSchema(dataset.schema)
 87 | 
 88 |     dataset.sparkSession.createDataFrame(resRDD, structType)
 89 |   }
 90 | 
 91 |   /**
 92 |     * :: DeveloperApi ::
 93 |     *
 94 |     * Check transform validity and derive the output schema from the input schema.
 95 |     *
 96 |     * Typical implementation should first conduct verification on schema change and parameter
 97 |     * validity, including complex parameter interaction checks.
 98 |     */
 99 |   override def transformSchema(schema: StructType): StructType = {
100 |     StructType(Array(StructField("ARGARCH", DoubleType)))
101 |   }
102 | 
103 |   def removeTimeDependentEffects(ts: Vector): Vector = {
104 |     val destArr = new Array[Double](ts.size)
105 |     var prevEta = ts(0) - c
106 |     var prevVariance = omega / (1.0 - alpha - beta)
107 |     destArr(0) = prevEta / math.sqrt(prevVariance)
108 |     for (i <- 1 until ts.size) {
109 |       val variance = omega + alpha * prevEta * prevEta + beta * prevVariance
110 |       val eta = ts(i) - c - phi * ts(i - 1)
111 |       destArr(i) = eta / math.sqrt(variance)
112 | 
113 |       prevEta = eta
114 |       prevVariance = variance
115 |     }
116 |     new DenseVector(destArr)
117 |   }
118 | 
119 |   def addTimeDependentEffects(ts: Vector): Vector = {
120 |     val destArr = new Array[Double](ts.size)
121 |     var prevVariance = omega / (1.0 - alpha - beta)
122 |     var prevEta = ts(0) * math.sqrt(prevVariance)
123 |     destArr(0) = c + prevEta
124 |     for (i <- 1 until ts.size) {
125 |       val variance = omega + alpha * prevEta * prevEta + beta * prevVariance
126 |       val standardizedEta = ts(i)
127 |       val eta = standardizedEta * math.sqrt(variance)
128 |       destArr(i) = c + phi * destArr(i - 1) + eta
129 | 
130 |       prevEta = eta
131 |       prevVariance = variance
132 |     }
133 |     new DenseVector(destArr)
134 |   }
135 | 
136 |   private def sampleWithVariances(n: Int, rand: RandomGenerator): (Array[Double], Array[Double]) = {
137 |     val ts = new Array[Double](n)
138 |     val variances = new Array[Double](n)
139 |     variances(0) = omega / (1 - alpha - beta)
140 |     var eta = math.sqrt(variances(0)) * rand.nextGaussian()
141 |     for (i <- 1 until n) {
142 |       variances(i) = omega + beta * variances(i-1) + alpha * eta * eta
143 |       eta = math.sqrt(variances(i)) * rand.nextGaussian()
144 |       ts(i) = c + phi * ts(i - 1) + eta
145 |     }
146 | 
147 |     (ts, variances)
148 |   }
149 | 
150 |   /**
151 |     * Samples a random time series of a given length with the properties of the model.
152 |     *
153 |     * @param n The length of the time series to sample.
154 |     * @param rand The random generator used to generate the observations.
155 |     * @return The samples time series.
156 |     */
157 |   def sample(n: Int, rand: RandomGenerator): Array[Double] = sampleWithVariances(n, rand)._1
158 | }
159 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/knn/KNNClassifier.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.classification
  2 | 
  3 | import org.apache.spark.broadcast.Broadcast
  4 | import org.apache.spark.ml.param.ParamMap
  5 | import org.apache.spark.ml.param.shared.HasWeightCol
  6 | import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
  7 | import org.apache.spark.ml.linalg._
  8 | import org.apache.spark.rdd.RDD
  9 | import org.apache.spark.sql.types.{DoubleType, StructType}
 10 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
 11 | import org.apache.spark.storage.StorageLevel
 12 | import org.apache.spark.ml.feature.LabeledPoint
 13 | 
 14 | import scala.collection.mutable.ArrayBuffer
 15 | 
 16 | /**
 17 |   * Created by endy on 17-1-9.
 18 |   */
 19 | class KNNClassifier(override val uid: String) extends
 20 |   ProbabilisticClassifier[Vector, KNNClassifier, KNNClassificationModel]
 21 |   with KNNParams {
 22 | 
 23 |   def this() = this(Identifiable.randomUID("KNNClassifier"))
 24 | 
 25 |   def setK(value: Int): this.type = set(k, value)
 26 | 
 27 |   def setTopTreeSize(value: Int): this.type = set(topTreeSize, value)
 28 | 
 29 |   def setTopTreeLeafSize(value: Int): this.type = set(topTreeLeafSize, value)
 30 | 
 31 |   def setSubTreeLeafSize(value: Int): this.type = set(subTreeLeafSize, value)
 32 | 
 33 |   def setBufferSizeSampleSizes(value: Array[Int]): this.type = set(bufferSizeSampleSizes, value)
 34 | 
 35 |   def setBalanceThreshold(value: Double): this.type = set(balanceThreshold, value)
 36 | 
 37 |   def setSeed(value: Long): this.type = set(seed, value)
 38 | 
 39 |   override protected def train(dataset: Dataset[_]): KNNClassificationModel = {
 40 |     // Extract columns from data.  If dataset is persisted, do not persist oldDataset.
 41 |     val instances = extractLabeledPoints(dataset).map {
 42 |       case LabeledPoint(label: Double, features: Vector) => (label, features)
 43 |     }
 44 |     val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
 45 |     if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK)
 46 | 
 47 |     val labelSummarizer = instances.treeAggregate(
 48 |       new MultiClassSummarizer)(
 49 |       seqOp = (c, v) => (c, v) match {
 50 |         case (labelSummarizer: MultiClassSummarizer, (label: Double, features: Vector)) =>
 51 |           labelSummarizer.add(label)
 52 |       },
 53 |       combOp = (c1, c2) => (c1, c2) match {
 54 |         case (classSummarizer1: MultiClassSummarizer, classSummarizer2: MultiClassSummarizer) =>
 55 |           classSummarizer1.merge(classSummarizer2)
 56 |       })
 57 | 
 58 |     val histogram = labelSummarizer.histogram
 59 |     val numInvalid = labelSummarizer.countInvalid
 60 |     val numClasses = histogram.length
 61 | 
 62 |     if (numInvalid != 0) {
 63 |       val msg = s"Classification labels should be in {0 to ${numClasses - 1} " +
 64 |         s"Found $numInvalid invalid labels."
 65 |       throw Exception
 66 |     }
 67 | 
 68 |     val knnModel = copyValues(new KNN()).fit(dataset)
 69 |     knnModel.toNewClassificationModel(uid, numClasses)
 70 |   }
 71 | 
 72 |   override def fit(dataset: Dataset[_]): KNNClassificationModel = {
 73 |     // Need to overwrite this method because we need to manually overwrite the buffer size
 74 |     // because it is not supposed to stay the same as the Classifier if user sets it to -1.
 75 |     transformSchema(dataset.schema, logging = true)
 76 |     val model = train(dataset)
 77 |     val bufferSize = model.getBufferSize
 78 |     copyValues(model.setParent(this)).setBufferSize(bufferSize)
 79 |   }
 80 | 
 81 |   override def copy(extra: ParamMap): KNNClassifier = defaultCopy(extra)
 82 | }
 83 | 
 84 | class KNNClassificationModel(override val uid: String, val topTree: Broadcast[Tree],
 85 |                              val subTrees: RDD[Tree], val _numClasses: Int) extends
 86 |   ProbabilisticClassificationModel[Vector, KNNClassificationModel]
 87 |   with KNNModelParams with HasWeightCol with Serializable {
 88 |   require(subTrees.getStorageLevel != StorageLevel.NONE,
 89 |     "KNNModel is not designed to work with Trees that have not been cached")
 90 | 
 91 |   /** @group setParam */
 92 |   def setK(value: Int): this.type = set(k, value)
 93 | 
 94 |   /** @group setParam */
 95 |   def setBufferSize(value: Double): this.type = set(bufferSize, value)
 96 | 
 97 |   override def numClasses: Int = _numClasses
 98 | 
 99 |   override def transform(dataset: Dataset[_]): DataFrame = {
100 |     val getWeight: Row => Double = r => 1.0
101 | 
102 |     val merged = transform(dataset, topTree, subTrees).map {
103 |       case (id, labels) =>
104 |         val vector = new Array[Double](numClasses)
105 |         var i = 0
106 |         while (i < labels.length) {
107 |           vector(labels(i).getDouble(0).toInt) += getWeight(labels(i))
108 |           i += 1
109 |         }
110 |         val rawPrediction = Vectors.dense(vector)
111 |         lazy val probability = raw2probability(rawPrediction)
112 |         lazy val prediction = probability2prediction(probability)
113 | 
114 |         val values = new ArrayBuffer[Any]
115 |         if ($(rawPredictionCol).nonEmpty) {
116 |           values.append(rawPrediction)
117 |         }
118 |         if ($(probabilityCol).nonEmpty) {
119 |           values.append(probability)
120 |         }
121 |         if ($(predictionCol).nonEmpty) {
122 |           values.append(prediction)
123 |         }
124 |         (id, values)
125 |     }
126 | 
127 |     dataset.sqlContext.createDataFrame(
128 |       dataset.rdd.zipWithIndex().map { case (row, i) => (i, row) }
129 |         .leftOuterJoin(merged) // make sure we don't lose any observations
130 |         .map {
131 |         case (i, (row, values)) => Row.fromSeq(row.asInstanceOf[Row].toSeq ++ values.get)
132 |       },
133 |       transformSchema(dataset.schema)
134 |     )
135 |   }
136 | 
137 |   override def transformSchema(schema: StructType): StructType = {
138 |     var transformed = schema
139 |     if ($(rawPredictionCol).nonEmpty) {
140 |       transformed = SchemaUtils.appendColumn(transformed, $(rawPredictionCol), new VectorUDT)
141 |     }
142 |     if ($(probabilityCol).nonEmpty) {
143 |       transformed = SchemaUtils.appendColumn(transformed, $(probabilityCol), new VectorUDT)
144 |     }
145 |     if ($(predictionCol).nonEmpty) {
146 |       transformed = SchemaUtils.appendColumn(transformed, $(predictionCol), DoubleType)
147 |     }
148 |     transformed
149 |   }
150 | 
151 |   override def copy(extra: ParamMap): KNNClassificationModel = {
152 |     val copied = new KNNClassificationModel(uid, topTree, subTrees, numClasses)
153 |     copyValues(copied, extra).setParent(parent)
154 |   }
155 | 
156 |   override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = {
157 | 
158 |     rawPrediction match {
159 |       case dv: DenseVector =>
160 |         val size = dv.size
161 |         val sum = dv.toArray.sum
162 | 
163 |         var i = 0
164 |         while (i < size) {
165 |           dv.values(i) /= sum
166 |           i += 1
167 |         }
168 | 
169 |         dv
170 |       case sv: SparseVector =>
171 |         throw Exception
172 |     }
173 |   }
174 | 
175 |   override protected def predictRaw(features: Vector): Vector = {
176 |     throw Exception
177 |   }
178 | }
179 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/timeseries/models/EWMA.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.timeseries.models
  2 | 
  3 | import org.apache.commons.math3.analysis.{MultivariateFunction, MultivariateVectorFunction}
  4 | import org.apache.commons.math3.optim.{InitialGuess, MaxEval, MaxIter, SimpleValueChecker}
  5 | import org.apache.commons.math3.optim.nonlinear.scalar.{GoalType, ObjectiveFunction, ObjectiveFunctionGradient}
  6 | import org.apache.commons.math3.optim.nonlinear.scalar.gradient.NonLinearConjugateGradientOptimizer
  7 | import org.apache.spark.ml.{Estimator, Model}
  8 | import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}
  9 | import org.apache.spark.ml.param.{Param, ParamMap}
 10 | import org.apache.spark.ml.timeseries.params.TimeSeriesParams
 11 | import org.apache.spark.ml.util.Identifiable
 12 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
 13 | import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
 14 | 
 15 | /**
 16 |   * Fits an Exponentially Weight Moving Average model (EWMA) to a time series.
 17 |   */
 18 | 
 19 | trait EWMAParams extends TimeSeriesParams {
 20 |   final val maxEval = new Param[Int](this, "maxEval", "max eval")
 21 |   def setMaxEval(value: Int): this.type = set(maxEval, value)
 22 | 
 23 |   final val maxIter = new Param[Int](this, "maxIter", "max iteration")
 24 |   def setMaxIter(value: Int): this.type = set(maxIter, value)
 25 | 
 26 |   final val initPoint = new Param[Double](this, "initPoint", "init point")
 27 |   def setInitPoint(value: Double): this.type = set(initPoint, value)
 28 | }
 29 | 
 30 | class EWMA(override val uid: String) extends Estimator[EWMAModel] with EWMAParams{
 31 | 
 32 |   setDefault(timeCol -> "time",
 33 |     timeSeriesCol -> "timeseries")
 34 | 
 35 |   def this() = this(Identifiable.randomUID("EWMA"))
 36 | 
 37 |   /**
 38 |     * Fits an EWMA model to a time series. Uses the first point in the time series as a starting
 39 |     * value. Uses sum squared error as an objective function to optimize to find smoothing parameter
 40 |     * The model for EWMA is recursively defined as S_t = (1 - a) * X_t + a * S_{t-1}, where
 41 |     * a is the smoothing parameter, X is the original series, and S is the smoothed series
 42 |     * Note that the optimization is performed as unbounded optimization, although in its formal
 43 |     * definition the smoothing parameter is <= 1, which corresponds to an inequality bounded
 44 |     * optimization. Given this, the resulting smoothing parameter should always be sanity checked
 45 |     * https://en.wikipedia.org/wiki/Exponential_smoothing
 46 |     * @param dataset the time series dataset to which we want to fit an EWMA model
 47 |     * @return EWMA model
 48 |     */
 49 |   override def fit(dataset: Dataset[_]): EWMAModel = {
 50 | 
 51 |     val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map {
 52 |           case Row(time: String, value: Double) => (time, value)
 53 |       }.sortByKey().collect()
 54 |       .map(x => x._2)
 55 | 
 56 |     val dataVector = Vectors.dense(data)
 57 | 
 58 |     val optimizer = new NonLinearConjugateGradientOptimizer(
 59 |       NonLinearConjugateGradientOptimizer.Formula.FLETCHER_REEVES,
 60 |       new SimpleValueChecker(1e-6, 1e-6))
 61 | 
 62 | 
 63 |     val gradient = new ObjectiveFunctionGradient(new MultivariateVectorFunction() {
 64 |       def value(params: Array[Double]): Array[Double] = {
 65 |         val g = new EWMAModel(params(0)).gradient(dataVector)
 66 |         Array(g)
 67 |       }
 68 |     })
 69 | 
 70 |     val objectiveFunction = new ObjectiveFunction(new MultivariateFunction() {
 71 |       def value(params: Array[Double]): Double = {
 72 |         new EWMAModel(params(0)).sse(dataVector)
 73 |       }
 74 |     })
 75 |     // optimization parameters
 76 |     val initGuess = new InitialGuess(Array(${initPoint}))
 77 |     val goal = GoalType.MINIMIZE
 78 |     // optimization step
 79 |     val optimal = optimizer.optimize(objectiveFunction, goal, gradient, initGuess,
 80 |       new MaxIter(${maxIter}), new MaxEval(${maxEval}))
 81 |     val params = optimal.getPoint
 82 | 
 83 |     new EWMAModel(params(0))
 84 |       .setTimeCol(${timeCol})
 85 |       .setTimeSeriesCol(${timeSeriesCol})
 86 | 
 87 |   }
 88 | 
 89 |   override def copy(extra: ParamMap): Estimator[EWMAModel] = defaultCopy(extra)
 90 | 
 91 |   /**
 92 |     * Check transform validity and derive the output schema from the input schema.
 93 |     *
 94 |     * Typical implementation should first conduct verification on schema change and parameter
 95 |     * validity, including complex parameter interaction checks.
 96 |     */
 97 |   override def transformSchema(schema: StructType): StructType = {
 98 |     schema
 99 |   }
100 | }
101 | 
102 | 
103 | class EWMAModel(override val uid: String, val smoothing: Double)
104 |   extends Model[EWMAModel] with EWMAParams{
105 | 
106 |   def this(smoothing: Double) = this(Identifiable.randomUID("EWMAModel"), smoothing)
107 | 
108 |   /**
109 |     * Calculates the SSE for a given timeseries ts given
110 |     * the smoothing parameter of the current model
111 |     * The forecast for the observation at period t + 1 is the smoothed value at time t
112 |     * Source: http://people.duke.edu/~rnau/411avg.htm
113 |     * @param ts the time series to fit a EWMA model to
114 |     * @return Sum Squared Error
115 |     */
116 |   def sse(ts: Vector): Double = {
117 |     val n = ts.size
118 | 
119 |     val smoothed = addTimeDependentEffects(ts)
120 |     var i = 0
121 |     var error = 0.0
122 |     var sqrErrors = 0.0
123 |     while (i < n - 1) {
124 |       error = ts(i + 1) - smoothed(i)
125 |       sqrErrors += error * error
126 |       i += 1
127 |     }
128 | 
129 |     sqrErrors
130 |   }
131 | 
132 |   /**
133 |     * Calculates the gradient of the SSE cost function for our EWMA model
134 |     * @return gradient
135 |     */
136 |   def gradient(ts: Vector): Double = {
137 |     val n = ts.size
138 |     // val smoothed = new DenseVector(Array.fill(n)(0.0))
139 |     val smoothed = addTimeDependentEffects(ts)
140 | 
141 |     var error = 0.0
142 |     var prevSmoothed = ts(0)
143 |     var prevDSda = 0.0 // derivative of the EWMA function at time t - 1: (d S(t - 1)/ d smoothing)
144 |     var dSda = 0.0 // derivative of the EWMA function at time t: (d S(t) / d smoothing)
145 |     var dJda = 0.0 // derivative of our SSE cost function
146 |     var i = 0
147 | 
148 |     while (i < n - 1) {
149 |       error = ts(i + 1) - smoothed(i)
150 |       dSda = ts(i) - prevSmoothed + (1 - smoothing) * prevDSda
151 |       dJda += error * dSda
152 |       prevDSda = dSda
153 |       prevSmoothed = smoothed(i)
154 |       i += 1
155 |     }
156 |     2 * dJda
157 |   }
158 | 
159 |   def addTimeDependentEffects(ts: Vector): Vector = {
160 |     val arr = Array.fill(ts.size)(0.0)
161 |     arr(0) = ts(0) // by definition in our model S_0 = X_0
162 |     for (i <- 1 until ts.size) {
163 |       arr(i) = smoothing * ts(i) + (1 - smoothing) * arr(i - 1)
164 |     }
165 |     new DenseVector(arr)
166 |   }
167 | 
168 | 
169 |   override def copy(extra: ParamMap): EWMAModel = defaultCopy(extra)
170 | 
171 |   /**
172 |     * Transforms the input dataset.
173 |     */
174 |   override def transform(dataset: Dataset[_]): DataFrame = {
175 | 
176 |     val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map {
177 |       case Row(time: String, value: Double) => (time, value)
178 |     }.sortByKey().collect()
179 |       .map(x => x._2)
180 | 
181 |     val dataVector = Vectors.dense(data)
182 | 
183 |     val res = addTimeDependentEffects(dataVector)
184 | 
185 |     val resRDD = dataset.sparkSession.sparkContext.parallelize(res.toArray.map(x => Row(x)))
186 | 
187 |     val structType = transformSchema(dataset.schema)
188 | 
189 |     dataset.sparkSession.createDataFrame(resRDD, structType)
190 |   }
191 | 
192 |   /**
193 |     * Check transform validity and derive the output schema from the input schema.
194 |     *
195 |     * Typical implementation should first conduct verification on schema change and parameter
196 |     * validity, including complex parameter interaction checks.
197 |     */
198 |   override def transformSchema(schema: StructType): StructType = {
199 |     StructType(Array(StructField("EMA", DoubleType)))
200 |   }
201 | }
202 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/timeseries/models/GARCH.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.timeseries.models
  2 | 
  3 | import io.transwarp.discover.timeseries.params.TimeSeriesParams
  4 | import io.transwarp.midas.constant.midas.params.timeseries.{GARCHParams, TimeSeriesParams}
  5 | import org.apache.commons.math3.analysis.{MultivariateFunction, MultivariateVectorFunction}
  6 | import org.apache.commons.math3.optim.{InitialGuess, MaxEval, MaxIter, SimpleValueChecker}
  7 | import org.apache.commons.math3.optim.nonlinear.scalar.{ObjectiveFunction, ObjectiveFunctionGradient}
  8 | import org.apache.commons.math3.optim.nonlinear.scalar.gradient.NonLinearConjugateGradientOptimizer
  9 | import org.apache.commons.math3.random.RandomGenerator
 10 | import org.apache.spark.ml.{Estimator, Model}
 11 | import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}
 12 | import org.apache.spark.ml.param.{Param, ParamMap}
 13 | import org.apache.spark.ml.timeseries.params.TimeSeriesParams
 14 | import org.apache.spark.ml.util.Identifiable
 15 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
 16 | import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
 17 | 
 18 | /**
 19 |   * Created by endy on 16-12-22.
 20 |   */
 21 | 
 22 | trait GARCHParams extends TimeSeriesParams {
 23 |   final val maxEval = new Param[Int](this, "maxEval", "max eval")
 24 |   def setMaxEval(value: Int): this.type = set(maxEval, value)
 25 | 
 26 |   final val maxIter = new Param[Int](this, "maxIter", "max iteration")
 27 |   def setMaxIter(value: Int): this.type = set(maxIter, value)
 28 | }
 29 | 
 30 | class GARCH(override val uid: String) extends Estimator[GARCHModel] with GARCHParams{
 31 | 
 32 |   setDefault(timeCol -> "time",
 33 |     timeSeriesCol -> "timeseries",
 34 |     maxEval -> 10000,
 35 |     maxIter -> 10000)
 36 | 
 37 |   def this() = this(Identifiable.randomUID("GARCH"))
 38 | 
 39 |   /**
 40 |     * Fits a model to the input data.
 41 |     */
 42 |   override def fit(dataset: Dataset[_]): GARCHModel = {
 43 | 
 44 |     val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map {
 45 |       case Row(time: String, value: Double) => (time, value)
 46 |     }.sortByKey().collect()
 47 | 
 48 |     val dataVector = Vectors.dense(data.map(x => x._2))
 49 | 
 50 |     val optimizer = new NonLinearConjugateGradientOptimizer(
 51 |       NonLinearConjugateGradientOptimizer.Formula.FLETCHER_REEVES,
 52 |       new SimpleValueChecker(1e-6, 1e-6))
 53 | 
 54 |     val gradient = new ObjectiveFunctionGradient(new MultivariateVectorFunction() {
 55 |       def value(params: Array[Double]): Array[Double] = {
 56 |         new GARCHModel(params(0), params(1), params(2)).gradient(dataVector)
 57 |       }
 58 |     })
 59 |     val objectiveFunction = new ObjectiveFunction(new MultivariateFunction() {
 60 |       def value(params: Array[Double]): Double = {
 61 |         new GARCHModel(params(0), params(1), params(2)).logLikelihood(dataVector)
 62 |       }
 63 |     })
 64 | 
 65 |     val initialGuess = new InitialGuess(Array(.2, .2, .2)) // TODO: make this smarter
 66 | 
 67 |     val optimal = optimizer.optimize(objectiveFunction, gradient, initialGuess,
 68 |       new MaxIter(${maxIter}), new MaxEval(${maxEval}))
 69 | 
 70 |     val params = optimal.getPoint
 71 |     new GARCHModel(params(0), params(1), params(2))
 72 |       .setTimeCol(${timeCol}).setTimeSeriesCol(${timeSeriesCol})
 73 | 
 74 |   }
 75 | 
 76 |   override def copy(extra: ParamMap): Estimator[GARCHModel] = defaultCopy(extra)
 77 | 
 78 |   /**
 79 |     * :: DeveloperApi ::
 80 |     *
 81 |     * Check transform validity and derive the output schema from the input schema.
 82 |     *
 83 |     * Typical implementation should first conduct verification on schema change and parameter
 84 |     * validity, including complex parameter interaction checks.
 85 |     */
 86 |   override def transformSchema(schema: StructType): StructType = schema
 87 | }
 88 | 
 89 | class GARCHModel(override val uid: String, val omega: Double, val alpha: Double, val beta: Double)
 90 |           extends Model[GARCHModel] with GARCHParams {
 91 | 
 92 |   def this(omega: Double, alpha: Double, beta: Double) = this(Identifiable.randomUID("GARCH"),
 93 |     omega, alpha, beta)
 94 | 
 95 |   override def copy(extra: ParamMap): GARCHModel = defaultCopy(extra)
 96 | 
 97 |   /**
 98 |     * Transforms the input dataset.
 99 |     */
100 |   override def transform(dataset: Dataset[_]): DataFrame = {
101 |     val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map {
102 |       case Row(time: String, value: Double) => (time, value)
103 |     }.sortByKey().collect()
104 | 
105 |     val dataVector = Vectors.dense(data.map(x => x._2))
106 | 
107 |     val dest = addTimeDependentEffects(dataVector)
108 | 
109 |     val resRDD = dataset.sparkSession.sparkContext.parallelize(dest.toArray.map(x => Row(x)))
110 | 
111 |     val structType = transformSchema(dataset.schema)
112 | 
113 |     dataset.sparkSession.createDataFrame(resRDD, structType)
114 |   }
115 | 
116 |   /**
117 |     * :: DeveloperApi ::
118 |     *
119 |     * Check transform validity and derive the output schema from the input schema.
120 |     *
121 |     * Typical implementation should first conduct verification on schema change and parameter
122 |     * validity, including complex parameter interaction checks.
123 |     */
124 |   override def transformSchema(schema: StructType): StructType =
125 |           StructType(Array(StructField("GARCH", DoubleType)))
126 |   /**
127 |     * Returns the log likelihood of the parameters on the given time series.
128 |     *
129 |     * Based on https://pdfs.semanticscholar.org/7da8/bfa5295375c1141d797e80065a599153c19d.pdf
130 |     */
131 |   def logLikelihood(ts: Vector): Double = {
132 |     var sum = 0.0
133 |     iterateWithHAndEta(ts) { (i, h, eta, prevH, prevEta) =>
134 |       sum += -.5 * math.log(h) - .5 * eta * eta / h
135 |     }
136 |     sum + -.5 * math.log(2 * math.Pi) * (ts.size - 1)
137 |   }
138 | 
139 |   private def iterateWithHAndEta(ts: Vector)
140 |                                 (fn: (Int, Double, Double, Double, Double) => Unit): Unit = {
141 |     var prevH = omega / (1 - alpha - beta)
142 |     var i = 1
143 |     while (i < ts.size) {
144 |       val h = omega + alpha * ts(i - 1) * ts(i - 1) + beta * prevH
145 |       fn(i, h, ts(i), prevH, ts(i - 1))
146 |       prevH = h
147 |       i += 1
148 |     }
149 |   }
150 | 
151 |   def gradient(ts: Vector): Array[Double] = {
152 |     var omegaGradient = 0.0
153 |     var alphaGradient = 0.0
154 |     var betaGradient = 0.0
155 |     var omegaDhdtheta = 0.0
156 |     var alphaDhdtheta = 0.0
157 |     var betaDhdtheta = 0.0
158 |     iterateWithHAndEta(ts) { (i, h, eta, prevH, prevEta) =>
159 |       omegaDhdtheta = 1 + beta * omegaDhdtheta
160 |       alphaDhdtheta = prevEta * prevEta + beta * alphaDhdtheta
161 |       betaDhdtheta = prevH + beta * betaDhdtheta
162 | 
163 |       val multiplier = (eta * eta / (h * h)) - (1 / h)
164 |       omegaGradient += multiplier * omegaDhdtheta
165 |       alphaGradient += multiplier * alphaDhdtheta
166 |       betaGradient += multiplier * betaDhdtheta
167 |     }
168 |     Array(omegaGradient * .5, alphaGradient * .5, betaGradient * .5)
169 |   }
170 | 
171 |   def addTimeDependentEffects(ts: Vector): Vector = {
172 | 
173 |     val destArr = new Array[Double](ts.size)
174 | 
175 |     var prevVariance = omega / (1.0 - alpha - beta)
176 |     var prevEta = ts(0) * math.sqrt(prevVariance)
177 | 
178 |     destArr(0) = prevEta
179 |     for (i <- 1 until ts.size) {
180 |       val variance = omega + alpha * prevEta * prevEta + beta * prevVariance
181 |       val standardizedEta = ts(i)
182 |       val eta = standardizedEta * math.sqrt(variance)
183 |       destArr(i) = eta
184 | 
185 |       prevEta = eta
186 |       prevVariance = variance
187 |     }
188 |     new DenseVector(destArr)
189 |   }
190 | 
191 |   private def sampleWithVariances(n: Int, rand: RandomGenerator): (Array[Double], Array[Double]) = {
192 |     val ts = new Array[Double](n)
193 |     val variances = new Array[Double](n)
194 |     variances(0) = omega / (1 - alpha - beta)
195 |     var eta = math.sqrt(variances(0)) * rand.nextGaussian()
196 |     for (i <- 1 until n) {
197 |       variances(i) = omega + beta * variances(i-1) + alpha * eta * eta
198 |       eta = math.sqrt(variances(i)) * rand.nextGaussian()
199 |       ts(i) = eta
200 |     }
201 | 
202 |     (ts, variances)
203 |   }
204 | 
205 |   /**
206 |     * Samples a random time series of a given length with the properties of the model.
207 |     *
208 |     * @param n The length of the time series to sample.
209 |     * @param rand The random generator used to generate the observations.
210 |     * @return The samples time series.
211 |     */
212 |   def sample(n: Int, rand: RandomGenerator): Array[Double] = sampleWithVariances(n, rand)._1
213 | }
214 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/ml/timeseries/models/HoltWintersSuite.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.timeseries.models
  2 | 
  3 | import org.apache.spark.SparkFunSuite
  4 | import org.apache.spark.ml.util.DefaultReadWriteTest
  5 | import org.apache.spark.mllib.util.MLlibTestSparkContext
  6 | import org.apache.spark.mllib.util.TestingUtils._
  7 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
  8 | import org.apache.spark.sql.{Dataset, Row}
  9 | 
 10 | /**
 11 |   * Created by endy on 16-12-21.
 12 |   */
 13 | class HoltWintersSuite extends SparkFunSuite with MLlibTestSparkContext
 14 |   with DefaultReadWriteTest {
 15 | 
 16 |   @transient var dataSet: Dataset[_] = _
 17 |   @transient var dataSet2: Dataset[_] = _
 18 | 
 19 |   val tsAirPassengers = Array(
 20 |     112.0, 118.0, 132.0, 129.0, 121.0, 135.0, 148.0, 148.0, 136.0, 119.0, 104.0, 118.0, 115.0,
 21 |     126.0, 141.0, 135.0, 125.0, 149.0, 170.0, 170.0, 158.0, 133.0, 114.0, 140.0, 145.0, 150.0,
 22 |     178.0, 163.0, 172.0, 178.0, 199.0, 199.0, 184.0, 162.0, 146.0, 166.0, 171.0, 180.0, 193.0,
 23 |     181.0, 183.0, 218.0, 230.0, 242.0, 209.0, 191.0, 172.0, 194.0, 196.0, 196.0, 236.0, 235.0,
 24 |     229.0, 243.0, 264.0, 272.0, 237.0, 211.0, 180.0, 201.0, 204.0, 188.0, 235.0, 227.0, 234.0,
 25 |     264.0, 302.0, 293.0, 259.0, 229.0, 203.0, 229.0, 242.0, 233.0, 267.0, 269.0, 270.0, 315.0,
 26 |     364.0, 347.0, 312.0, 274.0, 237.0, 278.0, 284.0, 277.0, 317.0, 313.0, 318.0, 374.0, 413.0,
 27 |     405.0, 355.0, 306.0, 271.0, 306.0, 315.0, 301.0, 356.0, 348.0, 355.0, 422.0, 465.0, 467.0,
 28 |     404.0, 347.0, 305.0, 336.0, 340.0, 318.0, 362.0, 348.0, 363.0, 435.0, 491.0, 505.0, 404.0,
 29 |     359.0, 310.0, 337.0, 360.0, 342.0, 406.0, 396.0, 420.0, 472.0, 548.0, 559.0, 463.0, 407.0,
 30 |     362.0, 405.0, 417.0, 391.0, 419.0, 461.0, 472.0, 535.0, 622.0, 606.0, 508.0, 461.0, 390.0,
 31 |     432.0)
 32 | 
 33 |   val tsCO2 = Array(
 34 |     315.42, 316.31, 316.50, 317.56, 318.13, 318.00, 316.39, 314.65, 313.68, 313.18, 314.66, 315.43,
 35 |     316.27, 316.81, 317.42, 318.87, 319.87, 319.43, 318.01, 315.74, 314.00, 313.68, 314.84, 316.03,
 36 |     316.73, 317.54, 318.38, 319.31, 320.42, 319.61, 318.42, 316.63, 314.83, 315.16, 315.94, 316.85,
 37 |     317.78, 318.40, 319.53, 320.42, 320.85, 320.45, 319.45, 317.25, 316.11, 315.27, 316.53, 317.53,
 38 |     318.58, 318.92, 319.70, 321.22, 322.08, 321.31, 319.58, 317.61, 316.05, 315.83, 316.91, 318.20,
 39 |     319.41, 320.07, 320.74, 321.40, 322.06, 321.73, 320.27, 318.54, 316.54, 316.71, 317.53, 318.55,
 40 |     319.27, 320.28, 320.73, 321.97, 322.00, 321.71, 321.05, 318.71, 317.66, 317.14, 318.70, 319.25,
 41 |     320.46, 321.43, 322.23, 323.54, 323.91, 323.59, 322.24, 320.20, 318.48, 317.94, 319.63, 320.87,
 42 |     322.17, 322.34, 322.88, 324.25, 324.83, 323.93, 322.38, 320.76, 319.10, 319.24, 320.56, 321.80,
 43 |     322.40, 322.99, 323.73, 324.86, 325.40, 325.20, 323.98, 321.95, 320.18, 320.09, 321.16, 322.74,
 44 |     323.83, 324.26, 325.47, 326.50, 327.21, 326.54, 325.72, 323.50, 322.22, 321.62, 322.69, 323.95,
 45 |     324.89, 325.82, 326.77, 327.97, 327.91, 327.50, 326.18, 324.53, 322.93, 322.90, 323.85, 324.96,
 46 |     326.01, 326.51, 327.01, 327.62, 328.76, 328.40, 327.20, 325.27, 323.20, 323.40, 324.63, 325.85,
 47 |     326.60, 327.47, 327.58, 329.56, 329.90, 328.92, 327.88, 326.16, 324.68, 325.04, 326.34, 327.39,
 48 |     328.37, 329.40, 330.14, 331.33, 332.31, 331.90, 330.70, 329.15, 327.35, 327.02, 327.99, 328.48,
 49 |     329.18, 330.55, 331.32, 332.48, 332.92, 332.08, 331.01, 329.23, 327.27, 327.21, 328.29, 329.41,
 50 |     330.23, 331.25, 331.87, 333.14, 333.80, 333.43, 331.73, 329.90, 328.40, 328.17, 329.32, 330.59,
 51 |     331.58, 332.39, 333.33, 334.41, 334.71, 334.17, 332.89, 330.77, 329.14, 328.78, 330.14, 331.52,
 52 |     332.75, 333.24, 334.53, 335.90, 336.57, 336.10, 334.76, 332.59, 331.42, 330.98, 332.24, 333.68,
 53 |     334.80, 335.22, 336.47, 337.59, 337.84, 337.72, 336.37, 334.51, 332.60, 332.38, 333.75, 334.78,
 54 |     336.05, 336.59, 337.79, 338.71, 339.30, 339.12, 337.56, 335.92, 333.75, 333.70, 335.12, 336.56,
 55 |     337.84, 338.19, 339.91, 340.60, 341.29, 341.00, 339.39, 337.43, 335.72, 335.84, 336.93, 338.04,
 56 |     339.06, 340.30, 341.21, 342.33, 342.74, 342.08, 340.32, 338.26, 336.52, 336.68, 338.19, 339.44,
 57 |     340.57, 341.44, 342.53, 343.39, 343.96, 343.18, 341.88, 339.65, 337.81, 337.69, 339.09, 340.32,
 58 |     341.20, 342.35, 342.93, 344.77, 345.58, 345.14, 343.81, 342.21, 339.69, 339.82, 340.98, 342.82,
 59 |     343.52, 344.33, 345.11, 346.88, 347.25, 346.62, 345.22, 343.11, 340.90, 341.18, 342.80, 344.04,
 60 |     344.79, 345.82, 347.25, 348.17, 348.74, 348.07, 346.38, 344.51, 342.92, 342.62, 344.06, 345.38,
 61 |     346.11, 346.78, 347.68, 349.37, 350.03, 349.37, 347.76, 345.73, 344.68, 343.99, 345.48, 346.72,
 62 |     347.84, 348.29, 349.23, 350.80, 351.66, 351.07, 349.33, 347.92, 346.27, 346.18, 347.64, 348.78,
 63 |     350.25, 351.54, 352.05, 353.41, 354.04, 353.62, 352.22, 350.27, 348.55, 348.72, 349.91, 351.18,
 64 |     352.60, 352.92, 353.53, 355.26, 355.52, 354.97, 353.75, 351.52, 349.64, 349.83, 351.14, 352.37,
 65 |     353.50, 354.55, 355.23, 356.04, 357.00, 356.07, 354.67, 352.76, 350.82, 351.04, 352.69, 354.07,
 66 |     354.59, 355.63, 357.03, 358.48, 359.22, 358.12, 356.06, 353.92, 352.05, 352.11, 353.64, 354.89,
 67 |     355.88, 356.63, 357.72, 359.07, 359.58, 359.17, 356.94, 354.92, 352.94, 353.23, 354.09, 355.33,
 68 |     356.63, 357.10, 358.32, 359.41, 360.23, 359.55, 357.53, 355.48, 353.67, 353.95, 355.30, 356.78,
 69 |     358.34, 358.89, 359.95, 361.25, 361.67, 360.94, 359.55, 357.49, 355.84, 356.00, 357.59, 359.05,
 70 |     359.98, 361.03, 361.66, 363.48, 363.82, 363.30, 361.94, 359.50, 358.11, 357.80, 359.61, 360.74,
 71 |     362.09, 363.29, 364.06, 364.76, 365.45, 365.01, 363.70, 361.54, 359.51, 359.65, 360.80, 362.38,
 72 |     363.23, 364.06, 364.61, 366.40, 366.84, 365.68, 364.52, 362.57, 360.24, 360.83, 362.49, 364.34
 73 |   )
 74 | 
 75 |   override def beforeAll(): Unit = {
 76 |     super.beforeAll()
 77 | 
 78 |     val schema = StructType(Array(StructField("time", StringType), StructField("timeseries",
 79 |       DoubleType)))
 80 | 
 81 |     var data = tsAirPassengers.zipWithIndex.map(x => (x._2.formatted("%011d"), x._1))
 82 |     val rdd = sc.parallelize(data.map(x => Row(x._1, x._2)))
 83 |     dataSet = spark.createDataFrame(rdd, schema)
 84 | 
 85 |     data = tsCO2.zipWithIndex.map(x => (x._2.formatted("%011d"), x._1))
 86 |     val rdd2 = sc.parallelize(data.map(x => Row(x._1, x._2)))
 87 |     dataSet2 = spark.createDataFrame(rdd2, schema)
 88 |   }
 89 | 
 90 |   test("Optimal Paramaters alpha beta gamma - Additive Model") {
 91 |     val model = new HoltWinters()
 92 |         .setTimeCol("time")
 93 |         .setTimeSeriesCol("timeseries")
 94 |         .setModelType("additive")
 95 |         .setPeriod(12)
 96 |         .setMaxIter(30000)
 97 |         .setMaxEval(30000)
 98 |         .fit(dataSet)
 99 | 
100 |     assert(model.alpha ~== 0.24796 absTol 0.01 )
101 |     assert(model.beta ~== 0.03453 absTol 0.01 )
102 |     assert(model.gamma ~== 1.0 absTol 0.01 )
103 |   }
104 | 
105 |   test("Forecast - Additive Model") {
106 |     val model = new HoltWinters()
107 |       .setTimeCol("time")
108 |       .setTimeSeriesCol("timeseries")
109 |       .setModelType("additive")
110 |       .setPeriod(12)
111 |       .setMaxIter(30000)
112 |       .setMaxEval(30000)
113 |       .fit(dataSet)
114 | 
115 |     val forecasted = model.transform(dataSet).collect().map{
116 |       case Row(x: Double) => x
117 |     }
118 | 
119 |     val actualForecasted = new Array[Double](12)
120 |     actualForecasted(0) = 453.4977
121 |     actualForecasted(1) = 429.3906
122 |     actualForecasted(2) = 467.0361
123 |     actualForecasted(3) = 503.2574
124 |     actualForecasted(4) = 512.3395
125 |     actualForecasted(5) = 571.8880
126 |     actualForecasted(6) = 652.6095
127 |     actualForecasted(7) = 637.4623
128 |     actualForecasted(8) = 539.7548
129 |     actualForecasted(9) = 490.7250
130 |     actualForecasted(10) = 424.4593
131 |     actualForecasted(11) = 469.5315
132 | 
133 |     for (i <- 0 until 12) {
134 |       assert(forecasted(i) ~== actualForecasted(i) absTol 10)
135 |     }
136 |   }
137 | 
138 | 
139 |   test("Optimal Paramaters alpha beta gamma - Multiplicative Model") {
140 |     val model = new HoltWinters()
141 |       .setTimeCol("time")
142 |       .setTimeSeriesCol("timeseries")
143 |       .setModelType("multiplicative")
144 |       .setPeriod(12)
145 |       .setMaxIter(30000)
146 |       .setMaxEval(30000)
147 |       .fit(dataSet2)
148 | 
149 |     assert(model.alpha ~== 0.51265 absTol 0.01 )
150 |     assert(model.beta ~== 0.00949 absTol 0.01 )
151 |     assert(model.gamma ~== 0.47289 absTol 0.1 )
152 |   }
153 | 
154 |   test("Forecast - Multiplicative Model") {
155 |     val model = new HoltWinters()
156 |       .setTimeCol("time")
157 |       .setTimeSeriesCol("timeseries")
158 |       .setModelType("multiplicative")
159 |       .setPeriod(12)
160 |       .setMaxIter(30000)
161 |       .setMaxEval(30000)
162 |       .fit(dataSet2)
163 | 
164 |     val forecasted = model.transform(dataSet2).collect().map{
165 |       case Row(x: Double) => x
166 |     }
167 | 
168 |     val actualForecasted = new Array[Double](12)
169 |     actualForecasted(0) = 365.1079
170 |     actualForecasted(1) = 365.9664
171 |     actualForecasted(2) = 366.7343
172 |     actualForecasted(3) = 368.1364
173 |     actualForecasted(4) = 368.6674
174 |     actualForecasted(5) = 367.9508
175 |     actualForecasted(6) = 366.5318
176 |     actualForecasted(7) = 364.3799
177 |     actualForecasted(8) = 362.4731
178 |     actualForecasted(9) = 362.7520
179 |     actualForecasted(10) = 364.2203
180 |     actualForecasted(11) = 365.6741
181 | 
182 |     for (i <- 0 until 12) {
183 |       assert(forecasted(i) ~== actualForecasted(i) absTol 10)
184 |     }
185 |   }
186 | }
187 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/ml/timeseries/models/ARIMASuite.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.timeseries.models
  2 | 
  3 | import org.apache.commons.math3.random.{MersenneTwister, RandomGenerator}
  4 | import org.apache.spark.SparkFunSuite
  5 | import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}
  6 | import org.apache.spark.ml.timeseries.UnivariateTimeSeries
  7 | import org.apache.spark.ml.util.DefaultReadWriteTest
  8 | import org.apache.spark.mllib.util.MLlibTestSparkContext
  9 | import org.apache.spark.mllib.util.TestingUtils._
 10 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
 11 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
 12 | 
 13 | 
 14 | /**
 15 |   * Created by endy on 16-12-20.
 16 |   */
 17 | class ARIMASuite extends SparkFunSuite with MLlibTestSparkContext
 18 |   with DefaultReadWriteTest {
 19 | 
 20 |   @transient var dataSet: Dataset[_] = _
 21 |   test("compare with R") {
 22 |     // > R.Version()$version.string
 23 |     // [1] "R version 3.2.0 (2015-04-16)"
 24 |     // > set.seed(456)
 25 |     // y <- arima.sim(n=250,list(ar=0.3,ma=0.7),mean = 5)
 26 |     // write.table(y, file = "resources/R_ARIMA_DataSet1.csv", row.names = FALSE, col.names = FALSE)
 27 |     val dataFile = getClass.getResource("/timeseries/R_ARIMA_DataSet1.csv").toString
 28 | 
 29 |     val rawData = sc.textFile(dataFile).map(line => line.toDouble)
 30 |       .collect().zipWithIndex
 31 | 
 32 |     val schema = StructType(Array(StructField("time", StringType), StructField("timeseries",
 33 |       DoubleType)))
 34 | 
 35 |     val rdd = sc.parallelize(rawData.map(x => Row(x._2.formatted("%05d"), x._1)))
 36 |     val dataset = spark.createDataFrame(rdd, schema)
 37 | 
 38 |     val model = new ARIMA()
 39 |       .setP(1)
 40 |       .setD(0)
 41 |       .setQ(1)
 42 |       .setTimeCol("time")
 43 |       .setTimeSeriesCol("timeseries")
 44 |       .fit(dataset)
 45 | 
 46 |     val Array(c, ar, ma) = model.coefficient
 47 |     assert(ar ~== 0.3 absTol  0.05)
 48 |     assert(ma ~== 0.7 absTol  0.05)
 49 |   }
 50 | 
 51 |   test("Data sampled from a given model should result in similar model if fit") {
 52 |     val rand = new MersenneTwister(10L)
 53 |     val model = new ARIMAModel(2, 1, 2, Array(8.2, 0.2, 0.5, 0.3, 0.1))
 54 |     val (_, sampled) = sample(1000, rand, model)
 55 | 
 56 |     val newModel = new ARIMA()
 57 |       .setP(2)
 58 |       .setD(1)
 59 |       .setQ(2)
 60 |       .setTimeCol("time")
 61 |       .setTimeSeriesCol("timeseries")
 62 |       .fit(sampled)
 63 | 
 64 |     val Array(c, ar1, ar2, ma1, ma2) = model.coefficient
 65 |     val Array(cTest, ar1Test, ar2Test, ma1Test, ma2Test) = newModel.coefficient
 66 | 
 67 |     // intercept is given more leeway
 68 |     assert(c ~== cTest absTol 1)
 69 |     assert(ar1Test ~== ar1 absTol 0.1)
 70 |     assert(ma1Test ~== ma1 absTol 0.1)
 71 |     assert(ar2Test ~== ar2 absTol 0.1)
 72 |     assert(ma2Test ~== ma2 absTol 0.1)
 73 |   }
 74 | 
 75 |   test("Fitting CSS with BOBYQA and conjugate gradient descent should be fairly similar") {
 76 |     val rand = new MersenneTwister(10L)
 77 |     val model = new ARIMAModel(2, 1, 2, Array(8.2, 0.2, 0.5, 0.3, 0.1))
 78 |     val (_, sampled) = sample(1000, rand, model)
 79 | 
 80 |     val fitWithBOBYQA = new ARIMA()
 81 |         .setP(2)
 82 |         .setD(1)
 83 |         .setQ(2)
 84 |         .setTimeCol("time")
 85 |         .setTimeSeriesCol("timeseries")
 86 |         .setMethod("css-bobyqa")
 87 |         .fit(sampled)
 88 | 
 89 |     val fitWithCGD = new ARIMA()
 90 |       .setP(2)
 91 |       .setD(1)
 92 |       .setQ(2)
 93 |       .setTimeCol("time")
 94 |       .setTimeSeriesCol("timeseries")
 95 |       .setMethod("css-cgd")
 96 |       .fit(sampled)
 97 | 
 98 |     val Array(c, ar1, ar2, ma1, ma2) = fitWithBOBYQA.coefficient
 99 |     val Array(cCGD, ar1CGD, ar2CGD, ma1CGD, ma2CGD) = fitWithCGD.coefficient
100 | 
101 |     // give more leeway for intercept
102 |     assert(cCGD ~== c absTol 1)
103 |     assert(ar1CGD ~== ar1 absTol 0.1)
104 |     assert(ar2CGD ~== ar2 absTol 0.1)
105 |     assert(ma1CGD ~== ma1 absTol 0.1)
106 |     assert(ma2CGD ~== ma2 absTol 0.1)
107 |   }
108 | 
109 |   test("Fitting ARIMA(p, d, q) should be the same as fitting a d-order differenced ARMA(p, q)") {
110 |     val rand = new MersenneTwister(10L)
111 |     val model = new ARIMAModel(1, 1, 2, Array(0.3, 0.7, 0.1), hasIntercept = false)
112 |     val (vec, sampled) = sample(1000, rand, model)
113 | 
114 |     val arimaModel = new ARIMA()
115 |       .setP(1)
116 |       .setD(1)
117 |       .setQ(2)
118 |       .setTimeCol("time")
119 |       .setTimeSeriesCol("timeseries")
120 |       .setIncludeIntercept(false)
121 |       .fit(sampled)
122 | 
123 | 
124 |     val differenceSample = UnivariateTimeSeries.differencesOfOrderD(vec, 1).toArray.drop(1)
125 | 
126 |     val dataFrame = genDf(differenceSample)
127 | 
128 |     val armaModel = new ARIMA()
129 |       .setP(1)
130 |       .setD(0)
131 |       .setQ(2)
132 |       .setTimeCol("time")
133 |       .setTimeSeriesCol("timeseries")
134 |       .setIncludeIntercept(false)
135 |       .fit(dataFrame)
136 | 
137 |     val Array(refAR, refMA1, refMA2) = model.coefficient
138 |     val Array(iAR, iMA1, iMA2) = arimaModel.coefficient
139 |     val Array(ar, ma1, ma2) = armaModel.coefficient
140 | 
141 |     // ARIMA model should match parameters used to sample, to some extent
142 |     assert(iAR ~== refAR absTol 0.05)
143 |     assert(iMA1 ~== refMA1 absTol 0.05)
144 |     assert(iMA2 ~== refMA2 absTol 0.05)
145 | 
146 |     // ARMA model parameters of differenced sample should be equal to ARIMA model parameters
147 |     assert(ar == iAR)
148 |     assert(ma1 == iMA1)
149 |     assert(ma2 == iMA2)
150 |   }
151 | 
152 |   test("Fitting ARIMA(0, 0, 0) with intercept term results in model with average as parameter") {
153 |     val rand = new MersenneTwister(10L)
154 |     val (vec, sampled) = sample(100, rand)
155 | 
156 |     val model = new ARIMA()
157 |       .setP(0)
158 |       .setD(0)
159 |       .setQ(0)
160 |       .setTimeCol("time")
161 |       .setTimeSeriesCol("timeseries")
162 |       .fit(sampled)
163 | 
164 |     val mean = vec.toArray.sum / vec.size
165 | 
166 |     assert(model.coefficient(0) ~== mean absTol 1e-4)
167 |   }
168 | 
169 |   test("Fitting ARIMA(0, 0, 0) with intercept term results in model with average as the forecast") {
170 |     val rand = new MersenneTwister(10L)
171 |     val (vec, sampled) = sample(100, rand)
172 |     val model = new ARIMA()
173 |       .setP(0)
174 |       .setD(0)
175 |       .setQ(0)
176 |       .setTimeCol("time")
177 |       .setTimeSeriesCol("timeseries")
178 |       .fit(sampled)
179 | 
180 |     val mean = vec.toArray.sum / vec.size
181 | 
182 |     assert(model.coefficient(0) ~== mean absTol 1e-4)
183 |     val forecast = model
184 |       .setNFuture(10).transform(sampled).collect()
185 |       .map{case Row(s: Double) => s}
186 | 
187 |     for(i <- 100 until 110) {
188 |       assert(forecast(i) ~== mean absTol 1e-4)
189 |     }
190 |   }
191 | 
192 |   test("Fitting an integrated time series of order 3") {
193 |     // > set.seed(10)
194 |     // > vals <- arima.sim(list(ma = c(0.2), order = c(0, 3, 1)), 200)
195 |     // > arima(order = c(0, 3, 1), vals, method = "CSS")
196 |     //
197 |     // Call:
198 |     //  arima(x = vals, order = c(0, 3, 1), method = "CSS")
199 |     //
200 |     //  Coefficients:
201 |     //   ma1
202 |     //  0.2523
203 |     //  s.e.  0.0623
204 |     //
205 |     //  sigma^2 estimated as 0.9218:  part log likelihood = -275.65
206 |     // > write.table(y, file = "resources/R_ARIMA_DataSet2.csv", row.names = FALSE, col.names =
207 |     // FALSE)
208 |     val dataFile = getClass.getResource("/timeseries/R_ARIMA_DataSet2.csv").toString
209 |     val rawData = sc.textFile(dataFile).map(line => line.toDouble)
210 |       .collect().zipWithIndex
211 | 
212 |     val schema = StructType(Array(StructField("time", StringType), StructField("timeseries",
213 |       DoubleType)))
214 | 
215 |     val rdd = sc.parallelize(rawData.map(x => Row(x._2.formatted("%05d"), x._1)))
216 |     val dataset = spark.createDataFrame(rdd, schema)
217 |     val model = new ARIMA()
218 |       .setP(0)
219 |       .setD(3)
220 |       .setQ(1)
221 |       .setTimeCol("time")
222 |       .setTimeSeriesCol("timeseries")
223 |       .fit(dataset)
224 | 
225 |     val Array(c, ma) = model.coefficient
226 |     assert(ma ~== 0.2 absTol 0.05)
227 |   }
228 |   /**
229 |     * Sample a series of size n assuming an ARIMA(p, d, q) process.
230 |     *
231 |     * @param n size of sample
232 |     * @return series reflecting ARIMA(p, d, q) process
233 |     */
234 |   def sample(n: Int, rand: RandomGenerator, model: ARIMAModel): (Vector, DataFrame) = {
235 |     val vec = new DenseVector(Array.fill[Double](n)(rand.nextGaussian))
236 |     val res = model.addTimeDependentEffects(vec, vec).toArray
237 | 
238 |     val schema = StructType(Array(StructField("time", StringType), StructField("timeseries",
239 |       DoubleType)))
240 | 
241 |     val rdd = sc.parallelize(res.zipWithIndex.map(x => Row(x._2.formatted("%05d"), x._1)))
242 | 
243 |     (Vectors.dense(res), spark.createDataFrame(rdd, schema))
244 |   }
245 | 
246 |   /**
247 |     * Sample a series of size n assuming an ARIMA(p, d, q) process.
248 |     *
249 |     * @param n size of sample
250 |     * @return series reflecting ARIMA(p, d, q) process
251 |     */
252 |   def sample(n: Int, rand: RandomGenerator): (Vector, DataFrame) = {
253 |     val vec = new DenseVector(Array.fill[Double](n)(rand.nextGaussian)).toArray
254 | 
255 |     val schema = StructType(Array(StructField("time", StringType), StructField("timeseries",
256 |       DoubleType)))
257 | 
258 |     val rdd = sc.parallelize(vec.zipWithIndex.map(x => Row(x._2.formatted("%05d"), x._1)))
259 | 
260 |     (Vectors.dense(vec), spark.createDataFrame(rdd, schema))
261 |   }
262 | 
263 |   def genDf(array: Array[Double]): DataFrame = {
264 |     val schema = StructType(Array(StructField("time", StringType), StructField("timeseries",
265 |       DoubleType)))
266 | 
267 |     val rdd = spark.sparkContext.parallelize(
268 |       array.zipWithIndex.map(x => Row(x._2.formatted("%010d"), x._1)))
269 | 
270 |     spark.createDataFrame(rdd, schema)
271 |   }
272 | 
273 | }
274 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/dbscan/DBSCAN.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.dbscan
  2 | 
  3 | import org.apache.spark.ml.dbscan.DBSCANLabeledPoint.Flag
  4 | import org.apache.spark.internal.Logging
  5 | import org.apache.spark.ml.linalg.Vector
  6 | import org.apache.spark.rdd.RDD
  7 | 
  8 | /**
  9 |   * Top level method for calling DBSCAN
 10 |   */
 11 | object DBSCAN {
 12 | 
 13 |   /**
 14 |     * Train a DBSCAN Model using the given set of parameters
 15 |     *
 16 |     * @param data training points stored as `RDD[Vector]`
 17 |     * only the first two points of the vector are taken into consideration
 18 |     * @param eps the maximum distance between two points for them to be considered as part
 19 |     * of the same region
 20 |     * @param minPoints the minimum number of points required to form a dense region
 21 |     * @param maxPointsPerPartition the largest number of points in a single partition
 22 |     */
 23 |   def train(
 24 |              data: RDD[Vector],
 25 |              eps: Double,
 26 |              minPoints: Int,
 27 |              maxPointsPerPartition: Int): DBSCAN = {
 28 | 
 29 |     new DBSCAN(eps, minPoints, maxPointsPerPartition, null, null).train(data)
 30 | 
 31 |   }
 32 | 
 33 | }
 34 | 
 35 | /**
 36 |   * A parallel implementation of DBSCAN clustering. The implementation will split the data space
 37 |   * into a number of partitions, making a best effort to keep the number of points in each
 38 |   *  partition under `maxPointsPerPartition`. After partitioning, traditional DBSCAN
 39 |   *  clustering will be run in parallel for each partition and finally the results
 40 |   *  of each partition will be merged to identify global clusters.
 41 |   *
 42 |   *  This is an iterative algorithm that will make multiple passes over the data,
 43 |   *  any given RDDs should be cached by the user.
 44 |   */
 45 | class DBSCAN private ( val eps: Double,
 46 |                        val minPoints: Int,
 47 |                        val maxPointsPerPartition: Int,
 48 |                        @transient val partitions: List[(Int, DBSCANRectangle)],
 49 |                        @transient private val labeledPartitionedPoints:
 50 |                        RDD[(Int, DBSCANLabeledPoint)])
 51 | 
 52 |   extends Serializable with Logging {
 53 | 
 54 |   type Margins = (DBSCANRectangle, DBSCANRectangle, DBSCANRectangle)
 55 |   type ClusterId = (Int, Int)
 56 | 
 57 |   def minimumRectangleSize: Double = 2 * eps
 58 | 
 59 |   def labeledPoints: RDD[DBSCANLabeledPoint] = {
 60 |     labeledPartitionedPoints.values
 61 |   }
 62 | 
 63 |   private def train(vectors: RDD[Vector]): DBSCAN = {
 64 |     // generate the smallest rectangles that split the space
 65 |     // and count how many points are contained in each one of them
 66 |     val minimumRectanglesWithCount =
 67 |     vectors
 68 |       .map(toMinimumBoundingRectangle)
 69 |       .map((_, 1))
 70 |       .aggregateByKey(0)(_ + _, _ + _)
 71 |       .collect()
 72 |       .toSet
 73 | 
 74 |     // find the best partitions for the data space
 75 |     val localPartitions = EvenSplitPartitioner
 76 |       .partition(minimumRectanglesWithCount, maxPointsPerPartition, minimumRectangleSize)
 77 | 
 78 |     logDebug("Found partitions: ")
 79 |     localPartitions.foreach(p => logDebug(p.toString))
 80 | 
 81 |     // grow partitions to include eps
 82 |     val localMargins =
 83 |       localPartitions
 84 |         .map({ case (p, _) => (p.shrink(eps), p, p.shrink(-eps)) })
 85 |         .zipWithIndex
 86 | 
 87 |     val margins = vectors.context.broadcast(localMargins)
 88 | 
 89 |     // assign each point to its proper partition
 90 |     val duplicated = for {
 91 |       point <- vectors.map(DBSCANPoint)
 92 |       ((inner, main, outer), id) <- margins.value
 93 |       if outer.contains(point)
 94 |     } yield (id, point)
 95 | 
 96 |     val numOfPartitions = localPartitions.size
 97 | 
 98 |     // perform local dbscan
 99 |     val clustered =
100 |       duplicated
101 |         .groupByKey(numOfPartitions)
102 |         .flatMapValues(points =>
103 |           new LocalDBSCANNaive(eps, minPoints).fit(points))
104 |         .cache()
105 | 
106 |     // find all candidate points for merging clusters and group them
107 |     val mergePoints =
108 |       clustered
109 |         .flatMap({
110 |           case (partition, point) =>
111 |             margins.value
112 |               .filter({
113 |                 case ((inner, main, _), _) => main.contains(point) && !inner.almostContains(point)
114 |               })
115 |               .map({
116 |                 case (_, newPartition) => (newPartition, (partition, point))
117 |               })
118 |         })
119 |         .groupByKey()
120 | 
121 |     logDebug("About to find adjacencies")
122 |     // find all clusters with aliases from merging candidates
123 |     val adjacencies =
124 |       mergePoints
125 |         .flatMapValues(findAdjacencies)
126 |         .values
127 |         .collect()
128 | 
129 |     // generated adjacency graph
130 |     val adjacencyGraph = adjacencies.foldLeft(DBSCANGraph[ClusterId]()) {
131 |       case (graph, (from, to)) => graph.connect(from, to)
132 |     }
133 | 
134 |     logDebug("About to find all cluster ids")
135 |     // find all cluster ids
136 |     val localClusterIds =
137 |       clustered
138 |         .filter({ case (_, point) => point.flag != Flag.Noise })
139 |         .mapValues(_.cluster)
140 |         .distinct()
141 |         .collect()
142 |         .toList
143 | 
144 |     // assign a global Id to all clusters, where connected clusters get the same id
145 |     val (total, clusterIdToGlobalId) = localClusterIds.foldLeft((0, Map[ClusterId, Int]())) {
146 |       case ((id, map), clusterId) => {
147 | 
148 |         map.get(clusterId) match {
149 |           case None => {
150 |             val nextId = id + 1
151 |             val connectedClusters = adjacencyGraph.getConnected(clusterId) + clusterId
152 |             logDebug(s"Connected clusters $connectedClusters")
153 |             val toadd = connectedClusters.map((_, nextId)).toMap
154 |             (nextId, map ++ toadd)
155 |           }
156 |           case Some(x) =>
157 |             (id, map)
158 |         }
159 | 
160 |       }
161 |     }
162 | 
163 |     logDebug("Global Clusters")
164 |     clusterIdToGlobalId.foreach(e => logDebug(e.toString))
165 |     logInfo(s"Total Clusters: ${localClusterIds.size}, Unique: $total")
166 | 
167 |     val clusterIds = vectors.context.broadcast(clusterIdToGlobalId)
168 | 
169 |     logDebug("About to relabel inner points")
170 |     // relabel non-duplicated points
171 |     val labeledInner =
172 |       clustered
173 |         .filter(isInnerPoint(_, margins.value))
174 |         .map {
175 |           case (partition, point) => {
176 | 
177 |             if (point.flag != Flag.Noise) {
178 |               point.cluster = clusterIds.value((partition, point.cluster))
179 |             }
180 | 
181 |             (partition, point)
182 |           }
183 |         }
184 | 
185 |     logDebug("About to relabel outer points")
186 |     // de-duplicate and label merge points
187 |     val labeledOuter =
188 |       mergePoints.flatMapValues(partition => {
189 |         partition.foldLeft(Map[DBSCANPoint, DBSCANLabeledPoint]())({
190 |           case (all, (partition, point)) =>
191 | 
192 |             if (point.flag != Flag.Noise) {
193 |               point.cluster = clusterIds.value((partition, point.cluster))
194 |             }
195 | 
196 |             all.get(point) match {
197 |               case None => all + (point -> point)
198 |               case Some(prev) => {
199 |                 // override previous entry unless new entry is noise
200 |                 if (point.flag != Flag.Noise) {
201 |                   prev.flag = point.flag
202 |                   prev.cluster = point.cluster
203 |                 }
204 |                 all
205 |               }
206 |             }
207 | 
208 |         }).values
209 |       })
210 | 
211 |     val finalPartitions = localMargins.map {
212 |       case ((_, p, _), index) => (index, p)
213 |     }
214 |     logDebug("Done")
215 |     new DBSCAN(
216 |       eps,
217 |       minPoints,
218 |       maxPointsPerPartition,
219 |       finalPartitions,
220 |       labeledInner.union(labeledOuter))
221 | 
222 |   }
223 | 
224 |   /**
225 |     * Find the appropriate label to the given `vector`
226 |     *
227 |     * This method is not yet implemented
228 |     */
229 |   def predict(vector: Vector): Double = {
230 |     var centerid = 0
231 |     partitions.foreach{x =>
232 |       if (x._2.contains(DBSCANPoint(vector))){
233 |         centerid = x._1
234 |       }
235 |     }
236 |     centerid.toDouble
237 |   }
238 | 
239 |   private def isInnerPoint(
240 |                             entry: (Int, DBSCANLabeledPoint),
241 |                             margins: List[(Margins, Int)]): Boolean = {
242 |     entry match {
243 |       case (partition, point) =>
244 |         val ((inner, _, _), _) = margins.filter({
245 |           case (_, id) => id == partition
246 |         }).head
247 | 
248 |         inner.almostContains(point)
249 |     }
250 |   }
251 | 
252 |   private def findAdjacencies(partition: Iterable[(Int, DBSCANLabeledPoint)]):
253 |   Set[((Int, Int), (Int, Int))] = {
254 | 
255 |     val zero = (Map[DBSCANPoint, ClusterId](), Set[(ClusterId, ClusterId)]())
256 | 
257 |     val (seen, adjacencies) = partition.foldLeft(zero)({
258 |       case ((seen, adjacencies), (partition, point)) =>
259 |         // noise points are not relevant for adjacencies
260 |         if (point.flag == Flag.Noise) {
261 |           (seen, adjacencies)
262 |         } else {
263 |           val clusterId = (partition, point.cluster)
264 |           seen.get(point) match {
265 |             case None => (seen + (point -> clusterId), adjacencies)
266 |             case Some(prevClusterId) => (seen, adjacencies + ((prevClusterId, clusterId)))
267 |           }
268 | 
269 |         }
270 |     })
271 | 
272 |     adjacencies
273 |   }
274 | 
275 |   private def toMinimumBoundingRectangle(vector: Vector): DBSCANRectangle = {
276 |     val point = DBSCANPoint(vector)
277 |     val x = corner(point.x)
278 |     val y = corner(point.y)
279 |     DBSCANRectangle(x, y, x + minimumRectangleSize, y + minimumRectangleSize)
280 |   }
281 | 
282 |   private def corner(p: Double): Double =
283 |     (shiftIfNegative(p) / minimumRectangleSize).intValue * minimumRectangleSize
284 | 
285 |   private def shiftIfNegative(p: Double): Double =
286 |     if (p < 0) p - minimumRectangleSize else p
287 | 
288 | }
289 | 
290 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/timeseries/models/HoltWinters.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.timeseries.models
  2 | 
  3 | import io.transwarp.discover.timeseries.params.TimeSeriesParams
  4 | import io.transwarp.midas.constant.midas.params.timeseries.{HoltWintersParams, TimeSeriesParams}
  5 | import org.apache.commons.math3.analysis.MultivariateFunction
  6 | import org.apache.commons.math3.optim.{InitialGuess, MaxEval, MaxIter, SimpleBounds}
  7 | import org.apache.commons.math3.optim.nonlinear.scalar.{GoalType, ObjectiveFunction}
  8 | import org.apache.commons.math3.optim.nonlinear.scalar.noderiv.BOBYQAOptimizer
  9 | import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}
 10 | import org.apache.spark.ml.{Estimator, Model}
 11 | import org.apache.spark.ml.param.{Param, ParamMap}
 12 | import org.apache.spark.ml.timeseries.params.TimeSeriesParams
 13 | import org.apache.spark.ml.util.Identifiable
 14 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
 15 | import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
 16 | 
 17 | /**
 18 |   * Triple exponential smoothing takes into account seasonal changes as well as trends.
 19 |   * Seasonality is deﬁned to be the tendency of time-series data to exhibit behavior that repeats
 20 |   * itself every L periods, much like any harmonic function.
 21 |   *
 22 |   * The Holt-Winters method is a popular and effective approach to forecasting seasonal time series
 23 |   *
 24 |   * See https://en.wikipedia.org/wiki/Exponential_smoothing#Triple_exponential_smoothing
 25 |   * for more information on Triple Exponential Smoothing
 26 |   * See https://www.otexts.org/fpp/7/5 and
 27 |   * https://stat.ethz.ch/R-manual/R-devel/library/stats/html/HoltWinters.html
 28 |   * for more information on Holt Winter Method.
 29 |   */
 30 | 
 31 | trait HoltWintersParams extends TimeSeriesParams{
 32 |   final val maxEval = new Param[Int](this, "maxEval", "max eval")
 33 |   def setMaxEval(value: Int): this.type = set(maxEval, value)
 34 | 
 35 |   final val maxIter = new Param[Int](this, "maxIter", "max iteration")
 36 |   def setMaxIter(value: Int): this.type = set(maxIter, value)
 37 | 
 38 |   final val period = new Param[Int](this, "period", "Seasonality of data")
 39 |   def setPeriod(value: Int): this.type = set(period, value)
 40 | 
 41 |   final val modelType = new Param[String](this, "modelType", "Two variations " +
 42 |     "differ in the nature of the seasonal component. Additive method is preferred when seasonal " +
 43 |     "variations are roughly constant through the series, Multiplicative method is preferred when " +
 44 |     "the seasonal variations are changing proportional to the level of the series")
 45 |   def setModelType(value: String): this.type = set(modelType, value)
 46 | }
 47 | 
 48 | class HoltWinters(override val uid: String) extends Estimator[HoltWintersModel] with
 49 |   HoltWintersParams {
 50 | 
 51 |   setDefault(timeCol -> "time",
 52 |     timeSeriesCol -> "timeseries",
 53 |     maxEval -> 10000,
 54 |     maxIter -> 10000)
 55 | 
 56 |   def this() = this(Identifiable.randomUID("HoltWinters"))
 57 |   /**
 58 |     * Fits a model to the input data.
 59 |     */
 60 |   override def fit(dataset: Dataset[_]): HoltWintersModel = {
 61 | 
 62 |     val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map {
 63 |       case Row(time: String, value: Double) => (time, value)
 64 |     }.sortByKey().collect()
 65 | 
 66 |     val dataVector = Vectors.dense(data.map(x => x._2))
 67 |     val optimizer = new BOBYQAOptimizer(7)
 68 | 
 69 |     val objectiveFunction = new ObjectiveFunction(new MultivariateFunction() {
 70 |       def value(params: Array[Double]): Double = {
 71 |         new HoltWintersModel(params(0), params(1), params(2))
 72 |           .setModelType(${modelType})
 73 |           .setPeriod(${period})
 74 |           .sse(dataVector)
 75 |       }
 76 |     })
 77 | 
 78 |     // The starting guesses in R's stats:HoltWinters
 79 |     val initGuess = new InitialGuess(Array(0.3, 0.1, 0.1))
 80 |     val goal = GoalType.MINIMIZE
 81 |     val bounds = new SimpleBounds(Array(0.0, 0.0, 0.0), Array(1.0, 1.0, 1.0))
 82 |     val optimal = optimizer.optimize(objectiveFunction, goal, bounds, initGuess,
 83 |       new MaxIter(${maxIter}), new MaxEval(${maxEval}))
 84 |     val params = optimal.getPoint
 85 |     new HoltWintersModel(params(0), params(1), params(2))
 86 |       .setModelType(${modelType})
 87 |       .setPeriod (${period})
 88 |       .setTimeCol(${timeCol})
 89 |       .setTimeSeriesCol(${timeSeriesCol})
 90 |   }
 91 | 
 92 |   override def copy(extra: ParamMap): Estimator[HoltWintersModel] = defaultCopy(extra)
 93 | 
 94 |   /**
 95 |     * :: DeveloperApi ::
 96 |     *
 97 |     * Check transform validity and derive the output schema from the input schema.
 98 |     *
 99 |     * Typical implementation should first conduct verification on schema change and parameter
100 |     * validity, including complex parameter interaction checks.
101 |     */
102 |   override def transformSchema(schema: StructType): StructType = schema
103 | }
104 | 
105 | class HoltWintersModel(override val uid: String,
106 |                        val alpha: Double, val beta: Double, val gamma: Double)
107 |   extends Model[HoltWintersModel] with HoltWintersParams {
108 | 
109 |   def this(alpha: Double, beta: Double, gamma: Double) = this(Identifiable.randomUID
110 |   ("HoltWintersModel"), alpha, beta, gamma)
111 | 
112 |   override def copy(extra: ParamMap): HoltWintersModel = defaultCopy(extra)
113 | 
114 |   /**
115 |     * Transforms the input dataset.
116 |     */
117 |   override def transform(dataset: Dataset[_]): DataFrame = {
118 |     val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map {
119 |       case Row(time: String, value: Double) => (time, value)
120 |     }.sortByKey().collect()
121 | 
122 |     val dataVector = Vectors.dense(data.map(x => x._2))
123 | 
124 |     val destArr = new Array[Double](${period})
125 |     val (_, level, trend, season) = getHoltWintersComponents(dataVector)
126 |     val n = dataVector.size
127 | 
128 |     val finalLevel = level(n - ${period})
129 |     val finalTrend = trend(n - ${period})
130 |     val finalSeason = new Array[Double](${period})
131 | 
132 |     for (i <- 0 until ${period}) {
133 |       finalSeason(i) = season(i + n - ${period})
134 |     }
135 | 
136 |     for (i <- 0 until ${period}) {
137 |       destArr(i) = if (${modelType}.equalsIgnoreCase("additive")) {
138 |         (finalLevel + (i + 1) * finalTrend) + finalSeason(i % ${period})
139 |       } else {
140 |         (finalLevel + (i + 1) * finalTrend) * finalSeason(i % ${period})
141 |       }
142 |     }
143 | 
144 |     val resRDD = dataset.sparkSession.sparkContext.parallelize(destArr.map(x => Row(x)))
145 | 
146 |     val structType = transformSchema(dataset.schema)
147 | 
148 |     dataset.sparkSession.createDataFrame(resRDD, structType)
149 |   }
150 | 
151 |   /**
152 |     * :: DeveloperApi ::
153 |     *
154 |     * Check transform validity and derive the output schema from the input schema.
155 |     *
156 |     * Typical implementation should first conduct verification on schema change and parameter
157 |     * validity, including complex parameter interaction checks.
158 |     */
159 |   override def transformSchema(schema: StructType): StructType = {
160 |     StructType(Array(StructField("HoltWinters", DoubleType)))
161 |   }
162 | 
163 |   /**
164 |     * Calculates sum of squared errors, used to estimate the alpha and beta parameters
165 |     *
166 |     * @param ts A time series for which we want to calculate the SSE, given the current parameters
167 |     * @return SSE
168 |     */
169 |   def sse(ts: Vector): Double = {
170 |     val n = ts.size
171 |     val smoothed = addTimeDependentEffects(ts)
172 | 
173 |     var error = 0.0
174 |     var sqrErrors = 0.0
175 | 
176 |     // We predict only from period by using the first period - 1 elements.
177 |     for(i <- ${period} until n) {
178 |       error = ts(i) - smoothed(i)
179 |       sqrErrors += error * error
180 |     }
181 | 
182 |     sqrErrors
183 |   }
184 | 
185 |   def addTimeDependentEffects(ts: Vector): Vector = {
186 |     val destArr = Array.fill(ts.size)(0.0)
187 |     val fitted = getHoltWintersComponents(ts)._1
188 |     for (i <- 0 until ts.size) {
189 |       destArr(i) = fitted(i)
190 |     }
191 |     Vectors.dense(destArr)
192 |   }
193 | 
194 |   /**
195 |     * Start from the intial parameters and then iterate to find the final parameters
196 |     * using the equations of HoltWinter Method.
197 |     * See https://www.otexts.org/fpp/7/5 and
198 |     * https://stat.ethz.ch/R-manual/R-devel/library/stats/html/HoltWinters.html
199 |     * for more information on Holt Winter Method equations.
200 |     *
201 |     * @param ts A time series for which we want the HoltWinter parameters level,trend and season.
202 |     * @return (level trend season). Final vectors of level trend and season are returned.
203 |     */
204 |   def getHoltWintersComponents(ts: Vector): (Vector, Vector, Vector, Vector) = {
205 |     val n = ts.size
206 |     require(n >= 2, "Requires length of at least 2")
207 | 
208 |     val dest = new Array[Double](n)
209 | 
210 |     val level = new Array[Double](n)
211 |     val trend = new Array[Double](n)
212 |     val season = new Array[Double](n)
213 | 
214 |     val (initLevel, initTrend, initSeason) = initHoltWinters(ts)
215 |     level(0) = initLevel
216 |     trend(0) = initTrend
217 |     for (i <- 0 until initSeason.size){
218 |       season(i) = initSeason(i)
219 |     }
220 | 
221 |     for (i <- 0 until (n - ${period})) {
222 |       dest(i + ${period}) = level(i) + trend(i)
223 | 
224 |       // Add the seasonal factor for additive and multiply for multiplicative model.
225 |       if (${modelType}.equalsIgnoreCase("additive")) {
226 |         dest(i + ${period}) += season(i)
227 |       } else {
228 |         dest(i + ${period}) *= season(i)
229 |       }
230 | 
231 |       val levelWeight = if (${modelType}.equalsIgnoreCase("additive")) {
232 |         ts(i + ${period}) - season(i)
233 |       } else {
234 |         ts(i + ${period}) / season(i)
235 |       }
236 | 
237 |       level(i + 1) = alpha * levelWeight + (1 - alpha) * (level(i) + trend(i))
238 | 
239 |       trend(i + 1) = beta * (level(i + 1) - level(i)) + (1 - beta) * trend(i)
240 | 
241 |       val seasonWeight = if (${modelType}.equalsIgnoreCase("additive")) {
242 |         ts(i + ${period}) - level(i + 1)
243 |       } else {
244 |         ts(i + ${period}) / level(i + 1)
245 |       }
246 |       season(i + ${period}) = gamma * seasonWeight + (1 - gamma) * season(i)
247 |     }
248 | 
249 |     (Vectors.dense(dest), Vectors.dense(level), Vectors.dense(trend), Vectors.dense(season))
250 |   }
251 | 
252 |   def getKernel: (Array[Double]) = {
253 |     if (${period} % 2 == 0){
254 |       val kernel = Array.fill(${period} + 1)(1.0 / ${period})
255 |       kernel(0) = 0.5 / ${period}
256 |       kernel(${period}) = 0.5 / ${period}
257 |       kernel
258 |     } else {
259 |       Array.fill(${period})(1.0 / ${period})
260 |     }
261 |   }
262 | 
263 |   /**
264 |     * Function to calculate the Weighted moving average/convolution using above kernel/weights
265 |     * for input data.
266 |     * See http://robjhyndman.com/papers/movingaverage.pdf for more information
267 |     * @param inData Series on which you want to do moving average
268 |     * @param kernel Weight vector for weighted moving average
269 |     */
270 |   def convolve(inData: Array[Double], kernel: Array[Double]): (Array[Double]) = {
271 |     val kernelSize = kernel.length
272 |     val dataSize = inData.length
273 | 
274 |     val outData = new Array[Double](dataSize - kernelSize + 1)
275 | 
276 |     var end = 0
277 |     while (end <= (dataSize - kernelSize)) {
278 |       var sum = 0.0
279 |       for (i <- 0 until kernelSize) {
280 |         sum += kernel(i) * inData(end + i)
281 |       }
282 |       outData(end) = sum
283 |       end += 1
284 |     }
285 |     outData
286 |   }
287 | 
288 |   /**
289 |     * Function to get the initial level, trend and season using method suggested in
290 |     * http://robjhyndman.com/hyndsight/hw-initialization/
291 |     * @param ts
292 |     */
293 |   def initHoltWinters(ts: Vector): (Double, Double, Array[Double]) = {
294 |     val arrTs = ts.toArray
295 | 
296 |     // Decompose a window of time series into level trend and seasonal using convolution
297 |     val kernel = getKernel
298 |     val kernelSize = kernel.size
299 |     val trend = convolve(arrTs.take(${period} * 2), kernel)
300 | 
301 |     // Remove the trend from time series. Subtract for additive and divide for multiplicative
302 |     val n = (kernelSize -1) / 2
303 |     val removeTrend = arrTs.take(${period} * 2).zip(
304 |       Array.fill(n)(0.0) ++ trend ++ Array.fill(n)(0.0)).map{
305 |       case (a, t) =>
306 |         if (t != 0){
307 |           if (${modelType}.equalsIgnoreCase("additive")) {
308 |             a - t
309 |           } else {
310 |             a / t
311 |           }
312 |         } else {
313 |           0
314 |         }
315 |     }
316 | 
317 |     // seasonal mean is sum of mean of all season values of that period
318 |     val seasonalMean = removeTrend.splitAt(${period}).zipped.map { case (prevx, x) =>
319 |       if (prevx == 0 || x == 0) x + prevx else (x + prevx) / 2
320 |     }
321 | 
322 |     val meanOfFigures = seasonalMean.sum / ${period}
323 | 
324 |     // The seasonal mean is then centered and removed to get season.
325 |     // Subtract for additive and divide for multiplicative.
326 |     val initSeason = if (${modelType}.equalsIgnoreCase("additive")) {
327 |       seasonalMean.map(_ - meanOfFigures )
328 |     } else {
329 |       seasonalMean.map(_ / meanOfFigures )
330 |     }
331 | 
332 |     // Do Simple Linear Regression to find the initial level and trend
333 |     val indices = 1 to trend.length
334 |     val xbar = (indices.sum: Double) / indices.size
335 |     val ybar = trend.sum / trend.length
336 | 
337 |     val xxbar = indices.map( x => (x - xbar) * (x - xbar) ).sum
338 |     val xybar = indices.zip(trend).map {
339 |       case (x, y) => (x - xbar) * (y - ybar)
340 |     }.sum
341 | 
342 |     val initTrend = xybar / xxbar
343 |     val initLevel = ybar - (initTrend * xbar)
344 | 
345 |     (initLevel, initTrend, initSeason)
346 |   }
347 | }
348 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/timeseries/UnivariateTimeSeries.scala:
--------------------------------------------------------------------------------
  1 | package org.apache.spark.ml.timeseries
  2 | 
  3 | import java.util.Arrays
  4 | 
  5 | import breeze.stats._
  6 | import org.apache.commons.math3.analysis.interpolation.SplineInterpolator
  7 | import org.apache.spark.ml.linalg.{DenseVector, Matrix, Vector, Vectors}
  8 | 
  9 | /**
 10 |   * Created by endy on 16-12-20.
 11 |   */
 12 | object UnivariateTimeSeries {
 13 | 
 14 |   /**
 15 |     * Lags the univariate time series
 16 |     *
 17 |     * Example input vector: (1.0, 2.0, 3.0, 4.0, 5.0)
 18 |     *
 19 |     * With lag 2 and includeOriginal = true should give output matrix:
 20 |     *
 21 |     * 3.0   2.0   1.0
 22 |     * 4.0   3.0   2.0
 23 |     * 5.0   4.0   3.0
 24 |     */
 25 |   def lag(ts: Vector, maxLag: Int, includeOriginal: Boolean): Matrix = {
 26 |     Lag.lagMatTrimBoth(ts, maxLag, includeOriginal)
 27 |   }
 28 | 
 29 |   def autocorr(ts: Array[Double], numLags: Int): Array[Double] = {
 30 |     autocorr(new DenseVector(ts), numLags).toArray
 31 |   }
 32 | 
 33 |   /**
 34 |     * Computes the sample autocorrelation of the given series.
 35 |     */
 36 |   def autocorr(ts: Vector, numLags: Int): Vector = {
 37 |     val corrs = new Array[Double](numLags)
 38 |     var i = 1
 39 |     val breezeTs = MatrixUtil.toBreeze(ts)
 40 |     while (i <= numLags) {
 41 |       val slice1 = breezeTs(i until ts.size)
 42 |       val slice2 = breezeTs(0 until ts.size - i)
 43 |       val mean1 = mean(slice1)
 44 |       val mean2 = mean(slice2)
 45 |       var variance1 = 0.0
 46 |       var variance2 = 0.0
 47 |       var covariance = 0.0
 48 |       var j = 0
 49 |       while (j < ts.size - i) {
 50 |         val diff1 = slice1(j) - mean1
 51 |         val diff2 = slice2(j) - mean2
 52 |         variance1 += diff1 * diff1
 53 |         variance2 += diff2 * diff2
 54 |         covariance += diff1 * diff2
 55 |         j += 1
 56 |       }
 57 | 
 58 |       corrs(i - 1) = covariance / (math.sqrt(variance1) * math.sqrt(variance2))
 59 |       i += 1
 60 |     }
 61 |     new DenseVector(corrs)
 62 |   }
 63 | 
 64 |   def quotients(ts: Vector, lag: Int): Vector = {
 65 |     val ret = new Array[Double](ts.size - lag)
 66 |     var i = 0
 67 |     while (i < ret.length) {
 68 |       ret(i) = ts(i + lag) / ts(i)
 69 |       i += 1
 70 |     }
 71 |     new DenseVector(ret)
 72 |   }
 73 | 
 74 |   def price2ret(ts: Vector, lag: Int): Vector = {
 75 |     val ret = new Array[Double](ts.size - lag)
 76 |     var i = 0
 77 |     while (i < ret.length) {
 78 |       ret(i) = ts(i + lag) / ts(i) - 1.0
 79 |       i += 1
 80 |     }
 81 |     new DenseVector(ret)
 82 |   }
 83 | 
 84 |   /**
 85 |     * Trim leading NaNs from a series.
 86 |     */
 87 |   def trimLeading(ts: Vector): Vector = {
 88 |     val start = firstNotNaN(ts)
 89 |     if (start < ts.size) {
 90 |       Vectors.dense(Arrays.copyOfRange(ts.toArray, start, ts.size))
 91 |     } else {
 92 |       Vectors.zeros(0)
 93 |     }
 94 |   }
 95 | 
 96 |   /**
 97 |     * Trim trailing NaNs from a series.
 98 |     */
 99 |   def trimTrailing(ts: Vector): Vector = {
100 |     val end = lastNotNaN(ts)
101 |     if (end > 0) {
102 |       Vectors.dense(Arrays.copyOfRange(ts.toArray, 0, end))
103 |     } else {
104 |       Vectors.zeros(0)
105 |     }
106 |   }
107 | 
108 |   def firstNotNaN(ts: Vector): Int = {
109 |     var i = 0
110 |     while (i < ts.size) {
111 |       if (!java.lang.Double.isNaN(ts(i))) {
112 |         return i
113 |       }
114 |       i += 1
115 |     }
116 |     i
117 |   }
118 | 
119 |   def lastNotNaN(ts: Vector): Int = {
120 |     var i = ts.size - 1
121 |     while (i >= 0) {
122 |       if (!java.lang.Double.isNaN(ts(i))) {
123 |         return i
124 |       }
125 |       i -= 1
126 |     }
127 |     i
128 |   }
129 | 
130 |   def fillts(ts: Vector, fillMethod: String): Vector = {
131 |     fillMethod match {
132 |       case "linear" => fillLinear(ts)
133 |       case "nearest" => fillNearest(ts)
134 |       case "next" => fillNext(ts)
135 |       case "previous" => fillPrevious(ts)
136 |       case "spline" => fillSpline(ts)
137 |       case "zero" => fillValue(ts, 0)
138 |       case _ => throw new UnsupportedOperationException()
139 |     }
140 |   }
141 | 
142 |   /**
143 |     * Replace all NaNs with a specific value
144 |     */
145 |   def fillValue(values: Array[Double], filler: Double): Array[Double] = {
146 |     fillValue(new DenseVector(values), filler).toArray
147 |   }
148 | 
149 |   /**
150 |     * Replace all NaNs with a specific value
151 |     */
152 |   def fillValue(values: Vector, filler: Double): DenseVector = {
153 |     val result = values.copy.toArray
154 |     var i = 0
155 |     while (i < result.size) {
156 |       if (result(i).isNaN) result(i) = filler
157 |       i += 1
158 |     }
159 |     new DenseVector(result)
160 |   }
161 | 
162 |   def fillNearest(values: Array[Double]): Array[Double] = {
163 |     fillNearest(new DenseVector(values)).toArray
164 |   }
165 | 
166 |   def fillNearest(values: Vector): DenseVector = {
167 |     val result = values.copy.toArray
168 |     var lastExisting = -1
169 |     var nextExisting = -1
170 |     var i = 1
171 |     while (i < result.length) {
172 |       if (result(i).isNaN) {
173 |         if (nextExisting < i) {
174 |           nextExisting = i + 1
175 |           while (nextExisting < result.length && result(nextExisting).isNaN) {
176 |             nextExisting += 1
177 |           }
178 |         }
179 | 
180 |         if (lastExisting < 0 && nextExisting >= result.size) {
181 |           throw new IllegalArgumentException("Input is all NaNs!")
182 |         } else if (nextExisting >= result.size || // TODO: check this
183 |           (lastExisting >= 0 && i - lastExisting < nextExisting - i)) {
184 |           result(i) = result(lastExisting)
185 |         } else {
186 |           result(i) = result(nextExisting)
187 |         }
188 |       } else {
189 |         lastExisting = i
190 |       }
191 |       i += 1
192 |     }
193 |     new DenseVector(result)
194 |   }
195 | 
196 |   def fillPrevious(values: Array[Double]): Array[Double] = {
197 |     fillPrevious(new DenseVector(values)).toArray
198 |   }
199 | 
200 |   /**
201 |     * fills in NaN with the previously available not NaN, scanning from left to right.
202 |     * 1 NaN NaN 2 Nan -> 1 1 1 2 2
203 |     */
204 |   def fillPrevious(values: Vector): DenseVector = {
205 |     val result = values.copy.toArray
206 |     var filler = Double.NaN // initial value, maintains invariant
207 |     var i = 0
208 |     while (i < result.length) {
209 |       filler = if (result(i).isNaN) filler else result(i)
210 |       result(i) = filler
211 |       i += 1
212 |     }
213 |     new DenseVector(result)
214 |   }
215 | 
216 |   def fillNext(values: Array[Double]): Array[Double] = {
217 |     fillNext(new DenseVector(values)).toArray
218 |   }
219 | 
220 |   /**
221 |     * fills in NaN with the next available not NaN, scanning from right to left.
222 |     * 1 NaN NaN 2 Nan -> 1 2 2 2 NaN
223 |     */
224 |   def fillNext(values: Vector): DenseVector = {
225 |     val result = values.copy.toArray
226 |     var filler = Double.NaN // initial value, maintains invariant
227 |     var i = result.length - 1
228 |     while (i >= 0) {
229 |       filler = if (result(i).isNaN) filler else result(i)
230 |       result(i) = filler
231 |       i -= 1
232 |     }
233 |     new DenseVector(result)
234 |   }
235 | 
236 |   def fillWithDefault(values: Array[Double], filler: Double): Array[Double] = {
237 |     fillWithDefault(new DenseVector(values), filler).toArray
238 |   }
239 | 
240 |   /**
241 |     * fills in NaN with a default value
242 |     */
243 |   def fillWithDefault(values: Vector, filler: Double): DenseVector = {
244 |     val result = values.copy.toArray
245 |     var i = 0
246 |     while (i < result.length) {
247 |       result(i) = if (result(i).isNaN) filler else result(i)
248 |       i += 1
249 |     }
250 |     new DenseVector(result)
251 |   }
252 | 
253 |   def fillLinear(values: Array[Double]): Array[Double] = {
254 |     fillLinear(new DenseVector(values)).toArray
255 |   }
256 | 
257 |   def fillLinear(values: Vector): DenseVector = {
258 |     val result = values.copy.toArray
259 |     var i = 1
260 |     while (i < result.length - 1) {
261 |       val rangeStart = i
262 |       while (i < result.length - 1 && result(i).isNaN) {
263 |         i += 1
264 |       }
265 |       val before = result(rangeStart - 1)
266 |       val after = result(i)
267 |       if (i != rangeStart && !before.isNaN && !after.isNaN) {
268 |         val increment = (after - before) / (i - (rangeStart - 1))
269 |         for (j <- rangeStart until i) {
270 |           result(j) = result(j - 1) + increment
271 |         }
272 |       }
273 |       i += 1
274 |     }
275 |     new DenseVector(result)
276 |   }
277 | 
278 |   def fillSpline(values: Array[Double]): Array[Double] = {
279 |     fillSpline(new DenseVector(values)).toArray
280 |   }
281 | 
282 |   /**
283 |     * Fill in NaN values using a natural cubic spline.
284 |     * @param values Vector to interpolate
285 |     * @return Interpolated vector
286 |     */
287 |   def fillSpline(values: Vector): DenseVector = {
288 |     val result = values.copy.toArray
289 |     val interp = new SplineInterpolator()
290 |     val knotsAndValues = values.toArray.zipWithIndex.filter(!_._1.isNaN)
291 |     // Note that the type of unzip is missed up in scala 10.4 as per
292 |     // https://issues.scala-lang.org/browse/SI-8081
293 |     // given that this project is using scala 10.4, we cannot use unzip, so unpack manually
294 |     val knotsX = knotsAndValues.map(_._2.toDouble)
295 |     val knotsY = knotsAndValues.map(_._1)
296 |     val filler = interp.interpolate(knotsX, knotsY)
297 | 
298 |     // values that we can interpolate between, others need to be filled w/ other function
299 |     var i = knotsX(0).toInt
300 |     val end = knotsX.last.toInt
301 | 
302 |     while (i < end) {
303 |       result(i) = filler.value(i.toDouble)
304 |       i += 1
305 |     }
306 |     new DenseVector(result)
307 |   }
308 | 
309 | 
310 |   /**
311 |     * Down sample by taking every nth element starting from offset phase
312 |     * @param values Vector to down sample
313 |     * @param n take every nth element
314 |     * @param phase offset from starting index
315 |     * @return downsampled vector with appropriate length
316 |     */
317 |   def downsample(values: Vector, n: Int, phase: Int = 0): DenseVector = {
318 |     val origLen = values.size
319 |     val newLen = Math.ceil((values.size - phase) / n.toDouble).toInt
320 |     val sampledValues = Array.fill(newLen)(0.0)
321 |     var i = phase
322 |     var j = 0
323 | 
324 |     while (j < newLen) {
325 |       sampledValues(j) = values(i)
326 |       i += n
327 |       j += 1
328 |     }
329 |     new DenseVector(sampledValues)
330 |   }
331 | 
332 |   /**
333 |     * Up sample by inserting n - 1 elements into the original values vector, starting at index phase
334 |     * @param values the original data vector
335 |     * @param n the number of insertions between elements
336 |     * @param phase the offset to begin
337 |     * @param useZero fill with zeros rather than NaN
338 |     * @return upsampled vector filled with zeros or NaN, as specified by user
339 |     */
340 |   def upsample(values: Vector, n: Int, phase: Int = 0, useZero: Boolean = false): DenseVector = {
341 |     val filler = if (useZero) 0 else Double.NaN
342 |     val origLen = values.size
343 |     val newLen = origLen * n
344 |     val sampledValues = Array.fill(newLen)(filler)
345 |     var i = phase
346 |     var j = 0
347 | 
348 |     while (j < origLen) {
349 |       sampledValues(i) = values(j)
350 |       i += n
351 |       j += 1
352 |     }
353 |     new DenseVector(sampledValues)
354 |   }
355 | 
356 |   /**
357 |     * Difference a vector with respect to the m-th prior element. Size-preserving by leaving first
358 |     * `m` elements intact. This is the inverse of the `inverseDifferences` function.
359 |     * @param ts Series to difference
360 |     * @param destTs Series to store the differenced values (and return for convenience)
361 |     * @param lag The difference lag (e.g. x means destTs(i) = ts(i) - ts(i - x), etc)
362 |     * @param startIndex the starting index for the differencing. Must be at least equal to lag
363 |     * @return the differenced vector, for convenience
364 |     */
365 |   def differencesAtLag(ts: Vector, destTs: Vector, lag: Int, startIndex: Int): Vector = {
366 |     require(startIndex >= lag, "starting index cannot be less than lag")
367 |     val diffedTs = if (destTs == null) ts.copy else destTs
368 |     if (lag == 0) {
369 |       diffedTs
370 |     } else {
371 |       val arr = diffedTs.toArray
372 |       val n = ts.size
373 |       var i = 0
374 | 
375 |       while (i < n) {
376 |         // elements prior to starting point are copied over without modification
377 |         arr(i) = if (i < startIndex) ts(i) else ts(i) - ts(i - lag)
378 |         i += 1
379 |       }
380 |       diffedTs
381 |     }
382 |   }
383 | 
384 |   /**
385 |     * Convenience wrapper around `differencesAtLag[Vector[Double], Vector[Double], Int, Int]`
386 |     * @param ts vector to difference
387 |     * @param lag the difference lag (e.g. x means destTs(i) = ts(i) - ts(i - x), etc)
388 |     * @return the differenced vector, for convenience
389 |     */
390 |   def differencesAtLag(ts: Vector, lag: Int): Vector = {
391 |     differencesAtLag(ts, null, lag, lag)
392 |   }
393 | 
394 |   /**
395 |     * Calculate an "inverse-differenced" vector of a given lag. Size-preserving by leaving first
396 |     * `startIndex` elements intact. This is the inverse of the `differences` function.
397 |     * @param diffedTs differenced vector that we want to inverse
398 |     * @param destTs Series to store the added up values (and return for convenience)
399 |     * @param lag The difference lag (e.g. x means destTs(i) = diffedTs(i) + destTs(i - x), etc)
400 |     * @param startIndex the starting index for the differencing. Must be at least equal to lag
401 |     * @return the inverse differenced vector, for convenience
402 |     */
403 |   def inverseDifferencesAtLag(diffedTs: Vector, destTs: Vector, lag: Int,
404 |                               startIndex: Int): Vector = {
405 |     require(startIndex >= lag, "starting index cannot be less than lag")
406 |     val addedTs = if (destTs == null) diffedTs.copy else destTs
407 |     if (lag == 0) {
408 |       addedTs
409 |     } else {
410 |       val n = diffedTs.size
411 |       var i = 0
412 | 
413 |       val arr = addedTs.toArray
414 |       while (i < n) {
415 |         // elements prior to starting point are copied over without modification
416 |         arr(i) = if (i < startIndex) diffedTs(i) else diffedTs(i) + addedTs(i - lag)
417 |         i += 1
418 |       }
419 |       addedTs
420 |     }
421 |   }
422 | 
423 |   /**
424 |     * Convenience wrapper around `inverseDifferencesAtLag[Vector[Double], Vector[Double], Int, Int]`
425 |     * @param diffedTs differenced vector that we want to inverse
426 |     * @param lag the difference lag (e.g. x means destTs(i) = ts(i) - ts(i - x), etc)
427 |     * @return the inverse differenced vector, for convenience
428 |     */
429 |   def inverseDifferencesAtLag(diffedTs: Vector, lag: Int): Vector = {
430 |     inverseDifferencesAtLag(diffedTs, null, lag, lag)
431 |   }
432 | 
433 |   /**
434 |     * Performs differencing of order `d`. This means we recursively difference a vector a total of
435 |     * d-times. So that d = 2 is a vector of the differences of differences. Note that for each
436 |     * difference level, d_i, the element at ts(d_i - 1) corresponds to the value in the prior
437 |     * iteration.
438 |     * @param ts time series to difference
439 |     * @param d order of differencing
440 |     * @return a vector of the same length differenced to order d
441 |     */
442 |   def differencesOfOrderD(ts: Vector, d: Int): Vector = {
443 |     // we create 2 copies to avoid copying with every call, and simply swap them as necessary
444 |     // for higher order differencing
445 |     var (diffedTs, origTs) = (ts.copy, ts.copy)
446 |     var swap: Vector = null
447 |     for (i <- 1 to d) {
448 |       swap = origTs
449 |       origTs = diffedTs
450 |       diffedTs = swap
451 |       differencesAtLag(origTs, diffedTs, 1, i)
452 |     }
453 |     diffedTs
454 |   }
455 | 
456 |   /**
457 |     * Inverses differencing of order `d`.
458 |     * @param diffedTs time series to reverse differencing process
459 |     * @param d order of differencing
460 |     * @return a vector of the same length, which when differenced to order ts, yields the original
461 |     *         vector provided
462 |     */
463 |   def inverseDifferencesOfOrderD(diffedTs: Vector, d: Int): Vector = {
464 |     val addedTs = diffedTs.copy
465 |     for (i <- d to 1 by -1) {
466 |       inverseDifferencesAtLag(addedTs, addedTs, 1, i)
467 |     }
468 |     addedTs
469 |   }
470 | 
471 |   def rollSum(ts: Vector, n: Int): Vector = {
472 |     new DenseVector(ts.toArray.sliding(n).toList.map(_.sum).toIndexedSeq.toArray[Double])
473 |   }
474 | }
475 | 


--------------------------------------------------------------------------------