├── .idea
└── vcs.xml
├── src
├── main
│ └── scala
│ │ └── org
│ │ └── apache
│ │ └── spark
│ │ └── ml
│ │ ├── util
│ │ ├── Utils.scala
│ │ ├── XORShiftRandom.scala
│ │ ├── DBHPartitioner.scala
│ │ ├── SparkUtils.scala
│ │ └── LoaderUtils.scala
│ │ ├── dbscan
│ │ ├── DBSCANPoint.scala
│ │ ├── DBSCANLabeledPoint.scala
│ │ ├── DBSCANRectangle.scala
│ │ ├── DBSCANGraph.scala
│ │ ├── LocalDBSCANArchery.scala
│ │ ├── LocalDBSCANNaive.scala
│ │ ├── DBSCAN2.scala
│ │ ├── EvenSplitPartitioner.scala
│ │ └── DBSCAN.scala
│ │ ├── tsne
│ │ ├── TSNEParam.scala
│ │ ├── TSNEHelper.scala
│ │ ├── impl
│ │ │ ├── SimpleTSNE.scala
│ │ │ ├── BHTSNE.scala
│ │ │ └── LBFGSTSNE.scala
│ │ ├── tree
│ │ │ └── SPTree.scala
│ │ ├── X2P.scala
│ │ └── TSNEGradient.scala
│ │ ├── timeseries
│ │ ├── params
│ │ │ └── TimeSeriesParams.scala
│ │ ├── MatrixUtil.scala
│ │ ├── Lag.scala
│ │ ├── models
│ │ │ ├── Autoregression.scala
│ │ │ ├── ARGARCH.scala
│ │ │ ├── EWMA.scala
│ │ │ ├── GARCH.scala
│ │ │ └── HoltWinters.scala
│ │ └── UnivariateTimeSeries.scala
│ │ ├── knn
│ │ ├── Distance.scala
│ │ └── KNNClassifier.scala
│ │ ├── sampling
│ │ ├── UnderSampling.scala
│ │ └── OverSampling.scala
│ │ ├── fm
│ │ ├── FMModel.scala
│ │ └── BSFMModel.scala
│ │ └── mvm
│ │ └── MVMModel.scala
└── test
│ └── scala
│ └── org
│ └── apache
│ └── spark
│ └── ml
│ ├── timeseries
│ ├── MatrixUtilSuite.scala
│ ├── models
│ │ ├── GARCHSuite.scala
│ │ ├── EWMASuite.scala
│ │ ├── AutoregressionSuite.scala
│ │ ├── ARGARCHSuite.scala
│ │ ├── HoltWintersSuite.scala
│ │ └── ARIMASuite.scala
│ └── UnivariateTimeSeriesSuite.scala
│ └── knn_is
│ └── KNN_ISSuite.scala
├── README.md
└── pom.xml
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/util/Utils.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.util
2 |
3 | import java.util.Random
4 |
5 | object Utils {
6 | val random = new Random()
7 | def log1pExp(x: Double): Double = {
8 | if (x > 0) {
9 | x + math.log1p(math.exp(-x))
10 | } else {
11 | math.log1p(math.exp(x))
12 | }
13 | }
14 | }
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/dbscan/DBSCANPoint.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.dbscan
2 |
3 | import org.apache.spark.ml.linalg.Vector
4 |
5 | case class DBSCANPoint(val vector: Vector) {
6 |
7 | def x: Double = vector(0)
8 | def y: Double = vector(1)
9 |
10 | def distanceSquared(other: DBSCANPoint): Double = {
11 | val dx = other.x - x
12 | val dy = other.y - y
13 | (dx * dx) + (dy * dy)
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/tsne/TSNEParam.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.tsne
2 |
3 | case class TSNEParam(
4 | early_exaggeration: Int = 100,
5 | exaggeration_factor: Double = 4.0,
6 | t_momentum: Int = 25,
7 | initial_momentum: Double = 0.5,
8 | final_momentum: Double = 0.8,
9 | eta: Double = 500.0,
10 | min_gain: Double = 0.01
11 | )
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/timeseries/params/TimeSeriesParams.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.timeseries.params
2 |
3 | import org.apache.spark.ml.param.{Param, Params}
4 |
5 | /**
6 | * Created by endy on 16-12-22.
7 | */
8 | trait TimeSeriesParams extends Params {
9 | final val timeCol = new Param[String](this, "timeCol",
10 | "The column that stored time value")
11 | def setTimeCol(value: String): this.type = set(timeCol, value)
12 |
13 | final val timeSeriesCol = new Param[String](this, "timeSeriesCol",
14 | "The column that stored time series value")
15 | def setTimeSeriesCol(value: String): this.type = set(timeSeriesCol, value)
16 | }
17 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/dbscan/DBSCANLabeledPoint.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.dbscan
2 |
3 | import org.apache.spark.ml.linalg.Vector
4 |
5 | /**
6 | * Companion constants for labeled points
7 | */
8 | object DBSCANLabeledPoint {
9 |
10 | val Unknown = 0
11 |
12 | object Flag extends Enumeration {
13 | type Flag = Value
14 | val Border, Core, Noise, NotFlagged = Value
15 | }
16 |
17 | }
18 |
19 | class DBSCANLabeledPoint(vector: Vector) extends DBSCANPoint(vector) {
20 |
21 | def this(point: DBSCANPoint) = this(point.vector)
22 |
23 | var flag = DBSCANLabeledPoint.Flag.NotFlagged
24 | var cluster = DBSCANLabeledPoint.Unknown
25 | var visited = false
26 |
27 | override def toString(): String = {
28 | s"$vector,$cluster,$flag"
29 | }
30 |
31 | }
32 |
33 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Distributed Algorithms On Spark
2 |
3 | This project implement some popular algorithms on spark.You can read the papers of them to see their details.
4 |
5 | Currently it support the following algorithms and I will add some other algorithms in the future.
6 |
7 | - Distributed KNN
8 | - Down Sampling
9 | - Over Sampling
10 | - Affinity Propagation
11 | - Distributed t-SNE
12 | - Factorization Machines
13 | - Multi-view Machines
14 | - Block Structures Factorization Machines
15 | - Timeseries models
16 | - DBSCAN
17 |
18 |
19 | This project support spark 2.x
20 |
21 | ## reference
22 |
23 | - https://github.com/viirya/SparkAffinityPropagation
24 | - https://github.com/saurfang/spark-tsne
25 | - https://github.com/cloudml/zen
26 | - https://github.com/sryza/spark-timeseries
27 | - https://github.com/irvingc/dbscan-on-spark
28 | - http://mlwiki.org/index.php/Metric_Trees
--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/ml/timeseries/MatrixUtilSuite.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.timeseries
2 |
3 | import org.apache.spark.SparkFunSuite
4 | import org.apache.spark.ml.linalg.{Matrices, Vectors}
5 | import org.apache.spark.ml.util.DefaultReadWriteTest
6 | import org.apache.spark.mllib.util.MLlibTestSparkContext
7 |
8 | /**
9 | * Created by endy on 16-12-21.
10 | */
11 | class MatrixUtilSuite extends SparkFunSuite with MLlibTestSparkContext
12 | with DefaultReadWriteTest {
13 | test("modifying toBreeze version modifies original tensor") {
14 | val vec = Vectors.dense(1.0, 2.0, 3.0)
15 | val breezeVec = MatrixUtil.toBreeze(vec)
16 | breezeVec(1) = 4.0
17 | assert(vec(1) == 4.0)
18 |
19 | val mat = Matrices.zeros(3, 4)
20 | val breezeMat = MatrixUtil.toBreeze(mat)
21 | breezeMat(0, 1) = 2.0
22 | assert(mat(0, 1) == 2.0)
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/dbscan/DBSCANRectangle.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.dbscan
2 |
3 | /**
4 | * A rectangle with a left corner of (x, y) and a right upper corner of (x2, y2)
5 | */
6 | case class DBSCANRectangle(x: Double, y: Double, x2: Double, y2: Double) {
7 |
8 | /**
9 | * Returns whether other is contained by this box
10 | */
11 | def contains(other: DBSCANRectangle): Boolean = {
12 | x <= other.x && other.x2 <= x2 && y <= other.y && other.y2 <= y2
13 | }
14 |
15 | /**
16 | * Returns whether point is contained by this box
17 | */
18 | def contains(point: DBSCANPoint): Boolean = {
19 | x <= point.x && point.x <= x2 && y <= point.y && point.y <= y2
20 | }
21 |
22 | /**
23 | * Returns a new box from shrinking this box by the given amount
24 | */
25 | def shrink(amount: Double): DBSCANRectangle = {
26 | DBSCANRectangle(x + amount, y + amount, x2 - amount, y2 - amount)
27 | }
28 |
29 | /**
30 | * Returns a whether the rectangle contains the point, and the point
31 | * is not in the rectangle's border
32 | */
33 | def almostContains(point: DBSCANPoint): Boolean = {
34 | x < point.x && point.x < x2 && y < point.y && point.y < y2
35 | }
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/knn/Distance.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.classification
2 |
3 | import org.apache.spark.ml.linalg.Vector
4 |
5 | object Distance extends Enumeration {
6 |
7 | val Euclidean, Manhattan = Value
8 |
9 | /**
10 | * Computes the (Manhattan or Euclidean) distance between instance x and instance y.
11 | * The type of the distance used is determined by the value of distanceType.
12 | *
13 | * @param x instance x
14 | * @param y instance y
15 | * @param distanceType type of the distance used (Distance.Euclidean or Distance.Manhattan)
16 | * @return Distance
17 | */
18 | def apply(x: Vector, y: Vector, distanceType: Distance.Value): Double = {
19 | distanceType match {
20 | case Euclidean => euclidean(x, y)
21 | case Manhattan => manhattan(x, y)
22 | case _ => euclidean(x, y)
23 | }
24 | }
25 |
26 | /**
27 | * Computes the Euclidean distance between instance x and instance y.
28 | * The type of the distance used is determined by the value of distanceType.
29 | *
30 | * @param x instance x
31 | * @param y instance y
32 | * @return Euclidean distance
33 | */
34 | private def euclidean(x: Vector, y: Vector): Double = {
35 | var sum = 0.0
36 | val size = x.size
37 |
38 | for (i <- 0 until size) sum += (x(i) - y(i)) * (x(i) - y(i))
39 |
40 | Math.sqrt(sum)
41 | }
42 |
43 | /**
44 | * Computes the Manhattan distance between instance x and instance y.
45 | * The type of the distance used is determined by the value of distanceType.
46 | *
47 | * @param x instance x
48 | * @param y instance y
49 | * @return Manhattan distance
50 | */
51 | private def manhattan(x: Vector, y: Vector): Double = {
52 | var sum = 0.0
53 | val size = x.size
54 |
55 | for (i <- 0 until size) sum += Math.abs(x(i) - y(i))
56 |
57 | sum
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/util/XORShiftRandom.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.util
2 |
3 | import java.nio.ByteBuffer
4 | import java.util.{Random => JavaRandom}
5 |
6 | import scala.util.hashing.MurmurHash3
7 |
8 | /**
9 | * This class implements a XORShift random number generator algorithm
10 | * Source:
11 | * Marsaglia, G. (2003). Xorshift RNGs. Journal of Statistical Software, Vol. 8, Issue 14.
12 | * @see Paper
13 | * This implementation is approximately 3.5 times faster than
14 | * { @link java.util.Random java.util.Random}, partly because of the algorithm, but also due
15 | * to renouncing thread safety. JDK's implementation uses an AtomicLong seed, this class
16 | * uses a regular Long. We can forgo thread safety since we use a new instance of the RNG
17 | * for each thread.
18 | */
19 | class XORShiftRandom(init: Long) extends JavaRandom(init) {
20 |
21 | def this() = this(System.nanoTime)
22 |
23 | private var seed = XORShiftRandom.hashSeed(init)
24 |
25 | // we need to just override next - this will be called by nextInt, nextDouble,
26 | // nextGaussian, nextLong, etc.
27 | override protected def next(bits: Int): Int = {
28 | var nextSeed = seed ^ (seed << 21)
29 | nextSeed ^= (nextSeed >>> 35)
30 | nextSeed ^= (nextSeed << 4)
31 | seed = nextSeed
32 | (nextSeed & ((1L << bits) - 1)).asInstanceOf[Int]
33 | }
34 |
35 | override def setSeed(s: Long) {
36 | seed = XORShiftRandom.hashSeed(s)
37 | }
38 | }
39 |
40 | /** Contains benchmark method and main method to run benchmark of the RNG */
41 | object XORShiftRandom {
42 |
43 | /** Hash seeds to have 0/1 bits throughout. */
44 | private def hashSeed(seed: Long): Long = {
45 | val bytes = ByteBuffer.allocate(java.lang.Long.SIZE).putLong(seed).array()
46 | MurmurHash3.bytesHash(bytes)
47 | }
48 |
49 | }
--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/ml/timeseries/models/GARCHSuite.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.timeseries.models
2 |
3 | import org.apache.commons.math3.random.MersenneTwister
4 | import org.apache.spark.SparkFunSuite
5 | import org.apache.spark.ml.linalg.DenseVector
6 | import org.apache.spark.ml.util.DefaultReadWriteTest
7 | import org.apache.spark.mllib.util.MLlibTestSparkContext
8 |
9 | /**
10 | * Created by endy on 16-12-22.
11 | */
12 | class GARCHSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest{
13 |
14 | test("GARCH log likelihood") {
15 | val model = new GARCHModel(.2, .3, .4)
16 | val rand = new MersenneTwister(5L)
17 | val n = 10000
18 |
19 | val ts = new DenseVector(model.sample(n, rand))
20 | val logLikelihoodWithRightModel = model.logLikelihood(ts)
21 |
22 | val logLikelihoodWithWrongModel1 = new GARCHModel(.3, .4, .5).logLikelihood(ts)
23 | val logLikelihoodWithWrongModel2 = new GARCHModel(.25, .35, .45).logLikelihood(ts)
24 | val logLikelihoodWithWrongModel3 = new GARCHModel(.1, .2, .3).logLikelihood(ts)
25 |
26 | assert(logLikelihoodWithRightModel > logLikelihoodWithWrongModel1)
27 | assert(logLikelihoodWithRightModel > logLikelihoodWithWrongModel2)
28 | assert(logLikelihoodWithRightModel > logLikelihoodWithWrongModel3)
29 | assert(logLikelihoodWithWrongModel2 > logLikelihoodWithWrongModel1)
30 | }
31 |
32 | test("gradient") {
33 | val alpha = 0.3
34 | val beta = 0.4
35 | val omega = 0.2
36 | val genModel = new GARCHModel(omega, alpha, beta)
37 | val rand = new MersenneTwister(5L)
38 | val n = 10000
39 |
40 | val ts = new DenseVector(genModel.sample(n, rand))
41 |
42 | val gradient1 = new GARCHModel(omega + .1, alpha + .05, beta + .1).gradient(ts)
43 | assert(gradient1.forall(_ < 0.0))
44 | val gradient2 = new GARCHModel(omega - .1, alpha - .05, beta - .1).gradient(ts)
45 | assert(gradient2.forall(_ > 0.0))
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/tsne/TSNEHelper.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.tsne
2 |
3 | import breeze.linalg._
4 | import breeze.stats._
5 | import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
6 | import org.apache.spark.rdd.RDD
7 |
8 | object TSNEHelper {
9 | // p_ij = (p_{i|j} + p_{j|i}) / 2n
10 | def computeP(p_ji: CoordinateMatrix, n: Int): RDD[(Int, Iterable[(Int, Double)])] = {
11 | p_ji.entries
12 | .flatMap(e => Seq(
13 | ((e.i.toInt, e.j.toInt), e.value),
14 | ((e.j.toInt, e.i.toInt), e.value)
15 | ))
16 | .reduceByKey(_ + _) // p + p'
17 | .map{case ((i, j), v) => (i, (j, math.max(v / 2 / n, 1e-12))) } // p / 2n
18 | .groupByKey()
19 | }
20 |
21 | /**
22 | * Update Y via gradient dY
23 | * @param Y current Y
24 | * @param dY gradient dY
25 | * @param iY stored y_i - y_{i-1}
26 | * @param gains adaptive learning rates
27 | * @param iteration n
28 | * @param param [[TSNEParam]]
29 | * @return
30 | */
31 | def update(Y: DenseMatrix[Double],
32 | dY: DenseMatrix[Double],
33 | iY: DenseMatrix[Double],
34 | gains: DenseMatrix[Double],
35 | iteration: Int,
36 | param: TSNEParam): DenseMatrix[Double] = {
37 | import param._
38 | val momentum = if (iteration <= t_momentum) initial_momentum else final_momentum
39 | gains.foreachPair {
40 | case ((i, j), old_gain) =>
41 | val new_gain = math.max(min_gain,
42 | if ((dY.unsafeValueAt(i, j) > 0.0) != (iY.unsafeValueAt(i, j) > 0.0))
43 | old_gain + 0.2
44 | else
45 | old_gain * 0.8
46 | )
47 | gains.unsafeUpdate(i, j, new_gain)
48 |
49 | val new_iY = momentum * iY.unsafeValueAt(i, j) - eta * new_gain * dY.unsafeValueAt(i, j)
50 | iY.unsafeUpdate(i, j, new_iY)
51 |
52 | Y.unsafeUpdate(i, j, Y.unsafeValueAt(i, j) + new_iY) // Y += iY
53 | }
54 | Y := Y(*, ::) - (mean(Y(::, *)): DenseMatrix[Double]).toDenseVector
55 | }
56 | }
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/dbscan/DBSCANGraph.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.dbscan
2 |
3 | import scala.annotation.tailrec
4 |
5 | /**
6 | * Top level method for creating a DBSCANGraph
7 | */
8 | object DBSCANGraph {
9 |
10 | /**
11 | * Create an empty graph
12 | */
13 | def apply[T](): DBSCANGraph[T] = new DBSCANGraph(Map[T, Set[T]]())
14 |
15 | }
16 |
17 | /**
18 | * An immutable unweighted graph with vertexes and edges
19 | */
20 | class DBSCANGraph[T] private (nodes: Map[T, Set[T]]) extends Serializable {
21 |
22 | /**
23 | * Add the given vertex `v` to the graph
24 | *
25 | */
26 | def addVertex(v: T): DBSCANGraph[T] = {
27 | nodes.get(v) match {
28 | case None => new DBSCANGraph(nodes + (v -> Set()))
29 | case Some(_) => this
30 | }
31 | }
32 |
33 | /**
34 | * Insert an edge from `from` to `to`
35 | */
36 | def insertEdge(from: T, to: T): DBSCANGraph[T] = {
37 | nodes.get(from) match {
38 | case None => new DBSCANGraph(nodes + (from -> Set(to)))
39 | case Some(edge) => new DBSCANGraph(nodes + (from -> (edge + to)))
40 | }
41 | }
42 |
43 | /**
44 | * Insert a vertex from `one` to `another`, and from `another` to `one`
45 | *
46 | */
47 | def connect(one: T, another: T): DBSCANGraph[T] = {
48 | insertEdge(one, another).insertEdge(another, one)
49 | }
50 |
51 | /**
52 | * Find all vertexes that are reachable from `from`
53 | */
54 | def getConnected(from: T): Set[T] = {
55 | getAdjacent(Set(from), Set[T](), Set[T]()) - from
56 | }
57 |
58 | @tailrec
59 | private def getAdjacent(tovisit: Set[T], visited: Set[T], adjacent: Set[T]): Set[T] = {
60 |
61 | tovisit.headOption match {
62 | case Some(current) =>
63 | nodes.get(current) match {
64 | case Some(edges) =>
65 | getAdjacent(edges.diff(visited) ++ tovisit.tail, visited + current, adjacent ++ edges)
66 | case None => getAdjacent(tovisit.tail, visited, adjacent)
67 | }
68 | case None => adjacent
69 | }
70 |
71 | }
72 |
73 | }
74 |
75 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/util/DBHPartitioner.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.util
2 |
3 | import scala.reflect.ClassTag
4 |
5 | import org.apache.spark.HashPartitioner
6 | import org.apache.spark.graphx._
7 | import org.apache.spark.graphx.impl.GraphImpl
8 | import org.apache.spark.storage.StorageLevel
9 |
10 | /**
11 | * Degree-Based Hashing, the paper:
12 | * Distributed Power-law Graph Computing: Theoretical and Empirical Analysis
13 | */
14 | class DBHPartitioner(val partitions: Int, val threshold: Int = 0)
15 | extends HashPartitioner(partitions) {
16 | /**
17 | * Default DBH doesn't consider the situation where both the degree of src and
18 | * dst vertices are both small than a given threshold value
19 | */
20 | def getKey(et: EdgeTriplet[Int, _]): Long = {
21 | val srcId = et.srcId
22 | val dstId = et.dstId
23 | val srcDeg = et.srcAttr
24 | val dstDeg = et.dstAttr
25 | val maxDeg = math.max(srcDeg, dstDeg)
26 | val minDegId = if (maxDeg == srcDeg) dstId else srcId
27 | val maxDegId = if (maxDeg == srcDeg) srcId else dstId
28 | if (maxDeg < threshold) {
29 | maxDegId
30 | } else {
31 | minDegId
32 | }
33 | }
34 |
35 | override def equals(other: Any): Boolean = other match {
36 | case dbh: DBHPartitioner =>
37 | dbh.numPartitions == numPartitions
38 | case _ =>
39 | false
40 | }
41 | }
42 |
43 | object DBHPartitioner {
44 | def partitionByDBH[VD: ClassTag, ED: ClassTag](input: Graph[VD, ED],
45 | storageLevel: StorageLevel): Graph[VD, ED] = {
46 | val edges = input.edges
47 | val conf = edges.context.getConf
48 | val numPartitions = conf.getInt("", edges.partitions.length)
49 | val dbh = new DBHPartitioner(numPartitions, 0)
50 | val degGraph = GraphImpl(input.degrees, edges)
51 | val newEdges = degGraph.triplets.mapPartitions(_.map(et =>
52 | (dbh.getKey(et), Edge(et.srcId, et.dstId, et.attr))
53 | )).partitionBy(dbh).map(_._2)
54 | GraphImpl(input.vertices, newEdges, null.asInstanceOf[VD], storageLevel, storageLevel)
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/ml/timeseries/models/EWMASuite.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.timeseries.models
2 |
3 | import org.apache.spark.SparkFunSuite
4 | import org.apache.spark.ml.util.DefaultReadWriteTest
5 | import org.apache.spark.mllib.util.MLlibTestSparkContext
6 | import org.apache.spark.mllib.util.TestingUtils._
7 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
8 | import org.apache.spark.sql.{Dataset, Row}
9 |
10 | class EWMASuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest{
11 | @transient var dataSet: Dataset[_] = _
12 | @transient var dataSet1: Dataset[_] = _
13 |
14 | override def beforeAll(): Unit = {
15 | super.beforeAll()
16 |
17 | val schema = StructType(Array(StructField("time", StringType), StructField("timeseries",
18 | DoubleType)))
19 |
20 | val smoothed = Array(
21 | Array("201512", 7.0), Array("201601", 8.0), Array("201602", 9.0),
22 | Array("201509", 4.0), Array("201510", 5.0), Array("201511", 6.0),
23 | Array("201506", 1.0), Array("201507", 2.0), Array("201508", 3.0),
24 | Array("201603", 10.0))
25 |
26 | val orig1 = sc.parallelize(smoothed.map(x => Row(x: _*)))
27 | dataSet = spark.createDataFrame(orig1, schema)
28 |
29 | val oil = Array(
30 | Array("201506", 446.7), Array("201507", 454.5), Array("201508", 455.7),
31 | Array("201512", 425.3), Array("201601", 485.1), Array("201602", 506.0),
32 | Array("201509", 423.6), Array("201510", 456.3), Array("201511", 440.6),
33 | Array("201603", 526.8), Array("201604", 514.3), Array("201605", 494.2))
34 |
35 | val orig2 = sc.parallelize(oil.map(x => Row(x: _*)))
36 | dataSet1 = spark.createDataFrame(orig2, schema)
37 | }
38 |
39 |
40 | test("add time dependent effects") {
41 |
42 | val m1 = new EWMAModel(0.2).setTimeCol("time").setTimeSeriesCol("timeseries")
43 | val res = m1.transform(dataSet).collect().map{case Row(x: Double) => x}
44 |
45 | assert(res(0) == 1.0)
46 | assert(res(1) ~== 1.2 absTol 10E-5)
47 | }
48 |
49 | test("fitting EWMA model") {
50 | val model = new EWMA()
51 | .setTimeCol("time")
52 | .setTimeSeriesCol("timeseries")
53 | .setMaxIter(10000)
54 | .setMaxEval(10000)
55 | .setInitPoint(.94)
56 | .fit(dataSet1)
57 |
58 | assert(model.smoothing ~== 0.89 absTol 0.01) // approximately 0.89
59 | }
60 |
61 | }
62 |
--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/ml/timeseries/models/AutoregressionSuite.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.timeseries.models
2 |
3 | import org.apache.commons.math3.random.{MersenneTwister, RandomGenerator}
4 | import org.apache.spark.SparkFunSuite
5 | import org.apache.spark.ml.linalg.DenseVector
6 | import org.apache.spark.ml.util.DefaultReadWriteTest
7 | import org.apache.spark.mllib.util.MLlibTestSparkContext
8 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
9 | import org.apache.spark.sql.{Dataset, Row}
10 |
11 | /**
12 | * Created by endy on 16-12-19.
13 | */
14 | class AutoregressionSuite extends SparkFunSuite with MLlibTestSparkContext
15 | with DefaultReadWriteTest {
16 |
17 | @transient var dataSet: Dataset[_] = _
18 |
19 | override def beforeAll(): Unit = {
20 | super.beforeAll()
21 | }
22 |
23 | test("fit AR(1) model") {
24 | val ts = sample(5000, new MersenneTwister(10L), 1.5, Array(.2))
25 |
26 | val fittedModel = new Autoregression()
27 | .setTimeCol("time")
28 | .setTimeSeriesCol("timeseries")
29 | .setMaxLag(1)
30 | .setNoIntercept(false)
31 | .fit(ts)
32 |
33 | assert(fittedModel.coefficients.length == 1)
34 | assert(math.abs(fittedModel.c - 1.5) < .07)
35 | assert(math.abs(fittedModel.coefficients(0) - .2) < .03)
36 | }
37 |
38 | test("fit AR(2) model") {
39 |
40 | val ts = sample(5000, new MersenneTwister(10L), 1.5, Array(.2, .3))
41 | val fittedModel = new Autoregression()
42 | .setTimeCol("time")
43 | .setTimeSeriesCol("timeseries")
44 | .setMaxLag(2)
45 | .setNoIntercept(false)
46 | .fit(ts)
47 |
48 | assert(fittedModel.coefficients.length == 2)
49 | assert(math.abs(fittedModel.c - 1.5) < .15)
50 | assert(math.abs(fittedModel.coefficients(0) - .2) < .03)
51 | assert(math.abs(fittedModel.coefficients(1) - .3) < .03)
52 | }
53 |
54 | def sample(n: Int, rand: RandomGenerator, c: Double, coefficients: Array[Double]): Dataset[_] = {
55 | val vec = new DenseVector(Array.fill[Double](n)(rand.nextGaussian()))
56 | val res = new ARModel(c, coefficients).addTimeDependentEffects(vec).toArray
57 | .zipWithIndex
58 |
59 | val schema = StructType(Array(StructField("time", StringType), StructField("timeseries",
60 | DoubleType)))
61 |
62 | val rdd = sc.parallelize(res.map(x => Row(x._2.formatted("%05d"), x._1)))
63 |
64 | spark.createDataFrame(rdd, schema)
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/tsne/impl/SimpleTSNE.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.tsne.impl
2 |
3 | import breeze.linalg._
4 | import breeze.stats.distributions.Rand
5 | import org.apache.spark.ml.tsne.{TSNEGradient, TSNEHelper, TSNEParam, X2P}
6 | import org.apache.spark.mllib.linalg.distributed.RowMatrix
7 | import org.apache.spark.storage.StorageLevel
8 | import org.slf4j.LoggerFactory
9 |
10 | import scala.util.Random
11 |
12 | object SimpleTSNE {
13 | private def logger = LoggerFactory.getLogger(SimpleTSNE.getClass)
14 |
15 | def tsne(
16 | input: RowMatrix,
17 | noDims: Int = 2,
18 | maxIterations: Int = 1000,
19 | perplexity: Double = 30,
20 | callback: (Int, DenseMatrix[Double], Option[Double]) => Unit = {case _ => },
21 | seed: Long = Random.nextLong()): DenseMatrix[Double] = {
22 | if(input.rows.getStorageLevel == StorageLevel.NONE) {
23 | logger.warn("Input is not persisted and performance could be bad")
24 | }
25 |
26 | Rand.generator.setSeed(seed)
27 |
28 | val tsneParam = TSNEParam()
29 | import tsneParam._
30 |
31 | val n = input.numRows().toInt
32 | val Y: DenseMatrix[Double] = DenseMatrix.rand(n, noDims, Rand.gaussian(0, 1))
33 | val iY = DenseMatrix.zeros[Double](n, noDims)
34 | val gains = DenseMatrix.ones[Double](n, noDims)
35 |
36 | // approximate p_{j|i}
37 | val p_ji = X2P(input, 1e-5, perplexity)
38 | val P = TSNEHelper.computeP(p_ji, n).glom().cache()
39 |
40 | var iteration = 1
41 | while(iteration <= maxIterations) {
42 | val bcY = P.context.broadcast(Y)
43 |
44 | val numerator = P.map{ arr => TSNEGradient.computeNumerator(bcY.value, arr.map(_._1): _*) }.cache()
45 | val bcNumerator = P.context.broadcast({
46 | numerator.treeAggregate(0.0)(seqOp = (x, v) => x + sum(v), combOp = _ + _)
47 | })
48 |
49 | val (dY, loss) = P.zip(numerator).treeAggregate((DenseMatrix.zeros[Double](n, noDims), 0.0))(
50 | seqOp = (c, v) => {
51 | // c: (grad, loss), v: (Array[(i, Iterable(j, Distance))], numerator)
52 | val l = TSNEGradient.compute(v._1, bcY.value, v._2, bcNumerator.value, c._1, iteration <= early_exaggeration)
53 | (c._1, c._2 + l)
54 | },
55 | combOp = (c1, c2) => {
56 | // c: (grad, loss)
57 | (c1._1 + c2._1, c1._2 + c2._2)
58 | })
59 |
60 | bcY.destroy()
61 | bcNumerator.destroy()
62 | numerator.unpersist()
63 |
64 | TSNEHelper.update(Y, dY, iY, gains, iteration, tsneParam)
65 |
66 | logger.debug(s"Iteration $iteration finished with $loss")
67 | callback(iteration, Y.copy, Some(loss))
68 | iteration += 1
69 | }
70 | Y
71 | }
72 | }
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/tsne/tree/SPTree.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.tsne.tree
2 |
3 | import breeze.linalg._
4 | import breeze.numerics._
5 |
6 | import scala.annotation.tailrec
7 |
8 |
9 | class SPTree private[tree](val dimension: Int,
10 | val corner: DenseVector[Double],
11 | val width: DenseVector[Double]) extends Serializable {
12 | private[this] val childWidth: DenseVector[Double] = width :/ 2.0
13 | lazy val radiusSq: Double = sum(pow(width, 2))
14 | private[tree] val totalMass: DenseVector[Double] = DenseVector.zeros(dimension)
15 | private var count: Int = 0
16 | private var leaf: Boolean = true
17 | val center: DenseVector[Double] = DenseVector.zeros(dimension)
18 |
19 | lazy val children: Array[SPTree] = {
20 | (0 until pow(2, dimension)).toArray.map {
21 | i =>
22 | val bits = DenseVector(s"%0${dimension}d".format(i.toBinaryString.toInt).toArray.map(_.toDouble - '0'.toDouble))
23 | val childCorner: DenseVector[Double] = corner + (bits :* childWidth)
24 | new SPTree(dimension, childCorner, childWidth)
25 | }
26 | }
27 |
28 | final def insert(vector: DenseVector[Double], finalize: Boolean = false): SPTree = {
29 | totalMass += vector
30 | count += 1
31 |
32 | if(leaf) {
33 | if(count == 1) { // first to leaf
34 | center := vector
35 | } else if(!vector.equals(center)) {
36 | (1 until count).foreach(_ => getCell(center).insert(center, finalize)) //subdivide
37 | leaf = false
38 | }
39 | }
40 |
41 | if(finalize) computeCenter(false)
42 |
43 | if(leaf) this else getCell(vector).insert(vector, finalize)
44 | }
45 |
46 | def computeCenter(recursive: Boolean = true): Unit = {
47 | if(count > 0) {
48 | center := totalMass / count.toDouble
49 | if(recursive) children.foreach(_.computeCenter())
50 | }
51 | }
52 |
53 | def getCell(vector: DenseVector[Double]): SPTree = {
54 | val idx = ((vector - corner) :/ childWidth).data
55 | children(idx.foldLeft(0)((acc, i) => acc * 2 + min(max(i.ceil.toInt - 1, 0), 1)))
56 | }
57 |
58 | def getCount: Int = count
59 |
60 | def isLeaf: Boolean = leaf
61 | }
62 |
63 | object SPTree {
64 | def apply(Y: DenseMatrix[Double]): SPTree = {
65 | val d = Y.cols
66 | val minMaxs = minMax(Y(::, *)).toDenseVector
67 | val mins = minMaxs.mapValues(_._1)
68 | val maxs = minMaxs.mapValues(_._2)
69 |
70 | val tree = new SPTree(Y.cols, mins, maxs - mins)
71 |
72 | // insert points but wait till end to compute all centers
73 | //Y(*, ::).foreach(tree.insert(_, finalize = false))
74 | (0 until Y.rows).foreach(i => tree.insert(Y(i, ::).t, finalize = false))
75 | // compute all center of mass
76 | tree.computeCenter()
77 |
78 | tree
79 | }
80 | }
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/tsne/X2P.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.tsne
2 |
3 | import breeze.linalg.DenseVector
4 | import org.apache.spark.mllib.X2PHelper._
5 | import org.apache.spark.mllib.linalg.Vectors
6 | import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry, RowMatrix}
7 | import org.apache.spark.mllib.rdd.MLPairRDDFunctions._
8 | import org.slf4j.LoggerFactory
9 |
10 | object X2P {
11 |
12 | private def logger = LoggerFactory.getLogger(X2P.getClass)
13 |
14 | def apply(x: RowMatrix, tol: Double = 1e-5, perplexity: Double = 30.0): CoordinateMatrix = {
15 | require(tol >= 0, "Tolerance must be non-negative")
16 | require(perplexity > 0, "Perplexity must be positive")
17 |
18 | val mu = (3 * perplexity).toInt //TODO: Expose this as parameter
19 | val logU = Math.log(perplexity)
20 | val norms = x.rows.map(Vectors.norm(_, 2.0))
21 | norms.persist()
22 | val rowsWithNorm = x.rows.zip(norms).map{ case (v, norm) => VectorWithNorm(v, norm) }
23 | val neighbors = rowsWithNorm.zipWithIndex()
24 | .cartesian(rowsWithNorm.zipWithIndex())
25 | .flatMap {
26 | case ((u, i), (v, j)) =>
27 | if(i < j) {
28 | val dist = fastSquaredDistance(u, v)
29 | Seq((i, (j, dist)), (j, (i, dist)))
30 | } else Seq.empty
31 | }
32 | .topByKey(mu)(Ordering.by(e => -e._2))
33 |
34 | val p_betas =
35 | neighbors.map {
36 | case (i, arr) =>
37 | var betamin = Double.NegativeInfinity
38 | var betamax = Double.PositiveInfinity
39 | var beta = 1.0
40 |
41 | val d = DenseVector(arr.map(_._2))
42 | var (h, p) = Hbeta(d, beta)
43 |
44 | //logInfo("data was " + d.toArray.toList)
45 | //logInfo("array P was " + p.toList)
46 |
47 | // Evaluate whether the perplexity is within tolerance
48 | def Hdiff = h - logU
49 | var tries = 0
50 | while (Math.abs(Hdiff) > tol && tries < 50) {
51 | //If not, increase or decrease precision
52 | if (Hdiff > 0) {
53 | betamin = beta
54 | beta = if (betamax.isInfinite) beta * 2 else (beta + betamax) / 2
55 | } else {
56 | betamax = beta
57 | beta = if (betamin.isInfinite) beta / 2 else (beta + betamin) / 2
58 | }
59 |
60 | // Recompute the values
61 | val HP = Hbeta(d, beta)
62 | h = HP._1
63 | p = HP._2
64 | tries = tries + 1
65 | }
66 |
67 | //logInfo("array P is " + p.toList)
68 |
69 | (arr.map(_._1).zip(p.toArray).map { case (j, v) => MatrixEntry(i, j, v) }, beta)
70 | }
71 |
72 | logger.info("Mean value of sigma: " + p_betas.map(x => math.sqrt(1 / x._2)).mean)
73 | new CoordinateMatrix(p_betas.flatMap(_._1))
74 | }
75 | }
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/dbscan/LocalDBSCANArchery.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.dbscan
2 |
3 | import scala.collection.mutable.Queue
4 | import org.apache.spark.internal.Logging
5 | import archery.Box
6 | import archery.Entry
7 | import archery.Point
8 | import archery.RTree
9 | import org.apache.spark.ml.dbscan.DBSCANLabeledPoint.Flag
10 |
11 | /**
12 | * An implementation of DBSCAN using an R-Tree to improve its running time
13 | */
14 | class LocalDBSCANArchery(eps: Double, minPoints: Int) extends Logging {
15 |
16 | val minDistanceSquared = eps * eps
17 |
18 | def fit(points: Iterable[DBSCANPoint]): Iterable[DBSCANLabeledPoint] = {
19 |
20 | val tree = points.foldLeft(RTree[DBSCANLabeledPoint]())(
21 | (tempTree, p) =>
22 | tempTree.insert(
23 | Entry(Point(p.x.toFloat, p.y.toFloat), new DBSCANLabeledPoint(p))))
24 |
25 | var cluster = DBSCANLabeledPoint.Unknown
26 |
27 | tree.entries.foreach(entry => {
28 |
29 | val point = entry.value
30 |
31 | if (!point.visited) {
32 | point.visited = true
33 |
34 | val neighbors = tree.search(toBoundingBox(point), inRange(point))
35 |
36 | if (neighbors.size < minPoints) {
37 | point.flag = Flag.Noise
38 | } else {
39 | cluster += 1
40 | expandCluster(point, neighbors, tree, cluster)
41 | }
42 |
43 | }
44 |
45 | })
46 |
47 | logDebug(s"total: $cluster")
48 |
49 | tree.entries.map(_.value).toIterable
50 |
51 | }
52 |
53 | private def expandCluster(
54 | point: DBSCANLabeledPoint,
55 | neighbors: Seq[Entry[DBSCANLabeledPoint]],
56 | tree: RTree[DBSCANLabeledPoint],
57 | cluster: Int): Unit = {
58 |
59 | point.flag = Flag.Core
60 | point.cluster = cluster
61 |
62 | val left = Queue(neighbors)
63 |
64 | while (left.nonEmpty) {
65 |
66 | left.dequeue().foreach(neighborEntry => {
67 |
68 | val neighbor = neighborEntry.value
69 |
70 | if (!neighbor.visited) {
71 |
72 | neighbor.visited = true
73 | neighbor.cluster = cluster
74 |
75 | val neighborNeighbors = tree.search(toBoundingBox(neighbor), inRange(neighbor))
76 |
77 | if (neighborNeighbors.size >= minPoints) {
78 | neighbor.flag = Flag.Core
79 | left.enqueue(neighborNeighbors)
80 | } else {
81 | neighbor.flag = Flag.Border
82 | }
83 | }
84 |
85 | if (neighbor.cluster == DBSCANLabeledPoint.Unknown) {
86 | neighbor.cluster = cluster
87 | neighbor.flag = Flag.Border
88 | }
89 |
90 | })
91 |
92 | }
93 |
94 | }
95 |
96 | private def inRange(point: DBSCANPoint)(entry: Entry[DBSCANLabeledPoint]): Boolean = {
97 | entry.value.distanceSquared(point) <= minDistanceSquared
98 | }
99 |
100 | private def toBoundingBox(point: DBSCANPoint): Box = {
101 | Box(
102 | (point.x - eps).toFloat,
103 | (point.y - eps).toFloat,
104 | (point.x + eps).toFloat,
105 | (point.y + eps).toFloat)
106 | }
107 |
108 | }
109 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/dbscan/LocalDBSCANNaive.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.dbscan
2 |
3 |
4 | import scala.collection.mutable.Queue
5 | import org.apache.spark.internal.Logging
6 | import org.apache.spark.ml.dbscan.DBSCANLabeledPoint.Flag
7 | import org.apache.spark.ml.linalg.Vectors
8 |
9 | /**
10 | * A naive implementation of DBSCAN. It has O(n2) complexity
11 | * but uses no extra memory. This implementation is not used
12 | * by the parallel version of DBSCAN.
13 | *
14 | */
15 | class LocalDBSCANNaive(eps: Double, minPoints: Int) extends Logging {
16 |
17 | val minDistanceSquared = eps * eps
18 |
19 | def samplePoint: Array[DBSCANLabeledPoint] =
20 | Array(new DBSCANLabeledPoint(Vectors.dense(Array(0D, 0D))))
21 |
22 | def fit(points: Iterable[DBSCANPoint]): Iterable[DBSCANLabeledPoint] = {
23 |
24 | logInfo(s"About to start fitting")
25 |
26 | val labeledPoints = points.map { new DBSCANLabeledPoint(_) }.toArray
27 |
28 | val totalClusters =
29 | labeledPoints
30 | .foldLeft(DBSCANLabeledPoint.Unknown)(
31 | (cluster, point) => {
32 | if (!point.visited) {
33 | point.visited = true
34 |
35 | val neighbors = findNeighbors(point, labeledPoints)
36 |
37 | if (neighbors.size < minPoints) {
38 | point.flag = Flag.Noise
39 | cluster
40 | } else {
41 | expandCluster(point, neighbors, labeledPoints, cluster + 1)
42 | cluster + 1
43 | }
44 | } else {
45 | cluster
46 | }
47 | })
48 |
49 | logInfo(s"found: $totalClusters clusters")
50 |
51 | labeledPoints
52 |
53 | }
54 |
55 | private def findNeighbors(
56 | point: DBSCANPoint,
57 | all: Array[DBSCANLabeledPoint]): Iterable[DBSCANLabeledPoint] =
58 | all.view.filter(other => {
59 | point.distanceSquared(other) <= minDistanceSquared
60 | })
61 |
62 | def expandCluster(
63 | point: DBSCANLabeledPoint,
64 | neighbors: Iterable[DBSCANLabeledPoint],
65 | all: Array[DBSCANLabeledPoint],
66 | cluster: Int): Unit = {
67 |
68 | point.flag = Flag.Core
69 | point.cluster = cluster
70 |
71 | var allNeighbors = Queue(neighbors)
72 |
73 | while (allNeighbors.nonEmpty) {
74 | allNeighbors.dequeue().foreach(neighbor => {
75 | if (!neighbor.visited) {
76 |
77 | neighbor.visited = true
78 | neighbor.cluster = cluster
79 |
80 | val neighborNeighbors = findNeighbors(neighbor, all)
81 |
82 | if (neighborNeighbors.size >= minPoints) {
83 | neighbor.flag = Flag.Core
84 | allNeighbors.enqueue(neighborNeighbors)
85 | } else {
86 | neighbor.flag = Flag.Border
87 | }
88 |
89 | if (neighbor.cluster == DBSCANLabeledPoint.Unknown) {
90 | neighbor.cluster = cluster
91 | neighbor.flag = Flag.Border
92 | }
93 | }
94 |
95 | })
96 |
97 | }
98 |
99 | }
100 |
101 | }
102 |
103 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/sampling/UnderSampling.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.sampling
2 |
3 | import org.apache.spark.ml.Transformer
4 | import org.apache.spark.ml.param._
5 | import org.apache.spark.ml.util.Identifiable
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.types.StructType
8 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
9 |
10 | /**
11 | * Created by endy on 16-12-8.
12 | */
13 |
14 | trait UnderSamplingParams extends Params{
15 | final val threshold = new DoubleParam(this, "threshold", "The threshold whether to " +
16 | "undersampling sample of a class", (x: Double) => x > 1)
17 | def setThreshold(value: Double): this.type = set(threshold, value)
18 |
19 | final val dependentColName = new Param[String](this, "dependentColName", "The column that " +
20 | "provide label values")
21 | def setDependentColName(value: String): this.type = set(dependentColName, value)
22 |
23 | final val withReplacement = new BooleanParam(this, "withReplacement", "")
24 | def setWithReplacement(value: Boolean): this.type = set(withReplacement, value)
25 |
26 | final val primaryClass = new DoubleParam(this, "primaryClass", "primary class that to under " +
27 | "sampling")
28 | def setPrimaryClass(value: Double): this.type = set(primaryClass, value)
29 | }
30 |
31 |
32 | class UnderSampling(override val uid: String) extends Transformer with UnderSamplingParams{
33 |
34 | def this() = this(Identifiable.randomUID("UnderSampling"))
35 | /**
36 | * Transforms the input dataset.
37 | */
38 | override def transform(dataset: Dataset[_]): DataFrame = {
39 |
40 | val labelCountPair = dataset.groupBy($(dependentColName)).count().collect()
41 |
42 | val primaryClassCount = labelCountPair
43 | .filter{ case Row(label: Double, count: Long) => label == ${primaryClass}}
44 | .map(x => x.get(1)).headOption.getOrElse(-1L).asInstanceOf[Long]
45 |
46 | if (primaryClassCount == -1) throw new Exception("The label is not exist")
47 |
48 | val res = labelCountPair.zipWithIndex.map {
49 | case (Row(label: Double, count: Long), index: Int) =>
50 | val ratio = count / primaryClassCount.toDouble
51 |
52 | /**
53 | * if ratio < threshold, only return samples of this label,
54 | * otherwise we sample the data from the samples of this label.
55 | *
56 | * The desired number of samples is : num = primaryClassCount * threshold
57 | * so the fraction of sample method is: num / count = threshold / ratio
58 | */
59 | val df = if (ratio < ${threshold}) dataset.filter(col($(dependentColName)) === label)
60 | else dataset.filter(col($(dependentColName)) === label)
61 | .sample(${withReplacement}, ${threshold} / ratio)
62 |
63 | df.toDF()
64 | }.reduce(_ union _)
65 |
66 | res
67 | }
68 |
69 | override def copy(extra: ParamMap): Transformer = defaultCopy(extra)
70 |
71 | /**
72 | * :: DeveloperApi ::
73 | *
74 | * Check transform validity and derive the output schema from the input schema.
75 | *
76 | * Typical implementation should first conduct verification on schema change and parameter
77 | * validity, including complex parameter interaction checks.
78 | */
79 | override def transformSchema(schema: StructType): StructType = {
80 | schema
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/timeseries/MatrixUtil.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.timeseries
2 |
3 | import breeze.linalg.{CSCMatrix, DenseMatrix, DenseVector, Matrix, SliceVector, SparseVector, Vector}
4 | import io.transwarp.hubble.error.HubbleErrors
5 | import org.apache.spark.ml.linalg.{DenseMatrix => SDM, Matrix => SM, SparseMatrix => SSM}
6 | import org.apache.spark.ml.linalg.{DenseVector => SDV, SparseVector => SSV, Vector => SV}
7 | /**
8 | * Created by endy on 16-12-16.
9 | */
10 | object MatrixUtil {
11 |
12 | def matToRowArrs(mat: SM): Array[Array[Double]] = {
13 | val arrs = new Array[Array[Double]](mat.numRows)
14 | for (r <- 0 until mat.numRows) {
15 | arrs(r) = toBreeze(mat)(r to r, 0 until mat.numCols).toDenseMatrix.toArray
16 | }
17 | arrs
18 | }
19 |
20 | def toBreeze(sparkMatrix: SM): Matrix[Double] = {
21 | sparkMatrix match {
22 | case dm: SDM =>
23 | if (!dm.isTransposed) {
24 | new DenseMatrix[Double](dm.numRows, dm.numCols, dm.values)
25 | } else {
26 | val breezeMatrix = new DenseMatrix[Double](dm.numCols, dm.numRows, dm.values)
27 | breezeMatrix.t
28 | }
29 | case sm: SSM =>
30 | if (!sm.isTransposed) {
31 | new CSCMatrix[Double](sm.values, sm.numRows, sm.numCols, sm.colPtrs, sm.rowIndices)
32 | } else {
33 | val breezeMatrix =
34 | new CSCMatrix[Double](sm.values, sm.numCols, sm.numRows, sm.colPtrs, sm.rowIndices)
35 | breezeMatrix.t
36 | }
37 | case _ =>
38 | throw HubbleErrors.typeNotSupported(
39 | s"Do not support conversion from type ${sparkMatrix.getClass.getName}.")
40 | }
41 | }
42 |
43 | def toBreeze(sparkVector: SV): Vector[Double] = {
44 | sparkVector match {
45 | case v: SDV =>
46 | new DenseVector[Double](v.values)
47 | case v: SSV =>
48 | new SparseVector[Double](v.indices, v.values, v.size)
49 | }
50 | }
51 |
52 |
53 | def fromBreeze(breeze: Matrix[Double]): SM = {
54 | breeze match {
55 | case dm: DenseMatrix[Double] =>
56 | new SDM(dm.rows, dm.cols, dm.data, dm.isTranspose)
57 | case sm: CSCMatrix[Double] =>
58 | // There is no isTranspose flag for sparse matrices in Breeze
59 | new SSM(sm.rows, sm.cols, sm.colPtrs, sm.rowIndices, sm.data)
60 | case _ =>
61 | throw HubbleErrors.typeNotSupported(
62 | s"Do not support conversion from type ${breeze.getClass.getName}.")
63 | }
64 | }
65 |
66 | def fromBreeze(breezeVector: Vector[Double]): SV = {
67 | breezeVector match {
68 | case v: DenseVector[Double] =>
69 | if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) {
70 | new SDV(v.data)
71 | } else {
72 | new SDV(v.toArray) // Can't use underlying array directly, so make a new one
73 | }
74 | case v: SparseVector[Double] =>
75 | if (v.index.length == v.used) {
76 | new SSV(v.length, v.index, v.data)
77 | } else {
78 | new SSV(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used))
79 | }
80 | case v: SliceVector[_, Double] =>
81 | new SDV(v.toArray)
82 | case v: Vector[_] =>
83 | throw HubbleErrors.typeNotSupported("Unsupported Breeze vector type: " + v.getClass.getName)
84 | }
85 | }
86 |
87 | }
88 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/dbscan/DBSCAN2.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.dbscan
2 |
3 | import org.apache.spark.ml.{Estimator, Model}
4 | import org.apache.spark.ml.param.{DoubleParam, IntParam, ParamMap, Params}
5 | import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol}
6 | import org.apache.spark.rdd.RDD
7 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
8 | import org.apache.spark.sql.types.{IntegerType, StructType}
9 | import org.apache.spark.ml.linalg.{Vector, VectorUDT}
10 | import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
11 | import org.apache.spark.sql.functions.{col, udf}
12 |
13 | /**
14 | * Created by endy on 17-12-5.
15 | */
16 |
17 | trait DBSCANParams extends Params with HasFeaturesCol with HasPredictionCol{
18 | final val eps = new DoubleParam(this, "eps", "the maximum distance between two points" +
19 | " for them to be considered as part of the same region")
20 | def getEps: Double = ${eps}
21 |
22 | final val minPoints = new IntParam(this, "minPoints", "the minimum number of" +
23 | " points required to form a dense region")
24 | def getMinPoints: Int = ${minPoints}
25 |
26 | final val maxPointsPerPartition = new IntParam(this, "maxPointsPerPartition",
27 | "the largest number of points in a single partition")
28 |
29 | def getMaxPointsPerPartition: Int = ${maxPointsPerPartition}
30 |
31 | protected def validateAndTransformSchema(schema: StructType): StructType = {
32 | SchemaUtils.checkColumnType(schema, ${featuresCol}, new VectorUDT)
33 | SchemaUtils.appendColumn(schema, ${predictionCol}, IntegerType)
34 | }
35 | }
36 |
37 | class DBSCAN2(override val uid: String) extends Estimator[DBSCAN2Model] with DBSCANParams{
38 |
39 | setDefault(eps -> 0.3, minPoints -> 10, maxPointsPerPartition -> 250)
40 |
41 | def this() = this(Identifiable.randomUID("dbscan"))
42 |
43 | def setEps(value: Double): this.type = set(eps, value)
44 |
45 | def setMinPoints(value: Int): this.type = set(minPoints, value)
46 |
47 | def setMaxPointsPerPartition(value: Int): this.type = set(maxPointsPerPartition, value)
48 |
49 | override def fit(dataset: Dataset[_]): DBSCAN2Model = {
50 | val instances: RDD[Vector] = dataset.select(col(${featuresCol})).rdd.map {
51 | case Row(point: Vector) => point
52 | }
53 |
54 | val dbscan = DBSCAN.train(instances, ${eps}, ${minPoints}, ${maxPointsPerPartition})
55 |
56 | new DBSCAN2Model(uid, dbscan)
57 | }
58 |
59 | override def copy(extra: ParamMap): Estimator[DBSCAN2Model] = defaultCopy(extra)
60 |
61 | override def transformSchema(schema: StructType): StructType = {
62 | validateAndTransformSchema(schema)
63 | }
64 | }
65 |
66 | class DBSCAN2Model(override val uid: String, val model: DBSCAN) extends
67 | Model[DBSCAN2Model] with DBSCANParams{
68 |
69 | override def copy(extra: ParamMap): DBSCAN2Model = defaultCopy(extra)
70 |
71 | override def transform(dataset: Dataset[_]): DataFrame = {
72 | val clustered = model.labeledPoints
73 | .map(p => (p.vector(0), p.vector(1), p.vector, p.cluster))
74 |
75 | dataset.sparkSession.createDataFrame(clustered)
76 | .toDF(dataset.schema.fieldNames(0),
77 | dataset.schema.fieldNames(1),
78 | ${featuresCol}, ${predictionCol})
79 | }
80 |
81 | override def transformSchema(schema: StructType): StructType = {
82 | validateAndTransformSchema(schema)
83 | }
84 | }
85 |
86 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/sampling/OverSampling.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.sampling
2 |
3 | import org.apache.spark.ml.Transformer
4 | import org.apache.spark.ml.param._
5 | import org.apache.spark.ml.util.Identifiable
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
8 | import org.apache.spark.sql.types.StructType
9 |
10 | /**
11 | * Created by endy on 16-12-8.
12 | */
13 |
14 | trait OverSamplingParams extends Params{
15 | final val threshold = new DoubleParam(this, "threshold", "The threshold whether to " +
16 | "undersampling sample of a class", (x: Double) => x > 1)
17 | def setThreshold(value: Double): this.type = set(threshold, value)
18 |
19 | final val dependentColName = new Param[String](this, "dependentColName", "The column that " +
20 | "provide label values")
21 | def setDependentColName(value: String): this.type = set(dependentColName, value)
22 |
23 | final val primaryClass = new DoubleParam(this, "primaryClass", "primary class that to under " +
24 | "sampling")
25 | def setPrimaryClass(value: Double): this.type = set(primaryClass, value)
26 | }
27 |
28 |
29 | class OverSampling(override val uid: String) extends Transformer with OverSamplingParams {
30 | def this() = this(Identifiable.randomUID("OverSampling"))
31 |
32 | /**
33 | * Transforms the input dataset.
34 | */
35 | override def transform(dataset: Dataset[_]): DataFrame = {
36 | val labelCountPair = dataset.groupBy($(dependentColName)).count().collect()
37 |
38 | val primaryClassCount = labelCountPair
39 | .filter{ case Row(label: Double, count: Long) => label == ${primaryClass}}
40 | .map(x => x.get(1)).headOption.getOrElse(-1L).asInstanceOf[Long]
41 |
42 | if (primaryClassCount == -1) throw new Exception("The label is not exist")
43 |
44 | val res = labelCountPair.zipWithIndex
45 | .map {
46 | case (Row(label: Double, count: Long), index: Int) =>
47 | val ratio = primaryClassCount / count.toDouble
48 |
49 | /**
50 | * if ratio < threshold, only return samples of this label,
51 | * otherwise we sample the data from the samples of this label.
52 | *
53 | * The desired number of samples is : num = primaryClassCount * threshold
54 | * so the fraction of sample method is: num / count = ratio / threshold.
55 | * Because fraction > 1, the value of 'withReplacement' parameter must be true
56 | */
57 | val df = if (ratio < ${threshold}) {
58 | dataset.filter(col($(dependentColName)) === label)
59 | } else {
60 | val desiredFraction = ratio / ${threshold}
61 | dataset.filter(col($(dependentColName)) === label)
62 | .sample(withReplacement = true, desiredFraction)
63 | }
64 | df.toDF()
65 | }.reduce(_ union _)
66 |
67 | res
68 | }
69 |
70 | override def copy(extra: ParamMap): Transformer = defaultCopy(extra)
71 |
72 | /**
73 | * :: DeveloperApi ::
74 | *
75 | * Check transform validity and derive the output schema from the input schema.
76 | *
77 | * Typical implementation should first conduct verification on schema change and parameter
78 | * validity, including complex parameter interaction checks.
79 | */
80 | override def transformSchema(schema: StructType): StructType = {
81 | schema
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/ml/knn_is/KNN_ISSuite.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.knn_is
2 |
3 | import org.apache.spark.SparkFunSuite
4 | import org.apache.spark.ml.feature.LabeledPoint
5 | import org.apache.spark.ml.linalg.Vectors
6 | import org.apache.spark.ml.util.DefaultReadWriteTest
7 | import org.apache.spark.mllib.evaluation.MulticlassMetrics
8 | import org.apache.spark.mllib.util.MLlibTestSparkContext
9 | import org.apache.spark.rdd.RDD
10 | import org.apache.spark.sql.{Dataset, Row}
11 |
12 | import scala.util.Random
13 |
14 | class KNN_ISSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {
15 | @transient var dataset: Dataset[_] = _
16 |
17 | override def beforeAll(): Unit = {
18 | super.beforeAll()
19 | dataset = spark.createDataFrame(KNN_ISSuite.generateKnnInput(1.0, 1.0,
20 | nPoints = 1000, seed = 42))
21 | }
22 |
23 | test("knn: default params") {
24 | val knn_is = new KNN_ISClassifier()
25 | assert(knn_is.getLabelCol === "label")
26 | assert(knn_is.getFeaturesCol === "features")
27 | assert(knn_is.getPredictionCol === "prediction")
28 | assert(knn_is.getK == 1)
29 | assert(knn_is.getDistanceType == 1)
30 | assert(knn_is.getNumSamplesTest == 1)
31 | assert(knn_is.getNumClass == 1)
32 | assert(knn_is.getNumIter == 1)
33 | assert(knn_is.getInc == 0)
34 | assert(knn_is.getSubdel == 0)
35 | assert(knn_is.getTopdel == 0)
36 | }
37 |
38 | test("train"){
39 | val knn_is = new KNN_ISClassifier()
40 | knn_is.fit(dataset)
41 | }
42 |
43 | test("transform: one iterationNum"){
44 | val knn_is = new KNN_ISClassifier()
45 | .setNumClass(2)
46 | .setNumSamplesTest(dataset.count().toInt)
47 | .setK(5)
48 |
49 | val model = knn_is.fit(dataset)
50 |
51 | val results = model.transform(dataset)
52 | assert(results.count() == dataset.count())
53 |
54 | val source = dataset.select("label").rdd.map{case Row(x: Double) => x}
55 | val res = results.select("prediction").rdd.map{case Row(x: Double) => x}
56 |
57 | val predictions = source.zip(res.asInstanceOf[RDD[Double]])
58 | val metrics = new MulticlassMetrics(predictions)
59 | val precision = metrics.accuracy
60 | assert(precision == 0.64)
61 | }
62 |
63 | test("transform: more than one iterationNum"){
64 | val knn_is = new KNN_ISClassifier()
65 | .setNumClass(2)
66 | .setNumSamplesTest(dataset.count().toInt)
67 | .setNumIter(3)
68 | .setK(5)
69 |
70 | val model = knn_is.fit(dataset)
71 |
72 | val results = model.transform(dataset)
73 | assert(results.count() == dataset.count())
74 |
75 | val source = dataset.select("label")
76 | .rdd.map{case Row(x: Double) => x}.repartition(1)
77 | val res = results.select("prediction")
78 | .rdd.map{case Row(x: Double) => x}.repartition(1)
79 |
80 | val predictions = source.zip(res.asInstanceOf[RDD[Double]])
81 | val metrics = new MulticlassMetrics(predictions)
82 | val precision = metrics.accuracy
83 | assert(precision == 0.648)
84 | }
85 | }
86 |
87 | object KNN_ISSuite {
88 | def generateKnnInput(offset: Double,
89 | scale: Double,
90 | nPoints: Int,
91 | seed: Int): Seq[LabeledPoint] = {
92 | val rnd = new Random(seed)
93 | val x1 = Array.fill[Double](nPoints)(rnd.nextGaussian())
94 |
95 | val y = (0 until nPoints).map { i =>
96 | val p = 1.0 / (1.0 + math.exp(-(offset + scale * x1(i))))
97 | if (rnd.nextDouble() < p) 1.0 else 0.0
98 | }
99 |
100 | val testData = (0 until nPoints).map(i => LabeledPoint(y(i), Vectors.dense(Array(x1(i)))))
101 | testData
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/tsne/impl/BHTSNE.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.tsne.impl
2 |
3 | import breeze.linalg._
4 | import breeze.stats.distributions.Rand
5 | import org.apache.spark.ml.tsne.tree.SPTree
6 | import org.apache.spark.ml.tsne.{TSNEGradient, TSNEHelper, TSNEParam, X2P}
7 | import org.apache.spark.mllib.linalg.distributed.RowMatrix
8 | import org.apache.spark.storage.StorageLevel
9 | import org.slf4j.LoggerFactory
10 |
11 | import scala.util.Random
12 |
13 | object BHTSNE {
14 | private def logger = LoggerFactory.getLogger(BHTSNE.getClass)
15 |
16 | def tsne(
17 | input: RowMatrix,
18 | noDims: Int = 2,
19 | maxIterations: Int = 1000,
20 | perplexity: Double = 30,
21 | theta: Double = 0.5,
22 | reportLoss: Int => Boolean = {i => i % 10 == 0},
23 | callback: (Int, DenseMatrix[Double], Option[Double]) => Unit = {case _ => },
24 | seed: Long = Random.nextLong()
25 | ): DenseMatrix[Double] = {
26 | if(input.rows.getStorageLevel == StorageLevel.NONE) {
27 | logger.warn("Input is not persisted and performance could be bad")
28 | }
29 |
30 | Rand.generator.setSeed(seed)
31 |
32 | val tsneParam = TSNEParam()
33 | import tsneParam._
34 |
35 | val n = input.numRows().toInt
36 | val Y: DenseMatrix[Double] = DenseMatrix.rand(n, noDims, Rand.gaussian(0, 1)) :/ 1e4
37 | val iY = DenseMatrix.zeros[Double](n, noDims)
38 | val gains = DenseMatrix.ones[Double](n, noDims)
39 |
40 | // approximate p_{j|i}
41 | val p_ji = X2P(input, 1e-5, perplexity)
42 | val P = TSNEHelper.computeP(p_ji, n).glom()
43 | .map(rows => rows.map {
44 | case (i, data) =>
45 | (i, data.map(_._1).toSeq, DenseVector(data.map(_._2 * exaggeration_factor).toArray))
46 | })
47 | .cache()
48 |
49 | var iteration = 1
50 | while(iteration <= maxIterations) {
51 | val bcY = P.context.broadcast(Y)
52 | val bcTree = P.context.broadcast(SPTree(Y))
53 |
54 | val initialValue = (DenseMatrix.zeros[Double](n, noDims), DenseMatrix.zeros[Double](n, noDims), 0.0)
55 | val (posF, negF, sumQ) = P.treeAggregate(initialValue)(
56 | seqOp = (c, v) => {
57 | // c: (pos, neg, sumQ), v: Array[(i, Seq(j), vec(Distance))]
58 | TSNEGradient.computeEdgeForces(v, bcY.value, c._1)
59 | val q = TSNEGradient.computeNonEdgeForces(bcTree.value, bcY.value, theta, c._2, v.map(_._1): _*)
60 | (c._1, c._2, c._3 + q)
61 | },
62 | combOp = (c1, c2) => {
63 | // c: (grad, loss)
64 | (c1._1 + c2._1, c1._2 + c2._2, c1._3 + c2._3)
65 | })
66 | val dY: DenseMatrix[Double] = posF :- (negF :/ sumQ)
67 |
68 | TSNEHelper.update(Y, dY, iY, gains, iteration, tsneParam)
69 |
70 | if(reportLoss(iteration)) {
71 | val loss = P.treeAggregate(0.0)(
72 | seqOp = (c, v) => {
73 | TSNEGradient.computeLoss(v, bcY.value, sumQ)
74 | },
75 | combOp = _ + _
76 | )
77 | logger.debug(s"Iteration $iteration finished with $loss")
78 | callback(iteration, Y.copy, Some(loss))
79 | } else {
80 | logger.debug(s"Iteration $iteration finished")
81 | callback(iteration, Y.copy, None)
82 | }
83 |
84 | bcY.destroy()
85 | bcTree.destroy()
86 |
87 | //undo early exaggeration
88 | if(iteration == early_exaggeration) {
89 | P.foreach {
90 | rows => rows.foreach {
91 | case (_, _, vec) => vec.foreachPair { case (i, v) => vec.update(i, v / exaggeration_factor) }
92 | }
93 | }
94 | }
95 |
96 | iteration += 1
97 | }
98 |
99 | Y
100 | }
101 | }
102 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/util/SparkUtils.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.util
2 |
3 | import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
4 | import breeze.storage.Zero
5 | import org.apache.hadoop.fs.{FileSystem, Path}
6 | import org.apache.spark.SparkConf
7 | import org.apache.spark.deploy.SparkHadoopUtil
8 | import org.apache.spark.mllib.linalg.{DenseVector => SDV, SparseVector => SSV, Vector => SV}
9 |
10 | import scala.language.implicitConversions
11 | import scala.reflect.ClassTag
12 |
13 |
14 | object SparkUtils {
15 | implicit def toBreeze(sv: SV): BV[Double] = {
16 | sv match {
17 | case SDV(data) =>
18 | new BDV(data)
19 | case SSV(size, indices, values) =>
20 | new BSV(indices, values, size)
21 | }
22 | }
23 |
24 | implicit def fromBreeze(breezeVector: BV[Double]): SV = {
25 | breezeVector match {
26 | case v: BDV[Double] =>
27 | if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) {
28 | new SDV(v.data)
29 | } else {
30 | new SDV(v.toArray) // Can't use underlying array directly, so make a new one
31 | }
32 | case v: BSV[Double] =>
33 | if (v.index.length == v.used) {
34 | new SSV(v.length, v.index, v.data)
35 | } else {
36 | new SSV(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used))
37 | }
38 | case v: BV[_] =>
39 | sys.error("Unsupported Breeze vector type: " + v.getClass.getName)
40 | }
41 | }
42 |
43 | def toBreezeConv[T: ClassTag](sv: SV)(implicit num: Numeric[T]): BV[T] = {
44 | val zero = num.zero
45 | implicit val conv: Array[Double] => Array[T] = (data) => {
46 | data.map(ele => (zero match {
47 | case zero: Double => ele
48 | case zero: Float => ele.toFloat
49 | case zero: Int => ele.toInt
50 | case zero: Long => ele.toLong
51 | }).asInstanceOf[T]).array
52 | }
53 | sv match {
54 | case SDV(data) =>
55 | new BDV[T](data)
56 | case SSV(size, indices, values) =>
57 | new BSV[T](indices, values, size)(Zero[T](zero))
58 | }
59 | }
60 |
61 | def fromBreezeConv[T: ClassTag](breezeVector: BV[T])(implicit num: Numeric[T]): SV = {
62 | implicit val conv: Array[T] => Array[Double] = (data) => {
63 | data.map(num.toDouble).array
64 | }
65 | breezeVector match {
66 | case v: BDV[T] =>
67 | if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) {
68 | new SDV(v.data)
69 | } else {
70 | new SDV(v.toArray) // Can't use underlying array directly, so make a new one
71 | }
72 | case v: BSV[T] =>
73 | if (v.index.length == v.used) {
74 | new SSV(v.length, v.index, v.data)
75 | } else {
76 | new SSV(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used))
77 | }
78 | case v: BV[T] =>
79 | sys.error("Unsupported Breeze vector type: " + v.getClass.getName)
80 | }
81 | }
82 |
83 | def getFileSystem(conf: SparkConf, path: Path): FileSystem = {
84 | val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
85 | if (sys.env.contains("HADOOP_CONF_DIR") || sys.env.contains("YARN_CONF_DIR")) {
86 | val hdfsConfPath = if (sys.env.get("HADOOP_CONF_DIR").isDefined) {
87 | sys.env.get("HADOOP_CONF_DIR").get + "/core-site.xml"
88 | } else {
89 | sys.env.get("YARN_CONF_DIR").get + "/core-site.xml"
90 | }
91 | hadoopConf.addResource(new Path(hdfsConfPath))
92 | }
93 | path.getFileSystem(hadoopConf)
94 | }
95 |
96 | def deleteChkptDirs(conf: SparkConf, dirs: Array[String]): Unit = {
97 | val fs = getFileSystem(conf, new Path(dirs(0)))
98 | dirs.foreach(dir => {
99 | fs.delete(new Path(dir), true)
100 | })
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/ml/timeseries/models/ARGARCHSuite.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.timeseries.models
2 |
3 | import org.apache.commons.math3.random.MersenneTwister
4 | import org.apache.spark.SparkFunSuite
5 | import org.apache.spark.ml.linalg.DenseVector
6 | import org.apache.spark.ml.timeseries.MatrixUtil
7 | import org.apache.spark.ml.util.DefaultReadWriteTest
8 | import org.apache.spark.mllib.util.MLlibTestSparkContext
9 | import org.apache.spark.mllib.util.TestingUtils._
10 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
11 | import org.apache.spark.sql.{Row, _}
12 |
13 | /**
14 | * Created by endy on 16-12-22.
15 | */
16 | class ARGARCHSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest{
17 | test("fit model") {
18 | val omega = 0.2
19 | val alpha = 0.3
20 | val beta = 0.5
21 | val genModel = new ARGARCHModel(0.0, 0.0, alpha, beta, omega)
22 | val rand = new MersenneTwister(5L)
23 | val n = 10000
24 |
25 | val ts = genModel.sample(n, rand)
26 | val data = genDf(ts)
27 |
28 | val model = new GARCH().fit(data)
29 | assert(model.omega - omega < .1) // TODO: we should be able to be more accurate
30 | assert(model.alpha - alpha < .02)
31 | assert(model.beta - beta < .02)
32 | }
33 |
34 |
35 | test("fit model 2") {
36 | val arr = Array[Double](0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
37 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
38 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
39 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
40 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
41 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
42 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
43 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
44 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
45 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
46 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
47 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
48 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
49 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
50 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
51 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
52 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
53 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
54 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1,
55 | 0.0, -0.01, 0.00, -0.1, 0.1, -0.2, -0.1, 0.1, 0.0, -0.01, 0.00, -0.1)
56 | val ts = genDf(arr)
57 |
58 | val model = new ARGARCH().fit(ts)
59 |
60 | assert(model.alpha ~== -0.106 absTol 0.001)
61 | assert(model.beta ~== -1.012 absTol 0.001)
62 | assert(model.omega ~== 0.190 absTol 0.01)
63 | assert(model.c ~== -0.0355 absTol 0.01)
64 | assert(model.phi ~== -0.339 absTol 0.01)
65 | }
66 |
67 | test("standardize and filter") {
68 | val model = new ARGARCHModel(40.0, .4, .2, .3, .4)
69 | val rand = new MersenneTwister(5L)
70 | val n = 10000
71 |
72 | val ts = new DenseVector(model.sample(n, rand))
73 |
74 | // de-heteroskedasticize
75 | val standardized = model.removeTimeDependentEffects(ts)
76 | // heteroskedasticize
77 | val filtered = model.addTimeDependentEffects(standardized)
78 |
79 | assert((MatrixUtil.toBreeze(filtered) - MatrixUtil.toBreeze(ts)).toArray.forall(math.abs(_) <
80 | .001))
81 | }
82 |
83 | def genDf(array: Array[Double]): DataFrame = {
84 | val schema = StructType(Array(StructField("time", StringType), StructField("timeseries",
85 | DoubleType)))
86 |
87 | val rdd = spark.sparkContext.parallelize(
88 | array.zipWithIndex.map(x => Row(x._2.formatted("%010d"), x._1)))
89 |
90 | spark.createDataFrame(rdd, schema)
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/util/LoaderUtils.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.util
2 |
3 | import org.apache.hadoop.fs._
4 | import org.apache.spark.SparkContext
5 | import org.apache.spark.rdd.RDD
6 | import org.apache.spark.sql.catalyst.ScalaReflection
7 | import org.apache.spark.sql.types.{DataType, StructField, StructType}
8 | import org.json4s._
9 | import org.json4s.jackson.JsonMethods._
10 |
11 | import scala.reflect.ClassTag
12 | import scala.reflect.runtime.universe.TypeTag
13 |
14 | // copy form Spark MLlib
15 | /**
16 | * Helper methods for loading models from files.
17 | */
18 | private[ml] object LoaderUtils {
19 |
20 | /** Returns URI for path/data using the Hadoop filesystem */
21 | def dataPath(path: String): String = new Path(path, "data").toUri.toString
22 |
23 | /** Returns URI for path/metadata using the Hadoop filesystem */
24 | def metadataPath(path: String): String = new Path(path, "metadata").toUri.toString
25 |
26 | /**
27 | * Check the schema of loaded model data.
28 | *
29 | * This checks every field in the expected schema to make sure that a field with the same
30 | * name and DataType appears in the loaded schema. Note that this does NOT check metadata
31 | * or containsNull.
32 | *
33 | * @param loadedSchema Schema for model data loaded from file.
34 | * @tparam Data Expected data type from which an expected schema can be derived.
35 | */
36 | def checkSchema[Data: TypeTag](loadedSchema: StructType): Unit = {
37 | // Check schema explicitly since erasure makes it hard to use match-case for checking.
38 | val expectedFields: Array[StructField] =
39 | ScalaReflection.schemaFor[Data].dataType.asInstanceOf[StructType].fields
40 | val loadedFields: Map[String, DataType] =
41 | loadedSchema.map(field => field.name -> field.dataType).toMap
42 | expectedFields.foreach { field =>
43 | assert(loadedFields.contains(field.name), s"Unable to parse model data." +
44 | s" Expected field with name ${field.name} was missing in loaded schema:" +
45 | s" ${loadedFields.mkString(", ")}")
46 | }
47 | }
48 |
49 | /**
50 | * Load metadata from the given path.
51 | * @return (class name, version, metadata)
52 | */
53 | def loadMetadata(sc: SparkContext, path: String): (String, String, JValue) = {
54 | implicit val formats = DefaultFormats
55 | val metadata = parse(sc.textFile(metadataPath(path)).first())
56 | val clazz = (metadata \ "class").extract[String]
57 | val version = (metadata \ "version").extract[String]
58 | (clazz, version, metadata)
59 | }
60 |
61 | /**
62 | * Save an RDD to one HDFS file
63 | * @param sc SparkContext
64 | * @param rdd The RDD to save
65 | * @param outPathStr The HDFS file path of String
66 | * @param header Header line of HDFS file, used for storing some metadata
67 | * @param mapEle The function mapping each element of RDD to a line of String
68 | */
69 | def RDD2HDFSFile[T: ClassTag](sc: SparkContext,
70 | rdd: RDD[T],
71 | outPathStr: String,
72 | header: => String,
73 | mapEle: T => String): Unit = {
74 | val hdpconf = sc.hadoopConfiguration
75 | val fs = FileSystem.get(hdpconf)
76 | val outPath = new Path(outPathStr)
77 | if (fs.exists(outPath)) {
78 | throw new InvalidPathException(s"Output path $outPathStr already exists.")
79 | }
80 | val fout = fs.create(outPath)
81 | fout.write(header.getBytes)
82 | fout.write("\n".getBytes)
83 | rdd.toLocalIterator.foreach(e => {
84 | fout.write(mapEle(e).getBytes)
85 | fout.write("\n".getBytes)
86 | })
87 | fout.close()
88 | }
89 |
90 | /**
91 | * Load an RDD from one HDFS file
92 | * @param sc SparkContext
93 | * @param inPathStr The HDFS file path of String
94 | * @param init_f The function used for initialization after reading header
95 | * @param lineParser The function parses each line in HDFS file to an element of RDD
96 | */
97 | def HDFSFile2RDD[T: ClassTag, M: ClassTag](sc: SparkContext,
98 | inPathStr: String,
99 | init_f: String => M,
100 | lineParser: (M, String) => T): (M, RDD[T]) = {
101 | val rawrdd = sc.textFile(inPathStr)
102 | val header = rawrdd.first()
103 | val meta = init_f(header)
104 | val rdd: RDD[T] = rawrdd.mapPartitions(iter => {
105 | val first = iter.next()
106 | if (first == header) {
107 | iter
108 | } else {
109 | Iterator.single(first) ++ iter
110 | }
111 | }.map(lineParser(meta, _)))
112 | (meta, rdd)
113 | }
114 | }
115 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/timeseries/Lag.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.timeseries
2 |
3 | import org.apache.spark.ml.linalg.{DenseMatrix, Matrix, Vector}
4 |
5 | /**
6 | * Created by endy on 16-12-16.
7 | */
8 | object Lag {
9 | /**
10 | * Makes a lag matrix from the given time series with the given lag, trimming both rows and
11 | * columns so that every element in the matrix is full.
12 | */
13 | def lagMatTrimBoth(x: Array[Double], maxLag: Int): Array[Array[Double]] = {
14 | lagMatTrimBoth(x, maxLag, false)
15 | }
16 |
17 | /**
18 | * Makes a lag matrix from the given time series with the given lag, trimming both rows and
19 | * columns so that every element in the matrix is full.
20 | */
21 | def lagMatTrimBoth(x: Array[Double], maxLag: Int, includeOriginal: Boolean)
22 | : Array[Array[Double]] = {
23 | val numObservations = x.length
24 | val numRows = numObservations - maxLag
25 | val numCols = maxLag + (if (includeOriginal) 1 else 0)
26 | val lagMat = Array.ofDim[Double](numRows, numCols)
27 |
28 | val initialLag = if (includeOriginal) 0 else 1
29 |
30 | for (r <- 0 until numRows) {
31 | for (c <- initialLag to maxLag) {
32 | lagMat(r)(c - initialLag) = x(r + maxLag - c)
33 | }
34 | }
35 | lagMat
36 | }
37 |
38 | /**
39 | * Makes a lag matrix from the given time series with the given lag, trimming both rows and
40 | * columns so that every element in the matrix is full.
41 | */
42 | def lagMatTrimBoth(x: Vector, maxLag: Int): Matrix = {
43 | lagMatTrimBoth(x, maxLag, false)
44 | }
45 |
46 | /**
47 | * Makes a lag matrix from the given time series with the given lag, trimming both rows and
48 | * columns so that every element in the matrix is full.
49 | */
50 | def lagMatTrimBoth(x: Vector, maxLag: Int, includeOriginal: Boolean): Matrix = {
51 | val numObservations = x.size
52 | val numRows = numObservations - maxLag
53 | val numCols = maxLag + (if (includeOriginal) 1 else 0)
54 | val lagMat = new DenseMatrix(numRows, numCols, new Array[Double](numRows * numCols))
55 |
56 | lagMatTrimBoth(x, lagMat, maxLag, includeOriginal, 0)
57 | lagMat
58 | }
59 |
60 | /**
61 | * @param x Vector to be lagged.
62 | * @param outputMat Matrix to place the lagged vector into, as a column.
63 | * @param numLags The number of times to lag the vector. E.g. if this is 2, the output matrix
64 | * will include one column that is the vector lagged by 1, and another column to
65 | * the right that is the vector lagged by 2.
66 | * @param includeOriginal Whether to place the original time series into the matrix as well.
67 | * @param colOffset The offset to start placing columns in the output mat.
68 | */
69 | def lagMatTrimBoth(
70 | x: Vector,
71 | outputMat: DenseMatrix,
72 | numLags: Int,
73 | includeOriginal: Boolean,
74 | colOffset: Int): Unit = {
75 | val numRows = outputMat.numRows
76 | val numTruncatedRows = x.size - numRows
77 |
78 | val initialLag = if (includeOriginal) 0 else 1
79 |
80 | val breezeOutputMat = MatrixUtil.toBreeze(outputMat)
81 | for (r <- 0 until numRows) {
82 | for (lag <- initialLag to numLags) {
83 | val c = colOffset + lag - initialLag
84 | breezeOutputMat(r, c) = x(r + numTruncatedRows - lag)
85 | }
86 | }
87 | }
88 |
89 | /**
90 | * Creates a lagged matrix from a current matrix (represented in row-array form).
91 | * Lags each column the appropriate amount of times and then concatenates the columns.
92 | * So given a matrix [a b c], where a/b/c are column vectors, and calling with lag of 2,
93 | * becomes a matrix of the form [a_-1 a_-2 b_-1 b_-2 c_-1 c_-2]
94 | */
95 | def lagMatTrimBoth(
96 | x: Array[Array[Double]],
97 | maxLag: Int,
98 | includeOriginal: Boolean): Array[Array[Double]] = {
99 | val xt = x.transpose
100 | // one matrix per column, consisting of all its lags
101 | val matrices = for (col <- xt) yield {
102 | Lag.lagMatTrimBoth(col, maxLag, includeOriginal)
103 | }
104 | // merge the matrices into 1 matrix by concatenating col-wise
105 | matrices.transpose.map(_.reduceLeft(_ ++ _))
106 | }
107 |
108 | /**
109 | * Creates a lagged matrix from a current matrix (represented in row-array form).
110 | * Lags each column the appropriate amount of times and then concatenates the columns.
111 | * So given a matrix [a b c], where a/b/c are column vectors, and calling with lag of 2,
112 | * becomes a matrix of the form [a_-1 a_-2 b_-1 b_-2 c_-1 c_-2]
113 | * The original time series is not included in the matrix.
114 | */
115 | def lagMatTrimBoth(x: Array[Array[Double]], maxLag: Int): Array[Array[Double]] = {
116 | lagMatTrimBoth(x, maxLag, false)
117 | }
118 | }
119 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/tsne/impl/LBFGSTSNE.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.tsne.impl
2 |
3 | import breeze.linalg._
4 | import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS}
5 | import breeze.stats.distributions.Rand
6 | import org.apache.spark.ml.tsne.{TSNEGradient, X2P}
7 | import org.apache.spark.mllib.linalg.distributed.RowMatrix
8 | import org.apache.spark.rdd.RDD
9 | import org.apache.spark.storage.StorageLevel
10 | import org.slf4j.LoggerFactory
11 |
12 | import scala.util.Random
13 |
14 | /**
15 | * TODO: This doesn't work at all (yet or ever).
16 | */
17 | object LBFGSTSNE {
18 | private def logger = LoggerFactory.getLogger(LBFGSTSNE.getClass)
19 |
20 | def tsne(
21 | input: RowMatrix,
22 | noDims: Int = 2,
23 | maxNumIterations: Int = 1000,
24 | numCorrections: Int = 10,
25 | convergenceTol: Double = 1e-4,
26 | perplexity: Double = 30,
27 | seed: Long = Random.nextLong()): DenseMatrix[Double] = {
28 | if(input.rows.getStorageLevel == StorageLevel.NONE) {
29 | logger.warn("Input is not persisted and performance could be bad")
30 | }
31 |
32 | Rand.generator.setSeed(seed)
33 |
34 | val n = input.numRows().toInt
35 | val early_exaggeration = 100
36 | val t_momentum = 250
37 | val initial_momentum = 0.5
38 | val final_momentum = 0.8
39 | val eta = 500.0
40 | val min_gain = 0.01
41 |
42 | val Y: DenseMatrix[Double] = DenseMatrix.rand(n, noDims, Rand.gaussian) //:* .0001
43 | val iY = DenseMatrix.zeros[Double](n, noDims)
44 | val gains = DenseMatrix.ones[Double](n, noDims)
45 |
46 | // approximate p_{j|i}
47 | val p_ji = X2P(input, 1e-5, perplexity)
48 | //logInfo(p_ji.toRowMatrix().rows.collect().toList.toString)
49 | // p_ij = (p_{i|j} + p_{j|i}) / 2n
50 | val P = p_ji.transpose().entries.union(p_ji.entries)
51 | .map(e => ((e.i.toInt, e.j.toInt), e.value))
52 | .reduceByKey(_ + _)
53 | .map{case ((i, j), v) => (i, (j, v / 2 / n)) }
54 | .groupByKey()
55 | .glom()
56 | .cache()
57 |
58 | var iteration = 1
59 |
60 | {
61 | val costFun = new CostFun(P, n, noDims, true)
62 | val lbfgs = new LBFGS[DenseVector[Double]](maxNumIterations, numCorrections, convergenceTol)
63 | val states = lbfgs.iterations(new CachedDiffFunction(costFun), new DenseVector(Y.data))
64 |
65 | while (states.hasNext) {
66 | val state = states.next()
67 | val loss = state.value
68 | //logInfo(state.convergedReason.get.toString)
69 | logger.debug(s"Iteration $iteration finished with $loss")
70 |
71 | Y := asDenseMatrix(state.x, n, noDims)
72 | //subscriber.onNext((iteration, Y.copy, Some(loss)))
73 | iteration += 1
74 | }
75 | }
76 |
77 | {
78 | val costFun = new CostFun(P, n, noDims, false)
79 | val lbfgs = new LBFGS[DenseVector[Double]](maxNumIterations, numCorrections, convergenceTol)
80 | val states = lbfgs.iterations(new CachedDiffFunction(costFun), new DenseVector(Y.data))
81 |
82 | while (states.hasNext) {
83 | val state = states.next()
84 | val loss = state.value
85 | //logInfo(state.convergedReason.get.toString)
86 | logger.debug(s"Iteration $iteration finished with $loss")
87 |
88 | Y := asDenseMatrix(state.x, n, noDims)
89 | //subscriber.onNext((iteration, Y.copy, Some(loss)))
90 | iteration += 1
91 | }
92 | }
93 |
94 | Y
95 | }
96 |
97 | private[this] def asDenseMatrix(v: DenseVector[Double], n: Int, noDims: Int) = {
98 | v.asDenseMatrix.reshape(n, noDims)
99 | }
100 |
101 | private class CostFun(
102 | P: RDD[Array[(Int, Iterable[(Int, Double)])]],
103 | n: Int,
104 | noDims: Int,
105 | exaggeration: Boolean) extends DiffFunction[DenseVector[Double]] {
106 |
107 | override def calculate(weights: DenseVector[Double]): (Double, DenseVector[Double]) = {
108 | val bcY = P.context.broadcast(asDenseMatrix(weights, n, noDims))
109 | val bcExaggeration = P.context.broadcast(exaggeration)
110 |
111 | val numerator = P.map{ arr => TSNEGradient.computeNumerator(bcY.value, arr.map(_._1): _*) }.cache()
112 | val bcNumerator = P.context.broadcast({
113 | numerator.treeAggregate(0.0)(seqOp = (x, v) => x + sum(v), combOp = _ + _)
114 | })
115 |
116 | val (dY, loss) = P.zip(numerator).treeAggregate((DenseMatrix.zeros[Double](n, noDims), 0.0))(
117 | seqOp = (c, v) => {
118 | // c: (grad, loss), v: (Array[(i, Iterable(j, Distance))], numerator)
119 | // TODO: See if we can include early_exaggeration
120 | val l = TSNEGradient.compute(v._1, bcY.value, v._2, bcNumerator.value, c._1, bcExaggeration.value)
121 | (c._1, c._2 + l)
122 | },
123 | combOp = (c1, c2) => {
124 | // c: (grad, loss)
125 | (c1._1 += c2._1, c1._2 + c2._2)
126 | })
127 |
128 | numerator.unpersist()
129 |
130 | (loss, new DenseVector(dY.data))
131 | }
132 | }
133 | }
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | io.enme
4 | enme
5 | 1.0
6 |
7 |
8 | 2.11.8
9 | 2.2.0
10 | 2.11
11 | 0.4.0
12 |
13 |
14 |
15 |
16 | org.scala-lang
17 | scala-library
18 | ${scala.version}
19 |
20 |
21 | org.apache.spark
22 | spark-mllib_${scala.binary.version}
23 | ${spark.version}
24 |
25 |
26 | org.apache.spark
27 | spark-mllib_${scala.binary.version}
28 | ${spark.version}
29 | test-jar
30 | test
31 |
32 |
33 | org.apache.spark
34 | spark-core_${scala.binary.version}
35 | ${spark.version}
36 | test-jar
37 | test
38 |
39 |
40 | com.meetup
41 | archery_${scala.binary.version}
42 | ${archery.version}
43 |
44 |
45 |
46 |
47 |
48 |
49 | net.alchim31.maven
50 | scala-maven-plugin
51 | 3.2.1
52 |
53 |
54 | org.apache.maven.plugins
55 | maven-compiler-plugin
56 | 2.0.2
57 |
58 | 1.7
59 | 1.7
60 | utf8
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 | org.codehaus.mojo
69 | build-helper-maven-plugin
70 | 1.7
71 |
72 |
73 | add-source
74 | generate-sources
75 |
76 | add-source
77 |
78 |
79 |
80 | src/main/java
81 |
82 |
83 |
84 |
85 |
86 |
87 | net.alchim31.maven
88 | scala-maven-plugin
89 | 3.2.0
90 |
91 |
92 | compile-scala-first
93 | process-resources
94 |
95 | add-source
96 | compile
97 |
98 |
99 |
100 | test-compile-scala
101 | process-test-resources
102 |
103 | add-source
104 | testCompile
105 |
106 |
107 |
108 |
109 | ${scala.version}
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 | org.scala-tools
118 | maven-scala-plugin
119 |
120 | ${scala.version}
121 |
122 |
123 |
124 |
125 |
126 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/timeseries/models/Autoregression.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.timeseries.models
2 |
3 | import org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression
4 | import org.apache.spark.ml.{Estimator, Model}
5 | import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}
6 | import org.apache.spark.ml.param.{Param, ParamMap, Params}
7 | import org.apache.spark.ml.timeseries.{Lag, MatrixUtil}
8 | import org.apache.spark.ml.timeseries.params.TimeSeriesParams
9 | import org.apache.spark.ml.util.Identifiable
10 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
11 | import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
12 |
13 | /**
14 | * Created by endy on 16-12-16.
15 | */
16 |
17 | trait AutoregressionParams extends TimeSeriesParams {
18 |
19 | final val maxLag = new Param[Int](this, "maxLag", "max lag")
20 | def setMaxLag(value: Int): this.type = set(maxLag, value)
21 |
22 | final val noIntercept = new Param[Boolean](this, "noIntercept", "no intercept")
23 | def setNoIntercept(value: Boolean): this.type = set(noIntercept, value)
24 | }
25 |
26 |
27 | class Autoregression(override val uid: String)
28 | extends Estimator[ARModel] with AutoregressionParams{
29 |
30 | def this() = this(Identifiable.randomUID("Autoregression"))
31 |
32 | setDefault(noIntercept -> false, maxLag -> 1, timeCol -> "time",
33 | timeSeriesCol -> "timeseries")
34 | /**
35 | * Fits a model to the input data.
36 | */
37 | override def fit(dataset: Dataset[_]): ARModel = {
38 | val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map {
39 | case Row(time: String, value: Double) => (time, value)
40 | }.sortByKey().collect()
41 |
42 | val dataVector = Vectors.dense(data.map(x => x._2))
43 |
44 | // Make left hand side
45 | val Y = MatrixUtil.toBreeze(dataVector)(${maxLag} until dataVector.size)
46 | // Make lagged right hand side
47 | val X = Lag.lagMatTrimBoth(dataVector, ${maxLag})
48 |
49 | val regression = new OLSMultipleLinearRegression()
50 | regression.setNoIntercept(${noIntercept}) // drop intercept in regression
51 | regression.newSampleData(Y.toArray, MatrixUtil.matToRowArrs(X))
52 | val params = regression.estimateRegressionParameters()
53 | val (c, coeffs) = if (${noIntercept}) (0.0, params) else (params.head, params.tail)
54 |
55 | new ARModel(c, coeffs)
56 | .setTimeCol(${timeCol})
57 | .setTimeSeriesCol(${timeSeriesCol})
58 | }
59 |
60 | override def copy(extra: ParamMap): Estimator[ARModel] = defaultCopy(extra)
61 |
62 | /**
63 | * :: DeveloperApi ::
64 | *
65 | * Check transform validity and derive the output schema from the input schema.
66 | *
67 | * Typical implementation should first conduct verification on schema change and parameter
68 | * validity, including complex parameter interaction checks.
69 | */
70 | override def transformSchema(schema: StructType): StructType = {
71 | schema
72 | }
73 | }
74 |
75 | class ARModel(override val uid: String, val c: Double, val coefficients: Array[Double]) extends
76 | Model[ARModel] with AutoregressionParams {
77 |
78 | def this(c: Double, coefficients: Array[Double]) = this(Identifiable.randomUID("ARModel"), c,
79 | coefficients)
80 |
81 | /**
82 | * Transforms the input dataset.
83 | */
84 | override def transform(dataset: Dataset[_]): DataFrame = {
85 | val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map {
86 | case Row(time: String, value: Double) => (time, value)
87 | }.sortByKey().collect()
88 | .map(x => x._2)
89 |
90 | val dataVector = Vectors.dense(data)
91 |
92 | val dest = addTimeDependentEffects(dataVector)
93 |
94 | val resRDD = dataset.sparkSession.sparkContext.parallelize(dest.toArray.map(x => Row(x)))
95 |
96 | val structType = transformSchema(dataset.schema)
97 |
98 | dataset.sparkSession.createDataFrame(resRDD, structType)
99 | }
100 |
101 | def removeTimeDependentEffects(ts: Vector): Vector = {
102 | val dest = new Array[Double](ts.size)
103 | var i = 0
104 | while (i < ts.size) {
105 | dest(i) = ts(i) - c
106 | var j = 0
107 | while (j < coefficients.length && i - j - 1 >= 0) {
108 | dest(i) -= ts(i - j - 1) * coefficients(j)
109 | j += 1
110 | }
111 | i += 1
112 | }
113 | new DenseVector(dest)
114 | }
115 |
116 | def addTimeDependentEffects(ts: Vector): Vector = {
117 | val dest = new Array[Double](ts.size)
118 | var i = 0
119 | while (i < ts.size) {
120 | dest(i) = c + ts(i)
121 | var j = 0
122 | while (j < coefficients.length && i - j - 1 >= 0) {
123 | dest(i) += dest(i - j - 1) * coefficients(j)
124 | j += 1
125 | }
126 | i += 1
127 | }
128 | new DenseVector(dest)
129 | }
130 |
131 | /**
132 | * :: DeveloperApi ::
133 | *
134 | * Check transform validity and derive the output schema from the input schema.
135 | *
136 | * Typical implementation should first conduct verification on schema change and parameter
137 | * validity, including complex parameter interaction checks.
138 | */
139 | override def transformSchema(schema: StructType): StructType = {
140 | StructType(Array(StructField("Autoregression", DoubleType)))
141 |
142 | }
143 |
144 | override def copy(extra: ParamMap): ARModel = defaultCopy(extra)
145 |
146 | }
147 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/fm/FMModel.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.fm
2 |
3 | import org.apache.spark.ml.fm.FM._
4 | import org.apache.spark.ml.util.LoaderUtils
5 | import org.apache.spark.ml.util.SparkUtils._
6 | import org.apache.spark.SparkContext
7 | import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, RegressionMetrics}
8 | import org.apache.spark.mllib.linalg.{Vector => SV}
9 | import org.apache.spark.mllib.regression.LabeledPoint
10 | import org.apache.spark.mllib.util.{Loader, Saveable}
11 | import org.apache.spark.rdd.RDD
12 | import org.apache.spark.sql.{Row, SQLContext}
13 | import org.apache.spark.storage.StorageLevel
14 | import org.json4s.DefaultFormats
15 | import org.json4s.JsonDSL._
16 | import org.json4s.jackson.JsonMethods._
17 |
18 | class FMModel(
19 | val k: Int,
20 | val intercept: ED,
21 | val classification: Boolean,
22 | val factors: RDD[(Long, VD)]) extends Serializable with Saveable {
23 | def predict(data: RDD[(Long, SV)]): RDD[(Long, ED)] = {
24 | data.flatMap { case (sampleId, features) =>
25 | features.activeIterator.filter(_._2 != 0.0).map {
26 | case (featureId, value) =>
27 | (featureId.toLong, (sampleId, value))
28 | }
29 | }.join(factors).map { case (featureId, ((sampleId, x), w)) =>
30 | (sampleId, forwardInterval(k, x, w))
31 | }.reduceByKey(reduceInterval).map { case (sampleId, arr) =>
32 | var result = predictInterval(k, intercept, arr)
33 | if (classification) {
34 | result = 1.0 / (1.0 + math.exp(-result))
35 | }
36 | (sampleId, result)
37 | }
38 | }
39 |
40 | def loss(data: RDD[(Long, LabeledPoint)]): Double = {
41 | // val minTarget = data.map(_._2.label).min()
42 | // val maxTarget = data.map(_._2.label).max()
43 | val perd = predict(data.map(t => (t._1, t._2.features)))
44 | val label = data.map(t => (t._1, t._2.label))
45 | val scoreAndLabels = label.join(perd).map { case (_, (label, score)) =>
46 | // var r = Math.max(score, minTarget)
47 | // r = Math.min(r, maxTarget)
48 | // pow(l - r, 2)
49 | (score, label)
50 | }
51 | scoreAndLabels.persist(StorageLevel.MEMORY_AND_DISK)
52 | val ret = if (classification) auc(scoreAndLabels) else rmse(scoreAndLabels)
53 | scoreAndLabels.unpersist(blocking = false)
54 | ret
55 | }
56 |
57 | def rmse(scoreAndLabels: RDD[(Double, Double)]): Double = {
58 | val metrics = new RegressionMetrics(scoreAndLabels)
59 | metrics.rootMeanSquaredError
60 | }
61 |
62 | def auc(scoreAndLabels: RDD[(Double, Double)]): Double = {
63 | val metrics = new BinaryClassificationMetrics(scoreAndLabels)
64 | metrics.areaUnderROC()
65 | }
66 |
67 | override def save(sc: SparkContext, path: String): Unit = {
68 | FMModel.SaveLoadV1_0.save(sc, path, k, intercept, classification, factors)
69 | }
70 |
71 | override protected def formatVersion: String = FMModel.SaveLoadV1_0.formatVersionV1_0
72 | }
73 |
74 | object FMModel extends Loader[FMModel] {
75 |
76 | override def load(sc: SparkContext, path: String): FMModel = {
77 | val (loadedClassName, version, metadata) = LoaderUtils.loadMetadata(sc, path)
78 | val versionV1_0 = SaveLoadV1_0.formatVersionV1_0
79 | val classNameV1_0 = SaveLoadV1_0.classNameV1_0
80 | if (loadedClassName == classNameV1_0 && version == versionV1_0) {
81 | implicit val formats = DefaultFormats
82 | val classification = (metadata \ "classification").extract[Boolean]
83 | val intercept = (metadata \ "intercept").extract[Double]
84 | val k = (metadata \ "k").extract[Int]
85 | val dataPath = LoaderUtils.dataPath(path)
86 | val sqlContext = new SQLContext(sc)
87 | val dataRDD = sqlContext.read.parquet(dataPath)
88 | val dataArray = dataRDD.select("featureId", "factors").take(1)
89 | assert(dataArray.length == 1, s"Unable to load $loadedClassName data from: $dataPath")
90 | val data = dataArray(0)
91 | assert(data.size == 2, s"Unable to load $loadedClassName data from: $dataPath")
92 | val factors = dataRDD.rdd.map {
93 | case Row(featureId: Long, factors: Seq[Double]) =>
94 | (featureId, factors.toArray)
95 | }
96 | new FMModel(k, intercept, classification, factors)
97 | } else {
98 | throw new Exception(
99 | s"FMModel.load did not recognize model with (className, format version):" +
100 | s"($loadedClassName, $version). Supported:\n" +
101 | s" ($classNameV1_0, 1.0)")
102 | }
103 |
104 | }
105 |
106 | private object SaveLoadV1_0 {
107 | val formatVersionV1_0 = "1.0"
108 | val classNameV1_0 = "com.github.cloudml.zen.ml.recommendation.FMModel"
109 |
110 | def save(
111 | sc: SparkContext,
112 | path: String,
113 | k: Int,
114 | intercept: Double,
115 | classification: Boolean,
116 | factors: RDD[(Long, Array[Double])]): Unit = {
117 | val metadata = compact(render
118 | (("class" -> classNameV1_0) ~ ("version" -> formatVersionV1_0) ~
119 | ("k" -> k) ~ ("intercept" -> intercept) ~ ("classification" -> classification)))
120 | sc.parallelize(Seq(metadata), 1).saveAsTextFile(LoaderUtils.metadataPath(path))
121 |
122 | val sqlContext = new SQLContext(sc)
123 | import sqlContext.implicits._
124 | // Create Parquet data.
125 | factors.toDF("featureId", "factors").write.parquet(LoaderUtils.dataPath(path))
126 | }
127 | }
128 |
129 | }
130 |
--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/ml/timeseries/UnivariateTimeSeriesSuite.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.timeseries
2 |
3 | import org.apache.commons.math3.random.MersenneTwister
4 | import org.apache.spark.SparkFunSuite
5 | import org.apache.spark.ml.linalg.{DenseVector, Matrices, Vectors}
6 | import org.apache.spark.ml.util.DefaultReadWriteTest
7 | import org.apache.spark.mllib.util.MLlibTestSparkContext
8 | import org.apache.spark.mllib.util.TestingUtils._
9 |
10 |
11 | /**
12 | * Created by endy on 16-12-21.
13 | */
14 | class UnivariateTimeSeriesSuite extends SparkFunSuite with MLlibTestSparkContext
15 | with DefaultReadWriteTest {
16 |
17 | test("lagIncludeOriginalsTrue") {
18 | val lagMatrix = UnivariateTimeSeries.lag(Vectors.dense(1.0, 2.0, 3.0, 4.0, 5.0), 2, true)
19 | assert(lagMatrix === Matrices.dense(3, 3, Array(3.0, 4.0, 5.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0)))
20 | }
21 |
22 | test("lagIncludeOriginalsFalse") {
23 | val lagMatrix = UnivariateTimeSeries.lag(Vectors.dense(1.0, 2.0, 3.0, 4.0, 5.0), 2, false)
24 | assert(lagMatrix == Matrices.dense(3, 2, Array(2.0, 3.0, 4.0, 1.0, 2.0, 3.0)))
25 | }
26 |
27 | test("autocorr") {
28 | val rand = new MersenneTwister(5L)
29 | val iidAutocorr = UnivariateTimeSeries.autocorr(Array.fill(10000)(rand.nextDouble * 5.0), 3)
30 | iidAutocorr.foreach(x => assert(math.abs(x) < .03))
31 | }
32 |
33 | test("upsampling") {
34 | // replicating upsampling examples
35 | // from http://www.mathworks.com/help/signal/ref/upsample.html?searchHighlight=upsample
36 | val y = new DenseVector(Array(1.0, 2.0, 3.0, 4.0))
37 | val yUp1 = UnivariateTimeSeries.upsample(y, 3, useZero = true).toArray
38 | assert(yUp1 === Array(1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 3.0, 0.0, 0.0, 4.0, 0.0, 0.0))
39 |
40 | val yUp2 = UnivariateTimeSeries.upsample(y, 3, useZero = true, phase = 2).toArray
41 | assert(yUp2 === Array(0.0, 0.0, 1.0, 0.0, 0.0, 2.0, 0.0, 0.0, 3.0, 0.0, 0.0, 4.0))
42 | }
43 |
44 | test("downsampling") {
45 | // replicating downsampling examples
46 | // from http://www.mathworks.com/help/signal/ref/downsample.html?searchHighlight=downsample
47 | val y = new DenseVector((1 to 10).toArray.map(_.toDouble))
48 | val yDown1 = UnivariateTimeSeries.downsample(y, 3).toArray
49 | assert(yDown1 === Array(1.0, 4.0, 7.0, 10.0))
50 |
51 | val yDown2 = UnivariateTimeSeries.downsample(y, 3, phase = 2).toArray
52 | assert(yDown2 === Array(3.0, 6.0, 9.0))
53 | }
54 |
55 | test("signal reconstruction with spline") {
56 | // If we have a frequent signal, downsample it (at a rate that doesn't cause aliasing)
57 | // and we upsample, and apply a filter (interpolation), then the result should be fairly
58 | // close to the original signal. In our case, we drop NAs that are not filled by interpolation
59 | // (i.e no extrapolation)
60 |
61 | val y = (1 to 1000).toArray.map(_.toDouble / 100.0).map(Math.sin)
62 | val vy = new DenseVector(y)
63 | val lessFreq = UnivariateTimeSeries.downsample(vy, 100)
64 | val moreFreq = UnivariateTimeSeries.upsample(lessFreq, 100)
65 |
66 | // work on copies
67 | val splineY = UnivariateTimeSeries.fillSpline(new DenseVector(moreFreq.toArray)).toArray
68 | val lineY = UnivariateTimeSeries.fillLinear(new DenseVector(moreFreq.toArray)).toArray
69 |
70 | val MSE = (est: Array[Double], obs: Array[Double]) => {
71 | val errs = est.zip(obs).filter(!_._1.isNaN).map { case (yhat, yi) =>
72 | (yhat - yi) * (yhat - yi)
73 | }
74 | errs.sum / errs.length
75 | }
76 |
77 | val sE = MSE(splineY, y)
78 | val lE = MSE(lineY, y)
79 |
80 | // a cubic spline should be better than linear interpolation
81 | assert(sE < lE)
82 | }
83 |
84 | test("differencing at lag") {
85 | val rand = new MersenneTwister(10L)
86 | val n = 100
87 | val sampled = new DenseVector(Array.fill(n)(rand.nextGaussian))
88 | val lag = 5
89 | val diffed = UnivariateTimeSeries.differencesAtLag(sampled, lag)
90 | val invDiffed = UnivariateTimeSeries.inverseDifferencesAtLag(diffed, lag)
91 |
92 | for (i <- 0 until n) {
93 | assert(sampled(i) ~== invDiffed(i) absTol 1e-6)
94 | }
95 |
96 | assert(diffed(10) == (sampled(10) - sampled(5)))
97 | assert(diffed(99) == (sampled(99) - sampled(94)))
98 | }
99 |
100 | test("differencing of order d") {
101 | val rand = new MersenneTwister(10L)
102 | val n = 100
103 | val sampled = new DenseVector(Array.fill(n)(rand.nextGaussian))
104 | // differencing at order 1 and lag 1 should be the same
105 | val diffedOfOrder1 = UnivariateTimeSeries.differencesOfOrderD(sampled, 1)
106 | val diffedAtLag1 = UnivariateTimeSeries.differencesAtLag(sampled, 1)
107 |
108 | for (i <- 0 until n) {
109 | assert(diffedAtLag1(i) ~== diffedOfOrder1(i) absTol 1e-6)
110 | }
111 |
112 | // differencing at order and inversing should return the original series
113 | val diffedOfOrder5 = UnivariateTimeSeries.differencesOfOrderD(sampled, 5)
114 | val invDiffedOfOrder5 = UnivariateTimeSeries.inverseDifferencesOfOrderD(diffedOfOrder5, 5)
115 |
116 | for (i <- 0 until n) {
117 | assert(invDiffedOfOrder5(i) ~== sampled(i) absTol 1e-6)
118 | }
119 |
120 | // Differencing of order n + 1 should be the same as differencing one time a
121 | // vector that has already been differenced to order n
122 | val diffedOfOrder6 = UnivariateTimeSeries.differencesOfOrderD(sampled, 6)
123 | val diffedOneMore = UnivariateTimeSeries.differencesOfOrderD(diffedOfOrder5, 1)
124 | // compare start at index = 6
125 | for (i <- 6 until n) {
126 | assert(diffedOfOrder6(i) ~== diffedOneMore(i) absTol 1e-6)
127 | }
128 | }
129 | }
130 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/tsne/TSNEGradient.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.tsne
2 |
3 | import breeze.linalg._
4 | import breeze.numerics._
5 | import org.apache.spark.ml.tsne.tree.SPTree
6 | import org.slf4j.LoggerFactory
7 |
8 | object TSNEGradient {
9 | def logger = LoggerFactory.getLogger(TSNEGradient.getClass)
10 |
11 | /**
12 | * Compute the numerator from the matrix Y
13 | *
14 | * @param idx the index in the matrix to use.
15 | * @param Y the matrix to analyze
16 | * @return the numerator
17 | */
18 | def computeNumerator(Y: DenseMatrix[Double], idx: Int *): DenseMatrix[Double] = {
19 | // Y_sum = ||Y_i||^2
20 | val sumY = sum(pow(Y, 2).apply(*, ::)) // n * 1
21 | val subY = Y(idx, ::).toDenseMatrix // k * 1
22 | val y1: DenseMatrix[Double] = Y * (-2.0 :* subY.t) // n * k
23 | val num: DenseMatrix[Double] = (y1(::, *) + sumY).t // k * n
24 | num := 1.0 :/ (1.0 :+ (num(::, *) + sumY(idx).toDenseVector)) // k * n
25 |
26 | idx.indices.foreach(i => num.update(i, idx(i), 0.0)) // num(i, i) = 0
27 |
28 | num
29 | }
30 |
31 | /**
32 | * Compute the TSNE Gradient at i. Update the gradient through dY then return costs attributed at i.
33 | *
34 | * @param data data point for row i by list of pair of (j, p_ij) and 0 <= j < n
35 | * @param Y current Y [n * 2]
36 | * @param totalNum the common numerator that captures the t-distribution of Y
37 | * @param dY gradient of Y
38 | * @return loss attributed to row i
39 | */
40 | def compute(
41 | data: Array[(Int, Iterable[(Int, Double)])],
42 | Y: DenseMatrix[Double],
43 | num: DenseMatrix[Double],
44 | totalNum: Double,
45 | dY: DenseMatrix[Double],
46 | exaggeration: Boolean): Double = {
47 | // q = (1 + ||Y_i - Y_j||^2)^-1 / sum(1 + ||Y_k - Y_l||^2)^-1
48 | val q: DenseMatrix[Double] = num / totalNum
49 | q.foreachPair{case ((i, j), v) => q.update(i, j, math.max(v, 1e-12))}
50 |
51 | // q = q - p
52 | val loss = data.zipWithIndex.flatMap {
53 | case ((_, itr), i) =>
54 | itr.map{
55 | case (j, p) =>
56 | val exaggeratedP = if(exaggeration) p * 4 else p
57 | val qij = q(i, j)
58 | val l = exaggeratedP * math.log(exaggeratedP / qij)
59 | q.update(i, j, qij - exaggeratedP)
60 | if(l.isNaN) 0.0 else l
61 | }
62 | }.sum
63 |
64 | // l = [ (p_ij - q_ij) * (1 + ||Y_i - Y_j||^2)^-1 ]
65 | q :*= -num
66 | // l_sum = [0 0 ... sum(l) ... 0]
67 | sum(q(*, ::)).foreachPair{ case (i, v) => q.update(i, data(i)._1, q(i, data(i)._1) - v) }
68 |
69 | // dY_i = -4 * (l - l_sum) * Y
70 | val dYi: DenseMatrix[Double] = -4.0 :* (q * Y)
71 | data.map(_._1).zipWithIndex.foreach{
72 | case (i, idx) => dY(i, ::) := dYi(idx, ::)
73 | }
74 |
75 | loss
76 | }
77 |
78 | /** BH Tree related functions **/
79 |
80 | /**
81 | *
82 | * @param data array of (row_id, Seq(col_id), Vector(P_ij))
83 | * @param Y matrix
84 | * @param posF positive forces
85 | */
86 | def computeEdgeForces(data: Array[(Int, Seq[Int], DenseVector[Double])],
87 | Y: DenseMatrix[Double],
88 | posF: DenseMatrix[Double]): Unit = {
89 | data.foreach {
90 | case (i, cols, vec) =>
91 | // k x D - 1 x D => k x D
92 | val diff = Y(cols, ::).toDenseMatrix.apply(*, ::) - Y(i, ::).t
93 | // k x D => k x 1
94 | val qZ = 1.0 :+ sum(pow(diff, 2).apply(*, ::))
95 | posF(i, ::) := (vec :/ qZ).t * (-diff)
96 | }
97 | }
98 |
99 | def computeNonEdgeForces(tree: SPTree,
100 | Y: DenseMatrix[Double],
101 | theta: Double,
102 | negF: DenseMatrix[Double],
103 | idx: Int *): Double = {
104 | idx.foldLeft(0.0)((acc, i) => acc + computeNonEdgeForce(tree, Y(i, ::).t, theta, negF, i))
105 | }
106 |
107 | /**
108 | * Calcualte negative forces using BH approximation
109 | *
110 | * @param tree SPTree used for approximation
111 | * @param y y_i
112 | * @param theta threshold for correctness / speed
113 | * @param negF negative forces
114 | * @param i row
115 | * @return sum of Q
116 | */
117 | private def computeNonEdgeForce(tree: SPTree,
118 | y: DenseVector[Double],
119 | theta: Double,
120 | negF: DenseMatrix[Double],
121 | i: Int): Double = {
122 | import tree._
123 | if(getCount == 0 || (isLeaf && center.equals(y))) {
124 | 0.0
125 | } else {
126 | val diff = y - center
127 | val diffSq = sum(pow(diff, 2))
128 | if(isLeaf || radiusSq / diffSq < theta) {
129 | val qZ = 1 / (1 + diffSq)
130 | val nqZ = getCount * qZ
131 | negF(i, ::) :+= (nqZ * qZ * diff).t
132 | nqZ
133 | } else {
134 | children.foldLeft(0.0)((acc, child) => acc + computeNonEdgeForce(child, y, theta, negF, i))
135 | }
136 | }
137 | }
138 |
139 | def computeLoss(data: Array[(Int, Seq[Int], DenseVector[Double])],
140 | Y: DenseMatrix[Double],
141 | sumQ: Double): Double = {
142 | data.foldLeft(0.0){
143 | case (acc, (i, cols, vec)) =>
144 | val diff = Y(cols, ::).toDenseMatrix.apply(*, ::) - Y(i, ::).t
145 | val diffSq = sum(pow(diff, 2).apply(*, ::))
146 | val Q = (1.0 :/ (1.0 :+ diffSq)) :/ sumQ
147 | sum(vec :* breeze.numerics.log(max(vec, 1e-12) :/ max(Q, 1e-12)))
148 | }
149 | }
150 | }
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/mvm/MVMModel.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.mvm
2 |
3 | import org.apache.spark.ml.mvm.MVM._
4 | import org.apache.spark.ml.util.LoaderUtils
5 | import org.apache.spark.ml.util.SparkUtils._
6 | import org.apache.spark.SparkContext
7 | import org.apache.spark.mllib.evaluation.{RegressionMetrics, BinaryClassificationMetrics}
8 | import org.apache.spark.mllib.linalg.{Vector => SV}
9 | import org.apache.spark.mllib.regression.LabeledPoint
10 | import org.apache.spark.mllib.util.{Loader, Saveable}
11 | import org.apache.spark.rdd.RDD
12 | import org.apache.spark.sql.{Row, SQLContext}
13 | import org.apache.spark.storage.StorageLevel
14 | import org.json4s.DefaultFormats
15 | import org.json4s.JsonDSL._
16 | import org.json4s.jackson.JsonMethods._
17 |
18 | import scala.math._
19 |
20 | class MVMModel(
21 | val k: Int,
22 | val views: Array[Long],
23 | val classification: Boolean,
24 | val factors: RDD[(Long, VD)]) extends Serializable with Saveable {
25 | def predict(data: RDD[(Long, SV)]): RDD[(Long, ED)] = {
26 | val numFeatures = data.first()._2.size.toLong
27 | data.flatMap { case (sampleId, features) =>
28 | features.activeIterator.filter(_._2 != 0.0).map {
29 | case (featureId, value) =>
30 | (featureId.toLong, (sampleId, value))
31 | } ++ views.indices.map { i => (numFeatures + i, (sampleId, 1D)) }
32 | }.join(factors).map { case (featureId, ((sampleId, x), w)) =>
33 | val viewSize = views.length
34 | val viewId = featureId2viewId(featureId, views)
35 | (sampleId, forwardInterval(k, viewSize, viewId, x, w))
36 | }.reduceByKey(reduceInterval).map { case (sampleId, arr) =>
37 | var result = predictInterval(k, arr)
38 | if (classification) {
39 | result = 1.0 / (1.0 + math.exp(-result))
40 | }
41 | (sampleId, result)
42 | }
43 | }
44 |
45 | def loss(data: RDD[(Long, LabeledPoint)]): Double = {
46 | // val minTarget = data.map(_._2.label).min()
47 | // val maxTarget = data.map(_._2.label).max()
48 | val perd = predict(data.map(t => (t._1, t._2.features)))
49 | val label = data.map(t => (t._1, t._2.label))
50 | val scoreAndLabels = label.join(perd).map { case (_, (label, score)) =>
51 | // var r = Math.max(score, minTarget)
52 | // r = Math.min(r, maxTarget)
53 | // pow(l - r, 2)
54 | (score, label)
55 | }
56 | scoreAndLabels.persist(StorageLevel.MEMORY_AND_DISK)
57 | val ret = if (classification) auc(scoreAndLabels) else rmse(scoreAndLabels)
58 | scoreAndLabels.unpersist(blocking = false)
59 | ret
60 | }
61 |
62 | def rmse(scoreAndLabels: RDD[(Double, Double)]): Double = {
63 | val metrics = new RegressionMetrics(scoreAndLabels)
64 | metrics.rootMeanSquaredError
65 | }
66 |
67 | def auc(scoreAndLabels: RDD[(Double, Double)]): Double = {
68 | val metrics = new BinaryClassificationMetrics(scoreAndLabels)
69 | metrics.areaUnderROC()
70 | }
71 |
72 | override def save(sc: SparkContext, path: String): Unit = {
73 | MVMModel.SaveLoadV1_0.save(sc, path, k, views, classification, factors)
74 | }
75 |
76 | override protected def formatVersion: String = MVMModel.SaveLoadV1_0.formatVersionV1_0
77 | }
78 |
79 | object MVMModel extends Loader[MVMModel] {
80 |
81 | override def load(sc: SparkContext, path: String): MVMModel = {
82 | val (loadedClassName, version, metadata) = LoaderUtils.loadMetadata(sc, path)
83 | val versionV1_0 = SaveLoadV1_0.formatVersionV1_0
84 | val classNameV1_0 = SaveLoadV1_0.classNameV1_0
85 | if (loadedClassName == classNameV1_0 && version == versionV1_0) {
86 | implicit val formats = DefaultFormats
87 | val classification = (metadata \ "classification").extract[Boolean]
88 | val views = (metadata \ "views").extract[String].split(",").map(_.toLong)
89 | val k = (metadata \ "k").extract[Int]
90 | val dataPath = LoaderUtils.dataPath(path)
91 | val sqlContext = new SQLContext(sc)
92 | val dataRDD = sqlContext.read.parquet(dataPath)
93 | val dataArray = dataRDD.select("featureId", "factors").take(1)
94 | assert(dataArray.size == 1, s"Unable to load $loadedClassName data from: $dataPath")
95 | val data = dataArray(0)
96 | assert(data.size == 2, s"Unable to load $loadedClassName data from: $dataPath")
97 | val factors = dataRDD.rdd.map {
98 | case Row(featureId: Long, factors: Seq[Double]) =>
99 | (featureId, factors.toArray)
100 | }
101 | new MVMModel(k, views, classification, factors)
102 | } else {
103 | throw new Exception(
104 | s"FMModel.load did not recognize model with (className, format version):" +
105 | s"($loadedClassName, $version). Supported:\n" +
106 | s" ($classNameV1_0, 1.0)")
107 | }
108 |
109 | }
110 |
111 | private object SaveLoadV1_0 {
112 | val formatVersionV1_0 = "1.0"
113 | val classNameV1_0 = "com.github.cloudml.zen.ml.recommendation.MVMModel"
114 |
115 | def save(
116 | sc: SparkContext,
117 | path: String,
118 | k: Int,
119 | views: Array[Long],
120 | classification: Boolean,
121 | factors: RDD[(Long, Array[Double])]): Unit = {
122 | val metadata = compact(render
123 | (("class" -> classNameV1_0) ~ ("version" -> formatVersionV1_0) ~
124 | ("k" -> k) ~ ("views" -> views.mkString(",")) ~ ("classification" -> classification)))
125 | sc.parallelize(Seq(metadata), 1).saveAsTextFile(LoaderUtils.metadataPath(path))
126 |
127 | val sqlContext = new SQLContext(sc)
128 | import sqlContext.implicits._
129 | // Create Parquet data.
130 | factors.toDF("featureId", "factors").write.parquet(LoaderUtils.dataPath(path))
131 | }
132 | }
133 |
134 | }
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/fm/BSFMModel.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.fm
2 |
3 | import org.apache.spark.ml.fm.BSFM._
4 | import org.apache.spark.ml.util.LoaderUtils
5 | import org.apache.spark.ml.util.SparkUtils._
6 | import org.apache.spark.SparkContext
7 | import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, RegressionMetrics}
8 | import org.apache.spark.mllib.linalg.{Vector => SV}
9 | import org.apache.spark.mllib.regression.LabeledPoint
10 | import org.apache.spark.mllib.util.{Loader, Saveable}
11 | import org.apache.spark.rdd.RDD
12 | import org.apache.spark.sql.{Row, SQLContext}
13 | import org.apache.spark.storage.StorageLevel
14 | import org.json4s.DefaultFormats
15 | import org.json4s.JsonDSL._
16 | import org.json4s.jackson.JsonMethods._
17 |
18 | import scala.math._
19 |
20 | class BSFMModel(
21 | val k: Int,
22 | val intercept: ED,
23 | val views: Array[Long],
24 | val classification: Boolean,
25 | val factors: RDD[(Long, VD)]) extends Serializable with Saveable {
26 | def predict(data: RDD[(Long, SV)]): RDD[(Long, ED)] = {
27 | val numFeatures = data.first()._2.size.toLong
28 | data.flatMap { case (sampleId, features) =>
29 | features.activeIterator.filter(_._2 != 0.0).map {
30 | case (featureId, value) =>
31 | (featureId.toLong, (sampleId, value))
32 | } ++ views.indices.map { i => (numFeatures + i, (sampleId, 1D)) }
33 | }.join(factors).map { case (featureId, ((sampleId, x), w)) =>
34 | val viewSize = views.length
35 | val viewId = featureId2viewId(featureId, views)
36 | (sampleId, forwardInterval(k, viewSize, viewId, x, w))
37 | }.reduceByKey(forwardReduceInterval).map { case (sampleId, arr) =>
38 | var result = predictInterval(k, views.length, intercept, arr)
39 | if (classification) {
40 | result = 1.0 / (1.0 + math.exp(-result))
41 | }
42 | (sampleId, result)
43 | }
44 | }
45 |
46 | def loss(data: RDD[(Long, LabeledPoint)]): Double = {
47 | // val minTarget = data.map(_._2.label).min()
48 | // val maxTarget = data.map(_._2.label).max()
49 | val perd = predict(data.map(t => (t._1, t._2.features)))
50 | val label = data.map(t => (t._1, t._2.label))
51 | val scoreAndLabels = label.join(perd).map { case (_, (label, score)) =>
52 | // var r = Math.max(score, minTarget)
53 | // r = Math.min(r, maxTarget)
54 | // pow(l - r, 2)
55 | (score, label)
56 | }
57 | scoreAndLabels.persist(StorageLevel.MEMORY_AND_DISK)
58 | val ret = if (classification) auc(scoreAndLabels) else rmse(scoreAndLabels)
59 | scoreAndLabels.unpersist(blocking = false)
60 | ret
61 | }
62 |
63 | def rmse(scoreAndLabels: RDD[(Double, Double)]): Double = {
64 | val metrics = new RegressionMetrics(scoreAndLabels)
65 | metrics.rootMeanSquaredError
66 | }
67 |
68 | def auc(scoreAndLabels: RDD[(Double, Double)]): Double = {
69 | val metrics = new BinaryClassificationMetrics(scoreAndLabels)
70 | metrics.areaUnderROC()
71 | }
72 |
73 | override def save(sc: SparkContext, path: String): Unit = {
74 | BSFMModel.SaveLoadV1_0.save(sc, path, k, intercept, views, classification, factors)
75 | }
76 |
77 | override protected def formatVersion: String = BSFMModel.SaveLoadV1_0.formatVersionV1_0
78 | }
79 |
80 | object BSFMModel extends Loader[BSFMModel] {
81 |
82 | override def load(sc: SparkContext, path: String): BSFMModel = {
83 | val (loadedClassName, version, metadata) = LoaderUtils.loadMetadata(sc, path)
84 | val versionV1_0 = SaveLoadV1_0.formatVersionV1_0
85 | val classNameV1_0 = SaveLoadV1_0.classNameV1_0
86 | if (loadedClassName == classNameV1_0 && version == versionV1_0) {
87 | implicit val formats = DefaultFormats
88 | val classification = (metadata \ "classification").extract[Boolean]
89 | val intercept = (metadata \ "intercept").extract[Double]
90 | val views = (metadata \ "views").extract[String].split(",").map(_.toLong)
91 | val k = (metadata \ "k").extract[Int]
92 | val dataPath = LoaderUtils.dataPath(path)
93 | val sqlContext = new SQLContext(sc)
94 | val dataRDD = sqlContext.read.parquet(dataPath)
95 | val dataArray = dataRDD.select("featureId", "factors").take(1)
96 | assert(dataArray.size == 1, s"Unable to load $loadedClassName data from: $dataPath")
97 | val data = dataArray(0)
98 | assert(data.size == 2, s"Unable to load $loadedClassName data from: $dataPath")
99 | val factors = dataRDD.rdd.map {
100 | case Row(featureId: Long, factors: Seq[Double]) =>
101 | (featureId, factors.toArray)
102 | }
103 | new BSFMModel(k, intercept, views, classification, factors)
104 | } else {
105 | throw new Exception(
106 | s"FMModel.load did not recognize model with (className, format version):" +
107 | s"($loadedClassName, $version). Supported:\n" +
108 | s" ($classNameV1_0, 1.0)")
109 | }
110 |
111 | }
112 |
113 | private object SaveLoadV1_0 {
114 | val formatVersionV1_0 = "1.0"
115 | val classNameV1_0 = "com.github.cloudml.zen.ml.recommendation.BSFMModel"
116 |
117 | def save(
118 | sc: SparkContext,
119 | path: String,
120 | k: Int,
121 | intercept: Double,
122 | views: Array[Long],
123 | classification: Boolean,
124 | factors: RDD[(Long, Array[Double])]): Unit = {
125 | val metadata = compact(render
126 | (("class" -> classNameV1_0) ~ ("version" -> formatVersionV1_0) ~ ("intercept" -> intercept) ~
127 | ("k" -> k) ~ ("views" -> views.mkString(",")) ~ ("classification" -> classification)))
128 | sc.parallelize(Seq(metadata), 1).saveAsTextFile(LoaderUtils.metadataPath(path))
129 |
130 | val sqlContext = new SQLContext(sc)
131 | import sqlContext.implicits._
132 | // Create Parquet data.
133 | factors.toDF("featureId", "factors").write.parquet(LoaderUtils.dataPath(path))
134 | }
135 | }
136 |
137 | }
138 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/dbscan/EvenSplitPartitioner.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.dbscan
2 |
3 | import scala.annotation.tailrec
4 |
5 | import org.apache.spark.internal.Logging
6 |
7 | /**
8 | * Helper methods for calling the partitioner
9 | */
10 | object EvenSplitPartitioner {
11 |
12 | def partition(
13 | toSplit: Set[(DBSCANRectangle, Int)],
14 | maxPointsPerPartition: Long,
15 | minimumRectangleSize: Double): List[(DBSCANRectangle, Int)] = {
16 | new EvenSplitPartitioner(maxPointsPerPartition, minimumRectangleSize)
17 | .findPartitions(toSplit)
18 | }
19 |
20 | }
21 |
22 | class EvenSplitPartitioner(
23 | maxPointsPerPartition: Long,
24 | minimumRectangleSize: Double) extends Logging {
25 |
26 | type RectangleWithCount = (DBSCANRectangle, Int)
27 |
28 | def findPartitions(toSplit: Set[RectangleWithCount]): List[RectangleWithCount] = {
29 |
30 | val boundingRectangle = findBoundingRectangle(toSplit)
31 |
32 | def pointsIn = pointsInRectangle(toSplit, _: DBSCANRectangle)
33 |
34 | val toPartition = List((boundingRectangle, pointsIn(boundingRectangle)))
35 | val partitioned = List[RectangleWithCount]()
36 |
37 | logTrace("About to start partitioning")
38 | val partitions = partition(toPartition, partitioned, pointsIn)
39 | logTrace("Done")
40 |
41 | // remove empty partitions
42 | partitions.filter({ case (partition, count) => count > 0 })
43 | }
44 |
45 | @tailrec
46 | private def partition(
47 | remaining: List[RectangleWithCount],
48 | partitioned: List[RectangleWithCount],
49 | pointsIn: (DBSCANRectangle) => Int): List[RectangleWithCount] = {
50 |
51 | remaining match {
52 | case (rectangle, count) :: rest =>
53 | if (count > maxPointsPerPartition) {
54 |
55 | if (canBeSplit(rectangle)) {
56 | logTrace(s"About to split: $rectangle")
57 | def cost = (r: DBSCANRectangle) => ((pointsIn(rectangle) / 2) - pointsIn(r)).abs
58 | val (split1, split2) = split(rectangle, cost)
59 | logTrace(s"Found split: $split1, $split2")
60 | val s1 = (split1, pointsIn(split1))
61 | val s2 = (split2, pointsIn(split2))
62 | partition(s1 :: s2 :: rest, partitioned, pointsIn)
63 |
64 | } else {
65 | logWarning(s"Can't split: ($rectangle -> $count) (maxSize: $maxPointsPerPartition)")
66 | partition(rest, (rectangle, count) :: partitioned, pointsIn)
67 | }
68 |
69 | } else {
70 | partition(rest, (rectangle, count) :: partitioned, pointsIn)
71 | }
72 |
73 | case Nil => partitioned
74 |
75 | }
76 |
77 | }
78 |
79 | def split(
80 | rectangle: DBSCANRectangle,
81 | cost: (DBSCANRectangle) => Int): (DBSCANRectangle, DBSCANRectangle) = {
82 |
83 | val smallestSplit =
84 | findPossibleSplits(rectangle)
85 | .reduceLeft {
86 | (smallest, current) =>
87 |
88 | if (cost(current) < cost(smallest)) {
89 | current
90 | } else {
91 | smallest
92 | }
93 |
94 | }
95 |
96 | (smallestSplit, (complement(smallestSplit, rectangle)))
97 |
98 | }
99 |
100 | /**
101 | * Returns the box that covers the space inside boundary that is not covered by box
102 | */
103 | private def complement(box: DBSCANRectangle, boundary: DBSCANRectangle): DBSCANRectangle =
104 | if (box.x == boundary.x && box.y == boundary.y) {
105 | if (boundary.x2 >= box.x2 && boundary.y2 >= box.y2) {
106 | if (box.y2 == boundary.y2) {
107 | DBSCANRectangle(box.x2, box.y, boundary.x2, boundary.y2)
108 | } else if (box.x2 == boundary.x2) {
109 | DBSCANRectangle(box.x, box.y2, boundary.x2, boundary.y2)
110 | } else {
111 | throw new IllegalArgumentException("rectangle is not a proper sub-rectangle")
112 | }
113 | } else {
114 | throw new IllegalArgumentException("rectangle is smaller than boundary")
115 | }
116 | } else {
117 | throw new IllegalArgumentException("unequal rectangle")
118 | }
119 |
120 | /**
121 | * Returns all the possible ways in which the given box can be split
122 | */
123 | private def findPossibleSplits(box: DBSCANRectangle): Set[DBSCANRectangle] = {
124 |
125 | val xSplits = (box.x + minimumRectangleSize) until box.x2 by minimumRectangleSize
126 |
127 | val ySplits = (box.y + minimumRectangleSize) until box.y2 by minimumRectangleSize
128 |
129 | val splits =
130 | xSplits.map(x => DBSCANRectangle(box.x, box.y, x, box.y2)) ++
131 | ySplits.map(y => DBSCANRectangle(box.x, box.y, box.x2, y))
132 |
133 | logTrace(s"Possible splits: $splits")
134 |
135 | splits.toSet
136 | }
137 |
138 | /**
139 | * Returns true if the given rectangle can be split into at least two rectangles of minimum size
140 | */
141 | private def canBeSplit(box: DBSCANRectangle): Boolean = {
142 | (box.x2 - box.x > minimumRectangleSize * 2 ||
143 | box.y2 - box.y > minimumRectangleSize * 2)
144 | }
145 |
146 | def pointsInRectangle(space: Set[RectangleWithCount], rectangle: DBSCANRectangle): Int = {
147 | space.view
148 | .filter({ case (current, _) => rectangle.contains(current) })
149 | .foldLeft(0) {
150 | case (total, (_, count)) => total + count
151 | }
152 | }
153 |
154 | def findBoundingRectangle(rectanglesWithCount: Set[RectangleWithCount]): DBSCANRectangle = {
155 |
156 | val invertedRectangle =
157 | DBSCANRectangle(Double.MaxValue, Double.MaxValue, Double.MinValue, Double.MinValue)
158 |
159 | rectanglesWithCount.foldLeft(invertedRectangle) {
160 | case (bounding, (c, _)) =>
161 | DBSCANRectangle(
162 | bounding.x.min(c.x), bounding.y.min(c.y),
163 | bounding.x2.max(c.x2), bounding.y2.max(c.y2))
164 | }
165 |
166 | }
167 |
168 | }
169 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/timeseries/models/ARGARCH.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.timeseries.models
2 |
3 | import io.transwarp.discover.timeseries.params.TimeSeriesParams
4 | import org.apache.commons.math3.random.RandomGenerator
5 | import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}
6 | import org.apache.spark.ml.param.ParamMap
7 | import org.apache.spark.ml.util.Identifiable
8 | import org.apache.spark.ml.{Estimator, Model}
9 | import org.apache.spark.sql._
10 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
11 |
12 | /**
13 | * Created by endy on 16-12-22.
14 | */
15 |
16 | class ARGARCH(override val uid: String) extends Estimator[ARGARCHModel] with TimeSeriesParams {
17 | setDefault(timeCol -> "time",
18 | timeSeriesCol -> "timeseries")
19 |
20 | def this() = this(Identifiable.randomUID("ARGARCH"))
21 | /**
22 | * Fits a model to the input data.
23 | */
24 | override def fit(dataset: Dataset[_]): ARGARCHModel = {
25 | val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map {
26 | case Row(time: String, value: Double) => (time, value)
27 | }.sortByKey().collect()
28 |
29 | val dataVector = Vectors.dense(data.map(x => x._2))
30 |
31 | val arModel = new Autoregression().fit(dataset)
32 | val residuals = arModel.removeTimeDependentEffects(dataVector)
33 | val dataFrame = generateDf(dataset.sparkSession, residuals.toArray)
34 | val garchModel = new GARCH().fit(dataFrame)
35 |
36 | new ARGARCHModel(arModel.c, arModel.coefficients(0), garchModel.omega, garchModel.alpha,
37 | garchModel.beta)
38 | }
39 |
40 | override def copy(extra: ParamMap): Estimator[ARGARCHModel] = defaultCopy(extra)
41 |
42 | /**
43 | * :: DeveloperApi ::
44 | *
45 | * Check transform validity and derive the output schema from the input schema.
46 | *
47 | * Typical implementation should first conduct verification on schema change and parameter
48 | * validity, including complex parameter interaction checks.
49 | */
50 | override def transformSchema(schema: StructType): StructType = schema
51 |
52 | private def generateDf(sparkSession: SparkSession, array: Array[Double]): DataFrame = {
53 | val schema = StructType(Array(StructField(${timeCol}, StringType), StructField(${timeSeriesCol},
54 | DoubleType)))
55 |
56 | val rdd = sparkSession.sparkContext.parallelize(
57 | array.zipWithIndex.map(x => Row(x._2.formatted("%010d"), x._1)))
58 |
59 | sparkSession.createDataFrame(rdd, schema)
60 | }
61 | }
62 |
63 | class ARGARCHModel(override val uid: String, val c: Double, val phi: Double, val omega: Double,
64 | val alpha: Double, val beta: Double) extends
65 | Model[ARGARCHModel] with TimeSeriesParams {
66 |
67 | def this(c: Double, phi: Double, omega: Double, alpha: Double, beta: Double) =
68 | this(Identifiable.randomUID("ARGARCHModel"), c, phi, omega, alpha, beta)
69 |
70 | override def copy(extra: ParamMap): ARGARCHModel = defaultCopy(extra)
71 |
72 | /**
73 | * Transforms the input dataset.
74 | */
75 | override def transform(dataset: Dataset[_]): DataFrame = {
76 | val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map {
77 | case Row(time: String, value: Double) => (time, value)
78 | }.sortByKey().collect()
79 |
80 | val dataVector = Vectors.dense(data.map(x => x._2))
81 |
82 | val dest = addTimeDependentEffects(dataVector)
83 |
84 | val resRDD = dataset.sparkSession.sparkContext.parallelize(dest.toArray.map(x => Row(x)))
85 |
86 | val structType = transformSchema(dataset.schema)
87 |
88 | dataset.sparkSession.createDataFrame(resRDD, structType)
89 | }
90 |
91 | /**
92 | * :: DeveloperApi ::
93 | *
94 | * Check transform validity and derive the output schema from the input schema.
95 | *
96 | * Typical implementation should first conduct verification on schema change and parameter
97 | * validity, including complex parameter interaction checks.
98 | */
99 | override def transformSchema(schema: StructType): StructType = {
100 | StructType(Array(StructField("ARGARCH", DoubleType)))
101 | }
102 |
103 | def removeTimeDependentEffects(ts: Vector): Vector = {
104 | val destArr = new Array[Double](ts.size)
105 | var prevEta = ts(0) - c
106 | var prevVariance = omega / (1.0 - alpha - beta)
107 | destArr(0) = prevEta / math.sqrt(prevVariance)
108 | for (i <- 1 until ts.size) {
109 | val variance = omega + alpha * prevEta * prevEta + beta * prevVariance
110 | val eta = ts(i) - c - phi * ts(i - 1)
111 | destArr(i) = eta / math.sqrt(variance)
112 |
113 | prevEta = eta
114 | prevVariance = variance
115 | }
116 | new DenseVector(destArr)
117 | }
118 |
119 | def addTimeDependentEffects(ts: Vector): Vector = {
120 | val destArr = new Array[Double](ts.size)
121 | var prevVariance = omega / (1.0 - alpha - beta)
122 | var prevEta = ts(0) * math.sqrt(prevVariance)
123 | destArr(0) = c + prevEta
124 | for (i <- 1 until ts.size) {
125 | val variance = omega + alpha * prevEta * prevEta + beta * prevVariance
126 | val standardizedEta = ts(i)
127 | val eta = standardizedEta * math.sqrt(variance)
128 | destArr(i) = c + phi * destArr(i - 1) + eta
129 |
130 | prevEta = eta
131 | prevVariance = variance
132 | }
133 | new DenseVector(destArr)
134 | }
135 |
136 | private def sampleWithVariances(n: Int, rand: RandomGenerator): (Array[Double], Array[Double]) = {
137 | val ts = new Array[Double](n)
138 | val variances = new Array[Double](n)
139 | variances(0) = omega / (1 - alpha - beta)
140 | var eta = math.sqrt(variances(0)) * rand.nextGaussian()
141 | for (i <- 1 until n) {
142 | variances(i) = omega + beta * variances(i-1) + alpha * eta * eta
143 | eta = math.sqrt(variances(i)) * rand.nextGaussian()
144 | ts(i) = c + phi * ts(i - 1) + eta
145 | }
146 |
147 | (ts, variances)
148 | }
149 |
150 | /**
151 | * Samples a random time series of a given length with the properties of the model.
152 | *
153 | * @param n The length of the time series to sample.
154 | * @param rand The random generator used to generate the observations.
155 | * @return The samples time series.
156 | */
157 | def sample(n: Int, rand: RandomGenerator): Array[Double] = sampleWithVariances(n, rand)._1
158 | }
159 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/knn/KNNClassifier.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.classification
2 |
3 | import org.apache.spark.broadcast.Broadcast
4 | import org.apache.spark.ml.param.ParamMap
5 | import org.apache.spark.ml.param.shared.HasWeightCol
6 | import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
7 | import org.apache.spark.ml.linalg._
8 | import org.apache.spark.rdd.RDD
9 | import org.apache.spark.sql.types.{DoubleType, StructType}
10 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
11 | import org.apache.spark.storage.StorageLevel
12 | import org.apache.spark.ml.feature.LabeledPoint
13 |
14 | import scala.collection.mutable.ArrayBuffer
15 |
16 | /**
17 | * Created by endy on 17-1-9.
18 | */
19 | class KNNClassifier(override val uid: String) extends
20 | ProbabilisticClassifier[Vector, KNNClassifier, KNNClassificationModel]
21 | with KNNParams {
22 |
23 | def this() = this(Identifiable.randomUID("KNNClassifier"))
24 |
25 | def setK(value: Int): this.type = set(k, value)
26 |
27 | def setTopTreeSize(value: Int): this.type = set(topTreeSize, value)
28 |
29 | def setTopTreeLeafSize(value: Int): this.type = set(topTreeLeafSize, value)
30 |
31 | def setSubTreeLeafSize(value: Int): this.type = set(subTreeLeafSize, value)
32 |
33 | def setBufferSizeSampleSizes(value: Array[Int]): this.type = set(bufferSizeSampleSizes, value)
34 |
35 | def setBalanceThreshold(value: Double): this.type = set(balanceThreshold, value)
36 |
37 | def setSeed(value: Long): this.type = set(seed, value)
38 |
39 | override protected def train(dataset: Dataset[_]): KNNClassificationModel = {
40 | // Extract columns from data. If dataset is persisted, do not persist oldDataset.
41 | val instances = extractLabeledPoints(dataset).map {
42 | case LabeledPoint(label: Double, features: Vector) => (label, features)
43 | }
44 | val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
45 | if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK)
46 |
47 | val labelSummarizer = instances.treeAggregate(
48 | new MultiClassSummarizer)(
49 | seqOp = (c, v) => (c, v) match {
50 | case (labelSummarizer: MultiClassSummarizer, (label: Double, features: Vector)) =>
51 | labelSummarizer.add(label)
52 | },
53 | combOp = (c1, c2) => (c1, c2) match {
54 | case (classSummarizer1: MultiClassSummarizer, classSummarizer2: MultiClassSummarizer) =>
55 | classSummarizer1.merge(classSummarizer2)
56 | })
57 |
58 | val histogram = labelSummarizer.histogram
59 | val numInvalid = labelSummarizer.countInvalid
60 | val numClasses = histogram.length
61 |
62 | if (numInvalid != 0) {
63 | val msg = s"Classification labels should be in {0 to ${numClasses - 1} " +
64 | s"Found $numInvalid invalid labels."
65 | throw Exception
66 | }
67 |
68 | val knnModel = copyValues(new KNN()).fit(dataset)
69 | knnModel.toNewClassificationModel(uid, numClasses)
70 | }
71 |
72 | override def fit(dataset: Dataset[_]): KNNClassificationModel = {
73 | // Need to overwrite this method because we need to manually overwrite the buffer size
74 | // because it is not supposed to stay the same as the Classifier if user sets it to -1.
75 | transformSchema(dataset.schema, logging = true)
76 | val model = train(dataset)
77 | val bufferSize = model.getBufferSize
78 | copyValues(model.setParent(this)).setBufferSize(bufferSize)
79 | }
80 |
81 | override def copy(extra: ParamMap): KNNClassifier = defaultCopy(extra)
82 | }
83 |
84 | class KNNClassificationModel(override val uid: String, val topTree: Broadcast[Tree],
85 | val subTrees: RDD[Tree], val _numClasses: Int) extends
86 | ProbabilisticClassificationModel[Vector, KNNClassificationModel]
87 | with KNNModelParams with HasWeightCol with Serializable {
88 | require(subTrees.getStorageLevel != StorageLevel.NONE,
89 | "KNNModel is not designed to work with Trees that have not been cached")
90 |
91 | /** @group setParam */
92 | def setK(value: Int): this.type = set(k, value)
93 |
94 | /** @group setParam */
95 | def setBufferSize(value: Double): this.type = set(bufferSize, value)
96 |
97 | override def numClasses: Int = _numClasses
98 |
99 | override def transform(dataset: Dataset[_]): DataFrame = {
100 | val getWeight: Row => Double = r => 1.0
101 |
102 | val merged = transform(dataset, topTree, subTrees).map {
103 | case (id, labels) =>
104 | val vector = new Array[Double](numClasses)
105 | var i = 0
106 | while (i < labels.length) {
107 | vector(labels(i).getDouble(0).toInt) += getWeight(labels(i))
108 | i += 1
109 | }
110 | val rawPrediction = Vectors.dense(vector)
111 | lazy val probability = raw2probability(rawPrediction)
112 | lazy val prediction = probability2prediction(probability)
113 |
114 | val values = new ArrayBuffer[Any]
115 | if ($(rawPredictionCol).nonEmpty) {
116 | values.append(rawPrediction)
117 | }
118 | if ($(probabilityCol).nonEmpty) {
119 | values.append(probability)
120 | }
121 | if ($(predictionCol).nonEmpty) {
122 | values.append(prediction)
123 | }
124 | (id, values)
125 | }
126 |
127 | dataset.sqlContext.createDataFrame(
128 | dataset.rdd.zipWithIndex().map { case (row, i) => (i, row) }
129 | .leftOuterJoin(merged) // make sure we don't lose any observations
130 | .map {
131 | case (i, (row, values)) => Row.fromSeq(row.asInstanceOf[Row].toSeq ++ values.get)
132 | },
133 | transformSchema(dataset.schema)
134 | )
135 | }
136 |
137 | override def transformSchema(schema: StructType): StructType = {
138 | var transformed = schema
139 | if ($(rawPredictionCol).nonEmpty) {
140 | transformed = SchemaUtils.appendColumn(transformed, $(rawPredictionCol), new VectorUDT)
141 | }
142 | if ($(probabilityCol).nonEmpty) {
143 | transformed = SchemaUtils.appendColumn(transformed, $(probabilityCol), new VectorUDT)
144 | }
145 | if ($(predictionCol).nonEmpty) {
146 | transformed = SchemaUtils.appendColumn(transformed, $(predictionCol), DoubleType)
147 | }
148 | transformed
149 | }
150 |
151 | override def copy(extra: ParamMap): KNNClassificationModel = {
152 | val copied = new KNNClassificationModel(uid, topTree, subTrees, numClasses)
153 | copyValues(copied, extra).setParent(parent)
154 | }
155 |
156 | override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = {
157 |
158 | rawPrediction match {
159 | case dv: DenseVector =>
160 | val size = dv.size
161 | val sum = dv.toArray.sum
162 |
163 | var i = 0
164 | while (i < size) {
165 | dv.values(i) /= sum
166 | i += 1
167 | }
168 |
169 | dv
170 | case sv: SparseVector =>
171 | throw Exception
172 | }
173 | }
174 |
175 | override protected def predictRaw(features: Vector): Vector = {
176 | throw Exception
177 | }
178 | }
179 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/timeseries/models/EWMA.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.timeseries.models
2 |
3 | import org.apache.commons.math3.analysis.{MultivariateFunction, MultivariateVectorFunction}
4 | import org.apache.commons.math3.optim.{InitialGuess, MaxEval, MaxIter, SimpleValueChecker}
5 | import org.apache.commons.math3.optim.nonlinear.scalar.{GoalType, ObjectiveFunction, ObjectiveFunctionGradient}
6 | import org.apache.commons.math3.optim.nonlinear.scalar.gradient.NonLinearConjugateGradientOptimizer
7 | import org.apache.spark.ml.{Estimator, Model}
8 | import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}
9 | import org.apache.spark.ml.param.{Param, ParamMap}
10 | import org.apache.spark.ml.timeseries.params.TimeSeriesParams
11 | import org.apache.spark.ml.util.Identifiable
12 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
13 | import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
14 |
15 | /**
16 | * Fits an Exponentially Weight Moving Average model (EWMA) to a time series.
17 | */
18 |
19 | trait EWMAParams extends TimeSeriesParams {
20 | final val maxEval = new Param[Int](this, "maxEval", "max eval")
21 | def setMaxEval(value: Int): this.type = set(maxEval, value)
22 |
23 | final val maxIter = new Param[Int](this, "maxIter", "max iteration")
24 | def setMaxIter(value: Int): this.type = set(maxIter, value)
25 |
26 | final val initPoint = new Param[Double](this, "initPoint", "init point")
27 | def setInitPoint(value: Double): this.type = set(initPoint, value)
28 | }
29 |
30 | class EWMA(override val uid: String) extends Estimator[EWMAModel] with EWMAParams{
31 |
32 | setDefault(timeCol -> "time",
33 | timeSeriesCol -> "timeseries")
34 |
35 | def this() = this(Identifiable.randomUID("EWMA"))
36 |
37 | /**
38 | * Fits an EWMA model to a time series. Uses the first point in the time series as a starting
39 | * value. Uses sum squared error as an objective function to optimize to find smoothing parameter
40 | * The model for EWMA is recursively defined as S_t = (1 - a) * X_t + a * S_{t-1}, where
41 | * a is the smoothing parameter, X is the original series, and S is the smoothed series
42 | * Note that the optimization is performed as unbounded optimization, although in its formal
43 | * definition the smoothing parameter is <= 1, which corresponds to an inequality bounded
44 | * optimization. Given this, the resulting smoothing parameter should always be sanity checked
45 | * https://en.wikipedia.org/wiki/Exponential_smoothing
46 | * @param dataset the time series dataset to which we want to fit an EWMA model
47 | * @return EWMA model
48 | */
49 | override def fit(dataset: Dataset[_]): EWMAModel = {
50 |
51 | val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map {
52 | case Row(time: String, value: Double) => (time, value)
53 | }.sortByKey().collect()
54 | .map(x => x._2)
55 |
56 | val dataVector = Vectors.dense(data)
57 |
58 | val optimizer = new NonLinearConjugateGradientOptimizer(
59 | NonLinearConjugateGradientOptimizer.Formula.FLETCHER_REEVES,
60 | new SimpleValueChecker(1e-6, 1e-6))
61 |
62 |
63 | val gradient = new ObjectiveFunctionGradient(new MultivariateVectorFunction() {
64 | def value(params: Array[Double]): Array[Double] = {
65 | val g = new EWMAModel(params(0)).gradient(dataVector)
66 | Array(g)
67 | }
68 | })
69 |
70 | val objectiveFunction = new ObjectiveFunction(new MultivariateFunction() {
71 | def value(params: Array[Double]): Double = {
72 | new EWMAModel(params(0)).sse(dataVector)
73 | }
74 | })
75 | // optimization parameters
76 | val initGuess = new InitialGuess(Array(${initPoint}))
77 | val goal = GoalType.MINIMIZE
78 | // optimization step
79 | val optimal = optimizer.optimize(objectiveFunction, goal, gradient, initGuess,
80 | new MaxIter(${maxIter}), new MaxEval(${maxEval}))
81 | val params = optimal.getPoint
82 |
83 | new EWMAModel(params(0))
84 | .setTimeCol(${timeCol})
85 | .setTimeSeriesCol(${timeSeriesCol})
86 |
87 | }
88 |
89 | override def copy(extra: ParamMap): Estimator[EWMAModel] = defaultCopy(extra)
90 |
91 | /**
92 | * Check transform validity and derive the output schema from the input schema.
93 | *
94 | * Typical implementation should first conduct verification on schema change and parameter
95 | * validity, including complex parameter interaction checks.
96 | */
97 | override def transformSchema(schema: StructType): StructType = {
98 | schema
99 | }
100 | }
101 |
102 |
103 | class EWMAModel(override val uid: String, val smoothing: Double)
104 | extends Model[EWMAModel] with EWMAParams{
105 |
106 | def this(smoothing: Double) = this(Identifiable.randomUID("EWMAModel"), smoothing)
107 |
108 | /**
109 | * Calculates the SSE for a given timeseries ts given
110 | * the smoothing parameter of the current model
111 | * The forecast for the observation at period t + 1 is the smoothed value at time t
112 | * Source: http://people.duke.edu/~rnau/411avg.htm
113 | * @param ts the time series to fit a EWMA model to
114 | * @return Sum Squared Error
115 | */
116 | def sse(ts: Vector): Double = {
117 | val n = ts.size
118 |
119 | val smoothed = addTimeDependentEffects(ts)
120 | var i = 0
121 | var error = 0.0
122 | var sqrErrors = 0.0
123 | while (i < n - 1) {
124 | error = ts(i + 1) - smoothed(i)
125 | sqrErrors += error * error
126 | i += 1
127 | }
128 |
129 | sqrErrors
130 | }
131 |
132 | /**
133 | * Calculates the gradient of the SSE cost function for our EWMA model
134 | * @return gradient
135 | */
136 | def gradient(ts: Vector): Double = {
137 | val n = ts.size
138 | // val smoothed = new DenseVector(Array.fill(n)(0.0))
139 | val smoothed = addTimeDependentEffects(ts)
140 |
141 | var error = 0.0
142 | var prevSmoothed = ts(0)
143 | var prevDSda = 0.0 // derivative of the EWMA function at time t - 1: (d S(t - 1)/ d smoothing)
144 | var dSda = 0.0 // derivative of the EWMA function at time t: (d S(t) / d smoothing)
145 | var dJda = 0.0 // derivative of our SSE cost function
146 | var i = 0
147 |
148 | while (i < n - 1) {
149 | error = ts(i + 1) - smoothed(i)
150 | dSda = ts(i) - prevSmoothed + (1 - smoothing) * prevDSda
151 | dJda += error * dSda
152 | prevDSda = dSda
153 | prevSmoothed = smoothed(i)
154 | i += 1
155 | }
156 | 2 * dJda
157 | }
158 |
159 | def addTimeDependentEffects(ts: Vector): Vector = {
160 | val arr = Array.fill(ts.size)(0.0)
161 | arr(0) = ts(0) // by definition in our model S_0 = X_0
162 | for (i <- 1 until ts.size) {
163 | arr(i) = smoothing * ts(i) + (1 - smoothing) * arr(i - 1)
164 | }
165 | new DenseVector(arr)
166 | }
167 |
168 |
169 | override def copy(extra: ParamMap): EWMAModel = defaultCopy(extra)
170 |
171 | /**
172 | * Transforms the input dataset.
173 | */
174 | override def transform(dataset: Dataset[_]): DataFrame = {
175 |
176 | val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map {
177 | case Row(time: String, value: Double) => (time, value)
178 | }.sortByKey().collect()
179 | .map(x => x._2)
180 |
181 | val dataVector = Vectors.dense(data)
182 |
183 | val res = addTimeDependentEffects(dataVector)
184 |
185 | val resRDD = dataset.sparkSession.sparkContext.parallelize(res.toArray.map(x => Row(x)))
186 |
187 | val structType = transformSchema(dataset.schema)
188 |
189 | dataset.sparkSession.createDataFrame(resRDD, structType)
190 | }
191 |
192 | /**
193 | * Check transform validity and derive the output schema from the input schema.
194 | *
195 | * Typical implementation should first conduct verification on schema change and parameter
196 | * validity, including complex parameter interaction checks.
197 | */
198 | override def transformSchema(schema: StructType): StructType = {
199 | StructType(Array(StructField("EMA", DoubleType)))
200 | }
201 | }
202 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/timeseries/models/GARCH.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.timeseries.models
2 |
3 | import io.transwarp.discover.timeseries.params.TimeSeriesParams
4 | import io.transwarp.midas.constant.midas.params.timeseries.{GARCHParams, TimeSeriesParams}
5 | import org.apache.commons.math3.analysis.{MultivariateFunction, MultivariateVectorFunction}
6 | import org.apache.commons.math3.optim.{InitialGuess, MaxEval, MaxIter, SimpleValueChecker}
7 | import org.apache.commons.math3.optim.nonlinear.scalar.{ObjectiveFunction, ObjectiveFunctionGradient}
8 | import org.apache.commons.math3.optim.nonlinear.scalar.gradient.NonLinearConjugateGradientOptimizer
9 | import org.apache.commons.math3.random.RandomGenerator
10 | import org.apache.spark.ml.{Estimator, Model}
11 | import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}
12 | import org.apache.spark.ml.param.{Param, ParamMap}
13 | import org.apache.spark.ml.timeseries.params.TimeSeriesParams
14 | import org.apache.spark.ml.util.Identifiable
15 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
16 | import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
17 |
18 | /**
19 | * Created by endy on 16-12-22.
20 | */
21 |
22 | trait GARCHParams extends TimeSeriesParams {
23 | final val maxEval = new Param[Int](this, "maxEval", "max eval")
24 | def setMaxEval(value: Int): this.type = set(maxEval, value)
25 |
26 | final val maxIter = new Param[Int](this, "maxIter", "max iteration")
27 | def setMaxIter(value: Int): this.type = set(maxIter, value)
28 | }
29 |
30 | class GARCH(override val uid: String) extends Estimator[GARCHModel] with GARCHParams{
31 |
32 | setDefault(timeCol -> "time",
33 | timeSeriesCol -> "timeseries",
34 | maxEval -> 10000,
35 | maxIter -> 10000)
36 |
37 | def this() = this(Identifiable.randomUID("GARCH"))
38 |
39 | /**
40 | * Fits a model to the input data.
41 | */
42 | override def fit(dataset: Dataset[_]): GARCHModel = {
43 |
44 | val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map {
45 | case Row(time: String, value: Double) => (time, value)
46 | }.sortByKey().collect()
47 |
48 | val dataVector = Vectors.dense(data.map(x => x._2))
49 |
50 | val optimizer = new NonLinearConjugateGradientOptimizer(
51 | NonLinearConjugateGradientOptimizer.Formula.FLETCHER_REEVES,
52 | new SimpleValueChecker(1e-6, 1e-6))
53 |
54 | val gradient = new ObjectiveFunctionGradient(new MultivariateVectorFunction() {
55 | def value(params: Array[Double]): Array[Double] = {
56 | new GARCHModel(params(0), params(1), params(2)).gradient(dataVector)
57 | }
58 | })
59 | val objectiveFunction = new ObjectiveFunction(new MultivariateFunction() {
60 | def value(params: Array[Double]): Double = {
61 | new GARCHModel(params(0), params(1), params(2)).logLikelihood(dataVector)
62 | }
63 | })
64 |
65 | val initialGuess = new InitialGuess(Array(.2, .2, .2)) // TODO: make this smarter
66 |
67 | val optimal = optimizer.optimize(objectiveFunction, gradient, initialGuess,
68 | new MaxIter(${maxIter}), new MaxEval(${maxEval}))
69 |
70 | val params = optimal.getPoint
71 | new GARCHModel(params(0), params(1), params(2))
72 | .setTimeCol(${timeCol}).setTimeSeriesCol(${timeSeriesCol})
73 |
74 | }
75 |
76 | override def copy(extra: ParamMap): Estimator[GARCHModel] = defaultCopy(extra)
77 |
78 | /**
79 | * :: DeveloperApi ::
80 | *
81 | * Check transform validity and derive the output schema from the input schema.
82 | *
83 | * Typical implementation should first conduct verification on schema change and parameter
84 | * validity, including complex parameter interaction checks.
85 | */
86 | override def transformSchema(schema: StructType): StructType = schema
87 | }
88 |
89 | class GARCHModel(override val uid: String, val omega: Double, val alpha: Double, val beta: Double)
90 | extends Model[GARCHModel] with GARCHParams {
91 |
92 | def this(omega: Double, alpha: Double, beta: Double) = this(Identifiable.randomUID("GARCH"),
93 | omega, alpha, beta)
94 |
95 | override def copy(extra: ParamMap): GARCHModel = defaultCopy(extra)
96 |
97 | /**
98 | * Transforms the input dataset.
99 | */
100 | override def transform(dataset: Dataset[_]): DataFrame = {
101 | val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map {
102 | case Row(time: String, value: Double) => (time, value)
103 | }.sortByKey().collect()
104 |
105 | val dataVector = Vectors.dense(data.map(x => x._2))
106 |
107 | val dest = addTimeDependentEffects(dataVector)
108 |
109 | val resRDD = dataset.sparkSession.sparkContext.parallelize(dest.toArray.map(x => Row(x)))
110 |
111 | val structType = transformSchema(dataset.schema)
112 |
113 | dataset.sparkSession.createDataFrame(resRDD, structType)
114 | }
115 |
116 | /**
117 | * :: DeveloperApi ::
118 | *
119 | * Check transform validity and derive the output schema from the input schema.
120 | *
121 | * Typical implementation should first conduct verification on schema change and parameter
122 | * validity, including complex parameter interaction checks.
123 | */
124 | override def transformSchema(schema: StructType): StructType =
125 | StructType(Array(StructField("GARCH", DoubleType)))
126 | /**
127 | * Returns the log likelihood of the parameters on the given time series.
128 | *
129 | * Based on https://pdfs.semanticscholar.org/7da8/bfa5295375c1141d797e80065a599153c19d.pdf
130 | */
131 | def logLikelihood(ts: Vector): Double = {
132 | var sum = 0.0
133 | iterateWithHAndEta(ts) { (i, h, eta, prevH, prevEta) =>
134 | sum += -.5 * math.log(h) - .5 * eta * eta / h
135 | }
136 | sum + -.5 * math.log(2 * math.Pi) * (ts.size - 1)
137 | }
138 |
139 | private def iterateWithHAndEta(ts: Vector)
140 | (fn: (Int, Double, Double, Double, Double) => Unit): Unit = {
141 | var prevH = omega / (1 - alpha - beta)
142 | var i = 1
143 | while (i < ts.size) {
144 | val h = omega + alpha * ts(i - 1) * ts(i - 1) + beta * prevH
145 | fn(i, h, ts(i), prevH, ts(i - 1))
146 | prevH = h
147 | i += 1
148 | }
149 | }
150 |
151 | def gradient(ts: Vector): Array[Double] = {
152 | var omegaGradient = 0.0
153 | var alphaGradient = 0.0
154 | var betaGradient = 0.0
155 | var omegaDhdtheta = 0.0
156 | var alphaDhdtheta = 0.0
157 | var betaDhdtheta = 0.0
158 | iterateWithHAndEta(ts) { (i, h, eta, prevH, prevEta) =>
159 | omegaDhdtheta = 1 + beta * omegaDhdtheta
160 | alphaDhdtheta = prevEta * prevEta + beta * alphaDhdtheta
161 | betaDhdtheta = prevH + beta * betaDhdtheta
162 |
163 | val multiplier = (eta * eta / (h * h)) - (1 / h)
164 | omegaGradient += multiplier * omegaDhdtheta
165 | alphaGradient += multiplier * alphaDhdtheta
166 | betaGradient += multiplier * betaDhdtheta
167 | }
168 | Array(omegaGradient * .5, alphaGradient * .5, betaGradient * .5)
169 | }
170 |
171 | def addTimeDependentEffects(ts: Vector): Vector = {
172 |
173 | val destArr = new Array[Double](ts.size)
174 |
175 | var prevVariance = omega / (1.0 - alpha - beta)
176 | var prevEta = ts(0) * math.sqrt(prevVariance)
177 |
178 | destArr(0) = prevEta
179 | for (i <- 1 until ts.size) {
180 | val variance = omega + alpha * prevEta * prevEta + beta * prevVariance
181 | val standardizedEta = ts(i)
182 | val eta = standardizedEta * math.sqrt(variance)
183 | destArr(i) = eta
184 |
185 | prevEta = eta
186 | prevVariance = variance
187 | }
188 | new DenseVector(destArr)
189 | }
190 |
191 | private def sampleWithVariances(n: Int, rand: RandomGenerator): (Array[Double], Array[Double]) = {
192 | val ts = new Array[Double](n)
193 | val variances = new Array[Double](n)
194 | variances(0) = omega / (1 - alpha - beta)
195 | var eta = math.sqrt(variances(0)) * rand.nextGaussian()
196 | for (i <- 1 until n) {
197 | variances(i) = omega + beta * variances(i-1) + alpha * eta * eta
198 | eta = math.sqrt(variances(i)) * rand.nextGaussian()
199 | ts(i) = eta
200 | }
201 |
202 | (ts, variances)
203 | }
204 |
205 | /**
206 | * Samples a random time series of a given length with the properties of the model.
207 | *
208 | * @param n The length of the time series to sample.
209 | * @param rand The random generator used to generate the observations.
210 | * @return The samples time series.
211 | */
212 | def sample(n: Int, rand: RandomGenerator): Array[Double] = sampleWithVariances(n, rand)._1
213 | }
214 |
--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/ml/timeseries/models/HoltWintersSuite.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.timeseries.models
2 |
3 | import org.apache.spark.SparkFunSuite
4 | import org.apache.spark.ml.util.DefaultReadWriteTest
5 | import org.apache.spark.mllib.util.MLlibTestSparkContext
6 | import org.apache.spark.mllib.util.TestingUtils._
7 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
8 | import org.apache.spark.sql.{Dataset, Row}
9 |
10 | /**
11 | * Created by endy on 16-12-21.
12 | */
13 | class HoltWintersSuite extends SparkFunSuite with MLlibTestSparkContext
14 | with DefaultReadWriteTest {
15 |
16 | @transient var dataSet: Dataset[_] = _
17 | @transient var dataSet2: Dataset[_] = _
18 |
19 | val tsAirPassengers = Array(
20 | 112.0, 118.0, 132.0, 129.0, 121.0, 135.0, 148.0, 148.0, 136.0, 119.0, 104.0, 118.0, 115.0,
21 | 126.0, 141.0, 135.0, 125.0, 149.0, 170.0, 170.0, 158.0, 133.0, 114.0, 140.0, 145.0, 150.0,
22 | 178.0, 163.0, 172.0, 178.0, 199.0, 199.0, 184.0, 162.0, 146.0, 166.0, 171.0, 180.0, 193.0,
23 | 181.0, 183.0, 218.0, 230.0, 242.0, 209.0, 191.0, 172.0, 194.0, 196.0, 196.0, 236.0, 235.0,
24 | 229.0, 243.0, 264.0, 272.0, 237.0, 211.0, 180.0, 201.0, 204.0, 188.0, 235.0, 227.0, 234.0,
25 | 264.0, 302.0, 293.0, 259.0, 229.0, 203.0, 229.0, 242.0, 233.0, 267.0, 269.0, 270.0, 315.0,
26 | 364.0, 347.0, 312.0, 274.0, 237.0, 278.0, 284.0, 277.0, 317.0, 313.0, 318.0, 374.0, 413.0,
27 | 405.0, 355.0, 306.0, 271.0, 306.0, 315.0, 301.0, 356.0, 348.0, 355.0, 422.0, 465.0, 467.0,
28 | 404.0, 347.0, 305.0, 336.0, 340.0, 318.0, 362.0, 348.0, 363.0, 435.0, 491.0, 505.0, 404.0,
29 | 359.0, 310.0, 337.0, 360.0, 342.0, 406.0, 396.0, 420.0, 472.0, 548.0, 559.0, 463.0, 407.0,
30 | 362.0, 405.0, 417.0, 391.0, 419.0, 461.0, 472.0, 535.0, 622.0, 606.0, 508.0, 461.0, 390.0,
31 | 432.0)
32 |
33 | val tsCO2 = Array(
34 | 315.42, 316.31, 316.50, 317.56, 318.13, 318.00, 316.39, 314.65, 313.68, 313.18, 314.66, 315.43,
35 | 316.27, 316.81, 317.42, 318.87, 319.87, 319.43, 318.01, 315.74, 314.00, 313.68, 314.84, 316.03,
36 | 316.73, 317.54, 318.38, 319.31, 320.42, 319.61, 318.42, 316.63, 314.83, 315.16, 315.94, 316.85,
37 | 317.78, 318.40, 319.53, 320.42, 320.85, 320.45, 319.45, 317.25, 316.11, 315.27, 316.53, 317.53,
38 | 318.58, 318.92, 319.70, 321.22, 322.08, 321.31, 319.58, 317.61, 316.05, 315.83, 316.91, 318.20,
39 | 319.41, 320.07, 320.74, 321.40, 322.06, 321.73, 320.27, 318.54, 316.54, 316.71, 317.53, 318.55,
40 | 319.27, 320.28, 320.73, 321.97, 322.00, 321.71, 321.05, 318.71, 317.66, 317.14, 318.70, 319.25,
41 | 320.46, 321.43, 322.23, 323.54, 323.91, 323.59, 322.24, 320.20, 318.48, 317.94, 319.63, 320.87,
42 | 322.17, 322.34, 322.88, 324.25, 324.83, 323.93, 322.38, 320.76, 319.10, 319.24, 320.56, 321.80,
43 | 322.40, 322.99, 323.73, 324.86, 325.40, 325.20, 323.98, 321.95, 320.18, 320.09, 321.16, 322.74,
44 | 323.83, 324.26, 325.47, 326.50, 327.21, 326.54, 325.72, 323.50, 322.22, 321.62, 322.69, 323.95,
45 | 324.89, 325.82, 326.77, 327.97, 327.91, 327.50, 326.18, 324.53, 322.93, 322.90, 323.85, 324.96,
46 | 326.01, 326.51, 327.01, 327.62, 328.76, 328.40, 327.20, 325.27, 323.20, 323.40, 324.63, 325.85,
47 | 326.60, 327.47, 327.58, 329.56, 329.90, 328.92, 327.88, 326.16, 324.68, 325.04, 326.34, 327.39,
48 | 328.37, 329.40, 330.14, 331.33, 332.31, 331.90, 330.70, 329.15, 327.35, 327.02, 327.99, 328.48,
49 | 329.18, 330.55, 331.32, 332.48, 332.92, 332.08, 331.01, 329.23, 327.27, 327.21, 328.29, 329.41,
50 | 330.23, 331.25, 331.87, 333.14, 333.80, 333.43, 331.73, 329.90, 328.40, 328.17, 329.32, 330.59,
51 | 331.58, 332.39, 333.33, 334.41, 334.71, 334.17, 332.89, 330.77, 329.14, 328.78, 330.14, 331.52,
52 | 332.75, 333.24, 334.53, 335.90, 336.57, 336.10, 334.76, 332.59, 331.42, 330.98, 332.24, 333.68,
53 | 334.80, 335.22, 336.47, 337.59, 337.84, 337.72, 336.37, 334.51, 332.60, 332.38, 333.75, 334.78,
54 | 336.05, 336.59, 337.79, 338.71, 339.30, 339.12, 337.56, 335.92, 333.75, 333.70, 335.12, 336.56,
55 | 337.84, 338.19, 339.91, 340.60, 341.29, 341.00, 339.39, 337.43, 335.72, 335.84, 336.93, 338.04,
56 | 339.06, 340.30, 341.21, 342.33, 342.74, 342.08, 340.32, 338.26, 336.52, 336.68, 338.19, 339.44,
57 | 340.57, 341.44, 342.53, 343.39, 343.96, 343.18, 341.88, 339.65, 337.81, 337.69, 339.09, 340.32,
58 | 341.20, 342.35, 342.93, 344.77, 345.58, 345.14, 343.81, 342.21, 339.69, 339.82, 340.98, 342.82,
59 | 343.52, 344.33, 345.11, 346.88, 347.25, 346.62, 345.22, 343.11, 340.90, 341.18, 342.80, 344.04,
60 | 344.79, 345.82, 347.25, 348.17, 348.74, 348.07, 346.38, 344.51, 342.92, 342.62, 344.06, 345.38,
61 | 346.11, 346.78, 347.68, 349.37, 350.03, 349.37, 347.76, 345.73, 344.68, 343.99, 345.48, 346.72,
62 | 347.84, 348.29, 349.23, 350.80, 351.66, 351.07, 349.33, 347.92, 346.27, 346.18, 347.64, 348.78,
63 | 350.25, 351.54, 352.05, 353.41, 354.04, 353.62, 352.22, 350.27, 348.55, 348.72, 349.91, 351.18,
64 | 352.60, 352.92, 353.53, 355.26, 355.52, 354.97, 353.75, 351.52, 349.64, 349.83, 351.14, 352.37,
65 | 353.50, 354.55, 355.23, 356.04, 357.00, 356.07, 354.67, 352.76, 350.82, 351.04, 352.69, 354.07,
66 | 354.59, 355.63, 357.03, 358.48, 359.22, 358.12, 356.06, 353.92, 352.05, 352.11, 353.64, 354.89,
67 | 355.88, 356.63, 357.72, 359.07, 359.58, 359.17, 356.94, 354.92, 352.94, 353.23, 354.09, 355.33,
68 | 356.63, 357.10, 358.32, 359.41, 360.23, 359.55, 357.53, 355.48, 353.67, 353.95, 355.30, 356.78,
69 | 358.34, 358.89, 359.95, 361.25, 361.67, 360.94, 359.55, 357.49, 355.84, 356.00, 357.59, 359.05,
70 | 359.98, 361.03, 361.66, 363.48, 363.82, 363.30, 361.94, 359.50, 358.11, 357.80, 359.61, 360.74,
71 | 362.09, 363.29, 364.06, 364.76, 365.45, 365.01, 363.70, 361.54, 359.51, 359.65, 360.80, 362.38,
72 | 363.23, 364.06, 364.61, 366.40, 366.84, 365.68, 364.52, 362.57, 360.24, 360.83, 362.49, 364.34
73 | )
74 |
75 | override def beforeAll(): Unit = {
76 | super.beforeAll()
77 |
78 | val schema = StructType(Array(StructField("time", StringType), StructField("timeseries",
79 | DoubleType)))
80 |
81 | var data = tsAirPassengers.zipWithIndex.map(x => (x._2.formatted("%011d"), x._1))
82 | val rdd = sc.parallelize(data.map(x => Row(x._1, x._2)))
83 | dataSet = spark.createDataFrame(rdd, schema)
84 |
85 | data = tsCO2.zipWithIndex.map(x => (x._2.formatted("%011d"), x._1))
86 | val rdd2 = sc.parallelize(data.map(x => Row(x._1, x._2)))
87 | dataSet2 = spark.createDataFrame(rdd2, schema)
88 | }
89 |
90 | test("Optimal Paramaters alpha beta gamma - Additive Model") {
91 | val model = new HoltWinters()
92 | .setTimeCol("time")
93 | .setTimeSeriesCol("timeseries")
94 | .setModelType("additive")
95 | .setPeriod(12)
96 | .setMaxIter(30000)
97 | .setMaxEval(30000)
98 | .fit(dataSet)
99 |
100 | assert(model.alpha ~== 0.24796 absTol 0.01 )
101 | assert(model.beta ~== 0.03453 absTol 0.01 )
102 | assert(model.gamma ~== 1.0 absTol 0.01 )
103 | }
104 |
105 | test("Forecast - Additive Model") {
106 | val model = new HoltWinters()
107 | .setTimeCol("time")
108 | .setTimeSeriesCol("timeseries")
109 | .setModelType("additive")
110 | .setPeriod(12)
111 | .setMaxIter(30000)
112 | .setMaxEval(30000)
113 | .fit(dataSet)
114 |
115 | val forecasted = model.transform(dataSet).collect().map{
116 | case Row(x: Double) => x
117 | }
118 |
119 | val actualForecasted = new Array[Double](12)
120 | actualForecasted(0) = 453.4977
121 | actualForecasted(1) = 429.3906
122 | actualForecasted(2) = 467.0361
123 | actualForecasted(3) = 503.2574
124 | actualForecasted(4) = 512.3395
125 | actualForecasted(5) = 571.8880
126 | actualForecasted(6) = 652.6095
127 | actualForecasted(7) = 637.4623
128 | actualForecasted(8) = 539.7548
129 | actualForecasted(9) = 490.7250
130 | actualForecasted(10) = 424.4593
131 | actualForecasted(11) = 469.5315
132 |
133 | for (i <- 0 until 12) {
134 | assert(forecasted(i) ~== actualForecasted(i) absTol 10)
135 | }
136 | }
137 |
138 |
139 | test("Optimal Paramaters alpha beta gamma - Multiplicative Model") {
140 | val model = new HoltWinters()
141 | .setTimeCol("time")
142 | .setTimeSeriesCol("timeseries")
143 | .setModelType("multiplicative")
144 | .setPeriod(12)
145 | .setMaxIter(30000)
146 | .setMaxEval(30000)
147 | .fit(dataSet2)
148 |
149 | assert(model.alpha ~== 0.51265 absTol 0.01 )
150 | assert(model.beta ~== 0.00949 absTol 0.01 )
151 | assert(model.gamma ~== 0.47289 absTol 0.1 )
152 | }
153 |
154 | test("Forecast - Multiplicative Model") {
155 | val model = new HoltWinters()
156 | .setTimeCol("time")
157 | .setTimeSeriesCol("timeseries")
158 | .setModelType("multiplicative")
159 | .setPeriod(12)
160 | .setMaxIter(30000)
161 | .setMaxEval(30000)
162 | .fit(dataSet2)
163 |
164 | val forecasted = model.transform(dataSet2).collect().map{
165 | case Row(x: Double) => x
166 | }
167 |
168 | val actualForecasted = new Array[Double](12)
169 | actualForecasted(0) = 365.1079
170 | actualForecasted(1) = 365.9664
171 | actualForecasted(2) = 366.7343
172 | actualForecasted(3) = 368.1364
173 | actualForecasted(4) = 368.6674
174 | actualForecasted(5) = 367.9508
175 | actualForecasted(6) = 366.5318
176 | actualForecasted(7) = 364.3799
177 | actualForecasted(8) = 362.4731
178 | actualForecasted(9) = 362.7520
179 | actualForecasted(10) = 364.2203
180 | actualForecasted(11) = 365.6741
181 |
182 | for (i <- 0 until 12) {
183 | assert(forecasted(i) ~== actualForecasted(i) absTol 10)
184 | }
185 | }
186 | }
187 |
--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/ml/timeseries/models/ARIMASuite.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.timeseries.models
2 |
3 | import org.apache.commons.math3.random.{MersenneTwister, RandomGenerator}
4 | import org.apache.spark.SparkFunSuite
5 | import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}
6 | import org.apache.spark.ml.timeseries.UnivariateTimeSeries
7 | import org.apache.spark.ml.util.DefaultReadWriteTest
8 | import org.apache.spark.mllib.util.MLlibTestSparkContext
9 | import org.apache.spark.mllib.util.TestingUtils._
10 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
11 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
12 |
13 |
14 | /**
15 | * Created by endy on 16-12-20.
16 | */
17 | class ARIMASuite extends SparkFunSuite with MLlibTestSparkContext
18 | with DefaultReadWriteTest {
19 |
20 | @transient var dataSet: Dataset[_] = _
21 | test("compare with R") {
22 | // > R.Version()$version.string
23 | // [1] "R version 3.2.0 (2015-04-16)"
24 | // > set.seed(456)
25 | // y <- arima.sim(n=250,list(ar=0.3,ma=0.7),mean = 5)
26 | // write.table(y, file = "resources/R_ARIMA_DataSet1.csv", row.names = FALSE, col.names = FALSE)
27 | val dataFile = getClass.getResource("/timeseries/R_ARIMA_DataSet1.csv").toString
28 |
29 | val rawData = sc.textFile(dataFile).map(line => line.toDouble)
30 | .collect().zipWithIndex
31 |
32 | val schema = StructType(Array(StructField("time", StringType), StructField("timeseries",
33 | DoubleType)))
34 |
35 | val rdd = sc.parallelize(rawData.map(x => Row(x._2.formatted("%05d"), x._1)))
36 | val dataset = spark.createDataFrame(rdd, schema)
37 |
38 | val model = new ARIMA()
39 | .setP(1)
40 | .setD(0)
41 | .setQ(1)
42 | .setTimeCol("time")
43 | .setTimeSeriesCol("timeseries")
44 | .fit(dataset)
45 |
46 | val Array(c, ar, ma) = model.coefficient
47 | assert(ar ~== 0.3 absTol 0.05)
48 | assert(ma ~== 0.7 absTol 0.05)
49 | }
50 |
51 | test("Data sampled from a given model should result in similar model if fit") {
52 | val rand = new MersenneTwister(10L)
53 | val model = new ARIMAModel(2, 1, 2, Array(8.2, 0.2, 0.5, 0.3, 0.1))
54 | val (_, sampled) = sample(1000, rand, model)
55 |
56 | val newModel = new ARIMA()
57 | .setP(2)
58 | .setD(1)
59 | .setQ(2)
60 | .setTimeCol("time")
61 | .setTimeSeriesCol("timeseries")
62 | .fit(sampled)
63 |
64 | val Array(c, ar1, ar2, ma1, ma2) = model.coefficient
65 | val Array(cTest, ar1Test, ar2Test, ma1Test, ma2Test) = newModel.coefficient
66 |
67 | // intercept is given more leeway
68 | assert(c ~== cTest absTol 1)
69 | assert(ar1Test ~== ar1 absTol 0.1)
70 | assert(ma1Test ~== ma1 absTol 0.1)
71 | assert(ar2Test ~== ar2 absTol 0.1)
72 | assert(ma2Test ~== ma2 absTol 0.1)
73 | }
74 |
75 | test("Fitting CSS with BOBYQA and conjugate gradient descent should be fairly similar") {
76 | val rand = new MersenneTwister(10L)
77 | val model = new ARIMAModel(2, 1, 2, Array(8.2, 0.2, 0.5, 0.3, 0.1))
78 | val (_, sampled) = sample(1000, rand, model)
79 |
80 | val fitWithBOBYQA = new ARIMA()
81 | .setP(2)
82 | .setD(1)
83 | .setQ(2)
84 | .setTimeCol("time")
85 | .setTimeSeriesCol("timeseries")
86 | .setMethod("css-bobyqa")
87 | .fit(sampled)
88 |
89 | val fitWithCGD = new ARIMA()
90 | .setP(2)
91 | .setD(1)
92 | .setQ(2)
93 | .setTimeCol("time")
94 | .setTimeSeriesCol("timeseries")
95 | .setMethod("css-cgd")
96 | .fit(sampled)
97 |
98 | val Array(c, ar1, ar2, ma1, ma2) = fitWithBOBYQA.coefficient
99 | val Array(cCGD, ar1CGD, ar2CGD, ma1CGD, ma2CGD) = fitWithCGD.coefficient
100 |
101 | // give more leeway for intercept
102 | assert(cCGD ~== c absTol 1)
103 | assert(ar1CGD ~== ar1 absTol 0.1)
104 | assert(ar2CGD ~== ar2 absTol 0.1)
105 | assert(ma1CGD ~== ma1 absTol 0.1)
106 | assert(ma2CGD ~== ma2 absTol 0.1)
107 | }
108 |
109 | test("Fitting ARIMA(p, d, q) should be the same as fitting a d-order differenced ARMA(p, q)") {
110 | val rand = new MersenneTwister(10L)
111 | val model = new ARIMAModel(1, 1, 2, Array(0.3, 0.7, 0.1), hasIntercept = false)
112 | val (vec, sampled) = sample(1000, rand, model)
113 |
114 | val arimaModel = new ARIMA()
115 | .setP(1)
116 | .setD(1)
117 | .setQ(2)
118 | .setTimeCol("time")
119 | .setTimeSeriesCol("timeseries")
120 | .setIncludeIntercept(false)
121 | .fit(sampled)
122 |
123 |
124 | val differenceSample = UnivariateTimeSeries.differencesOfOrderD(vec, 1).toArray.drop(1)
125 |
126 | val dataFrame = genDf(differenceSample)
127 |
128 | val armaModel = new ARIMA()
129 | .setP(1)
130 | .setD(0)
131 | .setQ(2)
132 | .setTimeCol("time")
133 | .setTimeSeriesCol("timeseries")
134 | .setIncludeIntercept(false)
135 | .fit(dataFrame)
136 |
137 | val Array(refAR, refMA1, refMA2) = model.coefficient
138 | val Array(iAR, iMA1, iMA2) = arimaModel.coefficient
139 | val Array(ar, ma1, ma2) = armaModel.coefficient
140 |
141 | // ARIMA model should match parameters used to sample, to some extent
142 | assert(iAR ~== refAR absTol 0.05)
143 | assert(iMA1 ~== refMA1 absTol 0.05)
144 | assert(iMA2 ~== refMA2 absTol 0.05)
145 |
146 | // ARMA model parameters of differenced sample should be equal to ARIMA model parameters
147 | assert(ar == iAR)
148 | assert(ma1 == iMA1)
149 | assert(ma2 == iMA2)
150 | }
151 |
152 | test("Fitting ARIMA(0, 0, 0) with intercept term results in model with average as parameter") {
153 | val rand = new MersenneTwister(10L)
154 | val (vec, sampled) = sample(100, rand)
155 |
156 | val model = new ARIMA()
157 | .setP(0)
158 | .setD(0)
159 | .setQ(0)
160 | .setTimeCol("time")
161 | .setTimeSeriesCol("timeseries")
162 | .fit(sampled)
163 |
164 | val mean = vec.toArray.sum / vec.size
165 |
166 | assert(model.coefficient(0) ~== mean absTol 1e-4)
167 | }
168 |
169 | test("Fitting ARIMA(0, 0, 0) with intercept term results in model with average as the forecast") {
170 | val rand = new MersenneTwister(10L)
171 | val (vec, sampled) = sample(100, rand)
172 | val model = new ARIMA()
173 | .setP(0)
174 | .setD(0)
175 | .setQ(0)
176 | .setTimeCol("time")
177 | .setTimeSeriesCol("timeseries")
178 | .fit(sampled)
179 |
180 | val mean = vec.toArray.sum / vec.size
181 |
182 | assert(model.coefficient(0) ~== mean absTol 1e-4)
183 | val forecast = model
184 | .setNFuture(10).transform(sampled).collect()
185 | .map{case Row(s: Double) => s}
186 |
187 | for(i <- 100 until 110) {
188 | assert(forecast(i) ~== mean absTol 1e-4)
189 | }
190 | }
191 |
192 | test("Fitting an integrated time series of order 3") {
193 | // > set.seed(10)
194 | // > vals <- arima.sim(list(ma = c(0.2), order = c(0, 3, 1)), 200)
195 | // > arima(order = c(0, 3, 1), vals, method = "CSS")
196 | //
197 | // Call:
198 | // arima(x = vals, order = c(0, 3, 1), method = "CSS")
199 | //
200 | // Coefficients:
201 | // ma1
202 | // 0.2523
203 | // s.e. 0.0623
204 | //
205 | // sigma^2 estimated as 0.9218: part log likelihood = -275.65
206 | // > write.table(y, file = "resources/R_ARIMA_DataSet2.csv", row.names = FALSE, col.names =
207 | // FALSE)
208 | val dataFile = getClass.getResource("/timeseries/R_ARIMA_DataSet2.csv").toString
209 | val rawData = sc.textFile(dataFile).map(line => line.toDouble)
210 | .collect().zipWithIndex
211 |
212 | val schema = StructType(Array(StructField("time", StringType), StructField("timeseries",
213 | DoubleType)))
214 |
215 | val rdd = sc.parallelize(rawData.map(x => Row(x._2.formatted("%05d"), x._1)))
216 | val dataset = spark.createDataFrame(rdd, schema)
217 | val model = new ARIMA()
218 | .setP(0)
219 | .setD(3)
220 | .setQ(1)
221 | .setTimeCol("time")
222 | .setTimeSeriesCol("timeseries")
223 | .fit(dataset)
224 |
225 | val Array(c, ma) = model.coefficient
226 | assert(ma ~== 0.2 absTol 0.05)
227 | }
228 | /**
229 | * Sample a series of size n assuming an ARIMA(p, d, q) process.
230 | *
231 | * @param n size of sample
232 | * @return series reflecting ARIMA(p, d, q) process
233 | */
234 | def sample(n: Int, rand: RandomGenerator, model: ARIMAModel): (Vector, DataFrame) = {
235 | val vec = new DenseVector(Array.fill[Double](n)(rand.nextGaussian))
236 | val res = model.addTimeDependentEffects(vec, vec).toArray
237 |
238 | val schema = StructType(Array(StructField("time", StringType), StructField("timeseries",
239 | DoubleType)))
240 |
241 | val rdd = sc.parallelize(res.zipWithIndex.map(x => Row(x._2.formatted("%05d"), x._1)))
242 |
243 | (Vectors.dense(res), spark.createDataFrame(rdd, schema))
244 | }
245 |
246 | /**
247 | * Sample a series of size n assuming an ARIMA(p, d, q) process.
248 | *
249 | * @param n size of sample
250 | * @return series reflecting ARIMA(p, d, q) process
251 | */
252 | def sample(n: Int, rand: RandomGenerator): (Vector, DataFrame) = {
253 | val vec = new DenseVector(Array.fill[Double](n)(rand.nextGaussian)).toArray
254 |
255 | val schema = StructType(Array(StructField("time", StringType), StructField("timeseries",
256 | DoubleType)))
257 |
258 | val rdd = sc.parallelize(vec.zipWithIndex.map(x => Row(x._2.formatted("%05d"), x._1)))
259 |
260 | (Vectors.dense(vec), spark.createDataFrame(rdd, schema))
261 | }
262 |
263 | def genDf(array: Array[Double]): DataFrame = {
264 | val schema = StructType(Array(StructField("time", StringType), StructField("timeseries",
265 | DoubleType)))
266 |
267 | val rdd = spark.sparkContext.parallelize(
268 | array.zipWithIndex.map(x => Row(x._2.formatted("%010d"), x._1)))
269 |
270 | spark.createDataFrame(rdd, schema)
271 | }
272 |
273 | }
274 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/dbscan/DBSCAN.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.dbscan
2 |
3 | import org.apache.spark.ml.dbscan.DBSCANLabeledPoint.Flag
4 | import org.apache.spark.internal.Logging
5 | import org.apache.spark.ml.linalg.Vector
6 | import org.apache.spark.rdd.RDD
7 |
8 | /**
9 | * Top level method for calling DBSCAN
10 | */
11 | object DBSCAN {
12 |
13 | /**
14 | * Train a DBSCAN Model using the given set of parameters
15 | *
16 | * @param data training points stored as `RDD[Vector]`
17 | * only the first two points of the vector are taken into consideration
18 | * @param eps the maximum distance between two points for them to be considered as part
19 | * of the same region
20 | * @param minPoints the minimum number of points required to form a dense region
21 | * @param maxPointsPerPartition the largest number of points in a single partition
22 | */
23 | def train(
24 | data: RDD[Vector],
25 | eps: Double,
26 | minPoints: Int,
27 | maxPointsPerPartition: Int): DBSCAN = {
28 |
29 | new DBSCAN(eps, minPoints, maxPointsPerPartition, null, null).train(data)
30 |
31 | }
32 |
33 | }
34 |
35 | /**
36 | * A parallel implementation of DBSCAN clustering. The implementation will split the data space
37 | * into a number of partitions, making a best effort to keep the number of points in each
38 | * partition under `maxPointsPerPartition`. After partitioning, traditional DBSCAN
39 | * clustering will be run in parallel for each partition and finally the results
40 | * of each partition will be merged to identify global clusters.
41 | *
42 | * This is an iterative algorithm that will make multiple passes over the data,
43 | * any given RDDs should be cached by the user.
44 | */
45 | class DBSCAN private ( val eps: Double,
46 | val minPoints: Int,
47 | val maxPointsPerPartition: Int,
48 | @transient val partitions: List[(Int, DBSCANRectangle)],
49 | @transient private val labeledPartitionedPoints:
50 | RDD[(Int, DBSCANLabeledPoint)])
51 |
52 | extends Serializable with Logging {
53 |
54 | type Margins = (DBSCANRectangle, DBSCANRectangle, DBSCANRectangle)
55 | type ClusterId = (Int, Int)
56 |
57 | def minimumRectangleSize: Double = 2 * eps
58 |
59 | def labeledPoints: RDD[DBSCANLabeledPoint] = {
60 | labeledPartitionedPoints.values
61 | }
62 |
63 | private def train(vectors: RDD[Vector]): DBSCAN = {
64 | // generate the smallest rectangles that split the space
65 | // and count how many points are contained in each one of them
66 | val minimumRectanglesWithCount =
67 | vectors
68 | .map(toMinimumBoundingRectangle)
69 | .map((_, 1))
70 | .aggregateByKey(0)(_ + _, _ + _)
71 | .collect()
72 | .toSet
73 |
74 | // find the best partitions for the data space
75 | val localPartitions = EvenSplitPartitioner
76 | .partition(minimumRectanglesWithCount, maxPointsPerPartition, minimumRectangleSize)
77 |
78 | logDebug("Found partitions: ")
79 | localPartitions.foreach(p => logDebug(p.toString))
80 |
81 | // grow partitions to include eps
82 | val localMargins =
83 | localPartitions
84 | .map({ case (p, _) => (p.shrink(eps), p, p.shrink(-eps)) })
85 | .zipWithIndex
86 |
87 | val margins = vectors.context.broadcast(localMargins)
88 |
89 | // assign each point to its proper partition
90 | val duplicated = for {
91 | point <- vectors.map(DBSCANPoint)
92 | ((inner, main, outer), id) <- margins.value
93 | if outer.contains(point)
94 | } yield (id, point)
95 |
96 | val numOfPartitions = localPartitions.size
97 |
98 | // perform local dbscan
99 | val clustered =
100 | duplicated
101 | .groupByKey(numOfPartitions)
102 | .flatMapValues(points =>
103 | new LocalDBSCANNaive(eps, minPoints).fit(points))
104 | .cache()
105 |
106 | // find all candidate points for merging clusters and group them
107 | val mergePoints =
108 | clustered
109 | .flatMap({
110 | case (partition, point) =>
111 | margins.value
112 | .filter({
113 | case ((inner, main, _), _) => main.contains(point) && !inner.almostContains(point)
114 | })
115 | .map({
116 | case (_, newPartition) => (newPartition, (partition, point))
117 | })
118 | })
119 | .groupByKey()
120 |
121 | logDebug("About to find adjacencies")
122 | // find all clusters with aliases from merging candidates
123 | val adjacencies =
124 | mergePoints
125 | .flatMapValues(findAdjacencies)
126 | .values
127 | .collect()
128 |
129 | // generated adjacency graph
130 | val adjacencyGraph = adjacencies.foldLeft(DBSCANGraph[ClusterId]()) {
131 | case (graph, (from, to)) => graph.connect(from, to)
132 | }
133 |
134 | logDebug("About to find all cluster ids")
135 | // find all cluster ids
136 | val localClusterIds =
137 | clustered
138 | .filter({ case (_, point) => point.flag != Flag.Noise })
139 | .mapValues(_.cluster)
140 | .distinct()
141 | .collect()
142 | .toList
143 |
144 | // assign a global Id to all clusters, where connected clusters get the same id
145 | val (total, clusterIdToGlobalId) = localClusterIds.foldLeft((0, Map[ClusterId, Int]())) {
146 | case ((id, map), clusterId) => {
147 |
148 | map.get(clusterId) match {
149 | case None => {
150 | val nextId = id + 1
151 | val connectedClusters = adjacencyGraph.getConnected(clusterId) + clusterId
152 | logDebug(s"Connected clusters $connectedClusters")
153 | val toadd = connectedClusters.map((_, nextId)).toMap
154 | (nextId, map ++ toadd)
155 | }
156 | case Some(x) =>
157 | (id, map)
158 | }
159 |
160 | }
161 | }
162 |
163 | logDebug("Global Clusters")
164 | clusterIdToGlobalId.foreach(e => logDebug(e.toString))
165 | logInfo(s"Total Clusters: ${localClusterIds.size}, Unique: $total")
166 |
167 | val clusterIds = vectors.context.broadcast(clusterIdToGlobalId)
168 |
169 | logDebug("About to relabel inner points")
170 | // relabel non-duplicated points
171 | val labeledInner =
172 | clustered
173 | .filter(isInnerPoint(_, margins.value))
174 | .map {
175 | case (partition, point) => {
176 |
177 | if (point.flag != Flag.Noise) {
178 | point.cluster = clusterIds.value((partition, point.cluster))
179 | }
180 |
181 | (partition, point)
182 | }
183 | }
184 |
185 | logDebug("About to relabel outer points")
186 | // de-duplicate and label merge points
187 | val labeledOuter =
188 | mergePoints.flatMapValues(partition => {
189 | partition.foldLeft(Map[DBSCANPoint, DBSCANLabeledPoint]())({
190 | case (all, (partition, point)) =>
191 |
192 | if (point.flag != Flag.Noise) {
193 | point.cluster = clusterIds.value((partition, point.cluster))
194 | }
195 |
196 | all.get(point) match {
197 | case None => all + (point -> point)
198 | case Some(prev) => {
199 | // override previous entry unless new entry is noise
200 | if (point.flag != Flag.Noise) {
201 | prev.flag = point.flag
202 | prev.cluster = point.cluster
203 | }
204 | all
205 | }
206 | }
207 |
208 | }).values
209 | })
210 |
211 | val finalPartitions = localMargins.map {
212 | case ((_, p, _), index) => (index, p)
213 | }
214 | logDebug("Done")
215 | new DBSCAN(
216 | eps,
217 | minPoints,
218 | maxPointsPerPartition,
219 | finalPartitions,
220 | labeledInner.union(labeledOuter))
221 |
222 | }
223 |
224 | /**
225 | * Find the appropriate label to the given `vector`
226 | *
227 | * This method is not yet implemented
228 | */
229 | def predict(vector: Vector): Double = {
230 | var centerid = 0
231 | partitions.foreach{x =>
232 | if (x._2.contains(DBSCANPoint(vector))){
233 | centerid = x._1
234 | }
235 | }
236 | centerid.toDouble
237 | }
238 |
239 | private def isInnerPoint(
240 | entry: (Int, DBSCANLabeledPoint),
241 | margins: List[(Margins, Int)]): Boolean = {
242 | entry match {
243 | case (partition, point) =>
244 | val ((inner, _, _), _) = margins.filter({
245 | case (_, id) => id == partition
246 | }).head
247 |
248 | inner.almostContains(point)
249 | }
250 | }
251 |
252 | private def findAdjacencies(partition: Iterable[(Int, DBSCANLabeledPoint)]):
253 | Set[((Int, Int), (Int, Int))] = {
254 |
255 | val zero = (Map[DBSCANPoint, ClusterId](), Set[(ClusterId, ClusterId)]())
256 |
257 | val (seen, adjacencies) = partition.foldLeft(zero)({
258 | case ((seen, adjacencies), (partition, point)) =>
259 | // noise points are not relevant for adjacencies
260 | if (point.flag == Flag.Noise) {
261 | (seen, adjacencies)
262 | } else {
263 | val clusterId = (partition, point.cluster)
264 | seen.get(point) match {
265 | case None => (seen + (point -> clusterId), adjacencies)
266 | case Some(prevClusterId) => (seen, adjacencies + ((prevClusterId, clusterId)))
267 | }
268 |
269 | }
270 | })
271 |
272 | adjacencies
273 | }
274 |
275 | private def toMinimumBoundingRectangle(vector: Vector): DBSCANRectangle = {
276 | val point = DBSCANPoint(vector)
277 | val x = corner(point.x)
278 | val y = corner(point.y)
279 | DBSCANRectangle(x, y, x + minimumRectangleSize, y + minimumRectangleSize)
280 | }
281 |
282 | private def corner(p: Double): Double =
283 | (shiftIfNegative(p) / minimumRectangleSize).intValue * minimumRectangleSize
284 |
285 | private def shiftIfNegative(p: Double): Double =
286 | if (p < 0) p - minimumRectangleSize else p
287 |
288 | }
289 |
290 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/timeseries/models/HoltWinters.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.timeseries.models
2 |
3 | import io.transwarp.discover.timeseries.params.TimeSeriesParams
4 | import io.transwarp.midas.constant.midas.params.timeseries.{HoltWintersParams, TimeSeriesParams}
5 | import org.apache.commons.math3.analysis.MultivariateFunction
6 | import org.apache.commons.math3.optim.{InitialGuess, MaxEval, MaxIter, SimpleBounds}
7 | import org.apache.commons.math3.optim.nonlinear.scalar.{GoalType, ObjectiveFunction}
8 | import org.apache.commons.math3.optim.nonlinear.scalar.noderiv.BOBYQAOptimizer
9 | import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}
10 | import org.apache.spark.ml.{Estimator, Model}
11 | import org.apache.spark.ml.param.{Param, ParamMap}
12 | import org.apache.spark.ml.timeseries.params.TimeSeriesParams
13 | import org.apache.spark.ml.util.Identifiable
14 | import org.apache.spark.sql.{DataFrame, Dataset, Row}
15 | import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
16 |
17 | /**
18 | * Triple exponential smoothing takes into account seasonal changes as well as trends.
19 | * Seasonality is defined to be the tendency of time-series data to exhibit behavior that repeats
20 | * itself every L periods, much like any harmonic function.
21 | *
22 | * The Holt-Winters method is a popular and effective approach to forecasting seasonal time series
23 | *
24 | * See https://en.wikipedia.org/wiki/Exponential_smoothing#Triple_exponential_smoothing
25 | * for more information on Triple Exponential Smoothing
26 | * See https://www.otexts.org/fpp/7/5 and
27 | * https://stat.ethz.ch/R-manual/R-devel/library/stats/html/HoltWinters.html
28 | * for more information on Holt Winter Method.
29 | */
30 |
31 | trait HoltWintersParams extends TimeSeriesParams{
32 | final val maxEval = new Param[Int](this, "maxEval", "max eval")
33 | def setMaxEval(value: Int): this.type = set(maxEval, value)
34 |
35 | final val maxIter = new Param[Int](this, "maxIter", "max iteration")
36 | def setMaxIter(value: Int): this.type = set(maxIter, value)
37 |
38 | final val period = new Param[Int](this, "period", "Seasonality of data")
39 | def setPeriod(value: Int): this.type = set(period, value)
40 |
41 | final val modelType = new Param[String](this, "modelType", "Two variations " +
42 | "differ in the nature of the seasonal component. Additive method is preferred when seasonal " +
43 | "variations are roughly constant through the series, Multiplicative method is preferred when " +
44 | "the seasonal variations are changing proportional to the level of the series")
45 | def setModelType(value: String): this.type = set(modelType, value)
46 | }
47 |
48 | class HoltWinters(override val uid: String) extends Estimator[HoltWintersModel] with
49 | HoltWintersParams {
50 |
51 | setDefault(timeCol -> "time",
52 | timeSeriesCol -> "timeseries",
53 | maxEval -> 10000,
54 | maxIter -> 10000)
55 |
56 | def this() = this(Identifiable.randomUID("HoltWinters"))
57 | /**
58 | * Fits a model to the input data.
59 | */
60 | override def fit(dataset: Dataset[_]): HoltWintersModel = {
61 |
62 | val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map {
63 | case Row(time: String, value: Double) => (time, value)
64 | }.sortByKey().collect()
65 |
66 | val dataVector = Vectors.dense(data.map(x => x._2))
67 | val optimizer = new BOBYQAOptimizer(7)
68 |
69 | val objectiveFunction = new ObjectiveFunction(new MultivariateFunction() {
70 | def value(params: Array[Double]): Double = {
71 | new HoltWintersModel(params(0), params(1), params(2))
72 | .setModelType(${modelType})
73 | .setPeriod(${period})
74 | .sse(dataVector)
75 | }
76 | })
77 |
78 | // The starting guesses in R's stats:HoltWinters
79 | val initGuess = new InitialGuess(Array(0.3, 0.1, 0.1))
80 | val goal = GoalType.MINIMIZE
81 | val bounds = new SimpleBounds(Array(0.0, 0.0, 0.0), Array(1.0, 1.0, 1.0))
82 | val optimal = optimizer.optimize(objectiveFunction, goal, bounds, initGuess,
83 | new MaxIter(${maxIter}), new MaxEval(${maxEval}))
84 | val params = optimal.getPoint
85 | new HoltWintersModel(params(0), params(1), params(2))
86 | .setModelType(${modelType})
87 | .setPeriod (${period})
88 | .setTimeCol(${timeCol})
89 | .setTimeSeriesCol(${timeSeriesCol})
90 | }
91 |
92 | override def copy(extra: ParamMap): Estimator[HoltWintersModel] = defaultCopy(extra)
93 |
94 | /**
95 | * :: DeveloperApi ::
96 | *
97 | * Check transform validity and derive the output schema from the input schema.
98 | *
99 | * Typical implementation should first conduct verification on schema change and parameter
100 | * validity, including complex parameter interaction checks.
101 | */
102 | override def transformSchema(schema: StructType): StructType = schema
103 | }
104 |
105 | class HoltWintersModel(override val uid: String,
106 | val alpha: Double, val beta: Double, val gamma: Double)
107 | extends Model[HoltWintersModel] with HoltWintersParams {
108 |
109 | def this(alpha: Double, beta: Double, gamma: Double) = this(Identifiable.randomUID
110 | ("HoltWintersModel"), alpha, beta, gamma)
111 |
112 | override def copy(extra: ParamMap): HoltWintersModel = defaultCopy(extra)
113 |
114 | /**
115 | * Transforms the input dataset.
116 | */
117 | override def transform(dataset: Dataset[_]): DataFrame = {
118 | val data = dataset.select(${timeCol}, ${timeSeriesCol}).rdd.map {
119 | case Row(time: String, value: Double) => (time, value)
120 | }.sortByKey().collect()
121 |
122 | val dataVector = Vectors.dense(data.map(x => x._2))
123 |
124 | val destArr = new Array[Double](${period})
125 | val (_, level, trend, season) = getHoltWintersComponents(dataVector)
126 | val n = dataVector.size
127 |
128 | val finalLevel = level(n - ${period})
129 | val finalTrend = trend(n - ${period})
130 | val finalSeason = new Array[Double](${period})
131 |
132 | for (i <- 0 until ${period}) {
133 | finalSeason(i) = season(i + n - ${period})
134 | }
135 |
136 | for (i <- 0 until ${period}) {
137 | destArr(i) = if (${modelType}.equalsIgnoreCase("additive")) {
138 | (finalLevel + (i + 1) * finalTrend) + finalSeason(i % ${period})
139 | } else {
140 | (finalLevel + (i + 1) * finalTrend) * finalSeason(i % ${period})
141 | }
142 | }
143 |
144 | val resRDD = dataset.sparkSession.sparkContext.parallelize(destArr.map(x => Row(x)))
145 |
146 | val structType = transformSchema(dataset.schema)
147 |
148 | dataset.sparkSession.createDataFrame(resRDD, structType)
149 | }
150 |
151 | /**
152 | * :: DeveloperApi ::
153 | *
154 | * Check transform validity and derive the output schema from the input schema.
155 | *
156 | * Typical implementation should first conduct verification on schema change and parameter
157 | * validity, including complex parameter interaction checks.
158 | */
159 | override def transformSchema(schema: StructType): StructType = {
160 | StructType(Array(StructField("HoltWinters", DoubleType)))
161 | }
162 |
163 | /**
164 | * Calculates sum of squared errors, used to estimate the alpha and beta parameters
165 | *
166 | * @param ts A time series for which we want to calculate the SSE, given the current parameters
167 | * @return SSE
168 | */
169 | def sse(ts: Vector): Double = {
170 | val n = ts.size
171 | val smoothed = addTimeDependentEffects(ts)
172 |
173 | var error = 0.0
174 | var sqrErrors = 0.0
175 |
176 | // We predict only from period by using the first period - 1 elements.
177 | for(i <- ${period} until n) {
178 | error = ts(i) - smoothed(i)
179 | sqrErrors += error * error
180 | }
181 |
182 | sqrErrors
183 | }
184 |
185 | def addTimeDependentEffects(ts: Vector): Vector = {
186 | val destArr = Array.fill(ts.size)(0.0)
187 | val fitted = getHoltWintersComponents(ts)._1
188 | for (i <- 0 until ts.size) {
189 | destArr(i) = fitted(i)
190 | }
191 | Vectors.dense(destArr)
192 | }
193 |
194 | /**
195 | * Start from the intial parameters and then iterate to find the final parameters
196 | * using the equations of HoltWinter Method.
197 | * See https://www.otexts.org/fpp/7/5 and
198 | * https://stat.ethz.ch/R-manual/R-devel/library/stats/html/HoltWinters.html
199 | * for more information on Holt Winter Method equations.
200 | *
201 | * @param ts A time series for which we want the HoltWinter parameters level,trend and season.
202 | * @return (level trend season). Final vectors of level trend and season are returned.
203 | */
204 | def getHoltWintersComponents(ts: Vector): (Vector, Vector, Vector, Vector) = {
205 | val n = ts.size
206 | require(n >= 2, "Requires length of at least 2")
207 |
208 | val dest = new Array[Double](n)
209 |
210 | val level = new Array[Double](n)
211 | val trend = new Array[Double](n)
212 | val season = new Array[Double](n)
213 |
214 | val (initLevel, initTrend, initSeason) = initHoltWinters(ts)
215 | level(0) = initLevel
216 | trend(0) = initTrend
217 | for (i <- 0 until initSeason.size){
218 | season(i) = initSeason(i)
219 | }
220 |
221 | for (i <- 0 until (n - ${period})) {
222 | dest(i + ${period}) = level(i) + trend(i)
223 |
224 | // Add the seasonal factor for additive and multiply for multiplicative model.
225 | if (${modelType}.equalsIgnoreCase("additive")) {
226 | dest(i + ${period}) += season(i)
227 | } else {
228 | dest(i + ${period}) *= season(i)
229 | }
230 |
231 | val levelWeight = if (${modelType}.equalsIgnoreCase("additive")) {
232 | ts(i + ${period}) - season(i)
233 | } else {
234 | ts(i + ${period}) / season(i)
235 | }
236 |
237 | level(i + 1) = alpha * levelWeight + (1 - alpha) * (level(i) + trend(i))
238 |
239 | trend(i + 1) = beta * (level(i + 1) - level(i)) + (1 - beta) * trend(i)
240 |
241 | val seasonWeight = if (${modelType}.equalsIgnoreCase("additive")) {
242 | ts(i + ${period}) - level(i + 1)
243 | } else {
244 | ts(i + ${period}) / level(i + 1)
245 | }
246 | season(i + ${period}) = gamma * seasonWeight + (1 - gamma) * season(i)
247 | }
248 |
249 | (Vectors.dense(dest), Vectors.dense(level), Vectors.dense(trend), Vectors.dense(season))
250 | }
251 |
252 | def getKernel: (Array[Double]) = {
253 | if (${period} % 2 == 0){
254 | val kernel = Array.fill(${period} + 1)(1.0 / ${period})
255 | kernel(0) = 0.5 / ${period}
256 | kernel(${period}) = 0.5 / ${period}
257 | kernel
258 | } else {
259 | Array.fill(${period})(1.0 / ${period})
260 | }
261 | }
262 |
263 | /**
264 | * Function to calculate the Weighted moving average/convolution using above kernel/weights
265 | * for input data.
266 | * See http://robjhyndman.com/papers/movingaverage.pdf for more information
267 | * @param inData Series on which you want to do moving average
268 | * @param kernel Weight vector for weighted moving average
269 | */
270 | def convolve(inData: Array[Double], kernel: Array[Double]): (Array[Double]) = {
271 | val kernelSize = kernel.length
272 | val dataSize = inData.length
273 |
274 | val outData = new Array[Double](dataSize - kernelSize + 1)
275 |
276 | var end = 0
277 | while (end <= (dataSize - kernelSize)) {
278 | var sum = 0.0
279 | for (i <- 0 until kernelSize) {
280 | sum += kernel(i) * inData(end + i)
281 | }
282 | outData(end) = sum
283 | end += 1
284 | }
285 | outData
286 | }
287 |
288 | /**
289 | * Function to get the initial level, trend and season using method suggested in
290 | * http://robjhyndman.com/hyndsight/hw-initialization/
291 | * @param ts
292 | */
293 | def initHoltWinters(ts: Vector): (Double, Double, Array[Double]) = {
294 | val arrTs = ts.toArray
295 |
296 | // Decompose a window of time series into level trend and seasonal using convolution
297 | val kernel = getKernel
298 | val kernelSize = kernel.size
299 | val trend = convolve(arrTs.take(${period} * 2), kernel)
300 |
301 | // Remove the trend from time series. Subtract for additive and divide for multiplicative
302 | val n = (kernelSize -1) / 2
303 | val removeTrend = arrTs.take(${period} * 2).zip(
304 | Array.fill(n)(0.0) ++ trend ++ Array.fill(n)(0.0)).map{
305 | case (a, t) =>
306 | if (t != 0){
307 | if (${modelType}.equalsIgnoreCase("additive")) {
308 | a - t
309 | } else {
310 | a / t
311 | }
312 | } else {
313 | 0
314 | }
315 | }
316 |
317 | // seasonal mean is sum of mean of all season values of that period
318 | val seasonalMean = removeTrend.splitAt(${period}).zipped.map { case (prevx, x) =>
319 | if (prevx == 0 || x == 0) x + prevx else (x + prevx) / 2
320 | }
321 |
322 | val meanOfFigures = seasonalMean.sum / ${period}
323 |
324 | // The seasonal mean is then centered and removed to get season.
325 | // Subtract for additive and divide for multiplicative.
326 | val initSeason = if (${modelType}.equalsIgnoreCase("additive")) {
327 | seasonalMean.map(_ - meanOfFigures )
328 | } else {
329 | seasonalMean.map(_ / meanOfFigures )
330 | }
331 |
332 | // Do Simple Linear Regression to find the initial level and trend
333 | val indices = 1 to trend.length
334 | val xbar = (indices.sum: Double) / indices.size
335 | val ybar = trend.sum / trend.length
336 |
337 | val xxbar = indices.map( x => (x - xbar) * (x - xbar) ).sum
338 | val xybar = indices.zip(trend).map {
339 | case (x, y) => (x - xbar) * (y - ybar)
340 | }.sum
341 |
342 | val initTrend = xybar / xxbar
343 | val initLevel = ybar - (initTrend * xbar)
344 |
345 | (initLevel, initTrend, initSeason)
346 | }
347 | }
348 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/ml/timeseries/UnivariateTimeSeries.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.ml.timeseries
2 |
3 | import java.util.Arrays
4 |
5 | import breeze.stats._
6 | import org.apache.commons.math3.analysis.interpolation.SplineInterpolator
7 | import org.apache.spark.ml.linalg.{DenseVector, Matrix, Vector, Vectors}
8 |
9 | /**
10 | * Created by endy on 16-12-20.
11 | */
12 | object UnivariateTimeSeries {
13 |
14 | /**
15 | * Lags the univariate time series
16 | *
17 | * Example input vector: (1.0, 2.0, 3.0, 4.0, 5.0)
18 | *
19 | * With lag 2 and includeOriginal = true should give output matrix:
20 | *
21 | * 3.0 2.0 1.0
22 | * 4.0 3.0 2.0
23 | * 5.0 4.0 3.0
24 | */
25 | def lag(ts: Vector, maxLag: Int, includeOriginal: Boolean): Matrix = {
26 | Lag.lagMatTrimBoth(ts, maxLag, includeOriginal)
27 | }
28 |
29 | def autocorr(ts: Array[Double], numLags: Int): Array[Double] = {
30 | autocorr(new DenseVector(ts), numLags).toArray
31 | }
32 |
33 | /**
34 | * Computes the sample autocorrelation of the given series.
35 | */
36 | def autocorr(ts: Vector, numLags: Int): Vector = {
37 | val corrs = new Array[Double](numLags)
38 | var i = 1
39 | val breezeTs = MatrixUtil.toBreeze(ts)
40 | while (i <= numLags) {
41 | val slice1 = breezeTs(i until ts.size)
42 | val slice2 = breezeTs(0 until ts.size - i)
43 | val mean1 = mean(slice1)
44 | val mean2 = mean(slice2)
45 | var variance1 = 0.0
46 | var variance2 = 0.0
47 | var covariance = 0.0
48 | var j = 0
49 | while (j < ts.size - i) {
50 | val diff1 = slice1(j) - mean1
51 | val diff2 = slice2(j) - mean2
52 | variance1 += diff1 * diff1
53 | variance2 += diff2 * diff2
54 | covariance += diff1 * diff2
55 | j += 1
56 | }
57 |
58 | corrs(i - 1) = covariance / (math.sqrt(variance1) * math.sqrt(variance2))
59 | i += 1
60 | }
61 | new DenseVector(corrs)
62 | }
63 |
64 | def quotients(ts: Vector, lag: Int): Vector = {
65 | val ret = new Array[Double](ts.size - lag)
66 | var i = 0
67 | while (i < ret.length) {
68 | ret(i) = ts(i + lag) / ts(i)
69 | i += 1
70 | }
71 | new DenseVector(ret)
72 | }
73 |
74 | def price2ret(ts: Vector, lag: Int): Vector = {
75 | val ret = new Array[Double](ts.size - lag)
76 | var i = 0
77 | while (i < ret.length) {
78 | ret(i) = ts(i + lag) / ts(i) - 1.0
79 | i += 1
80 | }
81 | new DenseVector(ret)
82 | }
83 |
84 | /**
85 | * Trim leading NaNs from a series.
86 | */
87 | def trimLeading(ts: Vector): Vector = {
88 | val start = firstNotNaN(ts)
89 | if (start < ts.size) {
90 | Vectors.dense(Arrays.copyOfRange(ts.toArray, start, ts.size))
91 | } else {
92 | Vectors.zeros(0)
93 | }
94 | }
95 |
96 | /**
97 | * Trim trailing NaNs from a series.
98 | */
99 | def trimTrailing(ts: Vector): Vector = {
100 | val end = lastNotNaN(ts)
101 | if (end > 0) {
102 | Vectors.dense(Arrays.copyOfRange(ts.toArray, 0, end))
103 | } else {
104 | Vectors.zeros(0)
105 | }
106 | }
107 |
108 | def firstNotNaN(ts: Vector): Int = {
109 | var i = 0
110 | while (i < ts.size) {
111 | if (!java.lang.Double.isNaN(ts(i))) {
112 | return i
113 | }
114 | i += 1
115 | }
116 | i
117 | }
118 |
119 | def lastNotNaN(ts: Vector): Int = {
120 | var i = ts.size - 1
121 | while (i >= 0) {
122 | if (!java.lang.Double.isNaN(ts(i))) {
123 | return i
124 | }
125 | i -= 1
126 | }
127 | i
128 | }
129 |
130 | def fillts(ts: Vector, fillMethod: String): Vector = {
131 | fillMethod match {
132 | case "linear" => fillLinear(ts)
133 | case "nearest" => fillNearest(ts)
134 | case "next" => fillNext(ts)
135 | case "previous" => fillPrevious(ts)
136 | case "spline" => fillSpline(ts)
137 | case "zero" => fillValue(ts, 0)
138 | case _ => throw new UnsupportedOperationException()
139 | }
140 | }
141 |
142 | /**
143 | * Replace all NaNs with a specific value
144 | */
145 | def fillValue(values: Array[Double], filler: Double): Array[Double] = {
146 | fillValue(new DenseVector(values), filler).toArray
147 | }
148 |
149 | /**
150 | * Replace all NaNs with a specific value
151 | */
152 | def fillValue(values: Vector, filler: Double): DenseVector = {
153 | val result = values.copy.toArray
154 | var i = 0
155 | while (i < result.size) {
156 | if (result(i).isNaN) result(i) = filler
157 | i += 1
158 | }
159 | new DenseVector(result)
160 | }
161 |
162 | def fillNearest(values: Array[Double]): Array[Double] = {
163 | fillNearest(new DenseVector(values)).toArray
164 | }
165 |
166 | def fillNearest(values: Vector): DenseVector = {
167 | val result = values.copy.toArray
168 | var lastExisting = -1
169 | var nextExisting = -1
170 | var i = 1
171 | while (i < result.length) {
172 | if (result(i).isNaN) {
173 | if (nextExisting < i) {
174 | nextExisting = i + 1
175 | while (nextExisting < result.length && result(nextExisting).isNaN) {
176 | nextExisting += 1
177 | }
178 | }
179 |
180 | if (lastExisting < 0 && nextExisting >= result.size) {
181 | throw new IllegalArgumentException("Input is all NaNs!")
182 | } else if (nextExisting >= result.size || // TODO: check this
183 | (lastExisting >= 0 && i - lastExisting < nextExisting - i)) {
184 | result(i) = result(lastExisting)
185 | } else {
186 | result(i) = result(nextExisting)
187 | }
188 | } else {
189 | lastExisting = i
190 | }
191 | i += 1
192 | }
193 | new DenseVector(result)
194 | }
195 |
196 | def fillPrevious(values: Array[Double]): Array[Double] = {
197 | fillPrevious(new DenseVector(values)).toArray
198 | }
199 |
200 | /**
201 | * fills in NaN with the previously available not NaN, scanning from left to right.
202 | * 1 NaN NaN 2 Nan -> 1 1 1 2 2
203 | */
204 | def fillPrevious(values: Vector): DenseVector = {
205 | val result = values.copy.toArray
206 | var filler = Double.NaN // initial value, maintains invariant
207 | var i = 0
208 | while (i < result.length) {
209 | filler = if (result(i).isNaN) filler else result(i)
210 | result(i) = filler
211 | i += 1
212 | }
213 | new DenseVector(result)
214 | }
215 |
216 | def fillNext(values: Array[Double]): Array[Double] = {
217 | fillNext(new DenseVector(values)).toArray
218 | }
219 |
220 | /**
221 | * fills in NaN with the next available not NaN, scanning from right to left.
222 | * 1 NaN NaN 2 Nan -> 1 2 2 2 NaN
223 | */
224 | def fillNext(values: Vector): DenseVector = {
225 | val result = values.copy.toArray
226 | var filler = Double.NaN // initial value, maintains invariant
227 | var i = result.length - 1
228 | while (i >= 0) {
229 | filler = if (result(i).isNaN) filler else result(i)
230 | result(i) = filler
231 | i -= 1
232 | }
233 | new DenseVector(result)
234 | }
235 |
236 | def fillWithDefault(values: Array[Double], filler: Double): Array[Double] = {
237 | fillWithDefault(new DenseVector(values), filler).toArray
238 | }
239 |
240 | /**
241 | * fills in NaN with a default value
242 | */
243 | def fillWithDefault(values: Vector, filler: Double): DenseVector = {
244 | val result = values.copy.toArray
245 | var i = 0
246 | while (i < result.length) {
247 | result(i) = if (result(i).isNaN) filler else result(i)
248 | i += 1
249 | }
250 | new DenseVector(result)
251 | }
252 |
253 | def fillLinear(values: Array[Double]): Array[Double] = {
254 | fillLinear(new DenseVector(values)).toArray
255 | }
256 |
257 | def fillLinear(values: Vector): DenseVector = {
258 | val result = values.copy.toArray
259 | var i = 1
260 | while (i < result.length - 1) {
261 | val rangeStart = i
262 | while (i < result.length - 1 && result(i).isNaN) {
263 | i += 1
264 | }
265 | val before = result(rangeStart - 1)
266 | val after = result(i)
267 | if (i != rangeStart && !before.isNaN && !after.isNaN) {
268 | val increment = (after - before) / (i - (rangeStart - 1))
269 | for (j <- rangeStart until i) {
270 | result(j) = result(j - 1) + increment
271 | }
272 | }
273 | i += 1
274 | }
275 | new DenseVector(result)
276 | }
277 |
278 | def fillSpline(values: Array[Double]): Array[Double] = {
279 | fillSpline(new DenseVector(values)).toArray
280 | }
281 |
282 | /**
283 | * Fill in NaN values using a natural cubic spline.
284 | * @param values Vector to interpolate
285 | * @return Interpolated vector
286 | */
287 | def fillSpline(values: Vector): DenseVector = {
288 | val result = values.copy.toArray
289 | val interp = new SplineInterpolator()
290 | val knotsAndValues = values.toArray.zipWithIndex.filter(!_._1.isNaN)
291 | // Note that the type of unzip is missed up in scala 10.4 as per
292 | // https://issues.scala-lang.org/browse/SI-8081
293 | // given that this project is using scala 10.4, we cannot use unzip, so unpack manually
294 | val knotsX = knotsAndValues.map(_._2.toDouble)
295 | val knotsY = knotsAndValues.map(_._1)
296 | val filler = interp.interpolate(knotsX, knotsY)
297 |
298 | // values that we can interpolate between, others need to be filled w/ other function
299 | var i = knotsX(0).toInt
300 | val end = knotsX.last.toInt
301 |
302 | while (i < end) {
303 | result(i) = filler.value(i.toDouble)
304 | i += 1
305 | }
306 | new DenseVector(result)
307 | }
308 |
309 |
310 | /**
311 | * Down sample by taking every nth element starting from offset phase
312 | * @param values Vector to down sample
313 | * @param n take every nth element
314 | * @param phase offset from starting index
315 | * @return downsampled vector with appropriate length
316 | */
317 | def downsample(values: Vector, n: Int, phase: Int = 0): DenseVector = {
318 | val origLen = values.size
319 | val newLen = Math.ceil((values.size - phase) / n.toDouble).toInt
320 | val sampledValues = Array.fill(newLen)(0.0)
321 | var i = phase
322 | var j = 0
323 |
324 | while (j < newLen) {
325 | sampledValues(j) = values(i)
326 | i += n
327 | j += 1
328 | }
329 | new DenseVector(sampledValues)
330 | }
331 |
332 | /**
333 | * Up sample by inserting n - 1 elements into the original values vector, starting at index phase
334 | * @param values the original data vector
335 | * @param n the number of insertions between elements
336 | * @param phase the offset to begin
337 | * @param useZero fill with zeros rather than NaN
338 | * @return upsampled vector filled with zeros or NaN, as specified by user
339 | */
340 | def upsample(values: Vector, n: Int, phase: Int = 0, useZero: Boolean = false): DenseVector = {
341 | val filler = if (useZero) 0 else Double.NaN
342 | val origLen = values.size
343 | val newLen = origLen * n
344 | val sampledValues = Array.fill(newLen)(filler)
345 | var i = phase
346 | var j = 0
347 |
348 | while (j < origLen) {
349 | sampledValues(i) = values(j)
350 | i += n
351 | j += 1
352 | }
353 | new DenseVector(sampledValues)
354 | }
355 |
356 | /**
357 | * Difference a vector with respect to the m-th prior element. Size-preserving by leaving first
358 | * `m` elements intact. This is the inverse of the `inverseDifferences` function.
359 | * @param ts Series to difference
360 | * @param destTs Series to store the differenced values (and return for convenience)
361 | * @param lag The difference lag (e.g. x means destTs(i) = ts(i) - ts(i - x), etc)
362 | * @param startIndex the starting index for the differencing. Must be at least equal to lag
363 | * @return the differenced vector, for convenience
364 | */
365 | def differencesAtLag(ts: Vector, destTs: Vector, lag: Int, startIndex: Int): Vector = {
366 | require(startIndex >= lag, "starting index cannot be less than lag")
367 | val diffedTs = if (destTs == null) ts.copy else destTs
368 | if (lag == 0) {
369 | diffedTs
370 | } else {
371 | val arr = diffedTs.toArray
372 | val n = ts.size
373 | var i = 0
374 |
375 | while (i < n) {
376 | // elements prior to starting point are copied over without modification
377 | arr(i) = if (i < startIndex) ts(i) else ts(i) - ts(i - lag)
378 | i += 1
379 | }
380 | diffedTs
381 | }
382 | }
383 |
384 | /**
385 | * Convenience wrapper around `differencesAtLag[Vector[Double], Vector[Double], Int, Int]`
386 | * @param ts vector to difference
387 | * @param lag the difference lag (e.g. x means destTs(i) = ts(i) - ts(i - x), etc)
388 | * @return the differenced vector, for convenience
389 | */
390 | def differencesAtLag(ts: Vector, lag: Int): Vector = {
391 | differencesAtLag(ts, null, lag, lag)
392 | }
393 |
394 | /**
395 | * Calculate an "inverse-differenced" vector of a given lag. Size-preserving by leaving first
396 | * `startIndex` elements intact. This is the inverse of the `differences` function.
397 | * @param diffedTs differenced vector that we want to inverse
398 | * @param destTs Series to store the added up values (and return for convenience)
399 | * @param lag The difference lag (e.g. x means destTs(i) = diffedTs(i) + destTs(i - x), etc)
400 | * @param startIndex the starting index for the differencing. Must be at least equal to lag
401 | * @return the inverse differenced vector, for convenience
402 | */
403 | def inverseDifferencesAtLag(diffedTs: Vector, destTs: Vector, lag: Int,
404 | startIndex: Int): Vector = {
405 | require(startIndex >= lag, "starting index cannot be less than lag")
406 | val addedTs = if (destTs == null) diffedTs.copy else destTs
407 | if (lag == 0) {
408 | addedTs
409 | } else {
410 | val n = diffedTs.size
411 | var i = 0
412 |
413 | val arr = addedTs.toArray
414 | while (i < n) {
415 | // elements prior to starting point are copied over without modification
416 | arr(i) = if (i < startIndex) diffedTs(i) else diffedTs(i) + addedTs(i - lag)
417 | i += 1
418 | }
419 | addedTs
420 | }
421 | }
422 |
423 | /**
424 | * Convenience wrapper around `inverseDifferencesAtLag[Vector[Double], Vector[Double], Int, Int]`
425 | * @param diffedTs differenced vector that we want to inverse
426 | * @param lag the difference lag (e.g. x means destTs(i) = ts(i) - ts(i - x), etc)
427 | * @return the inverse differenced vector, for convenience
428 | */
429 | def inverseDifferencesAtLag(diffedTs: Vector, lag: Int): Vector = {
430 | inverseDifferencesAtLag(diffedTs, null, lag, lag)
431 | }
432 |
433 | /**
434 | * Performs differencing of order `d`. This means we recursively difference a vector a total of
435 | * d-times. So that d = 2 is a vector of the differences of differences. Note that for each
436 | * difference level, d_i, the element at ts(d_i - 1) corresponds to the value in the prior
437 | * iteration.
438 | * @param ts time series to difference
439 | * @param d order of differencing
440 | * @return a vector of the same length differenced to order d
441 | */
442 | def differencesOfOrderD(ts: Vector, d: Int): Vector = {
443 | // we create 2 copies to avoid copying with every call, and simply swap them as necessary
444 | // for higher order differencing
445 | var (diffedTs, origTs) = (ts.copy, ts.copy)
446 | var swap: Vector = null
447 | for (i <- 1 to d) {
448 | swap = origTs
449 | origTs = diffedTs
450 | diffedTs = swap
451 | differencesAtLag(origTs, diffedTs, 1, i)
452 | }
453 | diffedTs
454 | }
455 |
456 | /**
457 | * Inverses differencing of order `d`.
458 | * @param diffedTs time series to reverse differencing process
459 | * @param d order of differencing
460 | * @return a vector of the same length, which when differenced to order ts, yields the original
461 | * vector provided
462 | */
463 | def inverseDifferencesOfOrderD(diffedTs: Vector, d: Int): Vector = {
464 | val addedTs = diffedTs.copy
465 | for (i <- d to 1 by -1) {
466 | inverseDifferencesAtLag(addedTs, addedTs, 1, i)
467 | }
468 | addedTs
469 | }
470 |
471 | def rollSum(ts: Vector, n: Int): Vector = {
472 | new DenseVector(ts.toArray.sliding(n).toList.map(_.sum).toIndexedSeq.toArray[Double])
473 | }
474 | }
475 |
--------------------------------------------------------------------------------