├── LICENSE
├── README.md
├── build.sbt
└── src
    ├── main
        └── scala
        │   └── com
        │       └── backhoff
        │           └── clustream
        │               ├── CluStream.scala
        │               ├── CluStreamOnline.scala
        │               └── Tools.scala
    └── test
        └── scala
            └── com
                └── backhoff
                    └── clustream
                        ├── KmeansTest.scala
                        ├── SimpleApp.scala
                        ├── StreamDM.scala
                        ├── StreamingKMeans.scala
                        └── StreamingTests.scala


/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Spark-CluStream
 2 | By Omar Backhoff Larrazolo
 3 | 
 4 | 
 5 | Adaptation of the CluStream method to Spark
 6 | 
 7 | Includes:
 8 | 
 9 | Online microclustering class
10 | 
11 | Offline macroclustering class
12 | 
13 | Reference to the actual method and original paper
14 | 
15 | http://www-nishio.ist.osaka-u.ac.jp/vldb/archives/public/website/2003/papers/S04P02.pdf
16 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | resolvers ++= Seq(
 2 |   Resolver sonatypeRepo "public",
 3 |   Resolver typesafeRepo "releases",
 4 | )
 5 | 
 6 | name := "spark-clustream"
 7 | 
 8 | //spName := "obackhoff/spark-clustream"
 9 | 
10 | version := "0.1"
11 | 
12 | scalaVersion := "2.11.8"
13 | 
14 | //sparkVersion := "2.2.0"
15 | 
16 | //sparkComponents ++= Seq("streaming", "mllib")
17 | 
18 | libraryDependencies ++= Seq(
19 | "com.github.fommil.netlib" % "all" % "1.1.2" pomOnly(),
20 | "org.apache.spark" % "spark-mllib_2.11" % "2.2.0",
21 | "org.apache.spark" % "spark-streaming_2.11" % "2.2.0"
22 | )
23 | 


--------------------------------------------------------------------------------
/src/main/scala/com/backhoff/clustream/CluStream.scala:
--------------------------------------------------------------------------------
  1 | package com.backhoff.clustream
  2 | 
  3 | /**
  4 |  * Created by omar on 9/25/15.
  5 |  */
  6 | 
  7 | import breeze.linalg._
  8 | import org.apache.spark.SparkContext
  9 | import org.apache.spark.streaming.dstream.DStream
 10 | import org.apache.spark.annotation.Experimental
 11 | import java.io._
 12 | import java.nio.file.{Paths, Files}
 13 | import org.apache.spark.mllib.clustering.KMeans
 14 | 
 15 | /**
 16 |   * Class that contains the offline methods for the CluStream
 17 |   * method. It can be initialized with a CluStreamOnline model to
 18 |   * facilitate the use of it at the same time the online process
 19 |   * is running.
 20 |   *
 21 |   **/
 22 | 
 23 | @Experimental
 24 | class CluStream (
 25 |                   val model:CluStreamOnline)
 26 |   extends Serializable{
 27 | 
 28 |   def this() = this(null)
 29 | 
 30 |   /**
 31 |     * Method that samples values from a given distribution.
 32 |     *
 33 |     * @param dist: this is a map containing values and their weights in
 34 |     *            the distributions. Weights must add to 1.
 35 |     *            Example. {A -> 0.5, B -> 0.3, C -> 0.2 }
 36 |     * @return A: sample value A
 37 |     **/
 38 | 
 39 |   private def sample[A](dist: Map[A, Double]): A = {
 40 |     val p = scala.util.Random.nextDouble
 41 |     val it = dist.iterator
 42 |     var accum = 0.0
 43 |     while (it.hasNext) {
 44 |       val (item, itemProb) = it.next
 45 |       accum += itemProb
 46 |       if (accum >= p)
 47 |         return item
 48 |     }
 49 |     sys.error(f"this should never happen") // needed so it will compile
 50 |   }
 51 | 
 52 |   /**
 53 |     * Method that saves a snapshot to disk using the pyramidal time
 54 |     * scheme to a given directory.
 55 |     *
 56 |     * @param dir: directory to save the snapshot
 57 | 
 58 |     * @param tc: time clock unit to save
 59 |     * @param alpha: alpha parameter of the pyramidal time scheme
 60 |     * @param l: l modifier of the pyramidal time scheme
 61 |     **/
 62 | 
 63 |   def saveSnapShotsToDisk(dir: String = "", tc: Long, alpha: Int = 2, l: Int = 2): Unit ={
 64 | 
 65 |     var write = false
 66 |     var delete = false
 67 |     var order = 0
 68 |     val mcs = model.getMicroClusters
 69 | 
 70 | 
 71 |     val exp = (scala.math.log(tc) / scala.math.log(alpha)).toInt
 72 | 
 73 |       for (i <- 0 to exp) {
 74 |         if (tc % scala.math.pow(alpha, i + 1) != 0 && tc % scala.math.pow(alpha, i) == 0) {
 75 |           order = i
 76 |           write = true
 77 |         }
 78 |       }
 79 | 
 80 |       val tcBye = tc - ((scala.math.pow(alpha, l) + 1) * scala.math.pow(alpha, order + 1)).toInt
 81 | 
 82 |       if (tcBye > 0) delete = true
 83 | 
 84 |       if (write) {
 85 |         val out = new ObjectOutputStream(new FileOutputStream(dir + "/" + tc))
 86 | 
 87 |         try {
 88 |           out.writeObject(mcs)
 89 |         }
 90 |         catch {
 91 |           case ex: IOException => println("Exception while writing file " + ex)
 92 |         }
 93 |         finally {
 94 |           out.close()
 95 |         }
 96 |       }
 97 | 
 98 |       if (delete) {
 99 |         try {
100 |           new File(dir + "/" + tcBye).delete()
101 |         }
102 |         catch {
103 |           case ex: IOException => println("Exception while deleting file " + ex);
104 |         }
105 |       }
106 |   }
107 | 
108 |   /**
109 |     * Method that gets the snapshots to use for a given time and horizon in a
110 |     * given file directory.
111 |     *
112 |     * @param dir: directory to save the snapshot
113 | 
114 |     * @param tc: time clock unit to save
115 |     * @param h: time horizon
116 |     * @return (Long,Long): tuple of the first and second snapshots to use.
117 |     **/
118 | 
119 |   def getSnapShots(dir: String = "", tc: Long, h: Long): (Long,Long) = {
120 | 
121 |     var tcReal = tc
122 |     while(!Files.exists(Paths.get(dir + "/" + tcReal)) && tcReal >= 0) tcReal = tcReal - 1
123 |     var tcH = tcReal - h
124 |     while(!Files.exists(Paths.get(dir + "/" + tcH)) && tcH >= 0) tcH = tcH - 1
125 |     if(tcH < 0) while(!Files.exists(Paths.get(dir + "/" + tcH))) tcH = tcH + 1
126 | 
127 |     if(tcReal == -1L) tcH = -1L
128 |     (tcReal, tcH)
129 |   }
130 | 
131 |   /**
132 |     * Method that returns the microclusters from the snapshots for a given time and horizon in a
133 |     * given file directory. Subtracts the features of the first one with the second one.
134 |     *
135 |     * @param dir: directory to save the snapshot
136 | 
137 |     * @param tc: time clock unit to save
138 |     * @param h: time horizon
139 |     * @return Array[MicroCluster]: computed array of microclusters
140 |     **/
141 | 
142 |   def getMCsFromSnapshots(dir: String = "", tc: Long, h: Long): Array[MicroCluster] = {
143 |     val (t1,t2) = getSnapShots(dir,tc,h)
144 | 
145 |     try{
146 |       val in1 = new ObjectInputStream(new FileInputStream(dir + "/" + t1))
147 |       val snap1 = in1.readObject().asInstanceOf[Array[MicroCluster]]
148 | 
149 |       val in2 = new ObjectInputStream(new FileInputStream(dir + "/" + t2))
150 |       val snap2 = in2.readObject().asInstanceOf[Array[MicroCluster]]
151 | 
152 |       in2.close()
153 |       in1.close()
154 | 
155 |       val arrs1 = snap1.map(_.getIds)
156 |       val arrs2 = snap2.map(_.getIds)
157 | 
158 |       val relatingMCs = snap1 zip arrs1.map(a => arrs2.zipWithIndex.map(b=> if(b._1.toSet.intersect(a.toSet).nonEmpty) b._2;else -1))
159 |       relatingMCs.map{ mc =>
160 |         if (!mc._2.forall(_ == -1) && t1 - h  >= t2) {
161 |           for(id <- mc._2) if(id != -1) {
162 |             mc._1.setCf2x(mc._1.getCf2x :- snap2(id).getCf2x)
163 |             mc._1.setCf1x(mc._1.getCf1x :- snap2(id).getCf1x)
164 |             mc._1.setCf2t(mc._1.getCf2t - snap2(id).getCf2t)
165 |             mc._1.setCf1t(mc._1.getCf1t - snap2(id).getCf1t)
166 |             mc._1.setN(mc._1.getN - snap2(id).getN)
167 |             mc._1.setIds(mc._1.getIds.toSet.diff(snap2(id).getIds.toSet).toArray)
168 |           }
169 |           mc._1
170 |         }else mc._1
171 | 
172 |       }
173 |     }
174 |     catch{
175 |       case ex: IOException => println("Exception while reading files " + ex)
176 |       null
177 |     }
178 | 
179 |   }
180 | 
181 |   /**
182 |     * Method that returns the centrois of the microclusters.
183 |     *
184 |     * @param mcs: array of microclusters
185 |     * @return Array[Vector]: computed array of centroids
186 |     **/
187 | 
188 |   def getCentersFromMC(mcs: Array[MicroCluster]): Array[Vector[Double]] = {
189 |     mcs.filter(_.getN > 0).map(mc => mc.getCf1x :/ mc.getN.toDouble)
190 |     }
191 | 
192 |   /**
193 |     * Method that returns the weights of the microclusters from the number of points.
194 |     *
195 |     * @param mcs: array of microclusters
196 |     * @return Array[Double]: computed array of weights
197 |     **/
198 | 
199 |   def getWeightsFromMC(mcs: Array[MicroCluster]): Array[Double] = {
200 |     var arr: Array[Double] = mcs.map(_.getN.toDouble).filter(_ > 0)
201 |     val sum: Double = arr.sum
202 |     arr.map(value => value/sum)
203 |   }
204 | 
205 |   /**
206 |     * Method that returns a computed KMeansModel. It runs a modified version
207 |     * of the KMeans algorithm in Spark from sampling the microclusters given
208 |     * its weights.
209 |     *
210 |     * @param sc: spark context where KMeans will run
211 |     * @param k: number of clusters
212 |     * @param mcs: array of microclusters
213 |     * @return org.apache.spark.mllib.clustering.KMeansModel: computed KMeansModel
214 |     **/
215 | 
216 |   def fakeKMeans(sc: SparkContext,k: Int, numPoints: Int, mcs: Array[MicroCluster]): org.apache.spark.mllib.clustering.KMeansModel ={
217 | 
218 |       val kmeans = new KMeans()
219 |       var centers = getCentersFromMC(mcs).map(v => org.apache.spark.mllib.linalg.Vectors.dense(v.toArray))
220 |       val weights = getWeightsFromMC(mcs)
221 |       val map = (centers zip weights).toMap
222 |       val points = Array.fill(numPoints)(sample(map))
223 | 
224 | 
225 |       kmeans.setMaxIterations(20)
226 |       kmeans.setK(k)
227 |       kmeans.setInitialModel(new org.apache.spark.mllib.clustering.KMeansModel(Array.fill(k)(sample(map))))
228 |       val trainingSet = sc.parallelize(points)
229 |       val clusters = kmeans.run(trainingSet)
230 |       trainingSet.unpersist(blocking = false)
231 |       clusters
232 | 
233 |   }
234 | 
235 |   /**
236 |     * Method that allows to run the online process from this class.
237 |     *
238 |     * @param data: data that comes from the stream
239 |     *
240 |     **/
241 | 
242 |   def startOnline(data: DStream[breeze.linalg.Vector[Double]]): Unit ={
243 |     model.run(data)
244 |   }
245 | 
246 | 
247 | }
248 | 


--------------------------------------------------------------------------------
/src/main/scala/com/backhoff/clustream/CluStreamOnline.scala:
--------------------------------------------------------------------------------
  1 | package com.backhoff.clustream
  2 | 
  3 | /**
  4 |   * Created by omar on 9/25/15.
  5 |   */
  6 | 
  7 | import breeze.linalg._
  8 | import org.apache.spark.broadcast.Broadcast
  9 | import org.apache.spark.streaming.dstream.DStream
 10 | import org.apache.spark.rdd.RDD
 11 | import org.apache.spark.annotation.Experimental
 12 | import org.apache.spark.mllib.clustering.{StreamingKMeans, KMeans}
 13 | import breeze.stats.distributions.Gaussian
 14 | 
 15 | 
 16 | /**
 17 |   * CluStreamOnline is a class that contains all the necessary
 18 |   * procedures to initialize and maintain the microclusters
 19 |   * required by the CluStream method. This approach is adapted
 20 |   * to work with batches of data to match the way Spark Streaming
 21 |   * works; meaning that every batch of data is considered to have
 22 |   * to have the same time stamp.
 23 |   *
 24 |   * @param q             : the number of microclusters to use. Normally 10 * k is a good choice,
 25 |   *                      where k is the number of macro clusters
 26 |   * @param numDimensions : this sets the number of attributes of the data
 27 |   * @param minInitPoints : minimum number of points to use for the initialization
 28 |   *                      of the microclusters. If set to 0 then initRand is used
 29 |   *                      insted of initKmeans
 30 |   **/
 31 | 
 32 | @Experimental
 33 | class CluStreamOnline(
 34 |                        val q: Int,
 35 |                        val numDimensions: Int,
 36 |                        val minInitPoints: Int)
 37 |  extends Serializable {
 38 | 
 39 |   @transient lazy val log = org.apache.log4j.LogManager.getLogger("myLogger")
 40 | 
 41 | 
 42 |   /**
 43 |     * Easy timer function for blocks
 44 |     **/
 45 | 
 46 |   def timer[R](block: => R): R = {
 47 |     val t0 = System.nanoTime()
 48 |     val result = block // call-by-name
 49 |     val t1 = System.nanoTime()
 50 |     log.warn(s"Elapsed time: " + (t1 - t0) / 1000000 + "ms")
 51 |     result
 52 |   }
 53 |   
 54 |   private var mLastPoints = 500
 55 |   private var delta = 20
 56 |   private var tFactor = 2.0
 57 |   private var recursiveOutliersRMSDCheck = true
 58 | 
 59 |   private var time: Long = 0L
 60 |   private var N: Long = 0L
 61 |   private var currentN: Long = 0L
 62 | 
 63 |   private var microClusters: Array[MicroCluster] = Array.fill(q)(new MicroCluster(Vector.fill[Double](numDimensions)(0.0), Vector.fill[Double](numDimensions)(0.0), 0L, 0L, 0L))
 64 |   private var mcInfo: Array[(MicroClusterInfo, Int)] = null
 65 | 
 66 |   private var broadcastQ: Broadcast[Int] = null
 67 |   private var broadcastMCInfo: Broadcast[Array[(MicroClusterInfo, Int)]] = null
 68 | 
 69 |   var initialized = false
 70 | 
 71 |   private var useNormalKMeans = false
 72 |   private var strKmeans: StreamingKMeans = null
 73 | 
 74 | 
 75 | 
 76 |   private var initArr: Array[breeze.linalg.Vector[Double]] = Array()
 77 | 
 78 |   /**
 79 |     * Random initialization of the q microclusters
 80 |     *
 81 |     * @param rdd : rdd in use from the incoming DStream
 82 |     **/
 83 | 
 84 |   private def initRand(rdd: RDD[breeze.linalg.Vector[Double]]): Unit = {
 85 |     mcInfo = Array.fill(q)(new MicroClusterInfo(Vector.fill[Double](numDimensions)(rand()), 0.0, 0L)) zip (0 until q)
 86 | 
 87 |     val assignations = assignToMicroCluster(rdd, mcInfo)
 88 |     updateMicroClusters(assignations)
 89 |     var i = 0
 90 |     for (mc <- microClusters) {
 91 |       mcInfo(i) = (mcInfo(i)._1, mc.getIds(0))
 92 |       if (mc.getN > 0) mcInfo(i)._1.setCentroid(mc.cf1x :/ mc.n.toDouble)
 93 |       mcInfo(i)._1.setN(mc.getN)
 94 |       if (mcInfo(i)._1.n > 1) mcInfo(i)._1.setRmsd(scala.math.sqrt(sum(mc.cf2x) / mc.n.toDouble - sum(mc.cf1x.map(a => a * a)) / (mc.n * mc.n.toDouble)))
 95 |       i += 1
 96 |     }
 97 |     for (mc <- mcInfo) {
 98 |       if (mc._1.n == 1)
 99 |         mc._1.setRmsd(distanceNearestMC(mc._1.centroid, mcInfo))
100 |     }
101 | 
102 |     broadcastMCInfo = rdd.context.broadcast(mcInfo)
103 |     initialized = true
104 |   }
105 | 
106 |   /**
107 |     * Initialization of the q microclusters using the K-Means algorithm
108 |     *
109 |     * @param rdd : rdd in use from the incoming DStream
110 |     **/
111 | 
112 |   private def initKmeans(rdd: RDD[breeze.linalg.Vector[Double]]): Unit = {
113 |     initArr = initArr ++ rdd.collect
114 |     if (initArr.length >= minInitPoints) {
115 |       val tempRDD = rdd.context.parallelize(initArr)
116 |       val trainingSet = tempRDD.map(v => org.apache.spark.mllib.linalg.Vectors.dense(v.toArray))
117 |       val clusters = KMeans.train(trainingSet, q, 10)
118 | 
119 |       mcInfo = Array.fill(q)(new MicroClusterInfo(Vector.fill[Double](numDimensions)(0), 0.0, 0L)) zip (0 until q)
120 |       for (i <- clusters.clusterCenters.indices) mcInfo(i)._1.setCentroid(DenseVector(clusters.clusterCenters(i).toArray))
121 | 
122 |       val assignations = assignToMicroCluster(tempRDD, mcInfo)
123 |       updateMicroClusters(assignations)
124 | 
125 |       var i = 0
126 |       for (mc <- microClusters) {
127 |         mcInfo(i) = (mcInfo(i)._1, mc.getIds(0))
128 |         if (mc.getN > 0) mcInfo(i)._1.setCentroid(mc.cf1x :/ mc.n.toDouble)
129 |         mcInfo(i)._1.setN(mc.getN)
130 |         if (mcInfo(i)._1.n > 1) mcInfo(i)._1.setRmsd(scala.math.sqrt(sum(mc.cf2x) / mc.n.toDouble - sum(mc.cf1x.map(a => a * a)) / (mc.n * mc.n.toDouble)))
131 |         i += 1
132 |       }
133 |       for (mc <- mcInfo) {
134 |         if (mc._1.n == 1)
135 |           mc._1.setRmsd(distanceNearestMC(mc._1.centroid, mcInfo))
136 |       }
137 | 
138 |       broadcastMCInfo = rdd.context.broadcast(mcInfo)
139 | 
140 |       initialized = true
141 |     }
142 |   }
143 | 
144 |   private def initStreamingKmeans(rdd: RDD[breeze.linalg.Vector[Double]]): Unit = {
145 | 
146 |     if(strKmeans == null) strKmeans = new StreamingKMeans().setK(q).setRandomCenters(numDimensions, 0.0)
147 |     val trainingSet = rdd.map(v => org.apache.spark.mllib.linalg.Vectors.dense(v.toArray))
148 | 
149 |     val clusters = strKmeans.latestModel().update(trainingSet,1.0, "batches")
150 |     if(getTotalPoints >= minInitPoints){
151 | 
152 |       mcInfo = Array.fill(q)(new MicroClusterInfo(Vector.fill[Double](numDimensions)(0), 0.0, 0L)) zip (0 until q)
153 |       for (i <- clusters.clusterCenters.indices) mcInfo(i)._1.setCentroid(DenseVector(clusters.clusterCenters(i).toArray))
154 | 
155 |       val assignations = assignToMicroCluster(rdd, mcInfo)
156 |       updateMicroClusters(assignations)
157 | 
158 |       var i = 0
159 |       for (mc <- microClusters) {
160 |         mcInfo(i) = (mcInfo(i)._1, mc.getIds(0))
161 |         if (mc.getN > 0) mcInfo(i)._1.setCentroid(mc.cf1x :/ mc.n.toDouble)
162 |         mcInfo(i)._1.setN(mc.getN)
163 |         if (mcInfo(i)._1.n > 1) mcInfo(i)._1.setRmsd(scala.math.sqrt(sum(mc.cf2x) / mc.n.toDouble - sum(mc.cf1x.map(a => a * a)) / (mc.n * mc.n.toDouble)))
164 |         i += 1
165 |       }
166 |       for (mc <- mcInfo) {
167 |         if (mc._1.n == 1)
168 |           mc._1.setRmsd(distanceNearestMC(mc._1.centroid, mcInfo))
169 |       }
170 | 
171 |       broadcastMCInfo = rdd.context.broadcast(mcInfo)
172 |       initialized = true
173 |     }
174 | 
175 |   }
176 | 
177 |   /**
178 |     * Main method that runs the entire algorithm. This is called every time the
179 |     * Streaming context handles a batch.
180 |     *
181 |     * @param data : data coming from the stream. Each entry has to be parsed as
182 |     *             breeze.linalg.Vector[Double]
183 |     **/
184 | 
185 |   def run(data: DStream[breeze.linalg.Vector[Double]]): Unit = {
186 |     data.foreachRDD { (rdd, timeS) =>
187 |       currentN = rdd.count()
188 |       if (currentN != 0) {
189 |         if (initialized) {
190 | 
191 |           val assignations = assignToMicroCluster(rdd)
192 |           updateMicroClusters(assignations)
193 | 
194 |           var i = 0
195 |           for (mc <- microClusters) {
196 |             mcInfo(i) = (mcInfo(i)._1, mc.getIds(0))
197 |             if (mc.getN > 0) mcInfo(i)._1.setCentroid(mc.cf1x :/ mc.n.toDouble)
198 |             mcInfo(i)._1.setN(mc.getN)
199 |             if (mcInfo(i)._1.n > 1) mcInfo(i)._1.setRmsd(scala.math.sqrt(sum(mc.cf2x) / mc.n.toDouble - sum(mc.cf1x.map(a => a * a)) / (mc.n * mc.n.toDouble)))
200 |             i += 1
201 |           }
202 |           for (mc <- mcInfo) {
203 |             if (mc._1.n == 1)
204 |               mc._1.setRmsd(distanceNearestMC(mc._1.centroid, mcInfo))
205 |           }
206 | 
207 |           broadcastMCInfo = rdd.context.broadcast(mcInfo)
208 |         } else {
209 |           minInitPoints match {
210 |             case 0 => initRand(rdd)
211 |             case _ => if(useNormalKMeans) initKmeans(rdd) else initStreamingKmeans(rdd)
212 |           }
213 |         }
214 |       }
215 |       this.time += 1
216 |       this.N += currentN
217 |     }
218 |   }
219 | 
220 |   /**
221 |     * Method that returns the current array of microclusters.
222 |     *
223 |     * @return Array[MicroCluster]: current array of microclusters
224 |     **/
225 | 
226 |   def getMicroClusters: Array[MicroCluster] = {
227 |     this.microClusters
228 |   }
229 | 
230 |   /**
231 |     * Method that returns current time clock unit in the stream.
232 |     *
233 |     * @return Long: current time in stream
234 |     **/
235 | 
236 |   def getCurrentTime: Long = {
237 |     this.time
238 |   }
239 | 
240 |   /**
241 |     * Method that returns the total number of points processed so far in
242 |     * the stream.
243 |     *
244 |     * @return Long: total number of points processed
245 |     **/
246 | 
247 |   def getTotalPoints: Long = {
248 |     this.N
249 |   }
250 | 
251 |   /**
252 |     * Method that sets if the newly created microclusters due to
253 |     * outliers are able to absorb other outlier points. This is done recursively
254 |     * for all new microclusters, thus disabling these increases slightly the
255 |     * speed of the algorithm but also allows to create overlaping microclusters
256 |     * at this stage.
257 |     *
258 |     * @param ans : true or false
259 |     * @return Class: current class
260 |     **/
261 | 
262 |   def setRecursiveOutliersRMSDCheck(ans: Boolean): this.type = {
263 |     this.recursiveOutliersRMSDCheck = ans
264 |     this
265 |   }
266 | 
267 |   /**
268 |     * Changes the K-Means method to use from StreamingKmeans to
269 |     * normal K-Means for the initialization. StreamingKMeans is much
270 |     * faster but in some cases normal K-Means could deliver more
271 |     * accurate initialization.
272 |     *
273 |     * @param ans : true or false
274 |     * @return Class: current class
275 |     **/
276 | 
277 |   def setInitNormalKMeans(ans: Boolean): this.type = {
278 |     this.useNormalKMeans = ans
279 |     this
280 |   }
281 | 
282 | 
283 |   /**
284 |     * Method that sets the m last number of points in a microcluster
285 |     * used to approximate its timestamp (recency value).
286 |     *
287 |     * @param m : m last points
288 |     * @return Class: current class
289 |     **/
290 | 
291 |   def setM(m: Int): this.type = {
292 |     this.mLastPoints = m
293 |     this
294 |   }
295 | 
296 |   /**
297 |     * Method that sets the threshold d, used to determine whether a
298 |     * microcluster is safe to delete or not (Tc - d < recency).
299 |     *
300 |     * @param d : threshold
301 |     * @return Class: current class
302 |     **/
303 | 
304 |   def setDelta(d: Int): this.type = {
305 |     this.delta = d
306 |     this
307 |   }
308 | 
309 |   /**
310 |     * Method that sets the factor t of RMSDs. A point whose distance to
311 |     * its nearest microcluster is greater than t*RMSD is considered an
312 |     * outlier.
313 |     *
314 |     * @param t : t factor
315 |     * @return Class: current class
316 |     **/
317 | 
318 |   def setTFactor(t: Double): this.type = {
319 |     this.tFactor = t
320 |     this
321 |   }
322 | 
323 |   /**
324 |     * Computes the distance of a point to its nearest microcluster.
325 |     *
326 |     * @param vec : the point
327 |     * @param mcs : Array of microcluster information
328 |     * @return Double: the distance
329 |     **/
330 | 
331 |   private def distanceNearestMC(vec: breeze.linalg.Vector[Double], mcs: Array[(MicroClusterInfo, Int)]): Double = {
332 | 
333 |     var minDist = Double.PositiveInfinity
334 |     var i = 0
335 |     for (mc <- mcs) {
336 |       val dist = squaredDistance(vec, mc._1.centroid)
337 |       if (dist != 0.0 && dist < minDist) minDist = dist
338 |       i += 1
339 |     }
340 |     scala.math.sqrt(minDist)
341 |   }
342 | 
343 |   /**
344 |     * Computes the squared distance of two microclusters.
345 |     *
346 |     * @param idx1 : local index of one microcluster in the array
347 |     * @param idx2 : local index of another microcluster in the array
348 |     * @return Double: the squared distance
349 |     **/
350 | 
351 |   private def squaredDistTwoMCArrIdx(idx1: Int, idx2: Int): Double = {
352 |     squaredDistance(microClusters(idx1).getCf1x :/ microClusters(idx1).getN.toDouble, microClusters(idx2).getCf1x :/ microClusters(idx2).getN.toDouble)
353 |   }
354 | 
355 |   /**
356 |     * Computes the squared distance of one microcluster to a point.
357 |     *
358 |     * @param idx1  : local index of the microcluster in the array
359 |     * @param point : the point
360 |     * @return Double: the squared distance
361 |     **/
362 | 
363 |   private def squaredDistPointToMCArrIdx(idx1: Int, point: Vector[Double]): Double = {
364 |     squaredDistance(microClusters(idx1).getCf1x :/ microClusters(idx1).getN.toDouble, point)
365 |   }
366 | 
367 |   /**
368 |     * Returns the local index of a microcluster for a given ID
369 |     *
370 |     * @param idx0 : ID of the microcluster
371 |     * @return Int: local index of the microcluster
372 |     **/
373 | 
374 |   private def getArrIdxMC(idx0: Int): Int = {
375 |     var id = -1
376 |     var i = 0
377 |     for (mc <- microClusters) {
378 |       if (mc.getIds(0) == idx0) id = i
379 |       i += 1
380 |     }
381 |     id
382 |   }
383 | 
384 |   /**
385 |     * Merges two microclusters adding all its features.
386 |     *
387 |     * @param idx1 : local index of one microcluster in the array
388 |     * @param idx2 : local index of one microcluster in the array
389 |     *
390 |     **/
391 | 
392 |   private def mergeMicroClusters(idx1: Int, idx2: Int): Unit = {
393 | 
394 |     microClusters(idx1).setCf1x(microClusters(idx1).getCf1x :+ microClusters(idx2).getCf1x)
395 |     microClusters(idx1).setCf2x(microClusters(idx1).getCf2x :+ microClusters(idx2).getCf2x)
396 |     microClusters(idx1).setCf1t(microClusters(idx1).getCf1t + microClusters(idx2).getCf1t)
397 |     microClusters(idx1).setCf2t(microClusters(idx1).getCf2t + microClusters(idx2).getCf2t)
398 |     microClusters(idx1).setN(microClusters(idx1).getN + microClusters(idx2).getN)
399 |     microClusters(idx1).setIds(microClusters(idx1).getIds ++ microClusters(idx2).getIds)
400 | 
401 |     mcInfo(idx1)._1.setCentroid(microClusters(idx1).getCf1x :/ microClusters(idx1).getN.toDouble)
402 |     mcInfo(idx1)._1.setN(microClusters(idx1).getN)
403 |     mcInfo(idx1)._1.setRmsd(scala.math.sqrt(sum(microClusters(idx1).cf2x) / microClusters(idx1).n.toDouble - sum(microClusters(idx1).cf1x.map(a => a * a)) / (microClusters(idx1).n * microClusters(idx1).n.toDouble)))
404 | 
405 |   }
406 | 
407 |   /**
408 |     * Adds one point to a microcluster adding all its features.
409 |     *
410 |     * @param idx1  : local index of the microcluster in the array
411 |     * @param point : the point
412 |     *
413 |     **/
414 | 
415 |   private def addPointMicroClusters(idx1: Int, point: Vector[Double]): Unit = {
416 | 
417 |     microClusters(idx1).setCf1x(microClusters(idx1).getCf1x :+ point)
418 |     microClusters(idx1).setCf2x(microClusters(idx1).getCf2x :+ (point :* point))
419 |     microClusters(idx1).setCf1t(microClusters(idx1).getCf1t + this.time)
420 |     microClusters(idx1).setCf2t(microClusters(idx1).getCf2t + (this.time * this.time))
421 |     microClusters(idx1).setN(microClusters(idx1).getN + 1)
422 | 
423 |     mcInfo(idx1)._1.setCentroid(microClusters(idx1).getCf1x :/ microClusters(idx1).getN.toDouble)
424 |     mcInfo(idx1)._1.setN(microClusters(idx1).getN)
425 |     mcInfo(idx1)._1.setRmsd(scala.math.sqrt(sum(microClusters(idx1).cf2x) / microClusters(idx1).n.toDouble - sum(microClusters(idx1).cf1x.map(a => a * a)) / (microClusters(idx1).n * microClusters(idx1).n.toDouble)))
426 | 
427 |   }
428 | 
429 |   /**
430 |     * Deletes one microcluster and replaces it locally with a new point.
431 |     *
432 |     * @param idx   : local index of the microcluster in the array
433 |     * @param point : the point
434 |     *
435 |     **/
436 | 
437 |   private def replaceMicroCluster(idx: Int, point: Vector[Double]): Unit = {
438 |     microClusters(idx) = new MicroCluster(point :* point, point, this.time * this.time, this.time, 1L)
439 |     mcInfo(idx)._1.setCentroid(point)
440 |     mcInfo(idx)._1.setN(1L)
441 |     mcInfo(idx)._1.setRmsd(distanceNearestMC(mcInfo(idx)._1.centroid, mcInfo))
442 |   }
443 | 
444 |   /**
445 |     * Finds the nearest microcluster for all entries of an RDD.
446 |     *
447 |     * @param rdd    : RDD with points
448 |     * @param mcInfo : Array containing microclusters information
449 |     * @return RDD[(Int, Vector[Double])]: RDD that contains a tuple of the ID of the
450 |     *         nearest microcluster and the point itself.
451 |     *
452 |     **/
453 | 
454 |   private def assignToMicroCluster(rdd: RDD[Vector[Double]], mcInfo: Array[(MicroClusterInfo, Int)]): RDD[(Int, Vector[Double])] = {
455 |     rdd.map { a =>
456 |       var minDist = Double.PositiveInfinity
457 |       var minIndex = Int.MaxValue
458 |       var i = 0
459 |       for (mc <- mcInfo) {
460 |         val dist = squaredDistance(a, mc._1.centroid)
461 |         if (dist < minDist) {
462 |           minDist = dist
463 |           minIndex = mc._2
464 |         }
465 |         i += 1
466 |       }
467 |       (minIndex, a)
468 |     }
469 |   }
470 | 
471 |   /**
472 |     * Finds the nearest microcluster for all entries of an RDD, uses broadcast variable.
473 |     *
474 |     * @param rdd    : RDD with points
475 |     * @return RDD[(Int, Vector[Double])]: RDD that contains a tuple of the ID of the
476 |     *         nearest microcluster and the point itself.
477 |     *
478 |     **/
479 |   private def assignToMicroCluster(rdd: RDD[Vector[Double]]) = {
480 |     rdd.map { a =>
481 |       var minDist = Double.PositiveInfinity
482 |       var minIndex = Int.MaxValue
483 |       var i = 0
484 |       for (mc <- broadcastMCInfo.value) {
485 |         val dist = squaredDistance(a, mc._1.centroid)
486 |         if (dist < minDist) {
487 |           minDist = dist
488 |           minIndex = mc._2
489 |         }
490 |         i += 1
491 |       }
492 |       (minIndex, a)
493 |     }
494 |   }
495 | 
496 |   /**
497 |     * Performs all the operations to maintain the microclusters. Assign points that
498 |     * belong to a microclusters, detects outliers and deals with them.
499 |     *
500 |     * @param assignations : RDD that contains a tuple of the ID of the
501 |     *                     nearest microcluster and the point itself.
502 |     *
503 |     **/
504 | 
505 |   private def updateMicroClusters(assignations: RDD[(Int, Vector[Double])]): Unit = {
506 | 
507 |     var dataInAndOut: RDD[(Int, (Int, Vector[Double]))] = null
508 |     var dataIn: RDD[(Int, Vector[Double])] = null
509 |     var dataOut: RDD[(Int, Vector[Double])] = null
510 | 
511 |     // Calculate RMSD
512 |     if (initialized) {
513 |       dataInAndOut = assignations.map { a =>
514 |         val nearMCInfo = broadcastMCInfo.value.find(id => id._2 == a._1).get._1
515 |         val nearDistance = scala.math.sqrt(squaredDistance(a._2, nearMCInfo.centroid))
516 | 
517 |         if (nearDistance <= tFactor * nearMCInfo.rmsd) (1, a)
518 |         else (0, a)
519 |       }
520 |     }
521 | 
522 |     // Separate data
523 |     if (dataInAndOut != null) {
524 |       dataIn = dataInAndOut.filter(_._1 == 1).map(a => a._2)
525 |       dataOut = dataInAndOut.filter(_._1 == 0).map(a => a._2)
526 |     } else dataIn = assignations
527 | 
528 |     // Compute sums, sums of squares and count points... all by key
529 |     log.warn(s"Processing points")
530 | 
531 |     // sumsAndSumsSquares -> (key: Int, (sum: Vector[Double], sumSquares: Vector[Double], count: Long ) )
532 |     val sumsAndSumsSquares = timer {
533 |       val aggregateFuntion = (aa: (Vector[Double], Vector[Double], Long), bb: (Vector[Double], Vector[Double], Long)) => (aa._1 :+ bb._1, aa._2 :+ bb._2, aa._3 + bb._3)
534 |       dataIn.mapValues(a => (a, a :* a, 1L)).reduceByKey(aggregateFuntion).collect()
535 |     }
536 | 
537 | 
538 |     var totalIn = 0L
539 | 
540 |     for (mc <- microClusters) {
541 |       for (ss <- sumsAndSumsSquares) if (mc.getIds(0) == ss._1) {
542 |         mc.setCf1x(mc.cf1x :+ ss._2._1)
543 |         mc.setCf2x(mc.cf2x :+ ss._2._2)
544 |         mc.setN(mc.n + ss._2._3)
545 |         mc.setCf1t(mc.cf1t + ss._2._3 * this.time)
546 |         mc.setCf2t(mc.cf2t + ss._2._3 * (this.time * this.time))
547 |         totalIn += ss._2._3
548 |       }
549 |     }
550 | 
551 | 
552 | 
553 | 
554 |     log.warn(s"Processing " + (currentN - totalIn) + " outliers")
555 |     timer {
556 |       if (dataOut != null && currentN - totalIn != 0) {
557 |         var mTimeStamp: Double = 0.0
558 |         val recencyThreshold = this.time - delta
559 |         var safeDeleteMC: Array[Int] = Array()
560 |         var keepOrMergeMC: Array[Int] = Array()
561 |         var i = 0
562 | 
563 | 
564 |         for (mc <- microClusters) {
565 |           val meanTimeStamp = if (mc.getN > 0) mc.getCf1t.toDouble / mc.getN.toDouble else 0
566 |           val sdTimeStamp = scala.math.sqrt(mc.getCf2t.toDouble / mc.getN.toDouble - meanTimeStamp * meanTimeStamp)
567 | 
568 |           if (mc.getN < 2 * mLastPoints) mTimeStamp = meanTimeStamp
569 |           else mTimeStamp = Gaussian(meanTimeStamp, sdTimeStamp).inverseCdf(1 - mLastPoints / (2 * mc.getN.toDouble))
570 | 
571 |           if (mTimeStamp < recencyThreshold || mc.getN == 0) safeDeleteMC = safeDeleteMC :+ i
572 |           else keepOrMergeMC = keepOrMergeMC :+ i
573 | 
574 |           i += 1
575 |         }
576 | 
577 |         var j = 0
578 |         var newMC: Array[Int] = Array()
579 | 
580 | 
581 |         for (point <- dataOut.collect()) {
582 | 
583 |           var minDist = Double.PositiveInfinity
584 |           var idMinDist = 0
585 |           if (recursiveOutliersRMSDCheck) for (id <- newMC) {
586 |             val dist = squaredDistPointToMCArrIdx(id, point._2)
587 |             if (dist < minDist) {
588 |               minDist = dist
589 |               idMinDist = id
590 |             }
591 | 
592 |           }
593 | 
594 |           if (scala.math.sqrt(minDist) <= tFactor * mcInfo(idMinDist)._1.rmsd) addPointMicroClusters(idMinDist, point._2)
595 |           else if (safeDeleteMC.lift(j).isDefined) {
596 |             replaceMicroCluster(safeDeleteMC(j), point._2)
597 |             newMC = newMC :+ safeDeleteMC(j)
598 |             j += 1
599 |           } else {
600 |             var minDist = Double.PositiveInfinity
601 |             var idx1 = 0
602 |             var idx2 = 0
603 | 
604 |             for (a <- keepOrMergeMC.indices)
605 |               for (b <- (0 + a) until keepOrMergeMC.length) {
606 |                 var dist = Double.PositiveInfinity
607 |                 if (keepOrMergeMC(a) != keepOrMergeMC(b)) dist = squaredDistance(mcInfo(keepOrMergeMC(a))._1.centroid, mcInfo(keepOrMergeMC(b))._1.centroid)
608 |                 if (dist < minDist) {
609 |                   minDist = dist
610 |                   idx1 = keepOrMergeMC(a)
611 |                   idx2 = keepOrMergeMC(b)
612 |                 }
613 |               }
614 |             mergeMicroClusters(idx1, idx2)
615 |             replaceMicroCluster(idx2, point._2)
616 |             newMC = newMC :+ idx2
617 |           }
618 | 
619 |         }
620 | 
621 |       }
622 |     }
623 |   }
624 | 
625 |   // END OF MODEL
626 | }
627 | 
628 | 
629 | /**
630 |   * Object complementing the MicroCluster Class to allow it to create
631 |   * new IDs whenever a new instance of it is created.
632 |   *
633 |   **/
634 | 
635 | private object MicroCluster extends Serializable {
636 |   private var current = -1
637 | 
638 |   private def inc = {
639 |     current += 1
640 |     current
641 |   }
642 | }
643 | 
644 | /**
645 |   * Packs the microcluster object and its features in one single class
646 |   *
647 |   **/
648 | 
649 | protected class MicroCluster(
650 |                               var cf2x: breeze.linalg.Vector[Double],
651 |                               var cf1x: breeze.linalg.Vector[Double],
652 |                               var cf2t: Long,
653 |                               var cf1t: Long,
654 |                               var n: Long,
655 |                               var ids: Array[Int]) extends Serializable {
656 | 
657 |   def this(cf2x: breeze.linalg.Vector[Double], cf1x: breeze.linalg.Vector[Double], cf2t: Long, cf1t: Long, n: Long) = this(cf2x, cf1x, cf2t, cf1t, n, Array(MicroCluster.inc))
658 | 
659 |   def setCf2x(cf2x: breeze.linalg.Vector[Double]): Unit = {
660 |     this.cf2x = cf2x
661 |   }
662 | 
663 |   def getCf2x: breeze.linalg.Vector[Double] = {
664 |     this.cf2x
665 |   }
666 | 
667 |   def setCf1x(cf1x: breeze.linalg.Vector[Double]): Unit = {
668 |     this.cf1x = cf1x
669 |   }
670 | 
671 |   def getCf1x: breeze.linalg.Vector[Double] = {
672 |     this.cf1x
673 |   }
674 | 
675 |   def setCf2t(cf2t: Long): Unit = {
676 |     this.cf2t = cf2t
677 |   }
678 | 
679 |   def getCf2t: Long = {
680 |     this.cf2t
681 |   }
682 | 
683 |   def setCf1t(cf1t: Long): Unit = {
684 |     this.cf1t = cf1t
685 |   }
686 | 
687 |   def getCf1t: Long = {
688 |     this.cf1t
689 |   }
690 | 
691 |   def setN(n: Long): Unit = {
692 |     this.n = n
693 |   }
694 | 
695 |   def getN: Long = {
696 |     this.n
697 |   }
698 | 
699 |   def setIds(ids: Array[Int]): Unit = {
700 |     this.ids = ids
701 |   }
702 | 
703 |   def getIds: Array[Int] = {
704 |     this.ids
705 |   }
706 | }
707 | 
708 | 
709 | /**
710 |   * Packs some microcluster information to reduce the amount of data to be
711 |   * broadcasted.
712 |   *
713 |   **/
714 | 
715 | private class MicroClusterInfo(
716 |                                 var centroid: breeze.linalg.Vector[Double],
717 |                                 var rmsd: Double,
718 |                                 var n: Long) extends Serializable {
719 | 
720 |   def setCentroid(centroid: Vector[Double]): Unit = {
721 |     this.centroid = centroid
722 |   }
723 | 
724 |   def setRmsd(rmsd: Double): Unit = {
725 |     this.rmsd = rmsd
726 |   }
727 | 
728 |   def setN(n: Long): Unit = {
729 |     this.n = n
730 |   }
731 | }
732 | 
733 | 


--------------------------------------------------------------------------------
/src/main/scala/com/backhoff/clustream/Tools.scala:
--------------------------------------------------------------------------------
 1 | package com.backhoff.clustream
 2 | 
 3 | import java.io._
 4 | import java.nio.file.{Paths, Files}
 5 | 
 6 | object Tools {
 7 | 
 8 |     def convertMCsBinariesToText(dirIn: String = "", dirOut: String = "", limit: Int): Unit = {
 9 |       print("processing files: ")
10 |       for(i <- 0 to limit) {
11 |         if(Files.exists(Paths.get(dirIn + "/" + i)))
12 |         try {
13 |           val file = new ObjectInputStream(new FileInputStream(dirIn + "/" + i))
14 |           val mc = file.readObject().asInstanceOf[Array[MicroCluster]]
15 |           var text: Array[String] = null
16 |           file.close()
17 |           if(mc != null) {
18 |             text = mc.map { m =>
19 |               "=========================================================== \n" +
20 |               "MicroCluster IDs = " + m.getIds.mkString("[", ",", "]") + "\n" +
21 |                 "CF2X = " + m.getCf2x.toArray.mkString("[", ",", "]") + "\n" +
22 |                 "CF1X = " + m.getCf1x.toArray.mkString("[", ",", "]") + "\n" +
23 |                 "CF2T = " + m.getCf2t.toString + "\n" +
24 |                 "CF1T = " + m.getCf1t.toString + "\n" +
25 |                 "N = " + m.getN.toString + "\n"
26 |             }
27 | 
28 |             val pw = new PrintWriter(new File(dirOut + "/" + i))
29 |             pw.write(text.mkString("","",""))
30 |             pw.close
31 |             print(i + " ")
32 |           }
33 | 
34 |         }
35 |         catch {
36 |           case ex: IOException => println("Exception while reading files " + ex)
37 |             null
38 |         }
39 |       }
40 |       println()
41 |     }
42 | 
43 | }
44 | 
45 | 


--------------------------------------------------------------------------------
/src/test/scala/com/backhoff/clustream/KmeansTest.scala:
--------------------------------------------------------------------------------
 1 | package com.backhoff.clustream
 2 | 
 3 | /**
 4 |  * Created by omar on 10/7/15.
 5 |  */
 6 | 
 7 | import org.apache.spark.{SparkContext, SparkConf}
 8 | import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
 9 | import org.apache.spark.mllib.linalg.Vectors
10 | 
11 | object KmeansTest {
12 |   def main(args: Array[String]) {
13 |     val conf = new SparkConf().setAppName("K-means test").setMaster("local[*]")
14 |     val sc = new SparkContext(conf)
15 |     sc.setLogLevel("ERROR")
16 | 
17 |     val data = sc.textFile("/home/omar/stream/streamMod")
18 |     val parsedData = data.map(s => Vectors.dense(s.split(' ').dropRight(1).map(_.toDouble))).cache()
19 | 
20 |     // Cluster the data into two classes using KMeans
21 |     val numClusters = 2
22 |     val numIterations = 20
23 |     val clusters = KMeans.train(parsedData, numClusters, numIterations)
24 | 
25 |     // Evaluate clustering by computing Within Set Sum of Squared Errors
26 |     val WSSSE = clusters.computeCost(parsedData)
27 |     println("Within Set Sum of Squared Errors = " + WSSSE)
28 | 
29 |     // Save and load model
30 | //    clusters.save(sc, "/home/omar/Desktop/model")
31 | //    val sameModel = KMeansModel.load(sc, "/home/omar/Desktop/model")
32 |     clusters.clusterCenters.foreach(println)
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/test/scala/com/backhoff/clustream/SimpleApp.scala:
--------------------------------------------------------------------------------
 1 | package com.backhoff.clustream
 2 | 
 3 | /**
 4 |  * Created by omar on 9/14/15.
 5 |  */
 6 | /* SimpleApp.scala */
 7 | import org.apache.spark.SparkContext
 8 | import org.apache.spark.SparkConf
 9 | 
10 | object SimpleApp {
11 |   def timer[R](block: => R): R = {
12 |     val t0 = System.nanoTime()
13 |     val result = block // call-by-name
14 |     val t1 = System.nanoTime()
15 |     println("Elapsed time: " + (t1 - t0) / 1000000 + "ms")
16 |     result
17 |   }
18 |   def main(args: Array[String]) {
19 | //    val logFile = "/home/omar/Libs/spark-1.5.0/README.md" // Should be some file on your system
20 |     val conf = new SparkConf().setAppName("Simple Application").setMaster("local[*]")
21 |     val sc = new SparkContext(conf)
22 |     sc.setLogLevel("ERROR")
23 | //    val logData = sc.textFile(logFile, 2).cache()
24 | //    val numAs = logData.filter(line => line.contains("a")).count()
25 | //    val numBs = logData.filter(line => line.contains("b")).count()
26 | //    println("Lines with a: %s, Lines with b: %s".format(numAs, numBs))
27 | 
28 |     val h = 1
29 |     val t1 = 6
30 |     val t2 = 21
31 |     val t3 = 81
32 |     val t4 = 161
33 | 
34 | //    Tools.convertMCsBinariesToText("snaps", "snaps/text", 100)
35 |     val clustream = new CluStream(null)
36 |     val snap1 = timer{clustream.getMCsFromSnapshots("snaps",t1,h)}
37 |     val snap2 = timer{clustream.getMCsFromSnapshots("snaps",t2,h)}
38 |     val snap3 = timer{clustream.getMCsFromSnapshots("snaps",t3,h)}
39 |     val snap4 = timer{clustream.getMCsFromSnapshots("snaps",t4,h)}
40 | 
41 |     println(snap1.map(a => a.getN).mkString("[",",","]"))
42 |     println("mics points = " + snap1.map(_.getN).sum)
43 |     println(snap2.map(a => a.getN).mkString("[",",","]"))
44 |     println("mics points = " + snap2.map(_.getN).sum)
45 |     println(snap3.map(a => a.getN).mkString("[",",","]"))
46 |     println("mics points = " + snap3.map(_.getN).sum)
47 |     println(snap4.map(a => a.getN).mkString("[",",","]"))
48 |     println("mics points = " + snap4.map(_.getN).sum)
49 | 
50 |     val clusters1 = timer{clustream.fakeKMeans(sc,5,5000,snap1)}
51 |     if(clusters1 != null) {
52 |       println("MacroClusters Ceneters")
53 |       println("snapshots " + clustream.getSnapShots("snaps",t1,h))
54 |       clusters1.clusterCenters.foreach(println)
55 |       clusters1.clusterCenters.foreach(c=>scala.tools.nsc.io.Path("/home/omar/datasets/tests/1case/results/clustream2000/centers1").createFile().appendAll(c.toArray.mkString("",",","") +"\n" ))
56 |     }
57 |     val clusters2 = timer{clustream.fakeKMeans(sc,5,5000,snap2)}
58 |     if(clusters2 != null) {
59 |       println("MacroClusters Ceneters")
60 |       println("snapshots " + clustream.getSnapShots("snaps",t2,h))
61 |       clusters2.clusterCenters.foreach(println)
62 |       clusters2.clusterCenters.foreach(c=>scala.tools.nsc.io.Path("/home/omar/datasets/tests/1case/results/clustream2000/centers2").createFile().appendAll(c.toArray.mkString("",",","") +"\n" ))
63 |     }
64 |     val clusters3 = timer{clustream.fakeKMeans(sc,5,5000,snap3)}
65 |     if(clusters3 != null) {
66 |       println("MacroClusters Ceneters")
67 |       println("snapshots " + clustream.getSnapShots("snaps",t3,h))
68 |       clusters3.clusterCenters.foreach(println)
69 |       clusters3.clusterCenters.foreach(c=>scala.tools.nsc.io.Path("/home/omar/datasets/tests/1case/results/clustream2000/centers3").createFile().appendAll(c.toArray.mkString("",",","") +"\n" ))
70 |     }
71 |     val clusters4 = timer{clustream.fakeKMeans(sc,5,5000,snap4)}
72 |     if(clusters4 != null) {
73 |       println("MacroClusters Ceneters")
74 |       println("snapshots " + clustream.getSnapShots("snaps",t4,h))
75 |       clusters4.clusterCenters.foreach(println)
76 |       clusters4.clusterCenters.foreach(c=>scala.tools.nsc.io.Path("/home/omar/datasets/tests/1case/results/clustream2000/centers4").createFile().appendAll(c.toArray.mkString("",",","") +"\n" ))
77 |     }
78 | 
79 | 
80 |   }
81 | }
82 | 


--------------------------------------------------------------------------------
/src/test/scala/com/backhoff/clustream/StreamDM.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.streamdm
 2 | 
 3 | /**
 4 |  * Created by omar on 9/18/15.
 5 |  */
 6 | 
 7 | import com.github.javacliparser.ClassOption
 8 | import org.apache.spark.streamdm.clusterers.{Clusterer, Clustream}
 9 | import org.apache.spark.streamdm.evaluation.Evaluator
10 | import org.apache.spark.streamdm.streams.{StreamWriter, StreamReader}
11 | import org.apache.spark.streamdm.tasks.Task
12 | import org.apache.spark.streaming.scheduler.{StreamingListenerBatchCompleted, StreamingListener}
13 | import org.apache.spark.streaming.{Milliseconds, StreamingContext}
14 | import org.apache.spark.{SparkContext, SparkConf}
15 | 
16 | class Clus extends Task {
17 |   //Task options
18 |   val evaluatorOption:ClassOption = new ClassOption("evaluator", 'e',
19 |     "Evaluator to use", classOf[Evaluator], "ClusteringCohesionEvaluator")
20 |   val clustererOption:ClassOption = new ClassOption("learner", 'l',
21 |     "Learner to use", classOf[Clustream], "Clustream")
22 |   val streamReaderOption:ClassOption = new ClassOption("streamReader", 's',
23 |     "Stream reader to use", classOf[StreamReader], "SocketTextStreamReader")
24 |   val resultsWriterOption:ClassOption = new ClassOption("resultsWriter", 'w',
25 |     "Stream writer to use", classOf[StreamWriter], "PrintStreamWriter")
26 | 
27 |   //Run the task
28 |   def run(ssc:StreamingContext): Unit = {
29 |     //Parse options and init
30 |     val reader:StreamReader = this.streamReaderOption.getValue()
31 |     val clusterer:Clustream = this.clustererOption.getValue()
32 |     clusterer.init(reader.getExampleSpecification())
33 |     val writer:StreamWriter = this.resultsWriterOption.getValue()
34 |     val evaluator:Evaluator = this.evaluatorOption.getValue()
35 | 
36 |     clusterer.microclusters.horizonOption.setValue(1)
37 |     clusterer.initOption.setValue(2000)
38 |     clusterer.kOption.setValue(5)
39 |     clusterer.mcOption.setValue(50)
40 |     //clusterer.repOption.setValue(10)
41 | 
42 |     //Parse stream and get Examples
43 |     val N = new StaticVar[Long](0L)
44 |     val listener = new MyListener(clusterer, N)
45 |     ssc.addStreamingListener(listener)
46 |     val instances = reader.getExamples(ssc)
47 | 
48 |     //Predict
49 |    // val predPairs = learner.predict(instances)
50 |     //Train
51 |     clusterer.train(instances)
52 |     //Assign
53 |     //val clpairs = clusterer.assign(instances)
54 | 
55 |     //Print statistics
56 |    // writer.output(evaluator.addResult(clpairs))
57 |   }
58 | }
59 | 
60 | class MyListener(model: Clustream, n: StaticVar[Long]) extends StreamingListener {
61 |   override def onBatchCompleted(batchCompleted:StreamingListenerBatchCompleted) {
62 |     if ( batchCompleted.batchInfo.numRecords > 0) {
63 |       n.value = n.value + batchCompleted.batchInfo.numRecords
64 |       println("================= CENTERS ================= N = " + n.value)
65 |       model.clusters.foreach(c => println(c.toString()))
66 |       println(model.microclusters.horizonOption.getValue)
67 |     }
68 |   }
69 | }
70 | class StaticVar[T]( var value: T )
71 | 
72 | object StreamDM {
73 |   def main(args: Array[String]) {
74 |     val conf = new SparkConf().setAppName("Streaming K-means test").setMaster("local[*]")
75 |     val sc = new SparkContext(conf)
76 |     sc.setLogLevel("ERROR")
77 |     val ssc = new StreamingContext(sc, Milliseconds(1000))
78 | 
79 | 
80 |     val numDimensions = 34
81 |     val numClusters = 5
82 |     val task = new Clus()
83 |     task.run(ssc)
84 |     ssc.start()
85 |     ssc.awaitTermination()
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/src/test/scala/com/backhoff/clustream/StreamingKMeans.scala:
--------------------------------------------------------------------------------
 1 | package com.backhoff.clustream
 2 | 
 3 | /**
 4 |  * Created by omar on 9/18/15.
 5 |  */
 6 | 
 7 | import org.apache.spark.streaming.dstream.DStream
 8 | import org.apache.spark.streaming.scheduler.{StreamingListenerBatchCompleted, StreamingListener}
 9 | import org.apache.spark.streaming.{Milliseconds, StreamingContext}
10 | import org.apache.spark.{SparkContext, SparkConf}
11 | import org.apache.spark.mllib.linalg.Vectors
12 | import org.apache.spark.mllib.linalg.Vector
13 | import org.apache.spark.mllib.regression.LabeledPoint
14 | import org.apache.spark.mllib.clustering.StreamingKMeans
15 | 
16 | object StreamingKMeans {
17 |   def main(args: Array[String]) {
18 |     val conf = new SparkConf().setAppName("Streaming K-means test").setMaster("local[*]")
19 |     val sc = new SparkContext(conf)
20 |     sc.setLogLevel("ERROR")
21 |     val ssc = new StreamingContext(sc, Milliseconds(1000))
22 | //    val trainingData = ssc.textFileStream("file:///home/omar/stream/train").map(_.split(" ")).map(arr => arr.dropRight(1)).map(_.mkString("[", ",", "]")).map(Vectors.parse)
23 | //    val trainingData = ssc.socketTextStream("localhost",9999).map(_.split(" ")).map(arr => arr.dropRight(1)).map(_.mkString("[",",","]")).map(Vectors.parse)
24 |     val trainingData = ssc.socketTextStream("localhost",9999).map(_.split(" ")).map(_.mkString("[",",","]")).map(Vectors.parse)
25 |     //val testData = ssc.textFileStream("/home/omar/stream/testing").map(LabeledPoint.parse)
26 |    // val testData = ssc.socketTextStream("localhost", 9998).map(LabeledPoint.parse)
27 |     val numDimensions = 34
28 |     val numClusters = 5
29 |     val model = new StreamingKMeans()
30 |       .setK(numClusters)
31 |       .setHalfLife(1000, "points")
32 |       //.setDecayFactor(0.0)
33 |       .setRandomCenters(numDimensions, 0.0)
34 | 
35 |     val N = new StaticVar[Long](0L)
36 |     val listener = new MyListener(model, N)
37 |     ssc.addStreamingListener(listener)
38 | 
39 |     model.trainOn(trainingData)
40 |     //model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()
41 | 
42 |     ssc.start()
43 |     ssc.awaitTermination()
44 |   }
45 | }
46 | 
47 | private[clustream] class MyListener(model: StreamingKMeans, n: StaticVar[Long]) extends StreamingListener {
48 |   override def onBatchCompleted(batchCompleted:StreamingListenerBatchCompleted) {
49 |     if ( batchCompleted.batchInfo.numRecords > 0) {
50 |       n.value = n.value + batchCompleted.batchInfo.numRecords
51 |       println("================= CENTERS ================= N = " + n.value)
52 |       model.latestModel().clusterCenters.foreach(println)
53 |     }
54 |   }
55 | }
56 | class StaticVar[T]( var value: T )
57 | 


--------------------------------------------------------------------------------
/src/test/scala/com/backhoff/clustream/StreamingTests.scala:
--------------------------------------------------------------------------------
  1 | package com.backhoff.clustream
  2 | 
  3 | /**
  4 |   * Created by omar on 9/20/15.
  5 |   */
  6 | 
  7 | import org.apache.spark.streaming.scheduler.{StreamingListenerBatchCompleted, StreamingListener}
  8 | import org.apache.spark.{SparkConf, SparkContext}
  9 | import org.apache.spark.streaming._
 10 | import org.apache.log4j._
 11 | 
 12 | 
 13 | import breeze.linalg._
 14 | 
 15 | object StreamingTests {
 16 |   def main(args: Array[String]) {
 17 |     val conf = new SparkConf().setAppName("Spark CluStream").setMaster("local[*]")
 18 |     //    val conf = new SparkConf().setAppName("Stream Word Count").setMaster("spark://192.168.0.119:7077")
 19 |     val sc = new SparkContext(conf)
 20 |     sc.setLogLevel("ERROR")
 21 |     Logger.getLogger("org").setLevel(Level.OFF)
 22 |     Logger.getLogger("akka").setLevel(Level.OFF)
 23 |     val ssc = new StreamingContext(sc, Milliseconds(1000))
 24 |     // ssc.checkpoint("/home/omar/stream/checkpoint")
 25 |     val lines = ssc.socketTextStream("localhost", 9999)
 26 |     //    val lines = ssc.textFileStream("file:///home/omar/stream/train")
 27 | 
 28 |     //    val words = lines.flatMap(_.split(" ").map(_.toInt))
 29 |     //    val pairs = words.map(word => (word, 1))
 30 |     //    val wordCounts = pairs.reduceByKey(_ + _)
 31 |     //
 32 |     //
 33 |     //    wordCounts.print()
 34 | 
 35 |     //    val words = lines.map(_.split(" ").map(_.toInt).zipWithIndex)
 36 |     //    val pairs = words.flatMap(a => a).transform(_.map(a => (a._2,a._1)))
 37 |     //    val wordCounts = pairs.reduceByKey(_ + _)
 38 | 
 39 | 
 40 |     val model = new CluStreamOnline(50, 34, 2000).setDelta(512).setM(20).setInitNormalKMeans(false)
 41 |     val clustream = new CluStream(model)
 42 |     ssc.addStreamingListener(new PrintClustersListener(clustream, sc))
 43 |     //    model.run(lines.map(_.split(" ").map(_.toDouble)).map(DenseVector(_)))
 44 |     //    clustream.startOnline(lines.map(_.split(" ").map(_.toDouble)).map(arr => arr.dropRight(1)).map(DenseVector(_)))
 45 |     clustream.startOnline(lines.map(_.split(" ").map(_.toDouble)).map(DenseVector(_)))
 46 | 
 47 |     // wordCounts.print()
 48 |     ssc.start()
 49 |     ssc.awaitTermination()
 50 |   }
 51 | 
 52 | }
 53 | 
 54 | private[clustream] class PrintClustersListener(clustream: CluStream, sc: SparkContext) extends StreamingListener {
 55 | 
 56 |   override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted) {
 57 |       if (batchCompleted.batchInfo.numRecords > 0) {
 58 | 
 59 |         val tc = clustream.model.getCurrentTime
 60 |         val n = clustream.model.getTotalPoints
 61 | 
 62 |         clustream.saveSnapShotsToDisk("snaps",tc, 2, 10)
 63 |         println("tc = " + tc + ", n = " + n)
 64 | 
 65 | //      if (149900 < n && n <= 150100 ) {
 66 | //
 67 | //        val snaps = clustream.getSnapShots("snaps",tc,256)
 68 | //        val clusters = clustream.fakeKMeans(sc, 5, 2000, clustream.getMCsFromSnapshots("snaps", tc, 256))
 69 | //        println("=============  MacroClusters Centers for time = " + tc + ", n = " + n + ", snapshots = " + snaps + " ============")
 70 | //        clusters.clusterCenters.foreach(c=>scala.tools.nsc.io.Path("/home/omar/datasets/tests/2case/results/clustream200/centers1").createFile().appendAll(c.toArray.mkString("",",","") +"\n" ))
 71 | //
 72 | //
 73 | ////        val clusters = clustream.fakeKMeans(sc, 5, 2000, clustream.model.getMicroClusters)
 74 | ////        println("=============  MacroClusters Centers for time = " + tc + ", n = " + n + " ============")
 75 | ////        clusters.clusterCenters.foreach(println)
 76 | //
 77 | //      }
 78 | //      if( 249900 < n && n <= 250100){
 79 | //        val snaps = clustream.getSnapShots("snaps",tc,256)
 80 | //        val clusters = clustream.fakeKMeans(sc, 5, 2000, clustream.getMCsFromSnapshots("snaps", tc, 256))
 81 | //        println("=============  MacroClusters Centers for time = " + tc + ", n = " + n + ", snapshots = " + snaps + " ============")
 82 | //        clusters.clusterCenters.foreach(c=>scala.tools.nsc.io.Path("/home/omar/datasets/tests/2case/results/clustream200/centers2").createFile().appendAll(c.toArray.mkString("",",","")+"\n"))
 83 | //      }
 84 | //      if(349900 < n && n <= 350100 ){
 85 | //        val snaps = clustream.getSnapShots("snaps",tc,256)
 86 | //        val clusters = clustream.fakeKMeans(sc, 5, 2000, clustream.getMCsFromSnapshots("snaps", tc, 256))
 87 | //        println("=============  MacroClusters Centers for time = " + tc + ", n = " + n + ", snapshots = " + snaps + " ============")
 88 | //        clusters.clusterCenters.foreach(c=>scala.tools.nsc.io.Path("/home/omar/datasets/tests/2case/results/clustream200/centers3").createFile().appendAll(c.toArray.mkString("",",","")+"\n"))
 89 | //      }
 90 | //      if(449900 < n && n <= 450100){
 91 | //        val snaps = clustream.getSnapShots("snaps",tc,256)
 92 | //        val clusters = clustream.fakeKMeans(sc, 5, 2000, clustream.getMCsFromSnapshots("snaps", tc, 256))
 93 | //        println("=============  MacroClusters Centers for time = " + tc + ", n = " + n + ", snapshots = " + snaps + " ============")
 94 | //        clusters.clusterCenters.foreach(c=>scala.tools.nsc.io.Path("/home/omar/datasets/tests/2case/results/clustream200/centers4").createFile().appendAll(c.toArray.mkString("",",","")+"\n"))
 95 | //      }
 96 | 
 97 |     }
 98 |   }
 99 | }
100 | 


--------------------------------------------------------------------------------