├── LICENSE ├── README.md ├── build.sbt └── src ├── main └── scala │ └── com │ └── backhoff │ └── clustream │ ├── CluStream.scala │ ├── CluStreamOnline.scala │ └── Tools.scala └── test └── scala └── com └── backhoff └── clustream ├── KmeansTest.scala ├── SimpleApp.scala ├── StreamDM.scala ├── StreamingKMeans.scala └── StreamingTests.scala /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spark-CluStream 2 | By Omar Backhoff Larrazolo 3 | 4 | 5 | Adaptation of the CluStream method to Spark 6 | 7 | Includes: 8 | 9 | Online microclustering class 10 | 11 | Offline macroclustering class 12 | 13 | Reference to the actual method and original paper 14 | 15 | http://www-nishio.ist.osaka-u.ac.jp/vldb/archives/public/website/2003/papers/S04P02.pdf 16 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | resolvers ++= Seq( 2 | Resolver sonatypeRepo "public", 3 | Resolver typesafeRepo "releases", 4 | ) 5 | 6 | name := "spark-clustream" 7 | 8 | //spName := "obackhoff/spark-clustream" 9 | 10 | version := "0.1" 11 | 12 | scalaVersion := "2.11.8" 13 | 14 | //sparkVersion := "2.2.0" 15 | 16 | //sparkComponents ++= Seq("streaming", "mllib") 17 | 18 | libraryDependencies ++= Seq( 19 | "com.github.fommil.netlib" % "all" % "1.1.2" pomOnly(), 20 | "org.apache.spark" % "spark-mllib_2.11" % "2.2.0", 21 | "org.apache.spark" % "spark-streaming_2.11" % "2.2.0" 22 | ) 23 | -------------------------------------------------------------------------------- /src/main/scala/com/backhoff/clustream/CluStream.scala: -------------------------------------------------------------------------------- 1 | package com.backhoff.clustream 2 | 3 | /** 4 | * Created by omar on 9/25/15. 5 | */ 6 | 7 | import breeze.linalg._ 8 | import org.apache.spark.SparkContext 9 | import org.apache.spark.streaming.dstream.DStream 10 | import org.apache.spark.annotation.Experimental 11 | import java.io._ 12 | import java.nio.file.{Paths, Files} 13 | import org.apache.spark.mllib.clustering.KMeans 14 | 15 | /** 16 | * Class that contains the offline methods for the CluStream 17 | * method. It can be initialized with a CluStreamOnline model to 18 | * facilitate the use of it at the same time the online process 19 | * is running. 20 | * 21 | **/ 22 | 23 | @Experimental 24 | class CluStream ( 25 | val model:CluStreamOnline) 26 | extends Serializable{ 27 | 28 | def this() = this(null) 29 | 30 | /** 31 | * Method that samples values from a given distribution. 32 | * 33 | * @param dist: this is a map containing values and their weights in 34 | * the distributions. Weights must add to 1. 35 | * Example. {A -> 0.5, B -> 0.3, C -> 0.2 } 36 | * @return A: sample value A 37 | **/ 38 | 39 | private def sample[A](dist: Map[A, Double]): A = { 40 | val p = scala.util.Random.nextDouble 41 | val it = dist.iterator 42 | var accum = 0.0 43 | while (it.hasNext) { 44 | val (item, itemProb) = it.next 45 | accum += itemProb 46 | if (accum >= p) 47 | return item 48 | } 49 | sys.error(f"this should never happen") // needed so it will compile 50 | } 51 | 52 | /** 53 | * Method that saves a snapshot to disk using the pyramidal time 54 | * scheme to a given directory. 55 | * 56 | * @param dir: directory to save the snapshot 57 | 58 | * @param tc: time clock unit to save 59 | * @param alpha: alpha parameter of the pyramidal time scheme 60 | * @param l: l modifier of the pyramidal time scheme 61 | **/ 62 | 63 | def saveSnapShotsToDisk(dir: String = "", tc: Long, alpha: Int = 2, l: Int = 2): Unit ={ 64 | 65 | var write = false 66 | var delete = false 67 | var order = 0 68 | val mcs = model.getMicroClusters 69 | 70 | 71 | val exp = (scala.math.log(tc) / scala.math.log(alpha)).toInt 72 | 73 | for (i <- 0 to exp) { 74 | if (tc % scala.math.pow(alpha, i + 1) != 0 && tc % scala.math.pow(alpha, i) == 0) { 75 | order = i 76 | write = true 77 | } 78 | } 79 | 80 | val tcBye = tc - ((scala.math.pow(alpha, l) + 1) * scala.math.pow(alpha, order + 1)).toInt 81 | 82 | if (tcBye > 0) delete = true 83 | 84 | if (write) { 85 | val out = new ObjectOutputStream(new FileOutputStream(dir + "/" + tc)) 86 | 87 | try { 88 | out.writeObject(mcs) 89 | } 90 | catch { 91 | case ex: IOException => println("Exception while writing file " + ex) 92 | } 93 | finally { 94 | out.close() 95 | } 96 | } 97 | 98 | if (delete) { 99 | try { 100 | new File(dir + "/" + tcBye).delete() 101 | } 102 | catch { 103 | case ex: IOException => println("Exception while deleting file " + ex); 104 | } 105 | } 106 | } 107 | 108 | /** 109 | * Method that gets the snapshots to use for a given time and horizon in a 110 | * given file directory. 111 | * 112 | * @param dir: directory to save the snapshot 113 | 114 | * @param tc: time clock unit to save 115 | * @param h: time horizon 116 | * @return (Long,Long): tuple of the first and second snapshots to use. 117 | **/ 118 | 119 | def getSnapShots(dir: String = "", tc: Long, h: Long): (Long,Long) = { 120 | 121 | var tcReal = tc 122 | while(!Files.exists(Paths.get(dir + "/" + tcReal)) && tcReal >= 0) tcReal = tcReal - 1 123 | var tcH = tcReal - h 124 | while(!Files.exists(Paths.get(dir + "/" + tcH)) && tcH >= 0) tcH = tcH - 1 125 | if(tcH < 0) while(!Files.exists(Paths.get(dir + "/" + tcH))) tcH = tcH + 1 126 | 127 | if(tcReal == -1L) tcH = -1L 128 | (tcReal, tcH) 129 | } 130 | 131 | /** 132 | * Method that returns the microclusters from the snapshots for a given time and horizon in a 133 | * given file directory. Subtracts the features of the first one with the second one. 134 | * 135 | * @param dir: directory to save the snapshot 136 | 137 | * @param tc: time clock unit to save 138 | * @param h: time horizon 139 | * @return Array[MicroCluster]: computed array of microclusters 140 | **/ 141 | 142 | def getMCsFromSnapshots(dir: String = "", tc: Long, h: Long): Array[MicroCluster] = { 143 | val (t1,t2) = getSnapShots(dir,tc,h) 144 | 145 | try{ 146 | val in1 = new ObjectInputStream(new FileInputStream(dir + "/" + t1)) 147 | val snap1 = in1.readObject().asInstanceOf[Array[MicroCluster]] 148 | 149 | val in2 = new ObjectInputStream(new FileInputStream(dir + "/" + t2)) 150 | val snap2 = in2.readObject().asInstanceOf[Array[MicroCluster]] 151 | 152 | in2.close() 153 | in1.close() 154 | 155 | val arrs1 = snap1.map(_.getIds) 156 | val arrs2 = snap2.map(_.getIds) 157 | 158 | val relatingMCs = snap1 zip arrs1.map(a => arrs2.zipWithIndex.map(b=> if(b._1.toSet.intersect(a.toSet).nonEmpty) b._2;else -1)) 159 | relatingMCs.map{ mc => 160 | if (!mc._2.forall(_ == -1) && t1 - h >= t2) { 161 | for(id <- mc._2) if(id != -1) { 162 | mc._1.setCf2x(mc._1.getCf2x :- snap2(id).getCf2x) 163 | mc._1.setCf1x(mc._1.getCf1x :- snap2(id).getCf1x) 164 | mc._1.setCf2t(mc._1.getCf2t - snap2(id).getCf2t) 165 | mc._1.setCf1t(mc._1.getCf1t - snap2(id).getCf1t) 166 | mc._1.setN(mc._1.getN - snap2(id).getN) 167 | mc._1.setIds(mc._1.getIds.toSet.diff(snap2(id).getIds.toSet).toArray) 168 | } 169 | mc._1 170 | }else mc._1 171 | 172 | } 173 | } 174 | catch{ 175 | case ex: IOException => println("Exception while reading files " + ex) 176 | null 177 | } 178 | 179 | } 180 | 181 | /** 182 | * Method that returns the centrois of the microclusters. 183 | * 184 | * @param mcs: array of microclusters 185 | * @return Array[Vector]: computed array of centroids 186 | **/ 187 | 188 | def getCentersFromMC(mcs: Array[MicroCluster]): Array[Vector[Double]] = { 189 | mcs.filter(_.getN > 0).map(mc => mc.getCf1x :/ mc.getN.toDouble) 190 | } 191 | 192 | /** 193 | * Method that returns the weights of the microclusters from the number of points. 194 | * 195 | * @param mcs: array of microclusters 196 | * @return Array[Double]: computed array of weights 197 | **/ 198 | 199 | def getWeightsFromMC(mcs: Array[MicroCluster]): Array[Double] = { 200 | var arr: Array[Double] = mcs.map(_.getN.toDouble).filter(_ > 0) 201 | val sum: Double = arr.sum 202 | arr.map(value => value/sum) 203 | } 204 | 205 | /** 206 | * Method that returns a computed KMeansModel. It runs a modified version 207 | * of the KMeans algorithm in Spark from sampling the microclusters given 208 | * its weights. 209 | * 210 | * @param sc: spark context where KMeans will run 211 | * @param k: number of clusters 212 | * @param mcs: array of microclusters 213 | * @return org.apache.spark.mllib.clustering.KMeansModel: computed KMeansModel 214 | **/ 215 | 216 | def fakeKMeans(sc: SparkContext,k: Int, numPoints: Int, mcs: Array[MicroCluster]): org.apache.spark.mllib.clustering.KMeansModel ={ 217 | 218 | val kmeans = new KMeans() 219 | var centers = getCentersFromMC(mcs).map(v => org.apache.spark.mllib.linalg.Vectors.dense(v.toArray)) 220 | val weights = getWeightsFromMC(mcs) 221 | val map = (centers zip weights).toMap 222 | val points = Array.fill(numPoints)(sample(map)) 223 | 224 | 225 | kmeans.setMaxIterations(20) 226 | kmeans.setK(k) 227 | kmeans.setInitialModel(new org.apache.spark.mllib.clustering.KMeansModel(Array.fill(k)(sample(map)))) 228 | val trainingSet = sc.parallelize(points) 229 | val clusters = kmeans.run(trainingSet) 230 | trainingSet.unpersist(blocking = false) 231 | clusters 232 | 233 | } 234 | 235 | /** 236 | * Method that allows to run the online process from this class. 237 | * 238 | * @param data: data that comes from the stream 239 | * 240 | **/ 241 | 242 | def startOnline(data: DStream[breeze.linalg.Vector[Double]]): Unit ={ 243 | model.run(data) 244 | } 245 | 246 | 247 | } 248 | -------------------------------------------------------------------------------- /src/main/scala/com/backhoff/clustream/CluStreamOnline.scala: -------------------------------------------------------------------------------- 1 | package com.backhoff.clustream 2 | 3 | /** 4 | * Created by omar on 9/25/15. 5 | */ 6 | 7 | import breeze.linalg._ 8 | import org.apache.spark.broadcast.Broadcast 9 | import org.apache.spark.streaming.dstream.DStream 10 | import org.apache.spark.rdd.RDD 11 | import org.apache.spark.annotation.Experimental 12 | import org.apache.spark.mllib.clustering.{StreamingKMeans, KMeans} 13 | import breeze.stats.distributions.Gaussian 14 | 15 | 16 | /** 17 | * CluStreamOnline is a class that contains all the necessary 18 | * procedures to initialize and maintain the microclusters 19 | * required by the CluStream method. This approach is adapted 20 | * to work with batches of data to match the way Spark Streaming 21 | * works; meaning that every batch of data is considered to have 22 | * to have the same time stamp. 23 | * 24 | * @param q : the number of microclusters to use. Normally 10 * k is a good choice, 25 | * where k is the number of macro clusters 26 | * @param numDimensions : this sets the number of attributes of the data 27 | * @param minInitPoints : minimum number of points to use for the initialization 28 | * of the microclusters. If set to 0 then initRand is used 29 | * insted of initKmeans 30 | **/ 31 | 32 | @Experimental 33 | class CluStreamOnline( 34 | val q: Int, 35 | val numDimensions: Int, 36 | val minInitPoints: Int) 37 | extends Serializable { 38 | 39 | @transient lazy val log = org.apache.log4j.LogManager.getLogger("myLogger") 40 | 41 | 42 | /** 43 | * Easy timer function for blocks 44 | **/ 45 | 46 | def timer[R](block: => R): R = { 47 | val t0 = System.nanoTime() 48 | val result = block // call-by-name 49 | val t1 = System.nanoTime() 50 | log.warn(s"Elapsed time: " + (t1 - t0) / 1000000 + "ms") 51 | result 52 | } 53 | 54 | private var mLastPoints = 500 55 | private var delta = 20 56 | private var tFactor = 2.0 57 | private var recursiveOutliersRMSDCheck = true 58 | 59 | private var time: Long = 0L 60 | private var N: Long = 0L 61 | private var currentN: Long = 0L 62 | 63 | private var microClusters: Array[MicroCluster] = Array.fill(q)(new MicroCluster(Vector.fill[Double](numDimensions)(0.0), Vector.fill[Double](numDimensions)(0.0), 0L, 0L, 0L)) 64 | private var mcInfo: Array[(MicroClusterInfo, Int)] = null 65 | 66 | private var broadcastQ: Broadcast[Int] = null 67 | private var broadcastMCInfo: Broadcast[Array[(MicroClusterInfo, Int)]] = null 68 | 69 | var initialized = false 70 | 71 | private var useNormalKMeans = false 72 | private var strKmeans: StreamingKMeans = null 73 | 74 | 75 | 76 | private var initArr: Array[breeze.linalg.Vector[Double]] = Array() 77 | 78 | /** 79 | * Random initialization of the q microclusters 80 | * 81 | * @param rdd : rdd in use from the incoming DStream 82 | **/ 83 | 84 | private def initRand(rdd: RDD[breeze.linalg.Vector[Double]]): Unit = { 85 | mcInfo = Array.fill(q)(new MicroClusterInfo(Vector.fill[Double](numDimensions)(rand()), 0.0, 0L)) zip (0 until q) 86 | 87 | val assignations = assignToMicroCluster(rdd, mcInfo) 88 | updateMicroClusters(assignations) 89 | var i = 0 90 | for (mc <- microClusters) { 91 | mcInfo(i) = (mcInfo(i)._1, mc.getIds(0)) 92 | if (mc.getN > 0) mcInfo(i)._1.setCentroid(mc.cf1x :/ mc.n.toDouble) 93 | mcInfo(i)._1.setN(mc.getN) 94 | if (mcInfo(i)._1.n > 1) mcInfo(i)._1.setRmsd(scala.math.sqrt(sum(mc.cf2x) / mc.n.toDouble - sum(mc.cf1x.map(a => a * a)) / (mc.n * mc.n.toDouble))) 95 | i += 1 96 | } 97 | for (mc <- mcInfo) { 98 | if (mc._1.n == 1) 99 | mc._1.setRmsd(distanceNearestMC(mc._1.centroid, mcInfo)) 100 | } 101 | 102 | broadcastMCInfo = rdd.context.broadcast(mcInfo) 103 | initialized = true 104 | } 105 | 106 | /** 107 | * Initialization of the q microclusters using the K-Means algorithm 108 | * 109 | * @param rdd : rdd in use from the incoming DStream 110 | **/ 111 | 112 | private def initKmeans(rdd: RDD[breeze.linalg.Vector[Double]]): Unit = { 113 | initArr = initArr ++ rdd.collect 114 | if (initArr.length >= minInitPoints) { 115 | val tempRDD = rdd.context.parallelize(initArr) 116 | val trainingSet = tempRDD.map(v => org.apache.spark.mllib.linalg.Vectors.dense(v.toArray)) 117 | val clusters = KMeans.train(trainingSet, q, 10) 118 | 119 | mcInfo = Array.fill(q)(new MicroClusterInfo(Vector.fill[Double](numDimensions)(0), 0.0, 0L)) zip (0 until q) 120 | for (i <- clusters.clusterCenters.indices) mcInfo(i)._1.setCentroid(DenseVector(clusters.clusterCenters(i).toArray)) 121 | 122 | val assignations = assignToMicroCluster(tempRDD, mcInfo) 123 | updateMicroClusters(assignations) 124 | 125 | var i = 0 126 | for (mc <- microClusters) { 127 | mcInfo(i) = (mcInfo(i)._1, mc.getIds(0)) 128 | if (mc.getN > 0) mcInfo(i)._1.setCentroid(mc.cf1x :/ mc.n.toDouble) 129 | mcInfo(i)._1.setN(mc.getN) 130 | if (mcInfo(i)._1.n > 1) mcInfo(i)._1.setRmsd(scala.math.sqrt(sum(mc.cf2x) / mc.n.toDouble - sum(mc.cf1x.map(a => a * a)) / (mc.n * mc.n.toDouble))) 131 | i += 1 132 | } 133 | for (mc <- mcInfo) { 134 | if (mc._1.n == 1) 135 | mc._1.setRmsd(distanceNearestMC(mc._1.centroid, mcInfo)) 136 | } 137 | 138 | broadcastMCInfo = rdd.context.broadcast(mcInfo) 139 | 140 | initialized = true 141 | } 142 | } 143 | 144 | private def initStreamingKmeans(rdd: RDD[breeze.linalg.Vector[Double]]): Unit = { 145 | 146 | if(strKmeans == null) strKmeans = new StreamingKMeans().setK(q).setRandomCenters(numDimensions, 0.0) 147 | val trainingSet = rdd.map(v => org.apache.spark.mllib.linalg.Vectors.dense(v.toArray)) 148 | 149 | val clusters = strKmeans.latestModel().update(trainingSet,1.0, "batches") 150 | if(getTotalPoints >= minInitPoints){ 151 | 152 | mcInfo = Array.fill(q)(new MicroClusterInfo(Vector.fill[Double](numDimensions)(0), 0.0, 0L)) zip (0 until q) 153 | for (i <- clusters.clusterCenters.indices) mcInfo(i)._1.setCentroid(DenseVector(clusters.clusterCenters(i).toArray)) 154 | 155 | val assignations = assignToMicroCluster(rdd, mcInfo) 156 | updateMicroClusters(assignations) 157 | 158 | var i = 0 159 | for (mc <- microClusters) { 160 | mcInfo(i) = (mcInfo(i)._1, mc.getIds(0)) 161 | if (mc.getN > 0) mcInfo(i)._1.setCentroid(mc.cf1x :/ mc.n.toDouble) 162 | mcInfo(i)._1.setN(mc.getN) 163 | if (mcInfo(i)._1.n > 1) mcInfo(i)._1.setRmsd(scala.math.sqrt(sum(mc.cf2x) / mc.n.toDouble - sum(mc.cf1x.map(a => a * a)) / (mc.n * mc.n.toDouble))) 164 | i += 1 165 | } 166 | for (mc <- mcInfo) { 167 | if (mc._1.n == 1) 168 | mc._1.setRmsd(distanceNearestMC(mc._1.centroid, mcInfo)) 169 | } 170 | 171 | broadcastMCInfo = rdd.context.broadcast(mcInfo) 172 | initialized = true 173 | } 174 | 175 | } 176 | 177 | /** 178 | * Main method that runs the entire algorithm. This is called every time the 179 | * Streaming context handles a batch. 180 | * 181 | * @param data : data coming from the stream. Each entry has to be parsed as 182 | * breeze.linalg.Vector[Double] 183 | **/ 184 | 185 | def run(data: DStream[breeze.linalg.Vector[Double]]): Unit = { 186 | data.foreachRDD { (rdd, timeS) => 187 | currentN = rdd.count() 188 | if (currentN != 0) { 189 | if (initialized) { 190 | 191 | val assignations = assignToMicroCluster(rdd) 192 | updateMicroClusters(assignations) 193 | 194 | var i = 0 195 | for (mc <- microClusters) { 196 | mcInfo(i) = (mcInfo(i)._1, mc.getIds(0)) 197 | if (mc.getN > 0) mcInfo(i)._1.setCentroid(mc.cf1x :/ mc.n.toDouble) 198 | mcInfo(i)._1.setN(mc.getN) 199 | if (mcInfo(i)._1.n > 1) mcInfo(i)._1.setRmsd(scala.math.sqrt(sum(mc.cf2x) / mc.n.toDouble - sum(mc.cf1x.map(a => a * a)) / (mc.n * mc.n.toDouble))) 200 | i += 1 201 | } 202 | for (mc <- mcInfo) { 203 | if (mc._1.n == 1) 204 | mc._1.setRmsd(distanceNearestMC(mc._1.centroid, mcInfo)) 205 | } 206 | 207 | broadcastMCInfo = rdd.context.broadcast(mcInfo) 208 | } else { 209 | minInitPoints match { 210 | case 0 => initRand(rdd) 211 | case _ => if(useNormalKMeans) initKmeans(rdd) else initStreamingKmeans(rdd) 212 | } 213 | } 214 | } 215 | this.time += 1 216 | this.N += currentN 217 | } 218 | } 219 | 220 | /** 221 | * Method that returns the current array of microclusters. 222 | * 223 | * @return Array[MicroCluster]: current array of microclusters 224 | **/ 225 | 226 | def getMicroClusters: Array[MicroCluster] = { 227 | this.microClusters 228 | } 229 | 230 | /** 231 | * Method that returns current time clock unit in the stream. 232 | * 233 | * @return Long: current time in stream 234 | **/ 235 | 236 | def getCurrentTime: Long = { 237 | this.time 238 | } 239 | 240 | /** 241 | * Method that returns the total number of points processed so far in 242 | * the stream. 243 | * 244 | * @return Long: total number of points processed 245 | **/ 246 | 247 | def getTotalPoints: Long = { 248 | this.N 249 | } 250 | 251 | /** 252 | * Method that sets if the newly created microclusters due to 253 | * outliers are able to absorb other outlier points. This is done recursively 254 | * for all new microclusters, thus disabling these increases slightly the 255 | * speed of the algorithm but also allows to create overlaping microclusters 256 | * at this stage. 257 | * 258 | * @param ans : true or false 259 | * @return Class: current class 260 | **/ 261 | 262 | def setRecursiveOutliersRMSDCheck(ans: Boolean): this.type = { 263 | this.recursiveOutliersRMSDCheck = ans 264 | this 265 | } 266 | 267 | /** 268 | * Changes the K-Means method to use from StreamingKmeans to 269 | * normal K-Means for the initialization. StreamingKMeans is much 270 | * faster but in some cases normal K-Means could deliver more 271 | * accurate initialization. 272 | * 273 | * @param ans : true or false 274 | * @return Class: current class 275 | **/ 276 | 277 | def setInitNormalKMeans(ans: Boolean): this.type = { 278 | this.useNormalKMeans = ans 279 | this 280 | } 281 | 282 | 283 | /** 284 | * Method that sets the m last number of points in a microcluster 285 | * used to approximate its timestamp (recency value). 286 | * 287 | * @param m : m last points 288 | * @return Class: current class 289 | **/ 290 | 291 | def setM(m: Int): this.type = { 292 | this.mLastPoints = m 293 | this 294 | } 295 | 296 | /** 297 | * Method that sets the threshold d, used to determine whether a 298 | * microcluster is safe to delete or not (Tc - d < recency). 299 | * 300 | * @param d : threshold 301 | * @return Class: current class 302 | **/ 303 | 304 | def setDelta(d: Int): this.type = { 305 | this.delta = d 306 | this 307 | } 308 | 309 | /** 310 | * Method that sets the factor t of RMSDs. A point whose distance to 311 | * its nearest microcluster is greater than t*RMSD is considered an 312 | * outlier. 313 | * 314 | * @param t : t factor 315 | * @return Class: current class 316 | **/ 317 | 318 | def setTFactor(t: Double): this.type = { 319 | this.tFactor = t 320 | this 321 | } 322 | 323 | /** 324 | * Computes the distance of a point to its nearest microcluster. 325 | * 326 | * @param vec : the point 327 | * @param mcs : Array of microcluster information 328 | * @return Double: the distance 329 | **/ 330 | 331 | private def distanceNearestMC(vec: breeze.linalg.Vector[Double], mcs: Array[(MicroClusterInfo, Int)]): Double = { 332 | 333 | var minDist = Double.PositiveInfinity 334 | var i = 0 335 | for (mc <- mcs) { 336 | val dist = squaredDistance(vec, mc._1.centroid) 337 | if (dist != 0.0 && dist < minDist) minDist = dist 338 | i += 1 339 | } 340 | scala.math.sqrt(minDist) 341 | } 342 | 343 | /** 344 | * Computes the squared distance of two microclusters. 345 | * 346 | * @param idx1 : local index of one microcluster in the array 347 | * @param idx2 : local index of another microcluster in the array 348 | * @return Double: the squared distance 349 | **/ 350 | 351 | private def squaredDistTwoMCArrIdx(idx1: Int, idx2: Int): Double = { 352 | squaredDistance(microClusters(idx1).getCf1x :/ microClusters(idx1).getN.toDouble, microClusters(idx2).getCf1x :/ microClusters(idx2).getN.toDouble) 353 | } 354 | 355 | /** 356 | * Computes the squared distance of one microcluster to a point. 357 | * 358 | * @param idx1 : local index of the microcluster in the array 359 | * @param point : the point 360 | * @return Double: the squared distance 361 | **/ 362 | 363 | private def squaredDistPointToMCArrIdx(idx1: Int, point: Vector[Double]): Double = { 364 | squaredDistance(microClusters(idx1).getCf1x :/ microClusters(idx1).getN.toDouble, point) 365 | } 366 | 367 | /** 368 | * Returns the local index of a microcluster for a given ID 369 | * 370 | * @param idx0 : ID of the microcluster 371 | * @return Int: local index of the microcluster 372 | **/ 373 | 374 | private def getArrIdxMC(idx0: Int): Int = { 375 | var id = -1 376 | var i = 0 377 | for (mc <- microClusters) { 378 | if (mc.getIds(0) == idx0) id = i 379 | i += 1 380 | } 381 | id 382 | } 383 | 384 | /** 385 | * Merges two microclusters adding all its features. 386 | * 387 | * @param idx1 : local index of one microcluster in the array 388 | * @param idx2 : local index of one microcluster in the array 389 | * 390 | **/ 391 | 392 | private def mergeMicroClusters(idx1: Int, idx2: Int): Unit = { 393 | 394 | microClusters(idx1).setCf1x(microClusters(idx1).getCf1x :+ microClusters(idx2).getCf1x) 395 | microClusters(idx1).setCf2x(microClusters(idx1).getCf2x :+ microClusters(idx2).getCf2x) 396 | microClusters(idx1).setCf1t(microClusters(idx1).getCf1t + microClusters(idx2).getCf1t) 397 | microClusters(idx1).setCf2t(microClusters(idx1).getCf2t + microClusters(idx2).getCf2t) 398 | microClusters(idx1).setN(microClusters(idx1).getN + microClusters(idx2).getN) 399 | microClusters(idx1).setIds(microClusters(idx1).getIds ++ microClusters(idx2).getIds) 400 | 401 | mcInfo(idx1)._1.setCentroid(microClusters(idx1).getCf1x :/ microClusters(idx1).getN.toDouble) 402 | mcInfo(idx1)._1.setN(microClusters(idx1).getN) 403 | mcInfo(idx1)._1.setRmsd(scala.math.sqrt(sum(microClusters(idx1).cf2x) / microClusters(idx1).n.toDouble - sum(microClusters(idx1).cf1x.map(a => a * a)) / (microClusters(idx1).n * microClusters(idx1).n.toDouble))) 404 | 405 | } 406 | 407 | /** 408 | * Adds one point to a microcluster adding all its features. 409 | * 410 | * @param idx1 : local index of the microcluster in the array 411 | * @param point : the point 412 | * 413 | **/ 414 | 415 | private def addPointMicroClusters(idx1: Int, point: Vector[Double]): Unit = { 416 | 417 | microClusters(idx1).setCf1x(microClusters(idx1).getCf1x :+ point) 418 | microClusters(idx1).setCf2x(microClusters(idx1).getCf2x :+ (point :* point)) 419 | microClusters(idx1).setCf1t(microClusters(idx1).getCf1t + this.time) 420 | microClusters(idx1).setCf2t(microClusters(idx1).getCf2t + (this.time * this.time)) 421 | microClusters(idx1).setN(microClusters(idx1).getN + 1) 422 | 423 | mcInfo(idx1)._1.setCentroid(microClusters(idx1).getCf1x :/ microClusters(idx1).getN.toDouble) 424 | mcInfo(idx1)._1.setN(microClusters(idx1).getN) 425 | mcInfo(idx1)._1.setRmsd(scala.math.sqrt(sum(microClusters(idx1).cf2x) / microClusters(idx1).n.toDouble - sum(microClusters(idx1).cf1x.map(a => a * a)) / (microClusters(idx1).n * microClusters(idx1).n.toDouble))) 426 | 427 | } 428 | 429 | /** 430 | * Deletes one microcluster and replaces it locally with a new point. 431 | * 432 | * @param idx : local index of the microcluster in the array 433 | * @param point : the point 434 | * 435 | **/ 436 | 437 | private def replaceMicroCluster(idx: Int, point: Vector[Double]): Unit = { 438 | microClusters(idx) = new MicroCluster(point :* point, point, this.time * this.time, this.time, 1L) 439 | mcInfo(idx)._1.setCentroid(point) 440 | mcInfo(idx)._1.setN(1L) 441 | mcInfo(idx)._1.setRmsd(distanceNearestMC(mcInfo(idx)._1.centroid, mcInfo)) 442 | } 443 | 444 | /** 445 | * Finds the nearest microcluster for all entries of an RDD. 446 | * 447 | * @param rdd : RDD with points 448 | * @param mcInfo : Array containing microclusters information 449 | * @return RDD[(Int, Vector[Double])]: RDD that contains a tuple of the ID of the 450 | * nearest microcluster and the point itself. 451 | * 452 | **/ 453 | 454 | private def assignToMicroCluster(rdd: RDD[Vector[Double]], mcInfo: Array[(MicroClusterInfo, Int)]): RDD[(Int, Vector[Double])] = { 455 | rdd.map { a => 456 | var minDist = Double.PositiveInfinity 457 | var minIndex = Int.MaxValue 458 | var i = 0 459 | for (mc <- mcInfo) { 460 | val dist = squaredDistance(a, mc._1.centroid) 461 | if (dist < minDist) { 462 | minDist = dist 463 | minIndex = mc._2 464 | } 465 | i += 1 466 | } 467 | (minIndex, a) 468 | } 469 | } 470 | 471 | /** 472 | * Finds the nearest microcluster for all entries of an RDD, uses broadcast variable. 473 | * 474 | * @param rdd : RDD with points 475 | * @return RDD[(Int, Vector[Double])]: RDD that contains a tuple of the ID of the 476 | * nearest microcluster and the point itself. 477 | * 478 | **/ 479 | private def assignToMicroCluster(rdd: RDD[Vector[Double]]) = { 480 | rdd.map { a => 481 | var minDist = Double.PositiveInfinity 482 | var minIndex = Int.MaxValue 483 | var i = 0 484 | for (mc <- broadcastMCInfo.value) { 485 | val dist = squaredDistance(a, mc._1.centroid) 486 | if (dist < minDist) { 487 | minDist = dist 488 | minIndex = mc._2 489 | } 490 | i += 1 491 | } 492 | (minIndex, a) 493 | } 494 | } 495 | 496 | /** 497 | * Performs all the operations to maintain the microclusters. Assign points that 498 | * belong to a microclusters, detects outliers and deals with them. 499 | * 500 | * @param assignations : RDD that contains a tuple of the ID of the 501 | * nearest microcluster and the point itself. 502 | * 503 | **/ 504 | 505 | private def updateMicroClusters(assignations: RDD[(Int, Vector[Double])]): Unit = { 506 | 507 | var dataInAndOut: RDD[(Int, (Int, Vector[Double]))] = null 508 | var dataIn: RDD[(Int, Vector[Double])] = null 509 | var dataOut: RDD[(Int, Vector[Double])] = null 510 | 511 | // Calculate RMSD 512 | if (initialized) { 513 | dataInAndOut = assignations.map { a => 514 | val nearMCInfo = broadcastMCInfo.value.find(id => id._2 == a._1).get._1 515 | val nearDistance = scala.math.sqrt(squaredDistance(a._2, nearMCInfo.centroid)) 516 | 517 | if (nearDistance <= tFactor * nearMCInfo.rmsd) (1, a) 518 | else (0, a) 519 | } 520 | } 521 | 522 | // Separate data 523 | if (dataInAndOut != null) { 524 | dataIn = dataInAndOut.filter(_._1 == 1).map(a => a._2) 525 | dataOut = dataInAndOut.filter(_._1 == 0).map(a => a._2) 526 | } else dataIn = assignations 527 | 528 | // Compute sums, sums of squares and count points... all by key 529 | log.warn(s"Processing points") 530 | 531 | // sumsAndSumsSquares -> (key: Int, (sum: Vector[Double], sumSquares: Vector[Double], count: Long ) ) 532 | val sumsAndSumsSquares = timer { 533 | val aggregateFuntion = (aa: (Vector[Double], Vector[Double], Long), bb: (Vector[Double], Vector[Double], Long)) => (aa._1 :+ bb._1, aa._2 :+ bb._2, aa._3 + bb._3) 534 | dataIn.mapValues(a => (a, a :* a, 1L)).reduceByKey(aggregateFuntion).collect() 535 | } 536 | 537 | 538 | var totalIn = 0L 539 | 540 | for (mc <- microClusters) { 541 | for (ss <- sumsAndSumsSquares) if (mc.getIds(0) == ss._1) { 542 | mc.setCf1x(mc.cf1x :+ ss._2._1) 543 | mc.setCf2x(mc.cf2x :+ ss._2._2) 544 | mc.setN(mc.n + ss._2._3) 545 | mc.setCf1t(mc.cf1t + ss._2._3 * this.time) 546 | mc.setCf2t(mc.cf2t + ss._2._3 * (this.time * this.time)) 547 | totalIn += ss._2._3 548 | } 549 | } 550 | 551 | 552 | 553 | 554 | log.warn(s"Processing " + (currentN - totalIn) + " outliers") 555 | timer { 556 | if (dataOut != null && currentN - totalIn != 0) { 557 | var mTimeStamp: Double = 0.0 558 | val recencyThreshold = this.time - delta 559 | var safeDeleteMC: Array[Int] = Array() 560 | var keepOrMergeMC: Array[Int] = Array() 561 | var i = 0 562 | 563 | 564 | for (mc <- microClusters) { 565 | val meanTimeStamp = if (mc.getN > 0) mc.getCf1t.toDouble / mc.getN.toDouble else 0 566 | val sdTimeStamp = scala.math.sqrt(mc.getCf2t.toDouble / mc.getN.toDouble - meanTimeStamp * meanTimeStamp) 567 | 568 | if (mc.getN < 2 * mLastPoints) mTimeStamp = meanTimeStamp 569 | else mTimeStamp = Gaussian(meanTimeStamp, sdTimeStamp).inverseCdf(1 - mLastPoints / (2 * mc.getN.toDouble)) 570 | 571 | if (mTimeStamp < recencyThreshold || mc.getN == 0) safeDeleteMC = safeDeleteMC :+ i 572 | else keepOrMergeMC = keepOrMergeMC :+ i 573 | 574 | i += 1 575 | } 576 | 577 | var j = 0 578 | var newMC: Array[Int] = Array() 579 | 580 | 581 | for (point <- dataOut.collect()) { 582 | 583 | var minDist = Double.PositiveInfinity 584 | var idMinDist = 0 585 | if (recursiveOutliersRMSDCheck) for (id <- newMC) { 586 | val dist = squaredDistPointToMCArrIdx(id, point._2) 587 | if (dist < minDist) { 588 | minDist = dist 589 | idMinDist = id 590 | } 591 | 592 | } 593 | 594 | if (scala.math.sqrt(minDist) <= tFactor * mcInfo(idMinDist)._1.rmsd) addPointMicroClusters(idMinDist, point._2) 595 | else if (safeDeleteMC.lift(j).isDefined) { 596 | replaceMicroCluster(safeDeleteMC(j), point._2) 597 | newMC = newMC :+ safeDeleteMC(j) 598 | j += 1 599 | } else { 600 | var minDist = Double.PositiveInfinity 601 | var idx1 = 0 602 | var idx2 = 0 603 | 604 | for (a <- keepOrMergeMC.indices) 605 | for (b <- (0 + a) until keepOrMergeMC.length) { 606 | var dist = Double.PositiveInfinity 607 | if (keepOrMergeMC(a) != keepOrMergeMC(b)) dist = squaredDistance(mcInfo(keepOrMergeMC(a))._1.centroid, mcInfo(keepOrMergeMC(b))._1.centroid) 608 | if (dist < minDist) { 609 | minDist = dist 610 | idx1 = keepOrMergeMC(a) 611 | idx2 = keepOrMergeMC(b) 612 | } 613 | } 614 | mergeMicroClusters(idx1, idx2) 615 | replaceMicroCluster(idx2, point._2) 616 | newMC = newMC :+ idx2 617 | } 618 | 619 | } 620 | 621 | } 622 | } 623 | } 624 | 625 | // END OF MODEL 626 | } 627 | 628 | 629 | /** 630 | * Object complementing the MicroCluster Class to allow it to create 631 | * new IDs whenever a new instance of it is created. 632 | * 633 | **/ 634 | 635 | private object MicroCluster extends Serializable { 636 | private var current = -1 637 | 638 | private def inc = { 639 | current += 1 640 | current 641 | } 642 | } 643 | 644 | /** 645 | * Packs the microcluster object and its features in one single class 646 | * 647 | **/ 648 | 649 | protected class MicroCluster( 650 | var cf2x: breeze.linalg.Vector[Double], 651 | var cf1x: breeze.linalg.Vector[Double], 652 | var cf2t: Long, 653 | var cf1t: Long, 654 | var n: Long, 655 | var ids: Array[Int]) extends Serializable { 656 | 657 | def this(cf2x: breeze.linalg.Vector[Double], cf1x: breeze.linalg.Vector[Double], cf2t: Long, cf1t: Long, n: Long) = this(cf2x, cf1x, cf2t, cf1t, n, Array(MicroCluster.inc)) 658 | 659 | def setCf2x(cf2x: breeze.linalg.Vector[Double]): Unit = { 660 | this.cf2x = cf2x 661 | } 662 | 663 | def getCf2x: breeze.linalg.Vector[Double] = { 664 | this.cf2x 665 | } 666 | 667 | def setCf1x(cf1x: breeze.linalg.Vector[Double]): Unit = { 668 | this.cf1x = cf1x 669 | } 670 | 671 | def getCf1x: breeze.linalg.Vector[Double] = { 672 | this.cf1x 673 | } 674 | 675 | def setCf2t(cf2t: Long): Unit = { 676 | this.cf2t = cf2t 677 | } 678 | 679 | def getCf2t: Long = { 680 | this.cf2t 681 | } 682 | 683 | def setCf1t(cf1t: Long): Unit = { 684 | this.cf1t = cf1t 685 | } 686 | 687 | def getCf1t: Long = { 688 | this.cf1t 689 | } 690 | 691 | def setN(n: Long): Unit = { 692 | this.n = n 693 | } 694 | 695 | def getN: Long = { 696 | this.n 697 | } 698 | 699 | def setIds(ids: Array[Int]): Unit = { 700 | this.ids = ids 701 | } 702 | 703 | def getIds: Array[Int] = { 704 | this.ids 705 | } 706 | } 707 | 708 | 709 | /** 710 | * Packs some microcluster information to reduce the amount of data to be 711 | * broadcasted. 712 | * 713 | **/ 714 | 715 | private class MicroClusterInfo( 716 | var centroid: breeze.linalg.Vector[Double], 717 | var rmsd: Double, 718 | var n: Long) extends Serializable { 719 | 720 | def setCentroid(centroid: Vector[Double]): Unit = { 721 | this.centroid = centroid 722 | } 723 | 724 | def setRmsd(rmsd: Double): Unit = { 725 | this.rmsd = rmsd 726 | } 727 | 728 | def setN(n: Long): Unit = { 729 | this.n = n 730 | } 731 | } 732 | 733 | -------------------------------------------------------------------------------- /src/main/scala/com/backhoff/clustream/Tools.scala: -------------------------------------------------------------------------------- 1 | package com.backhoff.clustream 2 | 3 | import java.io._ 4 | import java.nio.file.{Paths, Files} 5 | 6 | object Tools { 7 | 8 | def convertMCsBinariesToText(dirIn: String = "", dirOut: String = "", limit: Int): Unit = { 9 | print("processing files: ") 10 | for(i <- 0 to limit) { 11 | if(Files.exists(Paths.get(dirIn + "/" + i))) 12 | try { 13 | val file = new ObjectInputStream(new FileInputStream(dirIn + "/" + i)) 14 | val mc = file.readObject().asInstanceOf[Array[MicroCluster]] 15 | var text: Array[String] = null 16 | file.close() 17 | if(mc != null) { 18 | text = mc.map { m => 19 | "=========================================================== \n" + 20 | "MicroCluster IDs = " + m.getIds.mkString("[", ",", "]") + "\n" + 21 | "CF2X = " + m.getCf2x.toArray.mkString("[", ",", "]") + "\n" + 22 | "CF1X = " + m.getCf1x.toArray.mkString("[", ",", "]") + "\n" + 23 | "CF2T = " + m.getCf2t.toString + "\n" + 24 | "CF1T = " + m.getCf1t.toString + "\n" + 25 | "N = " + m.getN.toString + "\n" 26 | } 27 | 28 | val pw = new PrintWriter(new File(dirOut + "/" + i)) 29 | pw.write(text.mkString("","","")) 30 | pw.close 31 | print(i + " ") 32 | } 33 | 34 | } 35 | catch { 36 | case ex: IOException => println("Exception while reading files " + ex) 37 | null 38 | } 39 | } 40 | println() 41 | } 42 | 43 | } 44 | 45 | -------------------------------------------------------------------------------- /src/test/scala/com/backhoff/clustream/KmeansTest.scala: -------------------------------------------------------------------------------- 1 | package com.backhoff.clustream 2 | 3 | /** 4 | * Created by omar on 10/7/15. 5 | */ 6 | 7 | import org.apache.spark.{SparkContext, SparkConf} 8 | import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} 9 | import org.apache.spark.mllib.linalg.Vectors 10 | 11 | object KmeansTest { 12 | def main(args: Array[String]) { 13 | val conf = new SparkConf().setAppName("K-means test").setMaster("local[*]") 14 | val sc = new SparkContext(conf) 15 | sc.setLogLevel("ERROR") 16 | 17 | val data = sc.textFile("/home/omar/stream/streamMod") 18 | val parsedData = data.map(s => Vectors.dense(s.split(' ').dropRight(1).map(_.toDouble))).cache() 19 | 20 | // Cluster the data into two classes using KMeans 21 | val numClusters = 2 22 | val numIterations = 20 23 | val clusters = KMeans.train(parsedData, numClusters, numIterations) 24 | 25 | // Evaluate clustering by computing Within Set Sum of Squared Errors 26 | val WSSSE = clusters.computeCost(parsedData) 27 | println("Within Set Sum of Squared Errors = " + WSSSE) 28 | 29 | // Save and load model 30 | // clusters.save(sc, "/home/omar/Desktop/model") 31 | // val sameModel = KMeansModel.load(sc, "/home/omar/Desktop/model") 32 | clusters.clusterCenters.foreach(println) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/test/scala/com/backhoff/clustream/SimpleApp.scala: -------------------------------------------------------------------------------- 1 | package com.backhoff.clustream 2 | 3 | /** 4 | * Created by omar on 9/14/15. 5 | */ 6 | /* SimpleApp.scala */ 7 | import org.apache.spark.SparkContext 8 | import org.apache.spark.SparkConf 9 | 10 | object SimpleApp { 11 | def timer[R](block: => R): R = { 12 | val t0 = System.nanoTime() 13 | val result = block // call-by-name 14 | val t1 = System.nanoTime() 15 | println("Elapsed time: " + (t1 - t0) / 1000000 + "ms") 16 | result 17 | } 18 | def main(args: Array[String]) { 19 | // val logFile = "/home/omar/Libs/spark-1.5.0/README.md" // Should be some file on your system 20 | val conf = new SparkConf().setAppName("Simple Application").setMaster("local[*]") 21 | val sc = new SparkContext(conf) 22 | sc.setLogLevel("ERROR") 23 | // val logData = sc.textFile(logFile, 2).cache() 24 | // val numAs = logData.filter(line => line.contains("a")).count() 25 | // val numBs = logData.filter(line => line.contains("b")).count() 26 | // println("Lines with a: %s, Lines with b: %s".format(numAs, numBs)) 27 | 28 | val h = 1 29 | val t1 = 6 30 | val t2 = 21 31 | val t3 = 81 32 | val t4 = 161 33 | 34 | // Tools.convertMCsBinariesToText("snaps", "snaps/text", 100) 35 | val clustream = new CluStream(null) 36 | val snap1 = timer{clustream.getMCsFromSnapshots("snaps",t1,h)} 37 | val snap2 = timer{clustream.getMCsFromSnapshots("snaps",t2,h)} 38 | val snap3 = timer{clustream.getMCsFromSnapshots("snaps",t3,h)} 39 | val snap4 = timer{clustream.getMCsFromSnapshots("snaps",t4,h)} 40 | 41 | println(snap1.map(a => a.getN).mkString("[",",","]")) 42 | println("mics points = " + snap1.map(_.getN).sum) 43 | println(snap2.map(a => a.getN).mkString("[",",","]")) 44 | println("mics points = " + snap2.map(_.getN).sum) 45 | println(snap3.map(a => a.getN).mkString("[",",","]")) 46 | println("mics points = " + snap3.map(_.getN).sum) 47 | println(snap4.map(a => a.getN).mkString("[",",","]")) 48 | println("mics points = " + snap4.map(_.getN).sum) 49 | 50 | val clusters1 = timer{clustream.fakeKMeans(sc,5,5000,snap1)} 51 | if(clusters1 != null) { 52 | println("MacroClusters Ceneters") 53 | println("snapshots " + clustream.getSnapShots("snaps",t1,h)) 54 | clusters1.clusterCenters.foreach(println) 55 | clusters1.clusterCenters.foreach(c=>scala.tools.nsc.io.Path("/home/omar/datasets/tests/1case/results/clustream2000/centers1").createFile().appendAll(c.toArray.mkString("",",","") +"\n" )) 56 | } 57 | val clusters2 = timer{clustream.fakeKMeans(sc,5,5000,snap2)} 58 | if(clusters2 != null) { 59 | println("MacroClusters Ceneters") 60 | println("snapshots " + clustream.getSnapShots("snaps",t2,h)) 61 | clusters2.clusterCenters.foreach(println) 62 | clusters2.clusterCenters.foreach(c=>scala.tools.nsc.io.Path("/home/omar/datasets/tests/1case/results/clustream2000/centers2").createFile().appendAll(c.toArray.mkString("",",","") +"\n" )) 63 | } 64 | val clusters3 = timer{clustream.fakeKMeans(sc,5,5000,snap3)} 65 | if(clusters3 != null) { 66 | println("MacroClusters Ceneters") 67 | println("snapshots " + clustream.getSnapShots("snaps",t3,h)) 68 | clusters3.clusterCenters.foreach(println) 69 | clusters3.clusterCenters.foreach(c=>scala.tools.nsc.io.Path("/home/omar/datasets/tests/1case/results/clustream2000/centers3").createFile().appendAll(c.toArray.mkString("",",","") +"\n" )) 70 | } 71 | val clusters4 = timer{clustream.fakeKMeans(sc,5,5000,snap4)} 72 | if(clusters4 != null) { 73 | println("MacroClusters Ceneters") 74 | println("snapshots " + clustream.getSnapShots("snaps",t4,h)) 75 | clusters4.clusterCenters.foreach(println) 76 | clusters4.clusterCenters.foreach(c=>scala.tools.nsc.io.Path("/home/omar/datasets/tests/1case/results/clustream2000/centers4").createFile().appendAll(c.toArray.mkString("",",","") +"\n" )) 77 | } 78 | 79 | 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/test/scala/com/backhoff/clustream/StreamDM.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.streamdm 2 | 3 | /** 4 | * Created by omar on 9/18/15. 5 | */ 6 | 7 | import com.github.javacliparser.ClassOption 8 | import org.apache.spark.streamdm.clusterers.{Clusterer, Clustream} 9 | import org.apache.spark.streamdm.evaluation.Evaluator 10 | import org.apache.spark.streamdm.streams.{StreamWriter, StreamReader} 11 | import org.apache.spark.streamdm.tasks.Task 12 | import org.apache.spark.streaming.scheduler.{StreamingListenerBatchCompleted, StreamingListener} 13 | import org.apache.spark.streaming.{Milliseconds, StreamingContext} 14 | import org.apache.spark.{SparkContext, SparkConf} 15 | 16 | class Clus extends Task { 17 | //Task options 18 | val evaluatorOption:ClassOption = new ClassOption("evaluator", 'e', 19 | "Evaluator to use", classOf[Evaluator], "ClusteringCohesionEvaluator") 20 | val clustererOption:ClassOption = new ClassOption("learner", 'l', 21 | "Learner to use", classOf[Clustream], "Clustream") 22 | val streamReaderOption:ClassOption = new ClassOption("streamReader", 's', 23 | "Stream reader to use", classOf[StreamReader], "SocketTextStreamReader") 24 | val resultsWriterOption:ClassOption = new ClassOption("resultsWriter", 'w', 25 | "Stream writer to use", classOf[StreamWriter], "PrintStreamWriter") 26 | 27 | //Run the task 28 | def run(ssc:StreamingContext): Unit = { 29 | //Parse options and init 30 | val reader:StreamReader = this.streamReaderOption.getValue() 31 | val clusterer:Clustream = this.clustererOption.getValue() 32 | clusterer.init(reader.getExampleSpecification()) 33 | val writer:StreamWriter = this.resultsWriterOption.getValue() 34 | val evaluator:Evaluator = this.evaluatorOption.getValue() 35 | 36 | clusterer.microclusters.horizonOption.setValue(1) 37 | clusterer.initOption.setValue(2000) 38 | clusterer.kOption.setValue(5) 39 | clusterer.mcOption.setValue(50) 40 | //clusterer.repOption.setValue(10) 41 | 42 | //Parse stream and get Examples 43 | val N = new StaticVar[Long](0L) 44 | val listener = new MyListener(clusterer, N) 45 | ssc.addStreamingListener(listener) 46 | val instances = reader.getExamples(ssc) 47 | 48 | //Predict 49 | // val predPairs = learner.predict(instances) 50 | //Train 51 | clusterer.train(instances) 52 | //Assign 53 | //val clpairs = clusterer.assign(instances) 54 | 55 | //Print statistics 56 | // writer.output(evaluator.addResult(clpairs)) 57 | } 58 | } 59 | 60 | class MyListener(model: Clustream, n: StaticVar[Long]) extends StreamingListener { 61 | override def onBatchCompleted(batchCompleted:StreamingListenerBatchCompleted) { 62 | if ( batchCompleted.batchInfo.numRecords > 0) { 63 | n.value = n.value + batchCompleted.batchInfo.numRecords 64 | println("================= CENTERS ================= N = " + n.value) 65 | model.clusters.foreach(c => println(c.toString())) 66 | println(model.microclusters.horizonOption.getValue) 67 | } 68 | } 69 | } 70 | class StaticVar[T]( var value: T ) 71 | 72 | object StreamDM { 73 | def main(args: Array[String]) { 74 | val conf = new SparkConf().setAppName("Streaming K-means test").setMaster("local[*]") 75 | val sc = new SparkContext(conf) 76 | sc.setLogLevel("ERROR") 77 | val ssc = new StreamingContext(sc, Milliseconds(1000)) 78 | 79 | 80 | val numDimensions = 34 81 | val numClusters = 5 82 | val task = new Clus() 83 | task.run(ssc) 84 | ssc.start() 85 | ssc.awaitTermination() 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/test/scala/com/backhoff/clustream/StreamingKMeans.scala: -------------------------------------------------------------------------------- 1 | package com.backhoff.clustream 2 | 3 | /** 4 | * Created by omar on 9/18/15. 5 | */ 6 | 7 | import org.apache.spark.streaming.dstream.DStream 8 | import org.apache.spark.streaming.scheduler.{StreamingListenerBatchCompleted, StreamingListener} 9 | import org.apache.spark.streaming.{Milliseconds, StreamingContext} 10 | import org.apache.spark.{SparkContext, SparkConf} 11 | import org.apache.spark.mllib.linalg.Vectors 12 | import org.apache.spark.mllib.linalg.Vector 13 | import org.apache.spark.mllib.regression.LabeledPoint 14 | import org.apache.spark.mllib.clustering.StreamingKMeans 15 | 16 | object StreamingKMeans { 17 | def main(args: Array[String]) { 18 | val conf = new SparkConf().setAppName("Streaming K-means test").setMaster("local[*]") 19 | val sc = new SparkContext(conf) 20 | sc.setLogLevel("ERROR") 21 | val ssc = new StreamingContext(sc, Milliseconds(1000)) 22 | // val trainingData = ssc.textFileStream("file:///home/omar/stream/train").map(_.split(" ")).map(arr => arr.dropRight(1)).map(_.mkString("[", ",", "]")).map(Vectors.parse) 23 | // val trainingData = ssc.socketTextStream("localhost",9999).map(_.split(" ")).map(arr => arr.dropRight(1)).map(_.mkString("[",",","]")).map(Vectors.parse) 24 | val trainingData = ssc.socketTextStream("localhost",9999).map(_.split(" ")).map(_.mkString("[",",","]")).map(Vectors.parse) 25 | //val testData = ssc.textFileStream("/home/omar/stream/testing").map(LabeledPoint.parse) 26 | // val testData = ssc.socketTextStream("localhost", 9998).map(LabeledPoint.parse) 27 | val numDimensions = 34 28 | val numClusters = 5 29 | val model = new StreamingKMeans() 30 | .setK(numClusters) 31 | .setHalfLife(1000, "points") 32 | //.setDecayFactor(0.0) 33 | .setRandomCenters(numDimensions, 0.0) 34 | 35 | val N = new StaticVar[Long](0L) 36 | val listener = new MyListener(model, N) 37 | ssc.addStreamingListener(listener) 38 | 39 | model.trainOn(trainingData) 40 | //model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print() 41 | 42 | ssc.start() 43 | ssc.awaitTermination() 44 | } 45 | } 46 | 47 | private[clustream] class MyListener(model: StreamingKMeans, n: StaticVar[Long]) extends StreamingListener { 48 | override def onBatchCompleted(batchCompleted:StreamingListenerBatchCompleted) { 49 | if ( batchCompleted.batchInfo.numRecords > 0) { 50 | n.value = n.value + batchCompleted.batchInfo.numRecords 51 | println("================= CENTERS ================= N = " + n.value) 52 | model.latestModel().clusterCenters.foreach(println) 53 | } 54 | } 55 | } 56 | class StaticVar[T]( var value: T ) 57 | -------------------------------------------------------------------------------- /src/test/scala/com/backhoff/clustream/StreamingTests.scala: -------------------------------------------------------------------------------- 1 | package com.backhoff.clustream 2 | 3 | /** 4 | * Created by omar on 9/20/15. 5 | */ 6 | 7 | import org.apache.spark.streaming.scheduler.{StreamingListenerBatchCompleted, StreamingListener} 8 | import org.apache.spark.{SparkConf, SparkContext} 9 | import org.apache.spark.streaming._ 10 | import org.apache.log4j._ 11 | 12 | 13 | import breeze.linalg._ 14 | 15 | object StreamingTests { 16 | def main(args: Array[String]) { 17 | val conf = new SparkConf().setAppName("Spark CluStream").setMaster("local[*]") 18 | // val conf = new SparkConf().setAppName("Stream Word Count").setMaster("spark://192.168.0.119:7077") 19 | val sc = new SparkContext(conf) 20 | sc.setLogLevel("ERROR") 21 | Logger.getLogger("org").setLevel(Level.OFF) 22 | Logger.getLogger("akka").setLevel(Level.OFF) 23 | val ssc = new StreamingContext(sc, Milliseconds(1000)) 24 | // ssc.checkpoint("/home/omar/stream/checkpoint") 25 | val lines = ssc.socketTextStream("localhost", 9999) 26 | // val lines = ssc.textFileStream("file:///home/omar/stream/train") 27 | 28 | // val words = lines.flatMap(_.split(" ").map(_.toInt)) 29 | // val pairs = words.map(word => (word, 1)) 30 | // val wordCounts = pairs.reduceByKey(_ + _) 31 | // 32 | // 33 | // wordCounts.print() 34 | 35 | // val words = lines.map(_.split(" ").map(_.toInt).zipWithIndex) 36 | // val pairs = words.flatMap(a => a).transform(_.map(a => (a._2,a._1))) 37 | // val wordCounts = pairs.reduceByKey(_ + _) 38 | 39 | 40 | val model = new CluStreamOnline(50, 34, 2000).setDelta(512).setM(20).setInitNormalKMeans(false) 41 | val clustream = new CluStream(model) 42 | ssc.addStreamingListener(new PrintClustersListener(clustream, sc)) 43 | // model.run(lines.map(_.split(" ").map(_.toDouble)).map(DenseVector(_))) 44 | // clustream.startOnline(lines.map(_.split(" ").map(_.toDouble)).map(arr => arr.dropRight(1)).map(DenseVector(_))) 45 | clustream.startOnline(lines.map(_.split(" ").map(_.toDouble)).map(DenseVector(_))) 46 | 47 | // wordCounts.print() 48 | ssc.start() 49 | ssc.awaitTermination() 50 | } 51 | 52 | } 53 | 54 | private[clustream] class PrintClustersListener(clustream: CluStream, sc: SparkContext) extends StreamingListener { 55 | 56 | override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted) { 57 | if (batchCompleted.batchInfo.numRecords > 0) { 58 | 59 | val tc = clustream.model.getCurrentTime 60 | val n = clustream.model.getTotalPoints 61 | 62 | clustream.saveSnapShotsToDisk("snaps",tc, 2, 10) 63 | println("tc = " + tc + ", n = " + n) 64 | 65 | // if (149900 < n && n <= 150100 ) { 66 | // 67 | // val snaps = clustream.getSnapShots("snaps",tc,256) 68 | // val clusters = clustream.fakeKMeans(sc, 5, 2000, clustream.getMCsFromSnapshots("snaps", tc, 256)) 69 | // println("============= MacroClusters Centers for time = " + tc + ", n = " + n + ", snapshots = " + snaps + " ============") 70 | // clusters.clusterCenters.foreach(c=>scala.tools.nsc.io.Path("/home/omar/datasets/tests/2case/results/clustream200/centers1").createFile().appendAll(c.toArray.mkString("",",","") +"\n" )) 71 | // 72 | // 73 | //// val clusters = clustream.fakeKMeans(sc, 5, 2000, clustream.model.getMicroClusters) 74 | //// println("============= MacroClusters Centers for time = " + tc + ", n = " + n + " ============") 75 | //// clusters.clusterCenters.foreach(println) 76 | // 77 | // } 78 | // if( 249900 < n && n <= 250100){ 79 | // val snaps = clustream.getSnapShots("snaps",tc,256) 80 | // val clusters = clustream.fakeKMeans(sc, 5, 2000, clustream.getMCsFromSnapshots("snaps", tc, 256)) 81 | // println("============= MacroClusters Centers for time = " + tc + ", n = " + n + ", snapshots = " + snaps + " ============") 82 | // clusters.clusterCenters.foreach(c=>scala.tools.nsc.io.Path("/home/omar/datasets/tests/2case/results/clustream200/centers2").createFile().appendAll(c.toArray.mkString("",",","")+"\n")) 83 | // } 84 | // if(349900 < n && n <= 350100 ){ 85 | // val snaps = clustream.getSnapShots("snaps",tc,256) 86 | // val clusters = clustream.fakeKMeans(sc, 5, 2000, clustream.getMCsFromSnapshots("snaps", tc, 256)) 87 | // println("============= MacroClusters Centers for time = " + tc + ", n = " + n + ", snapshots = " + snaps + " ============") 88 | // clusters.clusterCenters.foreach(c=>scala.tools.nsc.io.Path("/home/omar/datasets/tests/2case/results/clustream200/centers3").createFile().appendAll(c.toArray.mkString("",",","")+"\n")) 89 | // } 90 | // if(449900 < n && n <= 450100){ 91 | // val snaps = clustream.getSnapShots("snaps",tc,256) 92 | // val clusters = clustream.fakeKMeans(sc, 5, 2000, clustream.getMCsFromSnapshots("snaps", tc, 256)) 93 | // println("============= MacroClusters Centers for time = " + tc + ", n = " + n + ", snapshots = " + snaps + " ============") 94 | // clusters.clusterCenters.foreach(c=>scala.tools.nsc.io.Path("/home/omar/datasets/tests/2case/results/clustream200/centers4").createFile().appendAll(c.toArray.mkString("",",","")+"\n")) 95 | // } 96 | 97 | } 98 | } 99 | } 100 | --------------------------------------------------------------------------------