├── data ├── sample_outlier_data.txt ├── sample_knn_join_data.txt └── sample_watershed_data.txt ├── misc └── Readme.txt ├── mllib ├── src │ ├── main │ │ └── scala │ │ │ └── org │ │ │ └── sparkalgos │ │ │ └── mllib │ │ │ ├── package.scala │ │ │ ├── join │ │ │ ├── Knn.scala │ │ │ ├── zScore_Int.scala │ │ │ ├── zScore_Long.scala │ │ │ ├── KnnJoin.scala │ │ │ ├── zScore.scala │ │ │ ├── knnJoin_Int.scala │ │ │ └── knnJoin_Long.scala │ │ │ └── clustering │ │ │ ├── AVF.scala │ │ │ └── OutlierWithAVF.scala │ └── test │ │ └── scala │ │ └── org │ │ └── sparkalgos │ │ └── mllib │ │ ├── utils │ │ └── LocalSparkContext.scala │ │ ├── join │ │ └── KnnJoinSuit.scala │ │ └── clustering │ │ └── AVFSuite.scala ├── build.sbt └── README.md ├── .gitignore ├── examples ├── build.sbt └── src │ └── main │ └── scala │ └── org │ └── sparkalgos │ └── examples │ ├── mllib │ ├── KnnJoin.scala │ └── OutlierDetection.scala │ └── graphx │ ├── WaterShed.scala │ └── FeedbackVertexSet.scala ├── graphx ├── build.sbt └── src │ ├── test │ └── scala │ │ └── org │ │ └── sparkalgos │ │ └── graphx │ │ ├── application │ │ └── WaterShedSuit.scala │ │ ├── core │ │ └── FeedbackVertexSetSuit.scala │ │ └── utils │ │ └── LocalSparkContext.scala │ └── main │ └── scala │ └── org │ └── sparkalgos │ └── graphx │ ├── core │ └── FeedbackVertexSet.scala │ └── application │ └── GraphProperties.scala ├── README.md └── LICENSE /data/sample_outlier_data.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/sample_knn_join_data.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/sample_watershed_data.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /misc/Readme.txt: -------------------------------------------------------------------------------- 1 | Miscellaneous algorithms 2 | 3 | -------------------------------------------------------------------------------- /mllib/src/main/scala/org/sparkalgos/mllib/package.scala: -------------------------------------------------------------------------------- 1 | package org.sparkalgos.mllib 2 | 3 | package object mllibalgos 4 | -------------------------------------------------------------------------------- /mllib/build.sbt: -------------------------------------------------------------------------------- 1 | name := "mllib" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.4" 6 | 7 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.1.0" 8 | 9 | libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.0" % "test" -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *~ 3 | *.class 4 | *.log 5 | 6 | # sbt specific 7 | .cache 8 | .history 9 | .lib/ 10 | dist/* 11 | target/ 12 | lib_managed/ 13 | src_managed/ 14 | bin/ 15 | target/ 16 | project/ 17 | project/boot/ 18 | project/plugins/project/ 19 | 20 | # Scala-IDE specific 21 | .scala_dependencies 22 | .worksheet -------------------------------------------------------------------------------- /examples/build.sbt: -------------------------------------------------------------------------------- 1 | name := "examples" 2 | 3 | scalaVersion := "2.10.4" 4 | 5 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.1.0" 6 | 7 | libraryDependencies += "org.apache.spark" %% "spark-graphx" % "1.0.0" 8 | 9 | libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.0" % "test" 10 | -------------------------------------------------------------------------------- /graphx/build.sbt: -------------------------------------------------------------------------------- 1 | name := "graphx" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.4" 6 | 7 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.1.0" 8 | 9 | libraryDependencies += "org.apache.spark" %% "spark-graphx" % "1.0.0" 10 | 11 | libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.0" % "test" -------------------------------------------------------------------------------- /graphx/src/test/scala/org/sparkalgos/graphx/application/WaterShedSuit.scala: -------------------------------------------------------------------------------- 1 | package org.sparkalgos.graphx.application 2 | 3 | import org.scalatest.{BeforeAndAfterEach, FunSuite} 4 | import org.sparkalgos.graphx.utils.LocalSparkContext 5 | 6 | class WaterShedSuit extends FunSuite with BeforeAndAfterEach with LocalSparkContext { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /graphx/src/test/scala/org/sparkalgos/graphx/core/FeedbackVertexSetSuit.scala: -------------------------------------------------------------------------------- 1 | package org.sparkalgos.graphx.core 2 | 3 | import org.scalatest.{BeforeAndAfterEach, FunSuite} 4 | import org.sparkalgos.graphx.utils.LocalSparkContext 5 | 6 | class FeedbackVertexSetSuit extends FunSuite with BeforeAndAfterEach with LocalSparkContext { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | SparkAlgorithms 2 | =============== 3 | 4 | Additional useful algorithms that can be used with spark. 5 | 6 | 7 | ###MLlib 8 | #####Outlier Detection 9 | Outlier detection on categorical data. By counting frequency scores 10 | 11 | 12 | #####KNN-Join 13 | Approximate KNN-Join which uses z-scores to compute nearest neigbors 14 | 15 | 16 | ###GraphX 17 | #####Feedback Vertex Set 18 | Greedy recursive solution to find feedback vertex set of a directed graph. 19 | #####Watershed Delineation 20 | A pregel based functionality to compute vertices which reach out to a given vertex in a DAG 21 | -------------------------------------------------------------------------------- /mllib/src/main/scala/org/sparkalgos/mllib/join/Knn.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.SparkContext 2 | import org.sparkalgos.mllib.join.KnnJoin 3 | 4 | object Knn { 5 | 6 | def main (args: Array[String]) { 7 | 8 | /*val model = knnJoin.knnJoin(dataset : RDD[Vector[Int]], 9 | datapoint : Vector[Int], len : Int, iteration : Int)*/ 10 | val sc = new SparkContext("local","knn") 11 | 12 | val vectors = Seq( 13 | Vector(0, 0, 0), 14 | Vector(1, 2, 3), 15 | Vector(1, 5, 4), 16 | Vector(5, 5, 8), 17 | Vector(1, 1, 2), 18 | Vector(1, 2, 4), 19 | Vector(3, 4, 5) 20 | ) 21 | val data = sc.parallelize(vectors, 2) 22 | 23 | val point = Vector(1,5,3) 24 | val len = 3 25 | val iter = 4 26 | 27 | val model = KnnJoin.knnJoin( data, point, len, iter) 28 | 29 | model.saveAsTextFile("/home/ashu/Desktop/knn") 30 | 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /examples/src/main/scala/org/sparkalgos/examples/mllib/KnnJoin.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.sparkalgos.examples.mllib 18 | 19 | object KnnJoin { 20 | 21 | } 22 | -------------------------------------------------------------------------------- /examples/src/main/scala/org/sparkalgos/examples/graphx/WaterShed.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.sparkalgos.examples.graphx 18 | 19 | object WaterShed { 20 | 21 | } 22 | -------------------------------------------------------------------------------- /examples/src/main/scala/org/sparkalgos/examples/mllib/OutlierDetection.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.sparkalgos.examples.mllib 18 | 19 | object OutlierDetection { 20 | 21 | } 22 | -------------------------------------------------------------------------------- /examples/src/main/scala/org/sparkalgos/examples/graphx/FeedbackVertexSet.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.sparkalgos.examples.graphx 18 | 19 | 20 | object FeedbackVertexSet { 21 | 22 | } 23 | -------------------------------------------------------------------------------- /graphx/src/test/scala/org/sparkalgos/graphx/utils/LocalSparkContext.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.sparkalgos.graphx.utils 18 | 19 | import org.apache.spark.{SparkConf, SparkContext} 20 | import org.scalatest.{BeforeAndAfterAll, Suite} 21 | 22 | trait LocalSparkContext extends BeforeAndAfterAll { self: Suite => 23 | @transient var sc: SparkContext = _ 24 | 25 | override def beforeAll() { 26 | val conf = new SparkConf() 27 | .setMaster("local") 28 | .setAppName("test") 29 | sc = new SparkContext(conf) 30 | super.beforeAll() 31 | } 32 | 33 | override def afterAll() { 34 | if (sc != null) { 35 | sc.stop() 36 | } 37 | super.afterAll() 38 | } 39 | } -------------------------------------------------------------------------------- /mllib/src/test/scala/org/sparkalgos/mllib/utils/LocalSparkContext.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.sparkalgos.mllib.utils 18 | 19 | import org.scalatest.Suite 20 | import org.scalatest.BeforeAndAfterAll 21 | 22 | import org.apache.spark.{SparkConf, SparkContext} 23 | 24 | trait LocalSparkContext extends BeforeAndAfterAll { self: Suite => 25 | @transient var sc: SparkContext = _ 26 | 27 | override def beforeAll() { 28 | val conf = new SparkConf() 29 | .setMaster("local") 30 | .setAppName("test") 31 | sc = new SparkContext(conf) 32 | super.beforeAll() 33 | } 34 | 35 | override def afterAll() { 36 | if (sc != null) { 37 | sc.stop() 38 | } 39 | super.afterAll() 40 | } 41 | } -------------------------------------------------------------------------------- /mllib/src/main/scala/org/sparkalgos/mllib/clustering/AVF.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.sparkAlgos.mllib.clustering 18 | 19 | import org.apache.spark.SparkContext 20 | 21 | 22 | /** 23 | * Driver for the OutlierWithAVFModel 24 | * 25 | **/ 26 | object Test{ 27 | 28 | def main(args:Array[String]) 29 | { 30 | val sc = new SparkContext("local", "OutlierDetection") 31 | val dir = "/home/ashu/Desktop/abc.txt"//"hdfs://localhost:54310/train3"// 32 | 33 | 34 | val data = sc.textFile(dir).map(word => word.split(",").toVector) 35 | val model = OutlierWithAVFModel.outliers(data,30,sc) //"hdfs://localhost:54310/train3" 36 | 37 | model.score.saveAsTextFile("/home/ashu/Desktop/sc") 38 | model.trimedData.saveAsTextFile("/home/ashu/Desktop/tri") 39 | model.outliers.saveAsTextFile("/home/ashu/Desktop/outs") 40 | 41 | } 42 | 43 | } -------------------------------------------------------------------------------- /mllib/README.md: -------------------------------------------------------------------------------- 1 | ##MLlib 2 | 3 | This folder contains the implementation of additional machine learning algorithms which can be used 4 | with apapche Spark 5 | Outlier-Detection-with-AVF-Spark 6 | ================================ 7 | 8 | ##What's this? 9 | This is an outlier detection algorithm which works on categorical data. It calculated the frequency of occurence of each attribute of a data-point within the entire dataset. Based on these frequencies scores are assigned to each data point Data points with minimum scores are the designated outliers. 10 | 11 | ##How to Run 12 | You should have spark already build as a jar file in your build library path. It has a scala file with class 'OutlierWithAVFModel' 13 | 14 | 15 | From your main call the function "outliers" of this class, with following parameters 16 | ``` 17 | val sc = new SparkContext("local", "OutlierDetection") 18 | val dir = "hdfs://localhost:54310/train3" 19 | 20 | val data = sc.textFile(dir).map(word => word.split(",").toVector) 21 | val model = OutlierWithAVFModel.outliers(data,20,sc) 22 | 23 | model.score.saveAsTextFile("../scores") 24 | model.trimmed_data.saveAsTextFile(".../trimmed") 25 | 26 | returned model has two attributes score and trimmed_data. 27 | 28 | model.score : RDD(String, Int) 29 | It contains the hash key representation of a datapoint and its avf score. 30 | 31 | model.trimmed_data: RDD(String) 32 | It contains the dataset minus the outliers by the percentage provided. 33 | ``` 34 | 35 | z-KNN 36 | ================================ 37 | 38 | ##What's this? 39 | It's a modified knn-Join, which translates each multi-dimensional data-point into a single dimension on which KNN search for a data-point can be performed. 40 | Given a data-set, the algorithm computes the z-values for each entry of the data-set and selects those entries with z-values closest to the z-value of the data-point. The process is performed over multiple iterations using random vector to transform the data-set. And then by using the data-entries over z-values, kNN is applied to the reduced data-set 41 | 42 | ##How to Run 43 | You should have spark already build as a jar file in your build library path. It has a scala file with class 'knnJoin' and 'zScore' 44 | 45 | From your main call the function "knnJoin" of this class, with following parameters 46 | ``` 47 | val model = knnJoin.knnJoin(dataset : RDD[Vector[Int]], datapoint : Vector[Int], len : Int, iteration : Int, sc : SparkContext) 48 | 49 | model : RDD(Vector[Int]) 50 | 51 | It contains the kNN over the union of the all selected entried from the data-set as mentioned in 52 | http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5447837&tag=1 53 | ``` 54 | 55 | -------------------------------------------------------------------------------- /mllib/src/main/scala/org/sparkalgos/mllib/join/zScore_Int.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.sparkAlgos.mllib.join 19 | 20 | import org.apache.spark.SparkContext._ 21 | import org.apache.spark.rdd.RDD 22 | import scala.collection.immutable.Vector 23 | import scala.math.BigInt 24 | 25 | object zScore_Int { 26 | 27 | /** 28 | * Checks if all entries within the array are 0 or not 29 | * 30 | * @param Array of Int 31 | * @return 1, if all elements are zero; else 0 32 | */ 33 | def checkVectors(vector : Array[Int]) : Int = { 34 | var flag = 1 35 | 36 | for(i <- 0 to vector.length - 1){ 37 | if(vector(i)!=0){ 38 | flag = 0 39 | } 40 | } 41 | 42 | return flag 43 | } 44 | 45 | /** 46 | * Computers the z-scores for each entry of the input RDD of Vector of Int, sorted 47 | * in ascending order 48 | * 49 | * @param rdd of Vector of Int 50 | * @return z-scores of the RDD[( , )] 51 | */ 52 | def computeScore(rdd : RDD[(Vector[Int],Long)]) : RDD[(Long,BigInt)] = { 53 | 54 | val score = rdd.map(word => scoreOfDataPoint(word._1) -> word._2). 55 | sortByKey(true). 56 | map(word => word._2 -> word._1) 57 | 58 | score 59 | 60 | } 61 | 62 | /** 63 | * Computes the z-score of a Vector 64 | * 65 | * @param Vector of Int 66 | * @return z-score of the vector 67 | */ 68 | def scoreOfDataPoint(vector : Vector[Int]) : BigInt = { 69 | 70 | var x = vector.toArray 71 | 72 | var temp = 0 73 | var score : BigInt = 0 74 | var counter = 0 75 | 76 | while(checkVectors(x) == 0) { 77 | for(i <- x.length-1 to 0 by -1){ 78 | temp = x(i) & ((1 << 1) - 1) 79 | temp = temp << counter 80 | score = score+temp 81 | x(i) = x(i)>>1 82 | counter = counter + 1 83 | } 84 | } 85 | score 86 | } 87 | 88 | } 89 | -------------------------------------------------------------------------------- /mllib/src/main/scala/org/sparkalgos/mllib/join/zScore_Long.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.sparkAlgos.mllib.join 19 | 20 | import org.apache.spark.SparkContext._ 21 | import org.apache.spark.rdd.RDD 22 | import scala.collection.immutable.Vector 23 | import scala.math.BigInt 24 | 25 | object zScore_Long { 26 | 27 | 28 | /** 29 | * Checks if all entries within the array are 0 or not 30 | * 31 | * @param Array of Int 32 | * @return 1, if all elements are zero; else 0 33 | */ 34 | def checkVectors(vector : Array[Long]) : Int = { 35 | var flag = 1 36 | 37 | for(i <- 0 to vector.length - 1){ 38 | if(vector(i)!=0){ 39 | flag = 0 40 | } 41 | } 42 | 43 | return flag 44 | } 45 | 46 | /** 47 | * Computers the z-scores for each entry of the input RDD of Vector of Long, 48 | * sorted in ascending order 49 | * 50 | * @param rdd of Vector of Long 51 | * @return z-scores of the RDD[( , )] 52 | */ 53 | def computeScore(rdd : RDD[(Vector[Long],Long)]) : RDD[(Long,BigInt)] = { 54 | 55 | val score = rdd.map(word => scoreOfDataPoint(word._1) -> word._2). 56 | sortByKey(true). 57 | map(word => word._2 -> word._1) 58 | score 59 | } 60 | 61 | 62 | /** 63 | * Computes the z-score of a Vector 64 | * 65 | * @param Vector of Long 66 | * @return z-score of the vector 67 | */ 68 | def scoreOfDataPoint(vector : Vector[Long]) : BigInt = { 69 | 70 | var x = vector.toArray 71 | 72 | var temp = 0L 73 | var score : BigInt = 0 74 | var counter = 0 75 | 76 | while(checkVectors(x) == 0) { 77 | for(i <- x.length-1 to 0 by -1){ 78 | temp = x(i) & ((1 << 1) - 1) 79 | temp = temp << counter 80 | score = score+temp 81 | x(i) = x(i)>>1 82 | counter = counter + 1 83 | } 84 | } 85 | score 86 | } 87 | 88 | } 89 | -------------------------------------------------------------------------------- /mllib/src/main/scala/org/sparkalgos/mllib/join/KnnJoin.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.sparkAlgos.mllib.join 18 | 19 | import scala.math._ 20 | import org.apache.spark.SparkContext 21 | import org.apache.spark.SparkContext._ 22 | import org.apache.spark.rdd.RDD 23 | import scala.collection.immutable.Vector 24 | import scala.util.Random 25 | import java.util.logging.Logger 26 | 27 | object KnnJoin { 28 | 29 | /** 30 | * Computes the nearest neighbors in the data-set for the data-point against which KNN 31 | * has to be applied 32 | * 33 | * @param dataSet : RDD of Vectors of Int/Long 34 | * @param dataPoint : Vector of Int/Long 35 | * @param len : Number of data-points of the dataSet on which knnJoin is to be done 36 | * @param randomSize : the number of iterations which has to be carried out 37 | * 38 | * @return an RDD of Vectors of Int/Long on which simple KNN needs to be applied with respect to 39 | * the data-point 40 | */ 41 | def knnJoin[A](dataSet : RDD[Vector[A]], 42 | dataPoint : Vector[A], 43 | len : Int, 44 | randomSize : Int) = { 45 | 46 | val logger = Logger.getLogger("knnJoin") 47 | val sc = dataSet.context 48 | 49 | val arg = dataPoint(0) 50 | arg match{ 51 | 52 | // if input is RDD[Vector[Int]] 53 | case _: Int => 54 | println("Calling Int") 55 | val set = dataSet.map(f => f.map(word => word.toString.toInt)) 56 | val point = dataPoint.map(f => f.toString.toInt) 57 | knnJoin_Int.knnJoin(set, point, len, randomSize, sc).coalesce(1) 58 | 59 | // if input is RDD[Vector[Long]] 60 | case _: Long => 61 | println("Calling Long") 62 | val set = dataSet.map(f => f.map(word => word.toString.toLong)) 63 | val point = dataPoint.map(f => f.toString.toLong) 64 | knnJoin_Long.knnJoin(set, point, len, randomSize, sc).coalesce(1) 65 | 66 | case _ => logger.severe("Argument_0 to knnJoin isn't of type Int/Long") 67 | exit(0) 68 | 69 | } 70 | } 71 | 72 | 73 | } 74 | -------------------------------------------------------------------------------- /mllib/src/test/scala/org/sparkalgos/mllib/join/KnnJoinSuit.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | import org.apache.spark.rdd.RDD 18 | import org.scalatest.{BeforeAndAfterEach, FunSuite} 19 | import org.sparkalgos.mllib.utils.LocalSparkContext 20 | import org.scalatest.{BeforeAndAfterEach, FunSuite} 21 | import org.sparkalgos.mllib.join.KnnJoin 22 | 23 | class KnnJoinSuit extends FunSuite with BeforeAndAfterEach with LocalSparkContext { 24 | 25 | var vectors: Vector[Vector[Int]] = _ 26 | var data: RDD[Vector[Int]] = _ 27 | var point:Vector[Int] = _ 28 | var len: Int = _ 29 | var iter :Int = _ 30 | 31 | override def beforeEach() { 32 | 33 | /* 34 | data 35 | 0, 0, 0 36 | 1, 2, 3 37 | 1, 5, 4 38 | 5, 5, 8 39 | 1, 1, 2 40 | 1, 2, 4 41 | 3, 4, 5 42 | */ 43 | vectors = Vector( 44 | Vector(0, 0, 0), 45 | Vector(1, 2, 3), 46 | Vector(1, 5, 4), 47 | Vector(5, 5, 8), 48 | Vector(1, 1, 2), 49 | Vector(1, 2, 4), 50 | Vector(3, 4, 5) 51 | ) 52 | data = sc.parallelize(vectors, 3) 53 | point = Vector(1,3,5) 54 | len = 4 55 | iter = 4 56 | } 57 | test("four neighbors should be there when length is four"){ 58 | 59 | val model = KnnJoin.knnJoin(data,point,len,iter,sc) 60 | assert(model.count() == 4) 61 | 62 | } 63 | test("No neighbors should be computed when length is zero"){ 64 | len = 0 65 | 66 | val model = KnnJoin.knnJoin(data,point,len,iter,sc) 67 | assert(model.count() == len) 68 | } 69 | 70 | test("All entries are from original data set") { 71 | val model = KnnJoin.knnJoin(data,point,len,iter,sc) 72 | assert(model.intersection(data).count() == len) 73 | 74 | } 75 | 76 | 77 | /* 78 | test("knnJoin method called by the companion object") { 79 | val model = knnJoin.knnJoin(data,point,len,iter,sc) 80 | assert(model.getClass.getSimpleName.toString === "knnJoin") 81 | } 82 | */ 83 | 84 | 85 | 86 | } 87 | -------------------------------------------------------------------------------- /mllib/src/main/scala/org/sparkalgos/mllib/join/zScore.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.sparkAlgos.mllib.join 19 | 20 | import org.apache.spark.SparkContext._ 21 | import org.apache.spark.rdd.RDD 22 | import scala.collection.immutable.Vector 23 | import scala.math.BigInt 24 | import java.util.logging.Logger 25 | 26 | object zScore { 27 | 28 | /** 29 | * Computes the z-score of a Vector 30 | * 31 | * @param Vector of Long/Int 32 | * @return z-score of the vector 33 | */ 34 | val logger = Logger.getLogger("zScore") 35 | def scoreOfDataPoint[A](vector : Vector[A]) : BigInt = { 36 | val arg = vector(0) 37 | arg match{ 38 | 39 | // if input is Vector[Int] 40 | case _: Int => 41 | val vec = vector.map(word => word.toString.toInt) 42 | zScore_Int.scoreOfDataPoint(vec) 43 | 44 | // if input is Vector[Long] 45 | case _: Long => 46 | val vec = vector.map(word => word.toString.toLong) 47 | zScore_Long.scoreOfDataPoint(vec) 48 | 49 | 50 | case _ => logger.severe("Argument_0 to scoreOfDataPoint isn't of type Int/Long") 51 | exit(0) 52 | } 53 | } 54 | 55 | /** 56 | * Computers the z-scores for each entry of the input RDD of Vector of Int/Long, sorted in ascending order 57 | * 58 | * @param rdd of Vector of Int/Long & Long 59 | * @return z-scores of the RDD[( , )] 60 | */ 61 | def computeScore[A](rdd : RDD[(Vector[A],Long)]) : RDD[(Long,BigInt)] = { 62 | 63 | val arg = rdd.first._1 64 | arg(0) match{ 65 | 66 | // if input is Vector[Int] 67 | case _: Int => 68 | val vec = rdd.map(line => line._1.map(f => f.toString.toInt) -> line._2) 69 | zScore_Int.computeScore(vec) 70 | 71 | // if input is Vector[Long] 72 | case _: Long => 73 | val vec = rdd.map(line => line._1.map(f => f.toString.toLong) -> line._2) 74 | zScore_Long.computeScore(vec) 75 | 76 | case _ => logger.severe("Argument_0 to scoreOfDataPoint isn't of type Int/Long") 77 | exit(0) 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /mllib/src/test/scala/org/sparkalgos/mllib/clustering/AVFSuite.scala: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * Licensed to the Apache Software Foundation (ASF) under one or more 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | import org.sparkalgos.mllib.utils.LocalSparkContext 19 | import org.sparkAlgos.mllib.clustering.OutlierWithAVFModel 20 | import org.apache.spark.rdd.RDD 21 | import org.scalatest.{BeforeAndAfterEach, FunSuite} 22 | 23 | 24 | class outSuit extends FunSuite with BeforeAndAfterEach with LocalSparkContext { 25 | 26 | var vectors: Vector[Vector[String]] = _ 27 | var data: RDD[Vector[String]] = _ 28 | 29 | override def beforeEach() { 30 | 31 | /* 32 | data score 33 | A,B 5 34 | A,C 4 35 | A,D 4 36 | E,B 3 37 | */ 38 | vectors = Vector( 39 | Vector("A", "B"), 40 | Vector("A", "C"), 41 | Vector("A", "D"), 42 | Vector("E", "B") 43 | ) 44 | data = sc.parallelize(vectors, 2) 45 | } 46 | test("only two outliers should be removed"){ 47 | val model = OutlierWithAVFModel.outliers(data,30,sc) 48 | assert(model.trimedData.count() == 3) 49 | 50 | } 51 | test("No outlier should be removed"){ 52 | val model = OutlierWithAVFModel.outliers(data,0,sc) 53 | assert(model.trimedData.count() == 4) 54 | } 55 | 56 | test("4 entries in score RDD") { 57 | val model = OutlierWithAVFModel.outliers(data, 30, sc) 58 | assert(model.score.count() == 4) 59 | 60 | } 61 | 62 | test("with 30 percent outliers 1 entry outlier RDD") { 63 | val model = OutlierWithAVFModel.outliers(data, 30, sc) 64 | assert(model.outliers.count() === 1) 65 | 66 | } 67 | 68 | test("vector(E,B) should be outlier"){ 69 | val model = OutlierWithAVFModel.outliers(data, 30, sc) 70 | assert(model.outliers.first().equals(Vector("E", "B"))) 71 | 72 | } 73 | 74 | test("outlires method called by the companion object") { 75 | val model = OutlierWithAVFModel.outliers(data, 30, sc) 76 | assert(model.getClass.getSimpleName.toString === "OutlierWithAVFModel") 77 | } 78 | 79 | 80 | 81 | 82 | } 83 | -------------------------------------------------------------------------------- /graphx/src/main/scala/org/sparkalgos/graphx/core/FeedbackVertexSet.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.sparkalgos.graphx.core 19 | 20 | import org.apache.spark.SparkContext 21 | import org.apache.spark.rdd.RDD 22 | import org.apache.spark.graphx._ 23 | /* 24 | * Top level model for feedback vertex set 25 | */ 26 | object FeedbackVertexSet { 27 | 28 | /** 29 | * This function calculate the optimal vertex in one scc, by removal of which highest number of scc 30 | * is generated. 31 | * @param graph of type Graph[Long,Int] 32 | * @param sc SparkContext 33 | * @return rdd with the optimal set of vertices. 34 | */ 35 | def getVertex(graph: Graph[Long, Int], 36 | sc:SparkContext): RDD[Long] = { 37 | 38 | val vertices = graph.vertices.collect() 39 | 40 | //work only for the vertex size more then 1 41 | if(vertices.size > 1) { 42 | 43 | val z = vertices.map(id => id._1 -> 44 | graph.subgraph(vpred = (index, scc) => index != id._1) 45 | .stronglyConnectedComponents(2).vertices.map(word => word._2->word._1) 46 | .groupBy( word => word._1) 47 | .count) 48 | //get the vertex with max scc 49 | val vMax = z.reduce( (a,b) => if (a._2 > b._2) a else b ) 50 | val idMax = vMax._1 51 | 52 | var vList = sc.parallelize(Array(vMax._1)) 53 | vList.persist() 54 | //remove the max id vertex and run the algorithm again 55 | vList.union(feedbackVertexSet( graph.subgraph(vpred = (index, scc) => index != idMax) ,sc)) 56 | .coalesce(1) 57 | vList.persist() 58 | 59 | vList 60 | } 61 | else 62 | sc.parallelize(Array[Long]()) 63 | } 64 | 65 | /** 66 | * This function calculate the strongly connected components and run getVertex on each scc in parallel 67 | * @param graph of type Graph[Long,Int] 68 | * @param sc Sparkcontext 69 | * @return rdd with the optimal set of vertices. 70 | */ 71 | def feedbackVertexSet(graph:Graph[Long,Int], 72 | sc :SparkContext ): RDD[Long] = { 73 | 74 | //calculate strongly connected components 75 | val sccGraph = graph.stronglyConnectedComponents(2) 76 | 77 | var res = sc.parallelize(Array[Long]()) 78 | //get the component ids (minimum id in each component) in the list l 79 | val l = sccGraph.triplets.map(word => Array(word.srcAttr, word.dstAttr)).flatMap(f =>f) 80 | .map(w => w->0).groupBy(f => f._1).map(f => f._1).collect() 81 | 82 | //run get vertices on each component 83 | l.map(id => {res = res 84 | .union(getVertex( sccGraph.subgraph(vpred = (index, scc) => scc == id ),sc)) 85 | .coalesce(1) 86 | res.persist() 87 | } ) 88 | res 89 | } 90 | } 91 | 92 | -------------------------------------------------------------------------------- /mllib/src/main/scala/org/sparkalgos/mllib/clustering/OutlierWithAVF.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.sparkAlgos.mllib.clustering 19 | 20 | import org.apache.spark.SparkContext 21 | import org.apache.spark.SparkContext._ 22 | import org.apache.spark.rdd.RDD 23 | import com.google.common.hash._ 24 | 25 | /** 26 | * Get scores of the data-points by AVF algorithm 27 | * score gives the data-point index and its score returned as a RDD 28 | * trimmed data is the data remained after user provided percentage of removal 29 | * 30 | */ 31 | 32 | class OutlierWithAVFModel private ( 33 | val score: RDD[(String,Int)], 34 | val trimedData: RDD[Vector[String]], 35 | val outliers: RDD[Vector[String]]) 36 | 37 | /** 38 | * Top-level methods for OutlierWithAVF. 39 | */ 40 | 41 | object OutlierWithAVFModel { 42 | /** 43 | * Computes the score of each data point which is summation of the frequency of 44 | * each feature in that data-point. Low score data-points are outliers. 45 | * 46 | * @param input RDD of Vector[String] where feature values are comma separated . 47 | * @param hashSeed which is the hash-function to be used for representing data-points 48 | * @param sc is the Spark Context of the calling function 49 | * @return a RDD of hash-key and score. 50 | */ 51 | def computeScores(input: RDD[Vector[String]], 52 | hashSeed : HashFunction, 53 | sc : SparkContext) : RDD[(String,Int)] = { 54 | 55 | // key,value pairs for < (column_no,attribute value) , "frequency"> 56 | val freq = input.map(word => word.zipWithIndex) 57 | .flatMap(line => line.toSeq) 58 | .map(word => word->1) 59 | .reduceByKey(_+_) 60 | .cache() 61 | 62 | // key,value pairs for < (column_no,attribute value) , "indexedInput-point number"> 63 | val data = input.zipWithIndex().map(word => (word._2,word._1)) 64 | .map(word => word._2.zipWithIndex 65 | .map(w => w-> hashSeed.hashLong(word._1).toString)) 66 | .flatMap(line => line.toSeq) 67 | 68 | //join the two RDDs and get the frequency for each attribute in a indexedInput point 69 | val scores = data.join(freq) 70 | .flatMap(line => Seq(line.swap._1)) 71 | .reduceByKey(_+_) 72 | .map(word => (word._1,word._2)) 73 | 74 | scores 75 | } 76 | 77 | /** 78 | * On basis of the computed scores of data points and user-provided percentage of outliers 79 | * to be removed, this functions removes the outliers from the input RDD and returns the 80 | * trimmed data-set 81 | * 82 | * @param input RDD of Vector[String] 83 | * @param score of type (String, Int) having AVF score of the data-point obtained from 84 | * function compute 85 | * @param percent of type Double which is the percentage of outliers to be removed from the 86 | * data-set 87 | * @param hashSeed is the Hash-Function for uniquely identifying each data-point 88 | * @return trimmed data-set and outliers. 89 | */ 90 | 91 | def trimScores(input : RDD[Vector[String]], 92 | score : RDD[(String,Int)], 93 | percent : Double, 94 | hashSeed : HashFunction, 95 | sc : SparkContext) : (RDD[Vector[String]],RDD[Vector[String]] )= { 96 | 97 | val nexample = score.count() 98 | val nremove = nexample * percent*0.01 99 | 100 | //sorted scores 101 | val sortedScore = score.map(word => (word._2,word._1)) 102 | .sortByKey(true) 103 | .map(word => (word._2,word._1)) 104 | 105 | //trimmed score RDD 106 | val trimmedScores = sortedScore.zipWithIndex() 107 | .filter(word=> word._2 < nremove.toLong) 108 | .map(word => word._1).collect().toMap 109 | 110 | //filtered data-set 111 | val trimmedData = input.zipWithIndex() 112 | .map(word => (word._2,word._1)) 113 | .map(word => hashSeed.hashLong(word._1).toString -> word._2) 114 | .filter(line => !trimmedScores.get(line._1).nonEmpty) 115 | .map(v => v._2) 116 | 117 | val outliers = input.zipWithIndex() 118 | .map(word => (word._2,word._1)) 119 | .map(word => hashSeed.hashLong(word._1).toString -> word._2) 120 | .filter(line => trimmedScores.get(line._1).nonEmpty) 121 | .map(v => v._2) 122 | 123 | (trimmedData,outliers) 124 | 125 | } 126 | 127 | /** 128 | * This function acts as an entry point to compute the scores of the data-points and trim the 129 | * RDD's 130 | * @param data of type RDD[Vector[String] ] 131 | * @param percent of type Double which is the percentage of outliers to be removed from the data-set 132 | * @param sc Spark context 133 | * @return main.scala.OutlierWithAVFModel which has score RDD and trimmed data-set . 134 | */ 135 | 136 | def outliers(data :RDD[Vector[String]], percent : Double, sc :SparkContext) :OutlierWithAVFModel = { 137 | 138 | // initial check to validate user provided percentage 139 | if(percent >100){ 140 | println("Error : percentage is greater than 100") 141 | System.exit(1) 142 | } 143 | 144 | // define a hash-function of type murmur3-128bits with seed_value of '5' 145 | val hashSeed = Hashing.murmur3_128(5) 146 | 147 | //compute the AVF scores for each data-point 148 | val scores = OutlierWithAVFModel.computeScores(data,hashSeed,sc) 149 | 150 | //returns an instance of main.scala.OutlierWithAVFModel 151 | val outlierData = OutlierWithAVFModel.trimScores(data, scores, percent,hashSeed,sc) 152 | val trimmed = outlierData._1 153 | val outliers = outlierData._2 154 | 155 | new OutlierWithAVFModel(scores,trimmed,outliers) 156 | 157 | } 158 | } 159 | 160 | -------------------------------------------------------------------------------- /graphx/src/main/scala/org/sparkalgos/graphx/application/GraphProperties.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.sparkalgos.graphx.application 19 | 20 | import org.apache.spark.SparkContext 21 | import org.apache.spark.SparkContext._ 22 | import scala.collection._ 23 | import scala.collection.mutable.Seq 24 | import org.apache.spark.graphx._ 25 | import org.apache.spark.rdd.RDD 26 | 27 | 28 | /** 29 | * Get the vertices of a directed Graph that can reach to a given point 30 | * This algorithm can find application in Geographic Information Systems 31 | * to compute the points in the map from where water can flow to a given co-ordinate 32 | */ 33 | class GraphProperties ( 34 | val RDDvertex: RDD[(VertexId, (Iterable[Long], Int, Int))], 35 | val RDDedge: RDD[Edge[Int]], 36 | val RDDcoOrdinate_vertexID_Mapper : RDD[((Long,Long),Long)], 37 | val srcID : Long 38 | ) 39 | 40 | 41 | /** 42 | * top level methods for watershed algorithm using pregel API 43 | */ 44 | object WaterShed { 45 | 46 | /** 47 | * Converts a DigitalElevationMap of the form x,y,z to 48 | * a RDD of Vertex , a RDD of Edge from which graphs can be created, 49 | * each co-ordinate now being represented in the form of a vertex 50 | * and the direction of flow of water from one co-ordinate to another being 51 | * represented in terms of an edge between the vertices (i.e. co-ordinates) 52 | * 53 | * @param path : location of the DigitalElevationMap(DEM) 54 | * @param sc : spark context 55 | * @param x_Point : x-Coordinate of point for which delineation needs to be done 56 | * @param y_Point : y-Coordinate of point for which delineation needs to be done 57 | * @return an object of GraphProperties comprising of 58 | * RDD of Vertices and it's properties 59 | * RDD of Edges and it's properties 60 | * RDD of mappings between co-ordinates and vertexId's 61 | * vertexID of the point for which delineation is to be performed 62 | */ 63 | private def conversion_DEMcsv_to_RDDed_RDDvd(path : String, sc : SparkContext, x_Point : Long, y_Point : Long) : GraphProperties = { 64 | 65 | //read csv file of the form < x,y,z > 66 | val temp_data = sc .textFile(path) 67 | .map(word => ((word .split(",")) 68 | .map(lit => lit.toDouble) 69 | .toList 70 | ).toVector) 71 | .map(word => word(0) -> (word(1) -> word(2))) 72 | 73 | val x_len = temp_data.groupBy(f => f._1).count 74 | val y_len = temp_data.groupBy(f => f._2._1).count 75 | 76 | //alter co-ordinate values, in case x and y values do not start from 0 77 | val data = temp_data.groupByKey.sortByKey(true) 78 | .zipWithIndex 79 | .map(word => word._1._2.map(w => w._1 -> (word._2 -> w._2)) ).flatMap(f => f) 80 | .groupByKey.sortByKey(true) 81 | .zipWithIndex 82 | .map(word => word._1._2.map(w => (w._1 -> word._2) -> w._2)).flatMap(f => f) 83 | .zipWithIndex 84 | .map(word => word._1._1 -> (word._1._2 -> word._2)) 85 | val mapper = data.map(word => word._1 -> word._2._2) 86 | 87 | //sub-routine to compute edges for each vertex 88 | val mod_data1 = data.map(word => ((word._1._1 + 1),word._1._2) -> word) 89 | val mod_data2 = data.map(word => ((word._1._1 - 1),word._1._2) -> word) 90 | val mod_data3 = data.map(word => (word._1._1,(word._1._2 - 1)) -> word) 91 | val mod_data4 = data.map(word => (word._1._1,(word._1._2 + 1)) -> word) 92 | val mod_data5 = data.map(word => ((word._1._1 + 1),(word._1._2 - 1)) -> word) 93 | val mod_data6 = data.map(word => ((word._1._1 + 1),(word._1._2 + 1)) -> word) 94 | val mod_data7 = data.map(word => ((word._1._1 - 1),(word._1._2 - 1)) -> word) 95 | val mod_data8 = data.map(word => ((word._1._1 - 1),(word._1._2 + 1)) -> word) 96 | 97 | 98 | val mod_data = mod_data1.union(mod_data2).union(mod_data3).union(mod_data4) 99 | .union(mod_data5).union(mod_data6).union(mod_data7) 100 | .union(mod_data8).filter(word => (word._1._1 >= 0) && (word._1._1 < x_len)) 101 | .filter(word => (word._1._2 >= 0) && (word._1._2 < y_len)) 102 | .groupByKey 103 | .sortByKey(true) 104 | .map(word => word._1 -> { 105 | 106 | /** 107 | * (NEW) 108 | * let it return all the nearest set of points 109 | */ 110 | word._2 111 | }) 112 | // creates an edge from vertex1 to all other vertices which have elevation less than or equal to vertex1 113 | val join = data.join(mod_data).map(w => w._1 -> (w._2._1 -> w._2._2.filter(f => f._2._1 <= w._2._1._1))) 114 | val edgeRDD : RDD[Edge[Int]] = join .map(word => word._2._1._2 -> word._2._2.map(f => f._2._2)) 115 | .map(f => f._2.map(d => f._1 -> d)) 116 | .flatMap(f => f) 117 | .map(f => Edge(f._1 , f._2,1)).coalesce(1) 118 | 119 | //sub-routine to compute vertices 120 | val vertexRDD : RDD[(VertexId, (Iterable[Long], Int, Int))] 121 | = join .map(word => word._2._1._2 -> (word._2._2.map(f => f._2._2) ,0,0)) 122 | .coalesce(1) 123 | 124 | val srcID = mapper.filter(f => f._1._1 == x_Point && f._1._2 == y_Point).first._2 125 | 126 | new GraphProperties(vertexRDD,edgeRDD,mapper,srcID) 127 | } 128 | 129 | /** 130 | * Computes all vertices in a directed graph which can reach to a given vertex 131 | * 132 | * @param path : location of the DigitalElevationMap(DEM) 133 | * @param sc : spark context 134 | * @param x_Point : x-Coordinate of point for which delineation needs to be done 135 | * @param y_Point : y-Coordinate of point for which delineation needs to be done 136 | * @return RDD of co-ordinates which can reach to the given vertex 137 | */ 138 | 139 | def connectedComponents_to_point(sc : SparkContext, path : String, x_Point : Long, y_Point : Long ) : RDD[(Long,Long)] = { 140 | 141 | 142 | val model = conversion_DEMcsv_to_RDDed_RDDvd(path, sc, x_Point , y_Point) 143 | 144 | //create graph from the returned RDD of vertices and edges 145 | val graph = Graph(model.RDDvertex, model.RDDedge) 146 | val sourceId: VertexId = model.srcID 147 | 148 | // Initialize the graph to identify the source vertex from which delineation needs to be performed 149 | val initialGraph = graph.mapVertices((id, word) => if (id == sourceId) (word._1,1 , 0) else word) 150 | 151 | //sub-routine for delineation computation 152 | val cctp = initialGraph.pregel(Seq(0L))( 153 | 154 | /** 155 | * Vertex Program 156 | */ 157 | (id, prop, newID) => 158 | { println("VERTEXVALUE : " + id +","+prop+","+newID) 159 | if(prop._3 == 0) { 160 | (prop._1,prop._2,1) 161 | } 162 | else { 163 | if(prop._2 == 1) { 164 | (prop._1,2,prop._3) 165 | } 166 | else if(prop._2 == 0){ 167 | if(prop._1.toSeq.map(f => if(newID.isEmpty) 0 else {if(newID.contains(f)) 1 else 0}) .filter(d => d==1).length > 0) 168 | { 169 | (prop._1,1,prop._3) 170 | } 171 | else prop 172 | } 173 | else prop 174 | 175 | } 176 | }, 177 | /** 178 | * Message computation for each triplet 179 | */ 180 | triplet => { 181 | 182 | if (triplet.dstAttr._2 == 1){ 183 | Iterator((triplet.srcId,Seq(triplet.dstId))) ++ Iterator((triplet.dstId,Seq(triplet.dstId)))//,(triplet.dstId,Seq(triplet.dstId))) 184 | } 185 | else if (triplet.dstAttr._2 == 2 && triplet.srcAttr._2 == 1) { 186 | Iterator((triplet.srcId,Seq(triplet.dstId))) 187 | } 188 | else { 189 | Iterator.empty 190 | } 191 | 192 | 193 | }, 194 | (a,b) => a++b //Merge message 195 | ) 196 | 197 | //vertices from the new graph which reach out to the vertex 198 | cctp.vertices .map(f => f._1.toLong -> f._2._2) 199 | .join(model.RDDcoOrdinate_vertexID_Mapper.map(w => w._2 -> w._1)) 200 | .map(w => w._2._2 -> w._2._1).filter(word => word._2 == 2).map(f => f._1) 201 | 202 | 203 | } 204 | } -------------------------------------------------------------------------------- /mllib/src/main/scala/org/sparkalgos/mllib/join/knnJoin_Int.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.sparkAlgos.mllib.join 19 | 20 | import scala.math._ 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.SparkContext._ 23 | import org.apache.spark.rdd.RDD 24 | import scala.collection.immutable.Vector 25 | import scala.util.Random 26 | 27 | object knnJoin_Int { 28 | 29 | /** 30 | * Computes the nearest neighbors in the data-set for the data-point against which KNN 31 | * has to be applied for A SINGLE ITERATION 32 | * 33 | * @param rdd : RDD of Vectors of Int, which is the data-set in which knnJoin has to be 34 | * undertaken 35 | * @param dataPoint : Vector of Int, which is the data-point with which knnJoin is done 36 | * with the data-set 37 | * @param randPoint : Vector of Int, it's the random vector generated in each iteration 38 | * @param len : The number of data-points from the data-set on which knnJoin is to be done 39 | * @param zScore : RDD of (Long,Long), which is the ( , ) for each entry 40 | * of the dataset 41 | * @param dataScore : Long value of z-score of the data-point 42 | * 43 | * @return an RDD of the nearest 2*len entries from the data-point on which KNN needs to be 44 | * undertaken for that iteration 45 | */ 46 | def knnJoin_perIteration(rdd : RDD[(Vector[Int],Long)], 47 | dataPoint : Vector[Int], 48 | randPoint : Vector[Int], 49 | len : Int, 50 | zScore : RDD[(Long,BigInt)], 51 | dataScore : BigInt, 52 | sc : SparkContext) : RDD[(Vector[Int],Long)] = { 53 | 54 | // rdd with score greater than the z-score of the data-point 55 | val greaterRDD = zScore.filter(word => word._2 > dataScore). 56 | map(word => word._2 -> word._1). 57 | sortByKey(true). 58 | map(word => word._2). 59 | zipWithIndex() 60 | // rdd with score lesser than the z-score of the data-point 61 | val lesserRDD = zScore.filter(word => word._2 < dataScore) 62 | .map(word => word._2 -> word._1) 63 | .sortByKey(false) 64 | .map(word => word._2) 65 | .zipWithIndex() 66 | 67 | 68 | /** 69 | * Need 2*len entries, hence the IF-ELSE construct to guarantee these many no.of entries in 70 | * the returned RDD 71 | * if the no.of entries in the greaterRDD and lesserRDD is greater than 72 | * extract no.of entries from each RDD 73 | */ 74 | 75 | if((greaterRDD.count >= len)&&(lesserRDD.count >= len)) { 76 | val trim = greaterRDD.filter(word => word._2 < len).map(word => word._1). 77 | union(lesserRDD.filter(word => word._2 < len).map(word => word._1)) 78 | 79 | val join = rdd.map(word => word._2 -> word._1) 80 | .join(trim.map(word => word -> 0)) 81 | .map(word => word._2._1 -> word._1) 82 | join 83 | } 84 | /* 85 | if the no.of entries in the greaterRDD less than extract all entries from 86 | greaterRDD and + ( - greaterRDD.count) no.of entries from lesserRDD 87 | */ 88 | else if(greaterRDD.count < len) { 89 | 90 | val lenMod = len + (len - greaterRDD.count) 91 | val trim = greaterRDD.map(word => word._1) 92 | .union(lesserRDD.filter(word => word._2 < lenMod) 93 | .map(word => word._1)) 94 | 95 | val join = rdd.map(word => word._2 -> word._1) 96 | .join(trim.map(word => word -> 0)) 97 | .map(word => word._2._1 -> word._1) 98 | join 99 | } 100 | 101 | //if the no.of entries in the lesserRDD less than 102 | //extract all entries from lesserRDD and 103 | // + ( - lesserRDD.count) no.of entries from greaterRDD 104 | else { 105 | 106 | val lenMod = len + (len - lesserRDD.count) 107 | val trim = greaterRDD.filter(word => word._2 < lenMod).map(word => word._1) 108 | .union(lesserRDD.map(word => word._1)) 109 | 110 | val join = rdd.map(word => word._2 -> word._1) 111 | .join(trim.map(word => word -> 0)) 112 | .map(word => word._2._1 -> word._1) 113 | join 114 | } 115 | } 116 | 117 | /** 118 | * Computes the nearest neighbors in the data-set for the data-point against which KNN 119 | * has to be applied 120 | * 121 | * @param dataSet : RDD of Vectors of Int 122 | * @param dataPoint : Vector of Int 123 | * @param len : Number of data-points of the dataSet on which knnJoin is to be done 124 | * @param randomSize : the number of iterations which has to be carried out 125 | * 126 | * @return an RDD of Vectors of Int on which simple KNN needs to be applied with respect 127 | * to the data-point 128 | */ 129 | def knnJoin(dataSet : RDD[Vector[Int]], 130 | dataPoint : Vector[Int], 131 | len : Int, 132 | randomSize : Int, 133 | sc : SparkContext): RDD[Vector[Int]] = { 134 | 135 | val size = dataSet.first().length 136 | val rand = new Array[Int](size) 137 | val randomValue = new Random 138 | val rdd1 = dataSet.zipWithIndex() 139 | 140 | //compute z-value for each iteration, this being the first 141 | val model = zScore.computeScore(rdd1) 142 | val dataScore = zScore.scoreOfDataPoint(dataPoint) 143 | 144 | //for first iteration rand vector is a ZERO vector 145 | for(count <- 0 to size-1) rand(count) = 0 146 | 147 | //compute nearest neighbours on basis of z-scores 148 | val c_i = knnJoin_perIteration(rdd1, dataPoint, rand.toVector ,len,model, dataScore, sc) 149 | c_i.persist() 150 | 151 | //compute -> rdd where data-set generated from each iteration is being recursively appended 152 | var compute = c_i 153 | compute.persist() 154 | 155 | 156 | //the no.of iterations to be performed 157 | for(count <- 2 to randomSize) { 158 | 159 | for(i <- 0 to size - 1) rand(i) = randomValue.nextInt(100) 160 | 161 | 162 | //increment each element of the data-set with the random vector "rand" 163 | var kLooped = -1 164 | val newRDD = rdd1.map(vector => {kLooped = -1 165 | vector._1.map(word => word + rand({kLooped = kLooped+1 166 | kLooped%size}) 167 | )} -> vector._2) 168 | 169 | 170 | val newData_point = dataPoint.map(word => word + rand({kLooped = kLooped+1 171 | kLooped % size})) 172 | 173 | 174 | //compute z-scores for the iteration 175 | val modelLooped = zScore.computeScore(newRDD) 176 | val data_scoreLooped = zScore.scoreOfDataPoint(newData_point) 177 | 178 | //compute nearest neighbours on basis of z-scores 179 | val c_iLooped = knnJoin_perIteration(newRDD, newData_point, rand.toVector, 180 | len, modelLooped, data_scoreLooped, sc) 181 | c_iLooped.persist() 182 | 183 | //remove the effect of random vector "rand" from each entry of the the returned RDD 184 | //from knnJoin_perIteration 185 | var z_Looped = -1 186 | val c_iCleansedLooped = c_iLooped.map(line => {z_Looped = -1 187 | line._1.map(word => word - rand({z_Looped = z_Looped+1 188 | z_Looped%size})) } -> line._2) 189 | 190 | compute = compute.union(c_iCleansedLooped) 191 | compute.persist() 192 | } 193 | 194 | zKNN(removeRedundantEntries(compute), dataPoint, len).coalesce(1) 195 | } 196 | 197 | /** 198 | * It removes redundant Vectors from the dataset 199 | * @param DataSet : RDD of Vector[Int] and the vectors corresponding line_no in the data-set 200 | * @return : RDD of non-repetitive Vectors on Int 201 | */ 202 | def removeRedundantEntries(DataSet : RDD[(Vector[Int],Long)]) : RDD[Vector[Int]] = { 203 | DataSet.map(word => word._2 -> word._1). 204 | groupByKey(). 205 | map(word => word._2.last) 206 | 207 | } 208 | 209 | /** 210 | * Computes euclidean distance between two vectors 211 | * 212 | * @param point1 : Vector of Int 213 | * @param point2 : Vector of Int 214 | * @return : euclidean distance between the two vectors 215 | */ 216 | def euclideanDist(point1 : Vector[Int], point2 : Vector[Int]) : Double = { 217 | var sum = 0.0 218 | for(i <- 0 to point1.length-1) { 219 | sum = sum + pow(point1(i) - point2(i),2) 220 | } 221 | sqrt(sum) 222 | } 223 | 224 | /** 225 | * Performs kNN over the modified data-set and returns the k-nearest neighbors for 226 | * the data-point 227 | * 228 | * @param reducedData : RDD of Vector of Int, which is the reduced data-set after kNNJoin 229 | * function applied to the data-set 230 | * @param dataPoint : Vector of Int, is the data-point for which kNN needs to be undertaken 231 | * @param k : the no.of neighbors to be computed 232 | * @return : RDD of Vector of Int 233 | */ 234 | def zKNN(reducedData : RDD[Vector[Int]], 235 | dataPoint : Vector[Int], k : Int) : RDD[Vector[Int]] = { 236 | 237 | val distData = reducedData.map(word => euclideanDist(dataPoint, word) -> word) 238 | .sortByKey(true) 239 | .zipWithIndex() 240 | .filter(word => word._2 < k).map(word => word._1._2) 241 | distData 242 | 243 | } 244 | 245 | } 246 | -------------------------------------------------------------------------------- /mllib/src/main/scala/org/sparkalgos/mllib/join/knnJoin_Long.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.sparkAlgos.mllib.join 19 | 20 | import scala.math._ 21 | import org.apache.spark.SparkContext 22 | import org.apache.spark.SparkContext._ 23 | import org.apache.spark.rdd.RDD 24 | import scala.collection.immutable.Vector 25 | import scala.util.Random 26 | 27 | object knnJoin_Long { 28 | 29 | /** 30 | * Computes the nearest neighbors in the data-set for the data-point against which KNN 31 | * has to be applied for A SINGLE ITERATION 32 | * 33 | * @param rdd : RDD of Vectors of Long, which is the data-set in which knnJoin has 34 | * to be undertaken 35 | * @param dataPoint : Vector of Long, which is the data-point with which knnJoin is 36 | * done with the data-set 37 | * @param randPoint : Vector of Long, it's the random vector generated in each iteration 38 | * @param len : the number of data-points from the data-set on which knnJoin is to be done 39 | * @param zScore : RDD of (Long,Long), which is the ( , ) for each 40 | * entry of the dataset 41 | * @param dataScore : Long value of z-score of the data-point 42 | * 43 | * @return an RDD of the nearest 2*len entries from the data-point on which KNN needs to 44 | * be undertaken for that iteration 45 | */ 46 | def knnJoin_perIteration(rdd : RDD[(Vector[Long],Long)], 47 | dataPoint : Vector[Long], 48 | randPoint : Vector[Long], 49 | len : Int, 50 | zScore : RDD[(Long,BigInt)], 51 | dataScore : BigInt, 52 | sc : SparkContext) : RDD[(Vector[Long],Long)] = { 53 | 54 | 55 | // rdd with score greater than the z-score of the data-point 56 | val greaterRDD = zScore.filter(word => word._2 > dataScore). 57 | map(word => word._2 -> word._1). 58 | sortByKey(true). 59 | map(word => word._2). 60 | zipWithIndex() 61 | // rdd with score lesser than the z-score of the data-point 62 | val lesserRDD = zScore.filter(word => word._2 < dataScore) 63 | .map(word => word._2 -> word._1) 64 | .sortByKey(false) 65 | .map(word => word._2) 66 | .zipWithIndex() 67 | 68 | 69 | /** 70 | * Need 2*len entries, hence the IF-ELSE construct to guarantee these many no.of entries in 71 | * the returned RDD 72 | * if the no.of entries in the greaterRDD and lesserRDD is greater than 73 | * extract no.of entries from each RDD 74 | */ 75 | 76 | if((greaterRDD.count >= len)&&(lesserRDD.count >= len)) { 77 | val trim = greaterRDD.filter(word => word._2 < len).map(word => word._1). 78 | union(lesserRDD.filter(word => word._2 < len).map(word => word._1)) 79 | 80 | val join = rdd.map(word => word._2 -> word._1) 81 | .join(trim.map(word => word -> 0)) 82 | .map(word => word._2._1 -> word._1) 83 | join 84 | } 85 | /* 86 | if the no.of entries in the greaterRDD less than extract all entries from greaterRDD and 87 | + ( - greaterRDD.count) no.of entries from lesserRDD 88 | */ 89 | else if(greaterRDD.count < len) { 90 | 91 | val lenMod = len + (len - greaterRDD.count) 92 | val trim = greaterRDD.map(word => word._1) 93 | .union(lesserRDD.filter(word => word._2 < lenMod) 94 | .map(word => word._1)) 95 | 96 | val join = rdd.map(word => word._2 -> word._1) 97 | .join(trim.map(word => word -> 0)) 98 | .map(word => word._2._1 -> word._1) 99 | join 100 | } 101 | 102 | //if the no.of entries in the lesserRDD less than 103 | //extract all entries from lesserRDD and 104 | // + ( - lesserRDD.count) no.of entries from greaterRDD 105 | else { 106 | 107 | val lenMod = len + (len - lesserRDD.count) 108 | val trim = greaterRDD.filter(word => word._2 < lenMod).map(word => word._1) 109 | .union(lesserRDD.map(word => word._1)) 110 | 111 | val join = rdd.map(word => word._2 -> word._1) 112 | .join(trim.map(word => word -> 0)) 113 | .map(word => word._2._1 -> word._1) 114 | join 115 | } 116 | } 117 | 118 | /** 119 | * Computes the nearest neighbors in the data-set for the data-point against which KNN 120 | * has to be applied 121 | * 122 | * @param dataSet : RDD of Vectors of Long 123 | * @param dataPoint : Vector of Long 124 | * @param len : Number of data-points of the dataSet on which knnJoin is to be done 125 | * @param randomSize : the number of iterations which has to be carried out 126 | * 127 | * @return an RDD of Vectors of Long on which simple KNN needs to be applied with 128 | * respect to the data-point 129 | */ 130 | def knnJoin(dataSet : RDD[Vector[Long]], 131 | dataPoint : Vector[Long], 132 | len : Int, 133 | randomSize : Int, 134 | sc : SparkContext): RDD[Vector[Long]] = { 135 | 136 | val size = dataSet.first().length 137 | val rand = new Array[Long](size) 138 | val randomValue = new Random 139 | val rdd1 = dataSet.zipWithIndex() 140 | 141 | //compute z-value for each iteration, this being the first 142 | val model = zScore.computeScore(rdd1) 143 | val dataScore = zScore.scoreOfDataPoint(dataPoint) 144 | 145 | //for first iteration rand vector is a ZERO vector 146 | for(count <- 0 to size-1) rand(count) = 0 147 | 148 | //compute nearest neighbours on basis of z-scores 149 | val c_i = knnJoin_perIteration(rdd1, dataPoint, rand.toVector ,len,model, dataScore, sc) 150 | c_i.persist() 151 | 152 | //compute -> rdd where data-set generated from each iteration is being recursively appended 153 | var compute = c_i 154 | compute.persist() 155 | 156 | 157 | //the no.of iterations to be performed 158 | for(count <- 2 to randomSize) { 159 | 160 | for(i <- 0 to size - 1) rand(i) = randomValue.nextInt(100).toLong 161 | 162 | 163 | //increment each element of the data-set with the random vector "rand" 164 | var kLooped = -1 165 | val newRDD = rdd1.map(vector => {kLooped = -1 166 | vector._1.map(word => word + rand({kLooped = kLooped+1 167 | kLooped%size}) 168 | )} -> vector._2) 169 | 170 | 171 | val newData_point = dataPoint.map(word => word + rand({kLooped = kLooped+1 172 | kLooped % size})) 173 | 174 | 175 | //compute z-scores for the iteration 176 | val modelLooped = zScore.computeScore(newRDD) 177 | val data_scoreLooped = zScore.scoreOfDataPoint(newData_point) 178 | 179 | //compute nearest neighbours on basis of z-scores 180 | val c_iLooped = knnJoin_perIteration(newRDD, newData_point, rand.toVector, 181 | len, modelLooped, data_scoreLooped, sc) 182 | c_iLooped.persist() 183 | 184 | //remove the effect of random vector "rand" from each entry of the the returned RDD from 185 | //knnJoin_perIteration 186 | var z_Looped = -1 187 | val c_iCleansedLooped = c_iLooped.map(line => {z_Looped = -1 188 | line._1.map(word => word - rand({z_Looped = z_Looped+1 189 | z_Looped%size})) } -> line._2) 190 | 191 | compute = compute.union(c_iCleansedLooped) 192 | compute.persist() 193 | } 194 | 195 | zKNN(removeRedundantEntries(compute), dataPoint, len).coalesce(1) 196 | } 197 | 198 | /** 199 | * It removes redundant Vectors from the dataset 200 | * @param DataSet : RDD of Vector[Long] and the vectors corresponding line_no in the data-set 201 | * @return : RDD of non-repetitive Vectors on Long 202 | */ 203 | def removeRedundantEntries(DataSet : RDD[(Vector[Long],Long)]) : RDD[Vector[Long]] = { 204 | DataSet.map(word => word._2 -> word._1). 205 | groupByKey(). 206 | map(word => word._2.last) 207 | 208 | } 209 | 210 | /** 211 | * Computes euclidean distance between two vectors 212 | * 213 | * @param point1 : Vector of Long 214 | * @param point2 : Vector of Long 215 | * @return : euclidean distance between the two vectors 216 | */ 217 | def euclideanDist(point1 : Vector[Long], point2 : Vector[Long]) : Double = { 218 | var sum = 0.0 219 | for(i <- 0 to point1.length-1) { 220 | sum = sum + pow(point1(i) - point2(i),2) 221 | } 222 | sqrt(sum) 223 | } 224 | 225 | /** 226 | * Performs kNN over the modified data-set and returns the k-nearest neighbors for the data-point 227 | * 228 | * @param reducedData : RDD of Vector of Long, which is the reduced data-set after kNNJoin 229 | * function applied to the data-set 230 | * @param dataPoint : Vector of Long, is the data-point for which kNN needs to be undertaken 231 | * @param k : the no.of neighbors to be computed 232 | * @return : RDD of Vector of Long 233 | */ 234 | def zKNN(reducedData : RDD[Vector[Long]], dataPoint : Vector[Long], k : Int) : RDD[Vector[Long]] = { 235 | val distData = reducedData.map(word => euclideanDist(dataPoint, word) -> word) 236 | .sortByKey(true) 237 | .zipWithIndex() 238 | .filter(word => word._2 < k).map(word => word._1._2) 239 | distData 240 | 241 | } 242 | 243 | } 244 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | --------------------------------------------------------------------------------