├── data
    ├── sample_outlier_data.txt
    ├── sample_knn_join_data.txt
    └── sample_watershed_data.txt
├── misc
    └── Readme.txt
├── mllib
    ├── src
    │   ├── main
    │   │   └── scala
    │   │   │   └── org
    │   │   │       └── sparkalgos
    │   │   │           └── mllib
    │   │   │               ├── package.scala
    │   │   │               ├── join
    │   │   │                   ├── Knn.scala
    │   │   │                   ├── zScore_Int.scala
    │   │   │                   ├── zScore_Long.scala
    │   │   │                   ├── KnnJoin.scala
    │   │   │                   ├── zScore.scala
    │   │   │                   ├── knnJoin_Int.scala
    │   │   │                   └── knnJoin_Long.scala
    │   │   │               └── clustering
    │   │   │                   ├── AVF.scala
    │   │   │                   └── OutlierWithAVF.scala
    │   └── test
    │   │   └── scala
    │   │       └── org
    │   │           └── sparkalgos
    │   │               └── mllib
    │   │                   ├── utils
    │   │                       └── LocalSparkContext.scala
    │   │                   ├── join
    │   │                       └── KnnJoinSuit.scala
    │   │                   └── clustering
    │   │                       └── AVFSuite.scala
    ├── build.sbt
    └── README.md
├── .gitignore
├── examples
    ├── build.sbt
    └── src
    │   └── main
    │       └── scala
    │           └── org
    │               └── sparkalgos
    │                   └── examples
    │                       ├── mllib
    │                           ├── KnnJoin.scala
    │                           └── OutlierDetection.scala
    │                       └── graphx
    │                           ├── WaterShed.scala
    │                           └── FeedbackVertexSet.scala
├── graphx
    ├── build.sbt
    └── src
    │   ├── test
    │       └── scala
    │       │   └── org
    │       │       └── sparkalgos
    │       │           └── graphx
    │       │               ├── application
    │       │                   └── WaterShedSuit.scala
    │       │               ├── core
    │       │                   └── FeedbackVertexSetSuit.scala
    │       │               └── utils
    │       │                   └── LocalSparkContext.scala
    │   └── main
    │       └── scala
    │           └── org
    │               └── sparkalgos
    │                   └── graphx
    │                       ├── core
    │                           └── FeedbackVertexSet.scala
    │                       └── application
    │                           └── GraphProperties.scala
├── README.md
└── LICENSE


/data/sample_outlier_data.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/sample_knn_join_data.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/sample_watershed_data.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/misc/Readme.txt:
--------------------------------------------------------------------------------
1 | Miscellaneous algorithms
2 | 
3 | 


--------------------------------------------------------------------------------
/mllib/src/main/scala/org/sparkalgos/mllib/package.scala:
--------------------------------------------------------------------------------
1 | package org.sparkalgos.mllib
2 | 
3 | package object mllibalgos
4 | 


--------------------------------------------------------------------------------
/mllib/build.sbt:
--------------------------------------------------------------------------------
1 | name := "mllib"
2 | 
3 | version := "1.0"
4 | 
5 | scalaVersion := "2.10.4"
6 | 
7 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.1.0"
8 | 
9 | libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.0" % "test"


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/
 2 | *~
 3 | *.class
 4 | *.log
 5 | 
 6 | # sbt specific
 7 | .cache
 8 | .history
 9 | .lib/
10 | dist/*
11 | target/
12 | lib_managed/
13 | src_managed/
14 | bin/
15 | target/
16 | project/
17 | project/boot/
18 | project/plugins/project/
19 | 
20 | # Scala-IDE specific
21 | .scala_dependencies
22 | .worksheet


--------------------------------------------------------------------------------
/examples/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "examples"
 2 | 
 3 | scalaVersion := "2.10.4"
 4 | 
 5 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.1.0"
 6 | 
 7 | libraryDependencies += "org.apache.spark" %% "spark-graphx" % "1.0.0"
 8 | 
 9 | libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.0" % "test"
10 |     


--------------------------------------------------------------------------------
/graphx/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "graphx"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.10.4"
 6 | 
 7 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.1.0"
 8 | 
 9 | libraryDependencies += "org.apache.spark" %% "spark-graphx" % "1.0.0"
10 | 
11 | libraryDependencies += "org.scalatest" % "scalatest_2.10" % "2.0" % "test"


--------------------------------------------------------------------------------
/graphx/src/test/scala/org/sparkalgos/graphx/application/WaterShedSuit.scala:
--------------------------------------------------------------------------------
1 | package org.sparkalgos.graphx.application
2 | 
3 | import org.scalatest.{BeforeAndAfterEach, FunSuite}
4 | import org.sparkalgos.graphx.utils.LocalSparkContext
5 | 
6 | class WaterShedSuit extends FunSuite with BeforeAndAfterEach with LocalSparkContext {
7 | 
8 | }
9 | 


--------------------------------------------------------------------------------
/graphx/src/test/scala/org/sparkalgos/graphx/core/FeedbackVertexSetSuit.scala:
--------------------------------------------------------------------------------
1 | package org.sparkalgos.graphx.core
2 | 
3 | import org.scalatest.{BeforeAndAfterEach, FunSuite}
4 | import org.sparkalgos.graphx.utils.LocalSparkContext
5 | 
6 | class FeedbackVertexSetSuit extends FunSuite with BeforeAndAfterEach with LocalSparkContext {
7 | 
8 | }
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | SparkAlgorithms
 2 | ===============
 3 | 
 4 | Additional useful algorithms that can be used with spark. 
 5 | 
 6 | 
 7 | ###MLlib
 8 | #####Outlier Detection 
 9 | Outlier detection on categorical data. By counting frequency scores
10 | 
11 | 
12 | #####KNN-Join
13 | Approximate KNN-Join which uses z-scores to compute nearest neigbors 
14 | 
15 | 
16 | ###GraphX
17 | #####Feedback Vertex Set
18 | Greedy recursive solution to find feedback vertex set of a directed graph.
19 | #####Watershed Delineation
20 | A pregel based functionality to compute vertices which reach out to a given vertex in a DAG
21 | 


--------------------------------------------------------------------------------
/mllib/src/main/scala/org/sparkalgos/mllib/join/Knn.scala:
--------------------------------------------------------------------------------
 1 | import org.apache.spark.SparkContext
 2 | import org.sparkalgos.mllib.join.KnnJoin
 3 | 
 4 | object Knn {
 5 | 
 6 |    def main (args: Array[String]) {
 7 | 
 8 |      /*val model = knnJoin.knnJoin(dataset : RDD[Vector[Int]],
 9 |        datapoint : Vector[Int], len : Int, iteration : Int)*/
10 |      val sc = new SparkContext("local","knn")
11 | 
12 |      val vectors = Seq(
13 |         Vector(0, 0, 0),
14 |         Vector(1, 2, 3),
15 |         Vector(1, 5, 4),
16 |         Vector(5, 5, 8),
17 |         Vector(1, 1, 2),
18 |         Vector(1, 2, 4),
19 |         Vector(3, 4, 5)
20 |        )
21 |      val data = sc.parallelize(vectors, 2)
22 | 
23 |      val point = Vector(1,5,3)
24 |      val len = 3
25 |      val iter = 4
26 | 
27 |      val model = KnnJoin.knnJoin( data, point, len, iter)
28 | 
29 |      model.saveAsTextFile("/home/ashu/Desktop/knn")
30 | 
31 |    }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/examples/src/main/scala/org/sparkalgos/examples/mllib/KnnJoin.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.sparkalgos.examples.mllib
18 | 
19 | object KnnJoin {
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/examples/src/main/scala/org/sparkalgos/examples/graphx/WaterShed.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.sparkalgos.examples.graphx
18 | 
19 | object WaterShed {
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/examples/src/main/scala/org/sparkalgos/examples/mllib/OutlierDetection.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.sparkalgos.examples.mllib
18 | 
19 | object OutlierDetection {
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/examples/src/main/scala/org/sparkalgos/examples/graphx/FeedbackVertexSet.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.sparkalgos.examples.graphx
18 | 
19 | 
20 | object FeedbackVertexSet {
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/graphx/src/test/scala/org/sparkalgos/graphx/utils/LocalSparkContext.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.sparkalgos.graphx.utils
18 | 
19 | import org.apache.spark.{SparkConf, SparkContext}
20 | import org.scalatest.{BeforeAndAfterAll, Suite}
21 | 
22 | trait LocalSparkContext extends BeforeAndAfterAll { self: Suite =>
23 |    @transient var sc: SparkContext = _
24 | 
25 |    override def beforeAll() {
26 |      val conf = new SparkConf()
27 |        .setMaster("local")
28 |        .setAppName("test")
29 |      sc = new SparkContext(conf)
30 |      super.beforeAll()
31 |    }
32 | 
33 |    override def afterAll() {
34 |      if (sc != null) {
35 |        sc.stop()
36 |      }
37 |      super.afterAll()
38 |    }
39 |  }


--------------------------------------------------------------------------------
/mllib/src/test/scala/org/sparkalgos/mllib/utils/LocalSparkContext.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.sparkalgos.mllib.utils
18 | 
19 | import org.scalatest.Suite
20 | import org.scalatest.BeforeAndAfterAll
21 | 
22 | import org.apache.spark.{SparkConf, SparkContext}
23 | 
24 | trait LocalSparkContext extends BeforeAndAfterAll { self: Suite =>
25 |   @transient var sc: SparkContext = _
26 | 
27 |   override def beforeAll() {
28 |     val conf = new SparkConf()
29 |       .setMaster("local")
30 |       .setAppName("test")
31 |     sc = new SparkContext(conf)
32 |     super.beforeAll()
33 |   }
34 | 
35 |   override def afterAll() {
36 |     if (sc != null) {
37 |       sc.stop()
38 |     }
39 |     super.afterAll()
40 |   }
41 | }


--------------------------------------------------------------------------------
/mllib/src/main/scala/org/sparkalgos/mllib/clustering/AVF.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.sparkAlgos.mllib.clustering
18 | 
19 | import org.apache.spark.SparkContext
20 | 
21 | 
22 | /**
23 |  * Driver for the OutlierWithAVFModel 
24 |  *
25 |  **/
26 | object Test{
27 | 
28 |   def main(args:Array[String])
29 |   {
30 |     val sc = new SparkContext("local", "OutlierDetection")
31 |     val dir = "/home/ashu/Desktop/abc.txt"//"hdfs://localhost:54310/train3"//
32 | 
33 | 
34 |     val data = sc.textFile(dir).map(word => word.split(",").toVector)
35 |     val model = OutlierWithAVFModel.outliers(data,30,sc) //"hdfs://localhost:54310/train3"
36 | 
37 |     model.score.saveAsTextFile("/home/ashu/Desktop/sc")
38 |     model.trimedData.saveAsTextFile("/home/ashu/Desktop/tri")
39 |     model.outliers.saveAsTextFile("/home/ashu/Desktop/outs")
40 | 
41 |   }
42 | 
43 | }


--------------------------------------------------------------------------------
/mllib/README.md:
--------------------------------------------------------------------------------
 1 | ##MLlib
 2 | 
 3 | This folder contains the implementation of  additional machine learning algorithms which can be used 
 4 | with apapche Spark
 5 | Outlier-Detection-with-AVF-Spark
 6 | ================================
 7 | 
 8 | ##What's this? 
 9 | This is an outlier detection algorithm which works on categorical data. It calculated the frequency of occurence of each attribute of a data-point within the entire dataset. Based on these frequencies scores are assigned to each data point Data points with minimum scores are the designated outliers.
10 | 
11 | ##How to Run
12 | You should have spark already build as a jar file in your build library path. It has a scala file with class 'OutlierWithAVFModel'
13 | 
14 | 
15 | From your main call the function "outliers" of this class, with following parameters
16 | ```
17 |  val sc = new SparkContext("local", "OutlierDetection")
18 |   val dir = "hdfs://localhost:54310/train3"      <your file path>
19 |    
20 |    val data = sc.textFile(dir).map(word => word.split(",").toVector)
21 |    val model = OutlierWithAVFModel.outliers(data,20,sc) 
22 |    
23 |    model.score.saveAsTextFile("../scores")
24 |    model.trimmed_data.saveAsTextFile(".../trimmed")
25 |    
26 | returned model has two attributes  score and trimmed_data.
27 | 
28 | model.score :       RDD(String, Int)
29 | It contains the hash key representation of a datapoint and its avf score.
30 | 
31 | model.trimmed_data: RDD(String)
32 | It contains the dataset minus the outliers by the percentage provided.
33 | ```
34 | 
35 | z-KNN
36 | ================================
37 | 
38 | ##What's this? 
39 | It's a modified knn-Join, which translates each multi-dimensional data-point into a single dimension on which KNN search for a data-point can be performed.
40 | Given a data-set, the algorithm computes the z-values for each entry of the data-set and selects those entries with z-values closest to the z-value of the data-point. The process is performed over multiple iterations using random vector to transform the data-set. And then by using the data-entries over z-values, kNN is applied to the reduced data-set
41 | 
42 | ##How to Run
43 | You should have spark already build as a jar file in your build library path. It has a scala file with class 'knnJoin' and 'zScore'
44 | 
45 | From your main call the function "knnJoin" of this class, with following parameters
46 | ```
47 | val model = knnJoin.knnJoin(dataset : RDD[Vector[Int]], datapoint : Vector[Int], len : Int, iteration : Int, sc : SparkContext)
48 | 
49 | model : RDD(Vector[Int])
50 | 
51 | It contains the kNN over the union of the all selected entried from the data-set as mentioned in 
52 | http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5447837&tag=1
53 | ```
54 | 
55 | 


--------------------------------------------------------------------------------
/mllib/src/main/scala/org/sparkalgos/mllib/join/zScore_Int.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.sparkAlgos.mllib.join
19 | 
20 | import org.apache.spark.SparkContext._
21 | import org.apache.spark.rdd.RDD
22 | import scala.collection.immutable.Vector
23 | import scala.math.BigInt
24 | 
25 | object zScore_Int {
26 |   
27 | 	/**
28 | 	 * Checks if all entries within the array are 0 or not
29 | 	 * 
30 | 	 * @param Array of Int
31 | 	 * @return 1, if all elements are zero; else 0
32 | 	 */
33 | 	def checkVectors(vector : Array[Int]) : Int = {
34 | 			var flag = 1
35 | 
36 | 					for(i <- 0 to vector.length - 1){
37 | 						if(vector(i)!=0){
38 | 							flag = 0
39 | 						}
40 | 					}
41 | 
42 | 			return flag
43 | 	}
44 |   
45 |   /**
46 |    * Computers the z-scores for each entry of the input RDD of Vector of Int, sorted 
47 |    * in ascending order
48 |    * 
49 |    * @param  rdd of Vector of Int
50 |    * @return z-scores of the RDD[( <line_no> , <z-value> )]
51 |    */
52 |   def computeScore(rdd : RDD[(Vector[Int],Long)])	: RDD[(Long,BigInt)] = {
53 | 
54 |     val score = rdd.map(word => scoreOfDataPoint(word._1) -> word._2).
55 |     			sortByKey(true).
56 |     			map(word => word._2 -> word._1)
57 |     
58 |     score
59 |         
60 |   }
61 |   
62 |    /**
63 |    * Computes the z-score of a Vector
64 |    *  
65 |    * @param Vector of Int
66 |    * @return z-score of the vector      
67 |    */
68 |   def scoreOfDataPoint(vector : Vector[Int]) : BigInt = {
69 |  
70 |     var x = vector.toArray
71 |     
72 |     var temp = 0
73 |     var score : BigInt = 0
74 |     var counter = 0
75 |     
76 |     while(checkVectors(x) == 0) {
77 |       for(i <- x.length-1 to 0 by -1){
78 |         temp = x(i) & ((1 << 1) - 1)
79 |         temp = temp << counter
80 |         score = score+temp
81 |         x(i) = x(i)>>1
82 |         counter = counter + 1
83 |       }
84 |     }
85 |     score
86 |   }
87 |   
88 | }
89 | 


--------------------------------------------------------------------------------
/mllib/src/main/scala/org/sparkalgos/mllib/join/zScore_Long.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.sparkAlgos.mllib.join
19 | 
20 | import org.apache.spark.SparkContext._
21 | import org.apache.spark.rdd.RDD
22 | import scala.collection.immutable.Vector
23 | import scala.math.BigInt
24 | 
25 | object zScore_Long {
26 |   
27 |   
28 | 	/**
29 | 	 * Checks if all entries within the array are 0 or not
30 | 	 * 
31 | 	 * @param Array of Int
32 | 	 * @return 1, if all elements are zero; else 0
33 | 	 */
34 | 	def checkVectors(vector : Array[Long]) : Int = {
35 | 			var flag = 1
36 | 
37 | 					for(i <- 0 to vector.length - 1){
38 | 						if(vector(i)!=0){
39 | 							flag = 0
40 | 						}
41 | 					}
42 | 
43 | 			return flag
44 | 	}
45 |   
46 |   /**
47 |    * Computers the z-scores for each entry of the input RDD of Vector of Long,
48 |    * sorted in ascending order
49 |    * 
50 |    * @param  rdd of Vector of Long
51 |    * @return z-scores of the RDD[( <line_no> , <z-value> )]
52 |    */
53 |   def computeScore(rdd : RDD[(Vector[Long],Long)])	: RDD[(Long,BigInt)] = {
54 | 
55 |     val score = rdd.map(word => scoreOfDataPoint(word._1) -> word._2).
56 |     			sortByKey(true).
57 |     			map(word => word._2 -> word._1)
58 |     score
59 |   }
60 |   
61 |   
62 |   /**
63 |    * Computes the z-score of a Vector
64 |    *  
65 |    * @param Vector of Long
66 |    * @return z-score of the vector      
67 |    */
68 |   def scoreOfDataPoint(vector : Vector[Long]) : BigInt = {
69 |   
70 |     var x = vector.toArray
71 |     
72 |     var temp = 0L
73 |     var score : BigInt = 0
74 |     var counter = 0
75 |     
76 |     while(checkVectors(x) == 0) {
77 |       for(i <- x.length-1 to 0 by -1){
78 |         temp = x(i) & ((1 << 1) - 1)
79 |         temp = temp << counter
80 |         score = score+temp
81 |         x(i) = x(i)>>1
82 |         counter = counter + 1
83 |       }
84 |     }
85 |     score
86 |   }
87 |   
88 | }
89 | 


--------------------------------------------------------------------------------
/mllib/src/main/scala/org/sparkalgos/mllib/join/KnnJoin.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.sparkAlgos.mllib.join
18 | 
19 | import scala.math._
20 | import org.apache.spark.SparkContext
21 | import org.apache.spark.SparkContext._
22 | import org.apache.spark.rdd.RDD
23 | import scala.collection.immutable.Vector
24 | import scala.util.Random
25 | import java.util.logging.Logger
26 | 
27 | object KnnJoin {
28 | 
29 |   /**
30 | 	 * Computes the nearest neighbors in the data-set for the data-point against which KNN
31 | 	 * has to be applied
32 | 	 *
33 | 	 * @param dataSet : RDD of Vectors of Int/Long
34 | 	 * @param dataPoint : Vector of Int/Long
35 | 	 * @param len : Number of data-points of the dataSet on which knnJoin is to be done
36 | 	 * @param randomSize : the number of iterations which has to be carried out
37 | 	 *
38 | 	 * @return an RDD of Vectors of Int/Long on which simple KNN needs to be applied with respect to
39 | 	 * 		the data-point
40 | 	 */
41 | 	def knnJoin[A](dataSet : RDD[Vector[A]],
42 | 			dataPoint : Vector[A],
43 | 			len : Int,
44 | 			randomSize : Int) = {
45 | 
46 | 		val logger = Logger.getLogger("knnJoin")
47 | 		val sc = dataSet.context
48 | 
49 | 				val arg = dataPoint(0)
50 | 				arg match{
51 | 
52 | 			// if input is RDD[Vector[Int]]
53 | 				case _: Int => 
54 | 				println("Calling Int")
55 | 				val set = dataSet.map(f => f.map(word => word.toString.toInt))
56 | 				val point = dataPoint.map(f => f.toString.toInt)
57 | 				knnJoin_Int.knnJoin(set, point, len, randomSize, sc).coalesce(1)
58 | 
59 | 				// if input is RDD[Vector[Long]]	      
60 | 				case _: Long =>
61 | 				println("Calling Long")
62 | 				val set = dataSet.map(f => f.map(word => word.toString.toLong))
63 | 				val point = dataPoint.map(f => f.toString.toLong)
64 | 				knnJoin_Long.knnJoin(set, point, len, randomSize, sc).coalesce(1)
65 | 
66 | 				case _ => logger.severe("Argument_0 to knnJoin isn't of type Int/Long")
67 | 				exit(0)
68 | 
69 | 		}
70 | 	}
71 | 
72 | 
73 | }
74 | 


--------------------------------------------------------------------------------
/mllib/src/test/scala/org/sparkalgos/mllib/join/KnnJoinSuit.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | import org.apache.spark.rdd.RDD
18 | import org.scalatest.{BeforeAndAfterEach, FunSuite}
19 | import org.sparkalgos.mllib.utils.LocalSparkContext
20 | import org.scalatest.{BeforeAndAfterEach, FunSuite}
21 | import org.sparkalgos.mllib.join.KnnJoin
22 | 
23 | class KnnJoinSuit extends FunSuite with BeforeAndAfterEach with LocalSparkContext {
24 | 
25 |   var vectors: Vector[Vector[Int]] = _
26 |   var data: RDD[Vector[Int]] = _
27 |   var point:Vector[Int] = _
28 |   var len: Int = _
29 |   var iter :Int = _
30 | 
31 |   override def beforeEach() {
32 | 
33 |     /*
34 |     data
35 |     0, 0, 0
36 |     1, 2, 3
37 |     1, 5, 4
38 |     5, 5, 8
39 |     1, 1, 2
40 |     1, 2, 4
41 |     3, 4, 5
42 |    */
43 |     vectors = Vector(
44 |       Vector(0, 0, 0),
45 |       Vector(1, 2, 3),
46 |       Vector(1, 5, 4),
47 |       Vector(5, 5, 8),
48 |       Vector(1, 1, 2),
49 |       Vector(1, 2, 4),
50 |       Vector(3, 4, 5)
51 |     )
52 |     data = sc.parallelize(vectors, 3)
53 |     point = Vector(1,3,5)
54 |     len = 4
55 |     iter = 4
56 |   }
57 |     test("four neighbors should be there when length is four"){
58 | 
59 |       val model = KnnJoin.knnJoin(data,point,len,iter,sc)
60 |       assert(model.count() == 4)
61 | 
62 |     }
63 |     test("No neighbors should be computed when length is zero"){
64 |       len = 0
65 | 
66 |       val model = KnnJoin.knnJoin(data,point,len,iter,sc)
67 |       assert(model.count() == len)
68 |     }
69 | 
70 |     test("All entries are from original data set") {
71 |      val model = KnnJoin.knnJoin(data,point,len,iter,sc)
72 |      assert(model.intersection(data).count() == len)
73 | 
74 |   }
75 | 
76 | 
77 | /*
78 |     test("knnJoin method called by the companion object") {
79 |       val model = knnJoin.knnJoin(data,point,len,iter,sc)
80 |       assert(model.getClass.getSimpleName.toString === "knnJoin")
81 |     }
82 | */
83 | 
84 | 
85 | 
86 |   }
87 | 


--------------------------------------------------------------------------------
/mllib/src/main/scala/org/sparkalgos/mllib/join/zScore.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.sparkAlgos.mllib.join
19 | 
20 | import org.apache.spark.SparkContext._
21 | import org.apache.spark.rdd.RDD
22 | import scala.collection.immutable.Vector
23 | import scala.math.BigInt
24 | import java.util.logging.Logger
25 | 
26 | object zScore {
27 | 
28 | 	/**
29 | 	 * Computes the z-score of a Vector
30 | 	 *  
31 | 	 * @param Vector of Long/Int
32 | 	 * @return z-score of the vector      
33 | 	 */
34 | 	val logger = Logger.getLogger("zScore")
35 | 	def scoreOfDataPoint[A](vector : Vector[A]) : BigInt = {
36 | 			val arg = vector(0)
37 | 					arg match{
38 | 
39 | 					// if input is Vector[Int]
40 | 					case _: Int => 
41 | 					val vec = vector.map(word => word.toString.toInt)
42 | 					zScore_Int.scoreOfDataPoint(vec)
43 | 
44 | 					// if input is Vector[Long]					
45 | 					case _: Long => 
46 | 					val vec = vector.map(word => word.toString.toLong)
47 | 					zScore_Long.scoreOfDataPoint(vec)
48 | 
49 | 
50 | 					case _ => logger.severe("Argument_0 to scoreOfDataPoint isn't of type Int/Long")
51 | 					exit(0)
52 | 			}
53 | 	}
54 | 
55 | 	/**
56 | 	 * Computers the z-scores for each entry of the input RDD of Vector of Int/Long, sorted in ascending order
57 | 	 * 
58 | 	 * @param  rdd of Vector of Int/Long & Long
59 | 	 * @return z-scores of the RDD[( <line_no> , <z-value> )]
60 | 	 */
61 | 	def computeScore[A](rdd : RDD[(Vector[A],Long)])	: RDD[(Long,BigInt)] = {
62 | 
63 | 			val arg = rdd.first._1
64 | 				arg(0) match{
65 | 			  
66 | 					// 	if input is Vector[Int]
67 | 					case _: Int => 
68 | 					val vec = rdd.map(line => line._1.map(f => f.toString.toInt) -> line._2)
69 | 					zScore_Int.computeScore(vec)
70 | 					
71 | 					// if input is Vector[Long]					
72 | 					case _: Long => 
73 | 					val vec = rdd.map(line => line._1.map(f => f.toString.toLong) -> line._2)
74 | 					zScore_Long.computeScore(vec)
75 | 					
76 | 					case _ => logger.severe("Argument_0 to scoreOfDataPoint isn't of type Int/Long")
77 | 					exit(0)
78 | 			}
79 | 	}
80 | }
81 | 


--------------------------------------------------------------------------------
/mllib/src/test/scala/org/sparkalgos/mllib/clustering/AVFSuite.scala:
--------------------------------------------------------------------------------
 1 | 
 2 | /*
 3 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 4 |  * contributor license agreements.  See the NOTICE file distributed with
 5 |  * this work for additional information regarding copyright ownership.
 6 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *    http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 |   import org.sparkalgos.mllib.utils.LocalSparkContext
19 |   import org.sparkAlgos.mllib.clustering.OutlierWithAVFModel
20 |   import org.apache.spark.rdd.RDD
21 |   import org.scalatest.{BeforeAndAfterEach, FunSuite}
22 | 
23 | 
24 |   class outSuit extends FunSuite with BeforeAndAfterEach with LocalSparkContext {
25 | 
26 |     var vectors: Vector[Vector[String]] = _
27 |     var data: RDD[Vector[String]] = _
28 | 
29 |     override def beforeEach() {
30 | 
31 |       /*
32 |       data score
33 |       A,B   5
34 |       A,C   4
35 |       A,D   4
36 |       E,B   3
37 |      */
38 |       vectors = Vector(
39 |         Vector("A", "B"),
40 |         Vector("A", "C"),
41 |         Vector("A", "D"),
42 |         Vector("E", "B")
43 |       )
44 |       data = sc.parallelize(vectors, 2)
45 |     }
46 |       test("only two outliers should be removed"){
47 |         val model = OutlierWithAVFModel.outliers(data,30,sc)
48 |         assert(model.trimedData.count() == 3)
49 | 
50 |       }
51 |       test("No outlier should be removed"){
52 |         val model = OutlierWithAVFModel.outliers(data,0,sc)
53 |         assert(model.trimedData.count() == 4)
54 |       }
55 | 
56 |       test("4 entries in score RDD") {
57 |        val model = OutlierWithAVFModel.outliers(data, 30, sc)
58 |        assert(model.score.count() == 4)
59 | 
60 |     }
61 | 
62 |     test("with 30 percent outliers 1 entry outlier RDD") {
63 |       val model = OutlierWithAVFModel.outliers(data, 30, sc)
64 |       assert(model.outliers.count() === 1)
65 | 
66 |     }
67 | 
68 |     test("vector(E,B) should be outlier"){
69 |       val model = OutlierWithAVFModel.outliers(data, 30, sc)
70 |       assert(model.outliers.first().equals(Vector("E", "B")))
71 | 
72 |     }
73 | 
74 |     test("outlires method called by the companion object") {
75 |         val model = OutlierWithAVFModel.outliers(data, 30, sc)
76 |         assert(model.getClass.getSimpleName.toString === "OutlierWithAVFModel")
77 |       }
78 | 
79 | 
80 | 
81 | 
82 |     }
83 | 


--------------------------------------------------------------------------------
/graphx/src/main/scala/org/sparkalgos/graphx/core/FeedbackVertexSet.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.sparkalgos.graphx.core
19 | 
20 | import org.apache.spark.SparkContext
21 | import org.apache.spark.rdd.RDD
22 | import org.apache.spark.graphx._
23 | /*
24 |  * Top level model for feedback vertex set
25 |  */
26 | object FeedbackVertexSet {
27 | 
28 |   /**
29 |    * This function calculate the optimal vertex in one scc, by removal of which highest number of scc
30 |    * is generated.
31 |    * @param graph of type  Graph[Long,Int] 
32 |    * @param sc SparkContext
33 |    * @return rdd with the optimal set of vertices. 
34 |    */
35 |   def getVertex(graph: Graph[Long, Int],
36 |                 sc:SparkContext): RDD[Long] = {
37 | 
38 |     val vertices = graph.vertices.collect()
39 | 
40 |     //work only for the vertex size more then 1
41 |     if(vertices.size > 1) {
42 | 
43 |       val z = vertices.map(id => id._1 ->
44 |         graph.subgraph(vpred = (index, scc) => index != id._1)
45 |           .stronglyConnectedComponents(2).vertices.map(word => word._2->word._1)
46 |           .groupBy( word => word._1)
47 |           .count)
48 |       //get the vertex with max scc
49 |       val vMax = z.reduce( (a,b) => if (a._2 > b._2) a else b )
50 |       val idMax = vMax._1
51 | 
52 |       var vList = sc.parallelize(Array(vMax._1))
53 |       vList.persist()
54 |       //remove the max id vertex and run the algorithm again
55 |       vList.union(feedbackVertexSet( graph.subgraph(vpred = (index, scc) => index != idMax) ,sc))
56 |         .coalesce(1)
57 |       vList.persist()
58 | 
59 |       vList
60 |     }
61 |     else
62 |       sc.parallelize(Array[Long]())
63 |   }
64 | 
65 |   /**
66 |    * This function calculate the strongly connected components and run getVertex on each scc in parallel
67 |    * @param graph of type  Graph[Long,Int] 
68 |    * @param sc Sparkcontext
69 |    * @return rdd with the optimal set of vertices. 
70 |    */
71 |   def feedbackVertexSet(graph:Graph[Long,Int],
72 |                         sc :SparkContext ): RDD[Long] = {
73 | 
74 |     //calculate strongly connected components
75 |     val sccGraph = graph.stronglyConnectedComponents(2)
76 | 
77 |     var res = sc.parallelize(Array[Long]())
78 |     //get the component ids (minimum id in each component) in the list l
79 |     val l = sccGraph.triplets.map(word => Array(word.srcAttr, word.dstAttr)).flatMap(f =>f)
80 |       .map(w => w->0).groupBy(f => f._1).map(f => f._1).collect()
81 | 
82 |     //run get vertices on each component
83 |     l.map(id => {res = res
84 |       .union(getVertex( sccGraph.subgraph(vpred = (index, scc) => scc == id ),sc))
85 |       .coalesce(1)
86 |       res.persist()
87 |     } )
88 |     res
89 |   }
90 | }
91 | 
92 | 


--------------------------------------------------------------------------------
/mllib/src/main/scala/org/sparkalgos/mllib/clustering/OutlierWithAVF.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.sparkAlgos.mllib.clustering
 19 | 
 20 | import org.apache.spark.SparkContext
 21 | import org.apache.spark.SparkContext._
 22 | import org.apache.spark.rdd.RDD
 23 | import com.google.common.hash._
 24 | 
 25 | /**
 26 |  * Get scores of the data-points by AVF algorithm 
 27 |  * score gives the data-point index and its score returned as a RDD
 28 |  * trimmed data is the data remained after user provided percentage of removal
 29 |  *
 30 |  */
 31 | 
 32 | class OutlierWithAVFModel private (
 33 |    val score:  RDD[(String,Int)],
 34 |    val trimedData:  RDD[Vector[String]],
 35 |    val outliers:  RDD[Vector[String]])
 36 | 
 37 | /**
 38 |  * Top-level methods for OutlierWithAVF.
 39 |  */
 40 | 
 41 | object OutlierWithAVFModel {
 42 |   /**
 43 |    * Computes the score of each data point which is summation of the frequency of 
 44 |    * each feature in that data-point. Low score data-points are outliers.
 45 |    *
 46 |    * @param input RDD of Vector[String] where feature values are comma separated .
 47 |    * @param hashSeed which is the hash-function to be used for representing data-points
 48 |    * @param sc is the Spark Context of the calling function
 49 |    * @return a RDD of hash-key and score.
 50 |    */
 51 |   def computeScores(input: RDD[Vector[String]],
 52 |                     hashSeed : HashFunction,
 53 |                     sc : SparkContext) : RDD[(String,Int)] =  {
 54 | 
 55 |     // key,value pairs for < (column_no,attribute value) , "frequency">
 56 |     val freq = input.map(word => word.zipWithIndex)
 57 |       .flatMap(line => line.toSeq)
 58 |       .map(word => word->1)
 59 |       .reduceByKey(_+_)
 60 |       .cache()
 61 | 
 62 |     // key,value pairs for < (column_no,attribute value) , "indexedInput-point number">
 63 |     val data = input.zipWithIndex().map(word => (word._2,word._1))
 64 |       .map(word => word._2.zipWithIndex
 65 |       .map(w =>  w-> hashSeed.hashLong(word._1).toString))
 66 |       .flatMap(line => line.toSeq)
 67 | 
 68 |     //join the two RDDs and get the frequency for each attribute in a indexedInput point
 69 |     val scores = data.join(freq)
 70 |       .flatMap(line => Seq(line.swap._1))
 71 |       .reduceByKey(_+_)
 72 |       .map(word => (word._1,word._2))
 73 | 
 74 |     scores
 75 |   }
 76 | 
 77 |   /**
 78 |    * On basis of the computed scores of data points and user-provided percentage of outliers 
 79 |    * to be removed, this functions removes the outliers from the input RDD and returns the 
 80 |    * trimmed data-set
 81 |    *
 82 |    * @param input RDD of Vector[String]
 83 |    * @param score of type (String, Int) having AVF score of the data-point obtained from 
 84 |    * function compute
 85 |    * @param percent of type Double which is the percentage of outliers to be removed from the 
 86 |    * data-set
 87 |    * @param hashSeed is the Hash-Function for uniquely identifying each data-point
 88 |    * @return trimmed data-set and outliers.
 89 |    */
 90 | 
 91 |   def trimScores(input : RDD[Vector[String]],
 92 |                  score : RDD[(String,Int)],
 93 |                  percent : Double,
 94 |                  hashSeed : HashFunction,
 95 |                  sc : SparkContext) : (RDD[Vector[String]],RDD[Vector[String]] )= {
 96 | 
 97 |     val nexample = score.count()
 98 |     val nremove =  nexample * percent*0.01
 99 | 
100 |     //sorted scores
101 |     val sortedScore = score.map(word => (word._2,word._1))
102 |       .sortByKey(true)
103 |       .map(word => (word._2,word._1))
104 | 
105 |     //trimmed score RDD
106 |     val trimmedScores = sortedScore.zipWithIndex()
107 |       .filter(word=> word._2 < nremove.toLong)
108 |       .map(word => word._1).collect().toMap
109 | 
110 |     //filtered data-set
111 |     val trimmedData = input.zipWithIndex()
112 |       .map(word => (word._2,word._1))
113 |       .map(word => hashSeed.hashLong(word._1).toString -> word._2)
114 |       .filter(line => !trimmedScores.get(line._1).nonEmpty)
115 |       .map(v => v._2)
116 | 
117 |     val outliers = input.zipWithIndex()
118 |       .map(word => (word._2,word._1))
119 |       .map(word => hashSeed.hashLong(word._1).toString -> word._2)
120 |       .filter(line => trimmedScores.get(line._1).nonEmpty)
121 |       .map(v => v._2)
122 | 
123 |     (trimmedData,outliers)
124 | 
125 |   }
126 | 
127 |   /**
128 |    * This function acts as an entry point to compute the scores of the data-points and trim the
129 |    * RDD's
130 |    * @param data of type  RDD[Vector[String] ]
131 |    * @param percent of type Double which is the percentage of outliers to be removed from the data-set
132 |    * @param sc  Spark context
133 |    * @return main.scala.OutlierWithAVFModel which has score RDD and trimmed data-set .
134 |    */
135 | 
136 |   def outliers(data :RDD[Vector[String]],  percent : Double, sc :SparkContext) :OutlierWithAVFModel = {
137 | 
138 |     // initial check to validate user provided percentage
139 |     if(percent >100){
140 |       println("Error : percentage is greater than 100")
141 |       System.exit(1)
142 |     }
143 | 
144 |     // define a hash-function of type murmur3-128bits with seed_value of '5'
145 |     val hashSeed = Hashing.murmur3_128(5)
146 | 
147 |     //compute the AVF scores for each data-point
148 |     val scores = OutlierWithAVFModel.computeScores(data,hashSeed,sc)
149 | 
150 |     //returns an instance of main.scala.OutlierWithAVFModel
151 |     val outlierData = OutlierWithAVFModel.trimScores(data, scores, percent,hashSeed,sc)
152 |     val trimmed = outlierData._1
153 |     val outliers = outlierData._2
154 | 
155 |     new OutlierWithAVFModel(scores,trimmed,outliers)
156 | 
157 |   }
158 | }
159 | 
160 | 


--------------------------------------------------------------------------------
/graphx/src/main/scala/org/sparkalgos/graphx/application/GraphProperties.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.sparkalgos.graphx.application
 19 | 
 20 | import org.apache.spark.SparkContext
 21 | import org.apache.spark.SparkContext._
 22 | import scala.collection._
 23 | import scala.collection.mutable.Seq
 24 | import org.apache.spark.graphx._
 25 | import org.apache.spark.rdd.RDD
 26 | 
 27 | 
 28 | /**
 29 |  * Get the vertices of a directed Graph that can reach to a given point
 30 |  * This algorithm can find application in Geographic Information Systems
 31 |  * to compute the points in the map from where water can flow to a given co-ordinate
 32 |  */
 33 | class GraphProperties (
 34 |                         val RDDvertex:  RDD[(VertexId, (Iterable[Long], Int, Int))],
 35 |                         val RDDedge:  RDD[Edge[Int]],
 36 |                         val RDDcoOrdinate_vertexID_Mapper : RDD[((Long,Long),Long)],
 37 |                         val srcID : Long
 38 |                         )
 39 | 
 40 | 
 41 | /**
 42 |  *     top level methods for watershed algorithm using pregel API
 43 |  */
 44 | object WaterShed {
 45 | 
 46 |   /**
 47 |    * Converts a DigitalElevationMap of the form x,y,z to
 48 |    * a RDD of Vertex , a RDD of Edge from which graphs can be created,
 49 |    * each co-ordinate now being represented in the form of a vertex
 50 |    * and the direction of flow of water from one co-ordinate to another being
 51 |    * represented in terms of an edge between the vertices (i.e. co-ordinates)
 52 |    *
 53 |    * @param path : location of the DigitalElevationMap(DEM)
 54 |    * @param sc : spark context
 55 |    * @param x_Point : x-Coordinate of point for which delineation needs to be done
 56 |    * @param y_Point : y-Coordinate of point for which delineation needs to be done
 57 |    * @return an object of GraphProperties comprising of
 58 |    * 		RDD of Vertices and it's properties
 59 |    *   		RDD of Edges and it's properties
 60 |    *     	RDD of mappings between co-ordinates and vertexId's
 61 |    *      	vertexID of the point for which delineation is to be performed
 62 |    */
 63 |   private def conversion_DEMcsv_to_RDDed_RDDvd(path : String, sc : SparkContext, x_Point : Long, y_Point : Long) : GraphProperties = {
 64 | 
 65 |     //read csv file of the form < x,y,z >
 66 |     val temp_data = sc	.textFile(path)
 67 |       .map(word => 	((word	.split(","))
 68 |       .map(lit => lit.toDouble)
 69 |       .toList
 70 |       ).toVector)
 71 |       .map(word => word(0) -> (word(1) -> word(2)))
 72 | 
 73 |     val x_len = temp_data.groupBy(f => f._1).count
 74 |     val y_len = temp_data.groupBy(f => f._2._1).count
 75 | 
 76 |     //alter co-ordinate values, in case x and y values do not start from 0
 77 |     val data = temp_data.groupByKey.sortByKey(true)
 78 |       .zipWithIndex
 79 |       .map(word => word._1._2.map(w => w._1 -> (word._2 -> w._2)) ).flatMap(f => f)
 80 |       .groupByKey.sortByKey(true)
 81 |       .zipWithIndex
 82 |       .map(word => word._1._2.map(w => (w._1 -> word._2) -> w._2)).flatMap(f => f)
 83 |       .zipWithIndex
 84 |       .map(word => word._1._1 -> (word._1._2 -> word._2))
 85 |     val mapper = data.map(word => word._1 -> word._2._2)
 86 | 
 87 |     //sub-routine to compute edges for each vertex
 88 |     val mod_data1 = data.map(word => ((word._1._1 + 1),word._1._2) -> word)
 89 |     val mod_data2 = data.map(word => ((word._1._1 - 1),word._1._2) -> word)
 90 |     val mod_data3 = data.map(word => (word._1._1,(word._1._2 - 1)) -> word)
 91 |     val mod_data4 = data.map(word => (word._1._1,(word._1._2 + 1)) -> word)
 92 |     val mod_data5 = data.map(word => ((word._1._1 + 1),(word._1._2 - 1)) -> word)
 93 |     val mod_data6 = data.map(word => ((word._1._1 + 1),(word._1._2 + 1)) -> word)
 94 |     val mod_data7 = data.map(word => ((word._1._1 - 1),(word._1._2 - 1)) -> word)
 95 |     val mod_data8 = data.map(word => ((word._1._1 - 1),(word._1._2 + 1)) -> word)
 96 | 
 97 | 
 98 |     val mod_data = mod_data1.union(mod_data2).union(mod_data3).union(mod_data4)
 99 |       .union(mod_data5).union(mod_data6).union(mod_data7)
100 |       .union(mod_data8).filter(word => (word._1._1 >= 0) && (word._1._1 < x_len))
101 |       .filter(word => (word._1._2 >= 0) && (word._1._2 < y_len))
102 |       .groupByKey
103 |       .sortByKey(true)
104 |       .map(word => word._1 -> {
105 | 
106 |       /**
107 |        * (NEW)
108 |        * let it return all the nearest set of points
109 |        */
110 |       word._2
111 |     })
112 |     // creates an edge from vertex1 to all other vertices which have elevation less than or equal to vertex1
113 |     val join = data.join(mod_data).map(w => w._1 -> (w._2._1 -> w._2._2.filter(f => f._2._1 <= w._2._1._1)))
114 |     val edgeRDD : RDD[Edge[Int]] = join	.map(word => word._2._1._2 -> word._2._2.map(f => f._2._2))
115 |       .map(f => f._2.map(d => f._1 -> d))
116 |       .flatMap(f => f)
117 |       .map(f => Edge(f._1 , f._2,1)).coalesce(1)
118 | 
119 |     //sub-routine to compute vertices
120 |     val vertexRDD : RDD[(VertexId, (Iterable[Long], Int, Int))]
121 |     = join	.map(word => word._2._1._2 -> (word._2._2.map(f => f._2._2) ,0,0))
122 |       .coalesce(1)
123 | 
124 |     val srcID = mapper.filter(f => f._1._1 == x_Point && f._1._2 == y_Point).first._2
125 | 
126 |     new GraphProperties(vertexRDD,edgeRDD,mapper,srcID)
127 |   }
128 | 
129 |   /**
130 |    * Computes all vertices in a directed graph which can reach to a given vertex
131 |    *
132 |    * @param path : location of the DigitalElevationMap(DEM)
133 |    * @param sc : spark context
134 |    * @param x_Point : x-Coordinate of point for which delineation needs to be done
135 |    * @param y_Point : y-Coordinate of point for which delineation needs to be done
136 |    * @return RDD of co-ordinates which can reach to the given vertex
137 |    */
138 | 
139 |   def connectedComponents_to_point(sc : SparkContext, path : String, x_Point : Long, y_Point : Long  ) : RDD[(Long,Long)] = {
140 | 
141 | 
142 |     val model = conversion_DEMcsv_to_RDDed_RDDvd(path, sc, x_Point , y_Point)
143 | 
144 |     //create graph from the returned RDD of vertices and edges
145 |     val graph = Graph(model.RDDvertex, model.RDDedge)
146 |     val sourceId: VertexId = model.srcID
147 | 
148 |     // Initialize the graph to identify the source vertex from which delineation needs to be performed
149 |     val initialGraph = graph.mapVertices((id, word) => if (id == sourceId) (word._1,1 , 0) else word)
150 | 
151 |     //sub-routine for delineation computation
152 |     val cctp = initialGraph.pregel(Seq(0L))(
153 | 
154 |       /**
155 |        * Vertex Program
156 |        */
157 |       (id, prop, newID) =>
158 |       { println("VERTEXVALUE : " + id +","+prop+","+newID)
159 |         if(prop._3 == 0) {
160 |           (prop._1,prop._2,1)
161 |         }
162 |         else {
163 |           if(prop._2 == 1) {
164 |             (prop._1,2,prop._3)
165 |           }
166 |           else if(prop._2 == 0){
167 |             if(prop._1.toSeq.map(f => if(newID.isEmpty) 0 else {if(newID.contains(f)) 1 else 0})	.filter(d => d==1).length > 0)
168 |             {
169 |               (prop._1,1,prop._3)
170 |             }
171 |             else prop
172 |           }
173 |           else prop
174 | 
175 |         }
176 |       },
177 |       /**
178 |        * Message computation for each triplet
179 |        */
180 |       triplet => {
181 | 
182 |         if (triplet.dstAttr._2 == 1){
183 |           Iterator((triplet.srcId,Seq(triplet.dstId))) ++ Iterator((triplet.dstId,Seq(triplet.dstId)))//,(triplet.dstId,Seq(triplet.dstId)))
184 |         }
185 |         else if (triplet.dstAttr._2 == 2 && triplet.srcAttr._2 == 1) {
186 |           Iterator((triplet.srcId,Seq(triplet.dstId)))
187 |         }
188 |         else {
189 |           Iterator.empty
190 |         }
191 | 
192 | 
193 |       },
194 |       (a,b) => a++b //Merge message
195 |     )
196 | 
197 |     //vertices from the new graph which reach out to the vertex
198 |     cctp.vertices	.map(f => f._1.toLong -> f._2._2)
199 |       .join(model.RDDcoOrdinate_vertexID_Mapper.map(w => w._2 -> w._1))
200 |       .map(w => w._2._2 -> w._2._1).filter(word => word._2 == 2).map(f => f._1)
201 | 
202 | 
203 |   }
204 | }


--------------------------------------------------------------------------------
/mllib/src/main/scala/org/sparkalgos/mllib/join/knnJoin_Int.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.sparkAlgos.mllib.join
 19 | 
 20 | import scala.math._
 21 | import org.apache.spark.SparkContext
 22 | import org.apache.spark.SparkContext._
 23 | import org.apache.spark.rdd.RDD
 24 | import scala.collection.immutable.Vector
 25 | import scala.util.Random
 26 | 
 27 | object knnJoin_Int {
 28 | 
 29 |   /**
 30 |    * Computes the nearest neighbors in the data-set for the data-point against which KNN
 31 |    * has to be applied for A SINGLE ITERATION
 32 |    *
 33 |    * @param rdd : RDD of Vectors of Int, which is the data-set in which knnJoin has to be 
 34 |    * 		undertaken
 35 |    * @param dataPoint : Vector of Int, which is the data-point with which knnJoin is done 
 36 |    * 		with the data-set
 37 |    * @param randPoint : Vector of Int, it's the random vector generated in each iteration
 38 |    * @param len : The number of data-points from the data-set on which knnJoin is to be done
 39 |    * @param zScore : RDD of (Long,Long), which is the ( <line_no> , <zscore> ) for each entry
 40 |    * 		of the dataset
 41 |    * @param dataScore : Long value of z-score of the data-point
 42 |    * 
 43 |    * @return an RDD of the nearest 2*len entries from the data-point on which KNN needs to be
 44 |    * undertaken for that iteration
 45 |    */
 46 |   def knnJoin_perIteration(rdd : RDD[(Vector[Int],Long)],
 47 |                            dataPoint : Vector[Int],
 48 |                            randPoint : Vector[Int],
 49 |                            len : Int,
 50 |                            zScore : RDD[(Long,BigInt)],
 51 |                            dataScore : BigInt,
 52 |                            sc : SparkContext) : RDD[(Vector[Int],Long)] = {
 53 |  
 54 |    // rdd with score greater than the z-score of the data-point
 55 |    val greaterRDD = zScore.filter(word  => word._2 > dataScore).
 56 |             map(word => word._2 -> word._1).
 57 |             sortByKey(true).
 58 |             map(word => word._2).
 59 |             zipWithIndex()
 60 |    // rdd with score lesser than the z-score of the data-point
 61 |    val lesserRDD = zScore.filter(word => word._2 < dataScore)
 62 |                 .map(word => word._2 -> word._1)
 63 |                 .sortByKey(false)
 64 |                 .map(word => word._2)
 65 |                 .zipWithIndex()
 66 | 
 67 | 
 68 |    /**
 69 |     * Need 2*len entries, hence the IF-ELSE construct to guarantee these many no.of entries in
 70 |     * the returned RDD
 71 |     * if the no.of entries in the greaterRDD and lesserRDD is greater than <len>
 72 |     * extract <len> no.of entries from each RDD
 73 |     */
 74 | 
 75 |    if((greaterRDD.count >= len)&&(lesserRDD.count >= len)) {
 76 |      val trim = greaterRDD.filter(word => word._2 < len).map(word => word._1).
 77 |             union(lesserRDD.filter(word => word._2 < len).map(word => word._1))
 78 | 
 79 |      val join = rdd.map(word => word._2 -> word._1)
 80 |             .join(trim.map(word => word -> 0))
 81 |             .map(word => word._2._1 -> word._1)
 82 |      join
 83 |    }
 84 |    /*
 85 |    if the no.of entries in the greaterRDD less than <len>  extract all entries from
 86 |    greaterRDD and <len> + (<len> - greaterRDD.count) no.of entries from lesserRDD
 87 |    */
 88 |    else if(greaterRDD.count < len) {
 89 | 
 90 |      val lenMod = len + (len - greaterRDD.count)
 91 |      val trim = greaterRDD.map(word => word._1)
 92 |             .union(lesserRDD.filter(word => word._2 < lenMod)
 93 |             .map(word => word._1))
 94 | 
 95 |      val join = rdd.map(word => word._2 -> word._1)
 96 |             .join(trim.map(word => word -> 0))
 97 |             .map(word => word._2._1 -> word._1)
 98 |      join
 99 |    }
100 | 
101 |    //if the no.of entries in the lesserRDD less than <len>
102 |    //extract all entries from lesserRDD and
103 |    //<len> + (<len> - lesserRDD.count) no.of entries from greaterRDD
104 |    else {
105 | 
106 |      val lenMod = len + (len - lesserRDD.count)
107 |      val trim = greaterRDD.filter(word => word._2 < lenMod).map(word => word._1)
108 |             .union(lesserRDD.map(word => word._1))
109 | 
110 |      val join = rdd.map(word => word._2 -> word._1)
111 |             .join(trim.map(word => word -> 0))
112 |             .map(word => word._2._1 -> word._1)
113 |      join
114 |    }
115 |   }
116 | 
117 |   /**
118 |    * Computes the nearest neighbors in the data-set for the data-point against which KNN
119 |    * has to be applied
120 |    *
121 |    * @param dataSet : RDD of Vectors of Int
122 |    * @param dataPoint : Vector of Int
123 |    * @param len : Number of data-points of the dataSet on which knnJoin is to be done
124 |    * @param randomSize : the number of iterations which has to be carried out
125 |    *
126 |    * @return an RDD of Vectors of Int on which simple KNN needs to be applied with respect
127 |    * to the data-point
128 |    */
129 |   def knnJoin(dataSet : RDD[Vector[Int]],
130 |               dataPoint : Vector[Int],
131 |               len : Int,
132 |               randomSize : Int,
133 |               sc : SparkContext): RDD[Vector[Int]] = {
134 | 
135 |    val size = dataSet.first().length
136 |    val rand = new Array[Int](size)
137 |    val randomValue = new Random
138 |    val rdd1 = dataSet.zipWithIndex()
139 |    
140 |    //compute z-value for each iteration, this being the first
141 |    val model = zScore.computeScore(rdd1)
142 |    val dataScore = zScore.scoreOfDataPoint(dataPoint)
143 | 
144 |    //for first iteration rand vector is a ZERO vector
145 |    for(count <- 0 to size-1) rand(count) = 0
146 | 
147 |    //compute nearest neighbours on basis of z-scores
148 |    val c_i = knnJoin_perIteration(rdd1, dataPoint, rand.toVector ,len,model, dataScore, sc)
149 |    c_i.persist()
150 | 
151 |    //compute -> rdd where data-set generated from each iteration is being recursively appended
152 |    var compute = c_i
153 |    compute.persist()
154 | 
155 | 
156 |    //the no.of iterations to be performed
157 |    for(count <- 2 to randomSize) {
158 | 
159 |      for(i <- 0 to size - 1) rand(i) = randomValue.nextInt(100)
160 | 
161 | 
162 |      //increment each element of the data-set with the random vector "rand"
163 |      var kLooped = -1
164 |      val newRDD = rdd1.map(vector => {kLooped = -1
165 |                 vector._1.map(word => word + rand({kLooped = kLooped+1
166 |                 kLooped%size})
167 |                 )} -> vector._2)
168 | 
169 | 
170 |      val newData_point = dataPoint.map(word => word + rand({kLooped = kLooped+1
171 |                 kLooped % size}))
172 | 
173 | 
174 |      //compute z-scores for the iteration
175 |      val modelLooped = zScore.computeScore(newRDD)
176 |      val data_scoreLooped = zScore.scoreOfDataPoint(newData_point)
177 | 
178 |      //compute nearest neighbours on basis of z-scores
179 |      val c_iLooped = knnJoin_perIteration(newRDD, newData_point, rand.toVector, 
180 |                                                   len, modelLooped, data_scoreLooped, sc)
181 |      c_iLooped.persist()
182 | 
183 |      //remove the effect of random vector "rand" from each entry of the the returned RDD 
184 |      //from knnJoin_perIteration
185 |      var z_Looped = -1
186 |      val c_iCleansedLooped = c_iLooped.map(line => {z_Looped = -1
187 |                     line._1.map(word => word - rand({z_Looped = z_Looped+1
188 |                     z_Looped%size})) } -> line._2)
189 | 
190 |      compute = compute.union(c_iCleansedLooped)
191 |      compute.persist()
192 |    }
193 | 
194 |     zKNN(removeRedundantEntries(compute), dataPoint, len).coalesce(1)
195 |   }
196 | 
197 |   /**
198 |    * It removes redundant Vectors from the dataset
199 |    * @param DataSet : RDD of Vector[Int] and the vectors corresponding line_no in the data-set
200 |    * @return : RDD of non-repetitive Vectors on Int
201 |    */
202 |   def removeRedundantEntries(DataSet : RDD[(Vector[Int],Long)]) : RDD[Vector[Int]] = {
203 |     DataSet.map(word => word._2 -> word._1).
204 |           groupByKey().
205 |           map(word => word._2.last)
206 | 
207 |   }
208 | 
209 |   /**
210 |    * Computes euclidean distance between two vectors
211 |    *
212 |    * @param point1 : Vector of Int
213 |    * @param point2 : Vector of Int
214 |    * @return : euclidean distance between the two vectors
215 |    */
216 |   def euclideanDist(point1 : Vector[Int], point2 : Vector[Int]) : Double = {
217 |     var sum = 0.0
218 |     for(i <- 0 to point1.length-1) {
219 |       sum = sum + pow(point1(i) - point2(i),2)
220 |     }
221 |     sqrt(sum)
222 |   }
223 | 
224 |   /**
225 |    * Performs kNN over the modified data-set and returns the k-nearest neighbors for 
226 |    * the data-point
227 |    *
228 |    * @param reducedData : RDD of Vector of Int, which is the reduced data-set after kNNJoin
229 |    * 		function applied to the data-set
230 |    * @param dataPoint : Vector of Int, is the data-point for which kNN needs to be undertaken
231 |    * @param k : the no.of neighbors to be computed
232 |    * @return : RDD of Vector of Int
233 |    */
234 |   def zKNN(reducedData : RDD[Vector[Int]],
235 |       dataPoint : Vector[Int], k : Int) : RDD[Vector[Int]] = {
236 |     
237 |     val distData = reducedData.map(word => euclideanDist(dataPoint, word) -> word)
238 |             .sortByKey(true)
239 |             .zipWithIndex()
240 |             .filter(word => word._2 < k).map(word => word._1._2)
241 |     distData
242 | 
243 |   }
244 | 
245 | }
246 | 


--------------------------------------------------------------------------------
/mllib/src/main/scala/org/sparkalgos/mllib/join/knnJoin_Long.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.sparkAlgos.mllib.join
 19 | 
 20 | import scala.math._
 21 | import org.apache.spark.SparkContext
 22 | import org.apache.spark.SparkContext._
 23 | import org.apache.spark.rdd.RDD
 24 | import scala.collection.immutable.Vector
 25 | import scala.util.Random
 26 | 
 27 | object knnJoin_Long {
 28 | 
 29 |   /**
 30 |    * Computes the nearest neighbors in the data-set for the data-point against which KNN
 31 |    * has to be applied for A SINGLE ITERATION
 32 |    *
 33 |    * @param rdd : RDD of Vectors of Long, which is the data-set in which knnJoin has 
 34 |    *		to be undertaken
 35 |    * @param dataPoint : Vector of Long, which is the data-point with which knnJoin is
 36 |    * 		done with the data-set
 37 |    * @param randPoint : Vector of Long, it's the random vector generated in each iteration
 38 |    * @param len : the number of data-points from the data-set on which knnJoin is to be done
 39 |    * @param zScore : RDD of (Long,Long), which is the ( <line_no> , <zscore> ) for each 
 40 |    * 		entry of the dataset
 41 |    * @param dataScore : Long value of z-score of the data-point
 42 |    * 
 43 |    * @return an RDD of the nearest 2*len entries from the data-point on which KNN needs to 
 44 |    * 		be undertaken for that iteration
 45 |    */
 46 |   def knnJoin_perIteration(rdd : RDD[(Vector[Long],Long)],
 47 |                            dataPoint : Vector[Long],
 48 |                            randPoint : Vector[Long],
 49 |                            len : Int,
 50 |                            zScore : RDD[(Long,BigInt)],
 51 |                            dataScore : BigInt,
 52 |                            sc : SparkContext) : RDD[(Vector[Long],Long)] = {
 53 | 
 54 | 
 55 |    // rdd with score greater than the z-score of the data-point
 56 |    val greaterRDD = zScore.filter(word  => word._2 > dataScore).
 57 |             map(word => word._2 -> word._1).
 58 |             sortByKey(true).
 59 |             map(word => word._2).
 60 |             zipWithIndex()
 61 |    // rdd with score lesser than the z-score of the data-point
 62 |    val lesserRDD = zScore.filter(word => word._2 < dataScore)
 63 |                 .map(word => word._2 -> word._1)
 64 |                 .sortByKey(false)
 65 |                 .map(word => word._2)
 66 |                 .zipWithIndex()
 67 | 
 68 | 
 69 |    /**
 70 |     * Need 2*len entries, hence the IF-ELSE construct to guarantee these many no.of entries in
 71 |     * the returned RDD
 72 |     * if the no.of entries in the greaterRDD and lesserRDD is greater than <len>
 73 |     * extract <len> no.of entries from each RDD
 74 |     */
 75 | 
 76 |    if((greaterRDD.count >= len)&&(lesserRDD.count >= len)) {
 77 |      val trim = greaterRDD.filter(word => word._2 < len).map(word => word._1).
 78 |             union(lesserRDD.filter(word => word._2 < len).map(word => word._1))
 79 | 
 80 |      val join = rdd.map(word => word._2 -> word._1)
 81 |             .join(trim.map(word => word -> 0))
 82 |             .map(word => word._2._1 -> word._1)
 83 |      join
 84 |    }
 85 |    /*
 86 |    if the no.of entries in the greaterRDD less than <len>  extract all entries from greaterRDD and
 87 |    <len> + (<len> - greaterRDD.count) no.of entries from lesserRDD
 88 |    */
 89 |    else if(greaterRDD.count < len) {
 90 | 
 91 |      val lenMod = len + (len - greaterRDD.count)
 92 |      val trim = greaterRDD.map(word => word._1)
 93 |             .union(lesserRDD.filter(word => word._2 < lenMod)
 94 |             .map(word => word._1))
 95 | 
 96 |      val join = rdd.map(word => word._2 -> word._1)
 97 |             .join(trim.map(word => word -> 0))
 98 |             .map(word => word._2._1 -> word._1)
 99 |      join
100 |    }
101 | 
102 |    //if the no.of entries in the lesserRDD less than <len>
103 |    //extract all entries from lesserRDD and
104 |    //<len> + (<len> - lesserRDD.count) no.of entries from greaterRDD
105 |    else {
106 | 
107 |      val lenMod = len + (len - lesserRDD.count)
108 |      val trim = greaterRDD.filter(word => word._2 < lenMod).map(word => word._1)
109 |             .union(lesserRDD.map(word => word._1))
110 | 
111 |      val join = rdd.map(word => word._2 -> word._1)
112 |             .join(trim.map(word => word -> 0))
113 |             .map(word => word._2._1 -> word._1)
114 |      join
115 |    }
116 |   }
117 | 
118 |   /**
119 |    * Computes the nearest neighbors in the data-set for the data-point against which KNN
120 |    * has to be applied
121 |    *
122 |    * @param dataSet : RDD of Vectors of Long
123 |    * @param dataPoint : Vector of Long
124 |    * @param len : Number of data-points of the dataSet on which knnJoin is to be done
125 |    * @param randomSize : the number of iterations which has to be carried out
126 |    *
127 |    * @return an RDD of Vectors of Long on which simple KNN needs to be applied with 
128 |    * 		respect to the data-point
129 |    */
130 |   def knnJoin(dataSet : RDD[Vector[Long]],
131 |               dataPoint : Vector[Long],
132 |               len : Int,
133 |               randomSize : Int,
134 |               sc : SparkContext): RDD[Vector[Long]] = {
135 | 
136 |    val size = dataSet.first().length
137 |    val rand = new Array[Long](size)
138 |    val randomValue = new Random
139 |    val rdd1 = dataSet.zipWithIndex()
140 |    
141 |    //compute z-value for each iteration, this being the first
142 |    val model = zScore.computeScore(rdd1)
143 |    val dataScore = zScore.scoreOfDataPoint(dataPoint)
144 | 
145 |    //for first iteration rand vector is a ZERO vector
146 |    for(count <- 0 to size-1) rand(count) = 0
147 | 
148 |    //compute nearest neighbours on basis of z-scores
149 |    val c_i = knnJoin_perIteration(rdd1, dataPoint, rand.toVector ,len,model, dataScore, sc)
150 |    c_i.persist()
151 | 
152 |    //compute -> rdd where data-set generated from each iteration is being recursively appended
153 |    var compute = c_i
154 |    compute.persist()
155 | 
156 | 
157 |    //the no.of iterations to be performed
158 |    for(count <- 2 to randomSize) {
159 | 
160 |      for(i <- 0 to size - 1) rand(i) = randomValue.nextInt(100).toLong
161 | 
162 | 
163 |      //increment each element of the data-set with the random vector "rand"
164 |      var kLooped = -1
165 |      val newRDD = rdd1.map(vector => {kLooped = -1
166 |                 vector._1.map(word => word + rand({kLooped = kLooped+1
167 |                 kLooped%size})
168 |                 )} -> vector._2)
169 | 
170 | 
171 |      val newData_point = dataPoint.map(word => word + rand({kLooped = kLooped+1
172 |                 kLooped % size}))
173 | 
174 | 
175 |      //compute z-scores for the iteration
176 |      val modelLooped = zScore.computeScore(newRDD)
177 |      val data_scoreLooped = zScore.scoreOfDataPoint(newData_point)
178 | 
179 |      //compute nearest neighbours on basis of z-scores
180 |      val c_iLooped = knnJoin_perIteration(newRDD, newData_point, rand.toVector, 
181 |                                                   len, modelLooped, data_scoreLooped, sc)
182 |      c_iLooped.persist()
183 | 
184 |      //remove the effect of random vector "rand" from each entry of the the returned RDD from 
185 |      //knnJoin_perIteration
186 |      var z_Looped = -1
187 |      val c_iCleansedLooped = c_iLooped.map(line => {z_Looped = -1
188 |                     line._1.map(word => word - rand({z_Looped = z_Looped+1
189 |                     z_Looped%size})) } -> line._2)
190 | 
191 |      compute = compute.union(c_iCleansedLooped)
192 |      compute.persist()
193 |    }
194 | 
195 |     zKNN(removeRedundantEntries(compute), dataPoint, len).coalesce(1)
196 |   }
197 | 
198 |   /**
199 |    * It removes redundant Vectors from the dataset
200 |    * @param DataSet : RDD of Vector[Long] and the vectors corresponding line_no in the data-set
201 |    * @return : RDD of non-repetitive Vectors on Long
202 |    */
203 |   def removeRedundantEntries(DataSet : RDD[(Vector[Long],Long)]) : RDD[Vector[Long]] = {
204 |     DataSet.map(word => word._2 -> word._1).
205 |           groupByKey().
206 |           map(word => word._2.last)
207 | 
208 |   }
209 | 
210 |   /**
211 |    * Computes euclidean distance between two vectors
212 |    *
213 |    * @param point1 : Vector of Long
214 |    * @param point2 : Vector of Long
215 |    * @return : euclidean distance between the two vectors
216 |    */
217 |   def euclideanDist(point1 : Vector[Long], point2 : Vector[Long]) : Double = {
218 |     var sum = 0.0
219 |     for(i <- 0 to point1.length-1) {
220 |       sum = sum + pow(point1(i) - point2(i),2)
221 |     }
222 |     sqrt(sum)
223 |   }
224 | 
225 |   /**
226 |    * Performs kNN over the modified data-set and returns the k-nearest neighbors for the data-point
227 |    *
228 |    * @param reducedData : RDD of Vector of Long, which is the reduced data-set after kNNJoin 
229 |    * 		function applied to the data-set
230 |    * @param dataPoint : Vector of Long, is the data-point for which kNN needs to be undertaken
231 |    * @param k : the no.of neighbors to be computed
232 |    * @return : RDD of Vector of Long
233 |    */
234 |   def zKNN(reducedData : RDD[Vector[Long]], dataPoint : Vector[Long], k : Int) : RDD[Vector[Long]] = {
235 |     val distData = reducedData.map(word => euclideanDist(dataPoint, word) -> word)
236 |             .sortByKey(true)
237 |             .zipWithIndex()
238 |             .filter(word => word._2 < k).map(word => word._1._2)
239 |     distData
240 | 
241 |   }
242 | 
243 | }
244 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------