├── project
    ├── build.properties
    └── plugins.sbt
├── images
    ├── MCL.png
    ├── Expansion.png
    ├── Inflation.png
    └── Difference.png
├── src
    ├── test
    │   ├── resources
    │   │   ├── MCL
    │   │   │   ├── clusters.tab
    │   │   │   ├── karateNodes.csv
    │   │   │   └── karateEdges.csv
    │   │   └── MCLUtils
    │   │   │   ├── OrientedNodes.txt
    │   │   │   ├── OrientedEdges.txt
    │   │   │   └── OrientedMatrixSelfLoop.txt
    │   └── scala
    │   │   ├── org
    │   │       └── apache
    │   │       │   └── spark
    │   │       │       └── mllib
    │   │       │           └── clustering
    │   │       │               ├── MCLFunSuite.scala
    │   │       │               ├── MCLModelSuite.scala
    │   │       │               ├── MCLUtilsSuite.scala
    │   │       │               └── MCLSuite.scala
    │   │   └── MainSuite.scala
    └── main
    │   └── scala
    │       ├── org
    │           └── apache
    │           │   └── spark
    │           │       └── mllib
    │           │           └── clustering
    │           │               ├── MCLModel.scala
    │           │               ├── MCLUtils.scala
    │           │               └── MCL.scala
    │       └── Main.scala
├── .gitignore
├── LICENSE.txt
├── .travis.yml
└── README.md


/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.11


--------------------------------------------------------------------------------
/images/MCL.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joandre/MCL_spark/HEAD/images/MCL.png


--------------------------------------------------------------------------------
/images/Expansion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joandre/MCL_spark/HEAD/images/Expansion.png


--------------------------------------------------------------------------------
/images/Inflation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joandre/MCL_spark/HEAD/images/Inflation.png


--------------------------------------------------------------------------------
/images/Difference.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/joandre/MCL_spark/HEAD/images/Difference.png


--------------------------------------------------------------------------------
/src/test/resources/MCL/clusters.tab:
--------------------------------------------------------------------------------
1 | 1	2	3	4	5	6	7	8	11	12	13	14	18	20	22	17
2 | 9	31	10	28	29	33	34	15	16	19	21	23	24	30	27
3 | 32	26	25
4 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.4.0")
2 | 
3 | resolvers += "bintray-spark-packages" at "https://dl.bintray.com/spark-packages/maven/"
4 | 
5 | addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.4")


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | # sbt specific
 4 | .cache
 5 | .lib/
 6 | dist/*
 7 | target/
 8 | lib_managed/
 9 | src_managed/
10 | project/boot/
11 | project/plugins/project/
12 | # Scala-IDE specific
13 | .idea/
14 | .scala_dependencies
15 | .worksheet


--------------------------------------------------------------------------------
/src/test/resources/MCL/karateNodes.csv:
--------------------------------------------------------------------------------
 1 | 1 "1"
 2 | 2 "2"
 3 | 3 "3"
 4 | 4 "4"
 5 | 5 "5"
 6 | 6 "6"
 7 | 7 "7"
 8 | 8 "8"
 9 | 9 "9"
10 | 10 "10"
11 | 11 "11"
12 | 12 "12"
13 | 13 "13"
14 | 14 "14"
15 | 15 "15"
16 | 16 "16"
17 | 17 "17"
18 | 18 "18"
19 | 19 "19"
20 | 20 "20"
21 | 21 "21"
22 | 22 "22"
23 | 23 "23"
24 | 24 "24"
25 | 25 "25"
26 | 26 "26"
27 | 27 "27"
28 | 28 "28"
29 | 29 "29"
30 | 30 "30"
31 | 31 "31"
32 | 32 "32"
33 | 33 "33"
34 | 34 "34"
35 | 


--------------------------------------------------------------------------------
/src/test/resources/MCLUtils/OrientedNodes.txt:
--------------------------------------------------------------------------------
 1 | 1 "0"
 2 | 2 "1"
 3 | 3 "2"
 4 | 4 "3"
 5 | 5 "4"
 6 | 6 "5"
 7 | 7 "6"
 8 | 8 "7"
 9 | 9 "8"
10 | 10 "9"
11 | 11 "10"
12 | 12 "11"
13 | 13 "12"
14 | 14 "13"
15 | 15 "14"
16 | 16 "15"
17 | 17 "16"
18 | 18 "17"
19 | 19 "18"
20 | 20 "19"
21 | 21 "20"
22 | 22 "21"
23 | 23 "22"
24 | 24 "23"
25 | 25 "24"
26 | 26 "25"
27 | 27 "26"
28 | 28 "27"
29 | 29 "28"
30 | 30 "29"
31 | 31 "30"
32 | 32 "31"
33 | 33 "32"
34 | 34 "33"
35 | 35 "34"
36 | 36 "35"
37 | 37 "36"
38 | 38 "37"
39 | 39 "38"
40 | 40 "39"
41 | 41 "40"
42 | 42 "41"
43 | 43 "42"
44 | 44 "43"
45 | 45 "44"
46 | 46 "45"
47 | 47 "46"
48 | 48 "47"
49 | 49 "48"
50 | 50 "49"
51 | 


--------------------------------------------------------------------------------
/src/test/resources/MCL/karateEdges.csv:
--------------------------------------------------------------------------------
 1 | 1 2 4
 2 | 1 3 5
 3 | 1 4 3
 4 | 1 5 3
 5 | 1 6 3
 6 | 1 7 3
 7 | 1 8 2
 8 | 1 9 2
 9 | 1 11 2
10 | 1 12 3
11 | 1 13 1
12 | 1 14 3
13 | 1 18 2
14 | 1 20 2
15 | 1 22 2
16 | 1 32 2
17 | 2 3 6
18 | 2 4 3
19 | 2 8 4
20 | 2 14 5
21 | 2 18 1
22 | 2 20 2
23 | 2 22 2
24 | 2 31 2
25 | 3 4 3
26 | 3 8 4
27 | 3 9 5
28 | 3 10 1
29 | 3 14 3
30 | 3 28 2
31 | 3 29 2
32 | 3 33 2
33 | 4 8 3
34 | 4 13 3
35 | 4 14 3
36 | 5 7 2
37 | 5 11 3
38 | 6 7 5
39 | 6 11 3
40 | 6 17 3
41 | 7 17 3
42 | 9 31 3
43 | 9 33 3
44 | 9 34 4
45 | 10 34 2
46 | 14 34 3
47 | 15 33 3
48 | 15 34 2
49 | 16 33 3
50 | 16 34 4
51 | 19 33 1
52 | 19 34 2
53 | 20 34 1
54 | 21 33 3
55 | 21 34 1
56 | 23 33 2
57 | 23 34 3
58 | 24 26 5
59 | 24 28 4
60 | 24 30 3
61 | 24 33 5
62 | 24 34 4
63 | 25 26 2
64 | 25 28 3
65 | 25 32 2
66 | 26 32 7
67 | 27 30 4
68 | 27 34 2
69 | 28 34 4
70 | 29 32 2
71 | 29 34 2
72 | 30 33 4
73 | 30 34 2
74 | 31 33 3
75 | 31 34 3
76 | 32 33 4
77 | 32 34 4
78 | 33 34 5
79 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015-2017, Joan André
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # The MIT License (MIT)
 2 | #
 3 | # Copyright (c) 2015-2016, Joan André
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | #   of this software and associated documentation files (the "Software"), to deal
 7 | #   in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | #   copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | #   The above copyright notice and this permission notice shall be included in
13 | # all copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | #   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | # THE SOFTWARE.
22 | 
23 | language: scala
24 | script:
25 |    - sbt clean coverage test coverageReport
26 | after_success:
27 |    - bash <(curl -s https://codecov.io/bash)
28 | scala:
29 |    - 2.10.5
30 |    - 2.11.8
31 | jdk:
32 |    - oraclejdk8
33 | env:
34 |   - SPARKVERSION="2.0.1"
35 | 
36 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/mllib/clustering/MCLFunSuite.scala:
--------------------------------------------------------------------------------
 1 | /*The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015-2016, Joan André
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |   of this software and associated documentation files (the "Software"), to deal
 7 |   in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |   copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 |   The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.*/
22 | 
23 | package org.apache.spark.mllib.clustering
24 | 
25 | import com.holdenkarau.spark.testing.{DatasetSuiteBase, SharedSparkContext}
26 | import org.scalatest.{FunSuite, Matchers, Tag}
27 | 
28 | /** MCL specific implementation of Scala Test Suite */
29 | //TODO Why spark ?
30 | private[spark] abstract class MCLFunSuite extends FunSuite with Matchers with SharedSparkContext with DatasetSuiteBase
31 | 
32 | object UnitTest extends Tag("org.apache.spark.mllib.clustering.tags.UnitTest")
33 | object IntegrationTest extends Tag("org.apache.spark.mllib.clustering.tags.IntegrationTest")


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/mllib/clustering/MCLModelSuite.scala:
--------------------------------------------------------------------------------
 1 | package org.apache.spark.mllib.clustering
 2 | 
 3 | import org.apache.log4j.{Level, Logger}
 4 | import org.apache.spark.graphx._
 5 | import org.apache.spark.rdd.RDD
 6 | import org.apache.spark.sql.Row
 7 | import org.apache.spark.util.Utils
 8 | 
 9 | /**
10 |   * Created by andrejoan on 4/30/16.
11 |   */
12 | class MCLModelSuite extends MCLFunSuite{
13 |   // Disable Spark messages when running program
14 |   Logger.getLogger("org").setLevel(Level.OFF)
15 |   Logger.getLogger("akka").setLevel(Level.OFF)
16 | 
17 |   test("model save/load", UnitTest){
18 | 
19 |     val users: RDD[(VertexId, String)] =
20 |       sc.parallelize(Array((0L,"Node1"), (1L,"Node2"),
21 |         (2L,"Node3"), (3L,"Node4"),(4L,"Node5"),
22 |         (5L,"Node6"), (6L,"Node7"), (7L, "Node8"),
23 |         (8L, "Node9"), (9L, "Node10"), (10L, "Node11")))
24 | 
25 |     val relationships: RDD[Edge[Double]] =
26 |       sc.parallelize(
27 |         Seq(Edge(0, 1, 1.0), Edge(1, 0, 1.0),
28 |           Edge(0, 2, 1.0), Edge(2, 0, 1.0),
29 |           Edge(0, 3, 1.0), Edge(3, 0, 1.0),
30 |           Edge(1, 2, 1.0), Edge(2, 1, 1.0),
31 |           Edge(1, 3, 1.0), Edge(3, 1, 1.0),
32 |           Edge(2, 3, 1.0), Edge(3, 2, 1.0),
33 |           Edge(4, 5, 1.0), Edge(5, 4, 1.0),
34 |           Edge(4, 6, 1.0), Edge(6, 4, 1.0),
35 |           Edge(4, 7, 1.0), Edge(7, 4, 1.0),
36 |           Edge(5, 6, 1.0), Edge(6, 5, 1.0),
37 |           Edge(5, 7, 1.0), Edge(7, 5, 1.0),
38 |           Edge(6, 7, 1.0), Edge(7, 6, 1.0),
39 |           Edge(3, 8, 1.0), Edge(8, 3, 1.0),
40 |           Edge(9, 8, 1.0), Edge(8, 9, 1.0),
41 |           Edge(9, 10, 1.0), Edge(10, 9, 1.0),
42 |           Edge(4, 10, 1.0), Edge(10, 4, 1.0)
43 |         ))
44 | 
45 |     val graph = Graph(users, relationships)
46 | 
47 |     val model: MCLModel = MCL.train(graph)
48 | 
49 |     // Check number of clusters
50 |     model.nbClusters shouldEqual 3
51 | 
52 |     // Check save and load methods
53 |     val tempDir = Utils.createTempDir()
54 |     val path = tempDir.toURI.toString
55 | 
56 |     Array(true, false).foreach { case selector =>
57 |       // Save model, load it back, and compare.
58 |       try {
59 |         model.save(sc, path)
60 |         val sameModel = MCLModel.load(sc, path)
61 |         assertDatasetEquals(model.assignments.orderBy("id"), sameModel.assignments.orderBy("id"))
62 |       } finally {
63 |         Utils.deleteRecursively(tempDir)
64 |       }
65 |     }
66 | 
67 |   }
68 | 
69 |   test("nodes assignments", UnitTest) {
70 |     val nodeId = 1.0.toLong
71 |     val cluster = 2.0.toLong
72 |     val newAssignment:Assignment = Assignment.apply(Row(nodeId, cluster))
73 | 
74 |     newAssignment.id shouldEqual nodeId
75 |     newAssignment.cluster shouldEqual cluster
76 |   }
77 | 
78 | }
79 | 


--------------------------------------------------------------------------------
/src/test/scala/MainSuite.scala:
--------------------------------------------------------------------------------
 1 | /*The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015-2016, Joan André
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |   of this software and associated documentation files (the "Software"), to deal
 7 |   in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |   copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 |   The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.*/
22 | 
23 | import org.scalatest.{FunSuite, Matchers}
24 | 
25 | class MainSuite extends FunSuite with Matchers{
26 | 
27 |   test("toInt"){
28 |     val eR = Main.toInt(Symbol("expansionRate"), "2")
29 |     eR shouldEqual 2
30 | 
31 |     an [Exception] should be thrownBy Main.toInt(Symbol("expansionRate"), "1.1")
32 |   }
33 | 
34 |   test("toDouble"){
35 |     val iR = Main.toDouble(Symbol("inflationRate"), "2.0")
36 |     iR shouldEqual 2.0
37 | 
38 |     an [Exception] should be thrownBy Main.toDouble(Symbol("inflationRate"), "test")
39 |   }
40 | 
41 |   test("nextOption"){
42 |     val args: Array[String] = Array("--expansionRate", "3", "--inflationRate", "3.0", "--epsilon", "0.1", "--maxIterations", "20", "--selfLoopWeight", "0.1", "--graphOrientationStrategy", "directed")
43 |     val arglist = args.toList
44 | 
45 |     val options = Main.nextOption(Map(),arglist)
46 |     Main.toInt('expansionRate, options.getOrElse('expansionRate, 2).toString) shouldEqual 3
47 |     Main.toDouble('inflationRate, options.getOrElse('inflationRate, 2.0).toString) shouldEqual 3.0
48 |     Main.toDouble('epsilon, options.getOrElse('epsilon, 0.01).toString) shouldEqual 0.1
49 |     Main.toInt('maxIterations, options.getOrElse('maxIterations, 10).toString) shouldEqual 20
50 |     Main.toDouble('selfLoopWeight, options.getOrElse('selfLoopWeight, 1.0).toString) shouldEqual 0.1
51 |     options.getOrElse('graphOrientationStrategy, "undirected").toString shouldEqual "directed"
52 | 
53 |     val args2: Array[String] = Array("--wrongOption", "test")
54 |     val arglist2 = args2.toList
55 | 
56 |     an [Exception] should be thrownBy Main.nextOption(Map(),arglist2)
57 |   }
58 | 
59 |   /*test("main"){
60 |     val args: Array[String] = Array("--expansionRate", "2", "--inflationRate", "2.0", "--epsilon", "0.01", "--maxIterations", "10", "--selfLoopWeight", "1", "--graphOrientationStrategy", "undirected")
61 | 
62 |     val streamIM = new java.io.ByteArrayOutputStream()
63 |     Console.withOut(streamIM) {
64 |       Main.main(args)
65 |     }
66 | 
67 |     streamIM.toString.split("\n") should contain theSameElementsAs Array("0 => List(0, 1, 2, 3)", "4 => List(4, 5, 6, 7)", "9 => List(8, 9, 10)").toSeq
68 |   }*/
69 | }
70 | 


--------------------------------------------------------------------------------
/src/test/resources/MCLUtils/OrientedEdges.txt:
--------------------------------------------------------------------------------
  1 | 1 1 1.0
  2 | 1 9 1.0
  3 | 1 18 1.0
  4 | 1 20 1.0
  5 | 1 21 1.0
  6 | 1 24 1.0
  7 | 1 35 1.0
  8 | 1 43 1.0
  9 | 1 45 1.0
 10 | 1 46 1.0
 11 | 2 7 1.0
 12 | 2 11 1.0
 13 | 2 12 1.0
 14 | 2 13 1.0
 15 | 2 18 1.0
 16 | 2 24 1.0
 17 | 2 28 1.0
 18 | 2 31 1.0
 19 | 2 40 1.0
 20 | 3 9 1.0
 21 | 3 13 1.0
 22 | 3 15 1.0
 23 | 3 21 1.0
 24 | 3 23 1.0
 25 | 3 24 1.0
 26 | 3 29 1.0
 27 | 3 30 1.0
 28 | 3 38 1.0
 29 | 3 42 1.0
 30 | 3 50 1.0
 31 | 4 35 1.0
 32 | 4 40 1.0
 33 | 4 45 1.0
 34 | 4 46 1.0
 35 | 4 48 1.0
 36 | 4 49 1.0
 37 | 5 14 1.0
 38 | 5 15 1.0
 39 | 5 19 1.0
 40 | 5 22 1.0
 41 | 5 26 1.0
 42 | 5 28 1.0
 43 | 5 34 1.0
 44 | 5 36 1.0
 45 | 5 40 1.0
 46 | 6 20 1.0
 47 | 6 21 1.0
 48 | 6 23 1.0
 49 | 6 26 1.0
 50 | 6 28 1.0
 51 | 6 36 1.0
 52 | 6 38 1.0
 53 | 6 44 1.0
 54 | 7 8 1.0
 55 | 7 12 1.0
 56 | 7 15 1.0
 57 | 7 17 1.0
 58 | 7 20 1.0
 59 | 7 22 1.0
 60 | 7 25 1.0
 61 | 7 26 1.0
 62 | 7 28 1.0
 63 | 7 36 1.0
 64 | 7 42 1.0
 65 | 8 25 1.0
 66 | 8 26 1.0
 67 | 8 28 1.0
 68 | 8 35 1.0
 69 | 8 37 1.0
 70 | 8 38 1.0
 71 | 8 39 1.0
 72 | 9 10 1.0
 73 | 9 11 1.0
 74 | 9 17 1.0
 75 | 9 18 1.0
 76 | 9 34 1.0
 77 | 9 46 1.0
 78 | 9 47 1.0
 79 | 10 13 1.0
 80 | 10 21 1.0
 81 | 10 22 1.0
 82 | 10 24 1.0
 83 | 10 28 1.0
 84 | 10 34 1.0
 85 | 10 48 1.0
 86 | 10 49 1.0
 87 | 10 50 1.0
 88 | 11 18 1.0
 89 | 11 22 1.0
 90 | 11 24 1.0
 91 | 11 26 1.0
 92 | 11 27 1.0
 93 | 11 34 1.0
 94 | 11 36 1.0
 95 | 11 37 1.0
 96 | 11 44 1.0
 97 | 12 28 1.0
 98 | 12 33 1.0
 99 | 12 36 1.0
100 | 12 38 1.0
101 | 12 50 1.0
102 | 13 17 1.0
103 | 13 19 1.0
104 | 13 23 1.0
105 | 13 29 1.0
106 | 13 31 1.0
107 | 13 39 1.0
108 | 13 41 1.0
109 | 13 46 1.0
110 | 13 50 1.0
111 | 14 21 1.0
112 | 14 24 1.0
113 | 14 26 1.0
114 | 14 35 1.0
115 | 14 45 1.0
116 | 15 20 1.0
117 | 15 22 1.0
118 | 15 25 1.0
119 | 15 29 1.0
120 | 15 43 1.0
121 | 15 46 1.0
122 | 15 47 1.0
123 | 16 19 1.0
124 | 16 26 1.0
125 | 16 33 1.0
126 | 16 38 1.0
127 | 16 40 1.0
128 | 16 44 1.0
129 | 16 45 1.0
130 | 16 50 1.0
131 | 17 18 1.0
132 | 17 20 1.0
133 | 17 24 1.0
134 | 17 26 1.0
135 | 17 27 1.0
136 | 17 28 1.0
137 | 17 30 1.0
138 | 17 44 1.0
139 | 17 45 1.0
140 | 18 20 1.0
141 | 18 31 1.0
142 | 18 36 1.0
143 | 18 37 1.0
144 | 18 40 1.0
145 | 19 30 1.0
146 | 19 35 1.0
147 | 19 36 1.0
148 | 19 37 1.0
149 | 19 39 1.0
150 | 19 43 1.0
151 | 19 46 1.0
152 | 20 24 1.0
153 | 20 27 1.0
154 | 20 34 1.0
155 | 20 45 1.0
156 | 20 47 1.0
157 | 21 22 1.0
158 | 21 31 1.0
159 | 21 32 1.0
160 | 22 23 1.0
161 | 22 25 1.0
162 | 22 27 1.0
163 | 22 28 1.0
164 | 22 30 1.0
165 | 22 32 1.0
166 | 22 36 1.0
167 | 22 47 1.0
168 | 23 32 1.0
169 | 23 36 1.0
170 | 23 37 1.0
171 | 23 38 1.0
172 | 23 39 1.0
173 | 23 43 1.0
174 | 23 44 1.0
175 | 23 45 1.0
176 | 24 28 1.0
177 | 24 40 1.0
178 | 24 47 1.0
179 | 25 34 1.0
180 | 26 27 1.0
181 | 26 33 1.0
182 | 26 34 1.0
183 | 26 35 1.0
184 | 26 42 1.0
185 | 26 43 1.0
186 | 27 30 1.0
187 | 27 32 1.0
188 | 27 33 1.0
189 | 27 35 1.0
190 | 27 38 1.0
191 | 27 41 1.0
192 | 27 44 1.0
193 | 28 32 1.0
194 | 28 42 1.0
195 | 28 43 1.0
196 | 28 46 1.0
197 | 29 31 1.0
198 | 29 32 1.0
199 | 29 37 1.0
200 | 29 40 1.0
201 | 30 31 1.0
202 | 31 36 1.0
203 | 31 48 1.0
204 | 31 49 1.0
205 | 32 37 1.0
206 | 32 40 1.0
207 | 32 46 1.0
208 | 32 48 1.0
209 | 33 35 1.0
210 | 33 44 1.0
211 | 33 49 1.0
212 | 33 50 1.0
213 | 34 38 1.0
214 | 34 42 1.0
215 | 34 44 1.0
216 | 34 47 1.0
217 | 35 36 1.0
218 | 35 42 1.0
219 | 35 47 1.0
220 | 35 49 1.0
221 | 36 37 1.0
222 | 36 38 1.0
223 | 36 41 1.0
224 | 36 43 1.0
225 | 36 47 1.0
226 | 37 41 1.0
227 | 37 48 1.0
228 | 38 45 1.0
229 | 38 48 1.0
230 | 39 50 1.0
231 | 40 41 1.0
232 | 40 42 1.0
233 | 40 50 1.0
234 | 41 42 1.0
235 | 41 45 1.0
236 | 41 47 1.0
237 | 42 47 1.0
238 | 43 44 1.0
239 | 43 46 1.0
240 | 43 49 1.0
241 | 44 47 1.0
242 | 50 50 1.0
243 | 45 49 1.0
244 | 46 49 1.0
245 | 47 48 1.0
246 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/mllib/clustering/MCLModel.scala:
--------------------------------------------------------------------------------
  1 | /*The MIT License (MIT)
  2 | 
  3 | Copyright (c) 2015-2016, Joan André
  4 | 
  5 | Permission is hereby granted, free of charge, to any person obtaining a copy
  6 |   of this software and associated documentation files (the "Software"), to deal
  7 |   in the Software without restriction, including without limitation the rights
  8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 |   copies of the Software, and to permit persons to whom the Software is
 10 | furnished to do so, subject to the following conditions:
 11 | 
 12 |   The above copyright notice and this permission notice shall be included in
 13 | all copies or substantial portions of the Software.
 14 | 
 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | THE SOFTWARE.*/
 22 | 
 23 | package org.apache.spark.mllib.clustering
 24 | 
 25 | import org.apache.spark.SparkContext
 26 | import org.apache.spark.mllib.util.{Loader, Saveable}
 27 | import org.apache.spark.sql.functions._
 28 | import org.apache.spark.sql.{Dataset, Row, SparkSession}
 29 | import org.json4s.JsonDSL._
 30 | import org.json4s._
 31 | import org.json4s.jackson.JsonMethods._
 32 | 
 33 | /** A clustering model for MCL.
 34 |   *
 35 |   * @param assignments an RDD of clustering assignments
 36 |   * @todo complete save and load features
 37 |   */
 38 | 
 39 | class MCLModel(var assignments: Dataset[Assignment]) extends Saveable with Serializable{
 40 | 
 41 |   /** Get number of clusters.*/
 42 |   def nbClusters: Int = assignments
 43 |     .groupBy("cluster")
 44 |     .agg(collect_list(col("id")))
 45 |     .collect.length
 46 | 
 47 |   /**
 48 |     * Save MCL clusters assignments
 49 |     *
 50 |     * @param sc current Spark Context
 51 |     * @param path location where MCL model is saved
 52 |     */
 53 | 
 54 |   override def save(sc: SparkContext, path: String): Unit = {
 55 |     MCLModel.SaveLoadV1_0.save(sc, this, path)
 56 |   }
 57 | 
 58 |   override protected def formatVersion: String = "1.0"
 59 | }
 60 | 
 61 | object MCLModel extends Loader[MCLModel]{
 62 | 
 63 |   /** Load MCL clusters assignments
 64 |     *
 65 |     * @param sc current Spark Context
 66 |     * @param path location where MCL model is saved
 67 |     */
 68 | 
 69 |   override def load(sc: SparkContext, path: String): MCLModel = {
 70 |     MCLModel.SaveLoadV1_0.load(sc, path)
 71 |   }
 72 | 
 73 |   private[clustering]
 74 |   object SaveLoadV1_0 {
 75 | 
 76 |     private val thisFormatVersion = "1.0"
 77 | 
 78 |     private[clustering]
 79 |     val thisClassName = "org.apache.spark.mllib.clustering.MCLModel"
 80 | 
 81 |     def save(sc: SparkContext, model: MCLModel, path: String): Unit = {
 82 |       val metadata = compact(render(
 83 |         ("class" -> thisClassName) ~ ("version" -> thisFormatVersion)
 84 |       ))
 85 |       sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path))
 86 | 
 87 |       model.assignments.write.parquet(Loader.dataPath(path))
 88 |     }
 89 | 
 90 |     def load(sc: SparkContext, path: String): MCLModel = {
 91 |       implicit val formats = DefaultFormats
 92 |       val spark = SparkSession.builder().getOrCreate()
 93 |       import spark.implicits._
 94 | 
 95 |       val (className, formatVersion, metadata) = Loader.loadMetadata(sc, path)
 96 |       assert(className == thisClassName)
 97 |       assert(formatVersion == thisFormatVersion)
 98 | 
 99 |       /*val expansionRate = (metadata \ "expansionRate").extract[Double]
100 |       val inflationRate = (metadata \ "inflationRate").extract[Double]
101 |       val epsilon = (metadata \ "epsilon").extract[Double]
102 |       val maxIterations = (metadata \ "maxIterations").extract[Int]*/
103 | 
104 |       val assignments = spark.read.parquet(Loader.dataPath(path))
105 |       // Check if loading file respects Assignment class schema
106 |       Loader.checkSchema[Assignment](assignments.schema)
107 |       val certifiedAssignments = assignments.map {
108 |         case Row(id: Long, cluster: Long) => Assignment(id, cluster)
109 |       }
110 | 
111 |       new MCLModel(certifiedAssignments)
112 |     }
113 |   }
114 | }
115 | 
116 | /** List which point belongs to which cluster
117 |   *
118 |   * @param id node id
119 |   * @param cluster cluster id
120 |   */
121 | 
122 | case class Assignment(id: Long, cluster: Long)
123 | 
124 | /** Factory for [[MCLModel.assignments]] instances. */
125 | private object Assignment {
126 | 
127 |   /** Creates an assignment with a given node id and a given cluster id
128 |     *
129 |     * @param r a row with two columns: one for node id and one for cluster id
130 |     */
131 |   def apply(r: Row): Assignment = {
132 |     Assignment(r.getLong(0), r.getLong(1))
133 |   }
134 | }


--------------------------------------------------------------------------------
/src/test/resources/MCLUtils/OrientedMatrixSelfLoop.txt:
--------------------------------------------------------------------------------
 1 | 1.0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;1.0;0;1.0;1.0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;1.0;0;1.0;1.0;0;0;0;0
 2 | 0;1.0;0;0;0;0;1.0;0;0;0;1.0;1.0;1.0;0;0;0;0;1.0;0;0;0;0;0;1.0;0;0;0;1.0;0;0;1.0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0
 3 | 0;0;1.0;0;0;0;0;0;1.0;0;0;0;1.0;0;1.0;0;0;0;0;0;1.0;0;1.0;1.0;0;0;0;0;1.0;1.0;0;0;0;0;0;0;0;1.0;0;0;0;1.0;0;0;0;0;0;0;0;1.0
 4 | 0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;1.0;0;0;0;0;1.0;1.0;0;1.0;1.0;0
 5 | 0;0;0;0;1.0;0;0;0;0;0;0;0;0;1.0;1.0;0;0;0;1.0;0;0;1.0;0;0;0;1.0;0;1.0;0;0;0;0;0;1.0;0;1.0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0
 6 | 0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;1.0;0;0;1.0;0;1.0;0;0;0;0;0;0;0;1.0;0;1.0;0;0;0;0;0;1.0;0;0;0;0;0;0
 7 | 0;0;0;0;0;0;1.0;1.0;0;0;0;1.0;0;0;1.0;0;1.0;0;0;1.0;0;1.0;0;0;1.0;1.0;0;1.0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0
 8 | 0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;1.0;0;0;0;0;0;0;1.0;0;1.0;1.0;1.0;0;0;0;0;0;0;0;0;0;0;0
 9 | 0;0;0;0;0;0;0;0;1.0;1.0;1.0;0;0;0;0;0;1.0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;0;0
10 | 0;0;0;0;0;0;0;0;0;1.0;0;0;1.0;0;0;0;0;0;0;0;1.0;1.0;0;1.0;0;0;0;1.0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;1.0
11 | 0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;1.0;0;0;0;1.0;0;1.0;0;1.0;1.0;0;0;0;0;0;0;1.0;0;1.0;1.0;0;0;0;0;0;0;1.0;0;0;0;0;0;0
12 | 0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;1.0;0;0;1.0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;1.0
13 | 0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;1.0;0;1.0;0;0;0;1.0;0;0;0;0;0;1.0;0;1.0;0;0;0;0;0;0;0;1.0;0;1.0;0;0;0;0;1.0;0;0;0;1.0
14 | 0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;1.0;0;0;1.0;0;1.0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0
15 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;1.0;0;1.0;0;0;1.0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;1.0;1.0;0;0;0
16 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;1.0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;1.0;0;0;0;0;1.0;0;1.0;0;0;0;1.0;1.0;0;0;0;0;1.0
17 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;1.0;0;0;0;1.0;0;1.0;1.0;1.0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;0;0;0;0
18 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;1.0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;1.0;1.0;0;0;1.0;0;0;0;0;0;0;0;0;0;0
19 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;1.0;1.0;1.0;0;1.0;0;0;0;1.0;0;0;1.0;0;0;0;0
20 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;1.0;0;0;1.0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;1.0;0;1.0;0;0;0
21 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;0;0;0;0;0;0;0;1.0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0
22 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;1.0;0;1.0;1.0;0;1.0;0;1.0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0
23 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;1.0;0;0;0;1.0;1.0;1.0;1.0;0;0;0;1.0;1.0;1.0;0;0;0;0;0
24 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;1.0;0;0;0
25 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0
26 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;0;0;0;0;1.0;1.0;1.0;0;0;0;0;0;0;1.0;1.0;0;0;0;0;0;0;0
27 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;1.0;0;1.0;1.0;0;1.0;0;0;1.0;0;0;1.0;0;0;1.0;0;0;0;0;0;0
28 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;0;1.0;0;0;0;0
29 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;1.0;1.0;0;0;0;0;1.0;0;0;1.0;0;0;0;0;0;0;0;0;0;0
30 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0
31 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0
32 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;1.0;0;0;1.0;0;0;0;0;0;1.0;0;1.0;0;0
33 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;1.0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;1.0;1.0
34 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;1.0;0;0;0;1.0;0;1.0;0;0;1.0;0;0;0
35 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;0;0;0;0;1.0;0;0;0;0;1.0;0;1.0;0
36 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;1.0;0;0;1.0;0;1.0;0;0;0;1.0;0;0;0
37 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;1.0;0;0;0;0;0;0;1.0;0;0
38 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;1.0;0;0;1.0;0;0
39 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;1.0
40 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;1.0;0;0;0;0;0;0;0;1.0
41 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;0;1.0;0;1.0;0;0;0
42 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;1.0;0;0;0
43 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;1.0;0;0;1.0;0
44 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;1.0;0;0;0
45 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;1.0;0
46 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;1.0;0
47 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;0
48 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0
49 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0
50 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0
51 | 


--------------------------------------------------------------------------------
/src/main/scala/Main.scala:
--------------------------------------------------------------------------------
  1 | /*The MIT License (MIT)
  2 | 
  3 | Copyright (c) 2015-2016, Joan André
  4 | 
  5 | Permission is hereby granted, free of charge, to any person obtaining a copy
  6 |   of this software and associated documentation files (the "Software"), to deal
  7 |   in the Software without restriction, including without limitation the rights
  8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 |   copies of the Software, and to permit persons to whom the Software is
 10 | furnished to do so, subject to the following conditions:
 11 | 
 12 |   The above copyright notice and this permission notice shall be included in
 13 | all copies or substantial portions of the Software.
 14 | 
 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | THE SOFTWARE.*/
 22 | 
 23 | // Import required spark classes
 24 | 
 25 | import org.apache.log4j.{Level, Logger}
 26 | import org.apache.spark.graphx._
 27 | import org.apache.spark.mllib.clustering.{Assignment, MCL}
 28 | import org.apache.spark.rdd.RDD
 29 | import org.apache.spark.{SparkConf, SparkContext}
 30 | import org.apache.spark.sql.Dataset
 31 | import org.apache.spark.sql.functions._
 32 | 
 33 | /** Define main method for a start-up example*/
 34 | object Main {
 35 | 
 36 |   // Disable Spark messages when running program
 37 |   Logger.getLogger("org").setLevel(Level.OFF)
 38 |   Logger.getLogger("akka").setLevel(Level.OFF)
 39 | 
 40 |   // Guide for users who want to run MCL program
 41 |   val usage = """
 42 |     Usage: mcl [--expansionRate num] [--inflationRate num] [--epsilon num] [--maxIterations num] [--selfLoopWeight num] [--graphOrientationStrategy string]
 43 |               """
 44 | 
 45 |   type OptionMap = Map[Symbol, Any]
 46 | 
 47 |   def toInt(key: Symbol, s: String): Int = {
 48 |     try {
 49 |       s.toInt
 50 |     } catch {
 51 |       case e: Exception => throw new Exception("\n" + key.toString() + " must be an integer")
 52 |     }
 53 |   }
 54 | 
 55 |   def toDouble(key: Symbol, s: String): Double = {
 56 |     try {
 57 |       s.toDouble
 58 |     } catch {
 59 |       case e: Exception => throw new Exception("\n" + key.toString() + " must be a double")
 60 |     }
 61 |   }
 62 | 
 63 |   def nextOption(map : OptionMap, list: List[String]) : OptionMap = {
 64 |     list match {
 65 |       case Nil => map
 66 |       case "--expansionRate" :: value :: tail =>
 67 |         nextOption(map ++ Map('expansionRate -> value), tail)
 68 |       case "--inflationRate" :: value :: tail =>
 69 |         nextOption(map ++ Map('inflationRate -> value), tail)
 70 |       case "--epsilon" :: value :: tail =>
 71 |         nextOption(map ++ Map('epsilon -> value), tail)
 72 |       case "--maxIterations" :: value :: tail =>
 73 |         nextOption(map ++ Map('maxIterations -> value), tail)
 74 |       case "--selfLoopWeight" :: value :: tail =>
 75 |         nextOption(map ++ Map('selfLoopWeight -> value), tail)
 76 |       case "--graphOrientationStrategy" :: value :: tail =>
 77 |         nextOption(map ++ Map('graphOrientationStrategy -> value), tail)
 78 |       case option :: tail => throw new Exception("\nUnknown option " + option)
 79 |     }
 80 |   }
 81 | 
 82 |   def main(args: Array[String]) {
 83 | 
 84 |     // Manage options for the program
 85 |     if (args.length == 0) println(usage)
 86 |     val arglist = args.toList
 87 | 
 88 |     try{
 89 |       val options = nextOption(Map(),arglist)
 90 |       val expansionRate:Int = toInt('expansionRate, options.getOrElse('expansionRate, 2).toString)
 91 |       val inflationRate:Double = toDouble('inflationRate, options.getOrElse('inflationRate, 2.0).toString)
 92 |       val epsilon:Double = toDouble('epsilon, options.getOrElse('epsilon, 0.01).toString)
 93 |       val maxIterations:Int = toInt('maxIterations, options.getOrElse('maxIterations, 10).toString)
 94 |       val selfLoopWeight:Double = toDouble('selfLoopWeight, options.getOrElse('selfLoopWeight, 1.0).toString)
 95 |       val graphOrientationStrategy:String = options.getOrElse('graphOrientationStrategy, "undirected").toString
 96 | 
 97 |       // Initialise spark context
 98 |       val conf = new SparkConf()
 99 |         .setMaster("local[*]")
100 |         .set("spark.driver.memory", "1g")
101 |         .set("spark.executor.memory", "1g")
102 |         .setAppName("MCL")
103 | 
104 |       val sc = new SparkContext(conf)
105 | 
106 |       // Create and RDD for vertices
107 |       val users: RDD[(VertexId, String)] =
108 |         sc.parallelize(Array((0L,"Node1"), (1L,"Node2"),
109 |           (2L,"Node3"), (3L,"Node4"),(4L,"Node5"),
110 |           (5L,"Node6"), (6L,"Node7"), (7L, "Node8"),
111 |           (8L, "Node9"), (9L, "Node10"), (10L, "Node11")))
112 | 
113 |       // Create an RDD for edges
114 |       val relationships: RDD[Edge[Double]] =
115 |         sc.parallelize(
116 |           Seq(Edge(0, 1, 1.0), Edge(1, 0, 1.0),
117 |             Edge(0, 2, 1.0), Edge(2, 0, 1.0),
118 |             Edge(0, 3, 1.0), Edge(3, 0, 1.0),
119 |             Edge(1, 2, 1.0), Edge(2, 1, 1.0),
120 |             Edge(1, 3, 1.0), Edge(3, 1, 1.0),
121 |             Edge(2, 3, 1.0), Edge(3, 2, 1.0),
122 |             Edge(4, 5, 1.0), Edge(5, 4, 1.0),
123 |             Edge(4, 6, 1.0), Edge(6, 4, 1.0),
124 |             Edge(4, 7, 1.0), Edge(7, 4, 1.0),
125 |             Edge(5, 6, 1.0), Edge(6, 5, 1.0),
126 |             Edge(5, 7, 1.0), Edge(7, 5, 1.0),
127 |             Edge(6, 7, 1.0), Edge(7, 6, 1.0),
128 |             Edge(3, 8, 1.0), Edge(8, 3, 1.0),
129 |             Edge(9, 8, 1.0), Edge(8, 9, 1.0),
130 |             Edge(9, 10, 1.0), Edge(10, 9, 1.0),
131 |             Edge(4, 10, 1.0), Edge(10, 4, 1.0)
132 |           ))
133 | 
134 |       // Build the initial Graph
135 |       val graph = Graph(users, relationships)
136 | 
137 |       // Run MCL algorithm and get nodes assignments to generated clusters
138 |       val clusters: Dataset[Assignment] =
139 |           MCL.train(
140 |             graph,
141 |             expansionRate,
142 |             inflationRate,
143 |             epsilon,
144 |             maxIterations,
145 |             selfLoopWeight,
146 |             graphOrientationStrategy)
147 |           .assignments
148 | 
149 |       clusters
150 |         .groupBy("cluster")
151 |         .agg(sort_array(collect_list(col("id"))))
152 |         .show(3)
153 | 
154 |       // Terminate spark context
155 |       sc.stop()
156 | 
157 |     }
158 |     catch{
159 |       case e: Exception => println(e.getMessage)
160 |         sys.exit(1)
161 |     }
162 |   }
163 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/mllib/clustering/MCLUtils.scala:
--------------------------------------------------------------------------------
  1 | /*The MIT License (MIT)
  2 | 
  3 | Copyright (c) 2015-2016, Joan André
  4 | 
  5 | Permission is hereby granted, free of charge, to any person obtaining a copy
  6 |   of this software and associated documentation files (the "Software"), to deal
  7 |   in the Software without restriction, including without limitation the rights
  8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 |   copies of the Software, and to permit persons to whom the Software is
 10 | furnished to do so, subject to the following conditions:
 11 | 
 12 |   The above copyright notice and this permission notice shall be included in
 13 | all copies or substantial portions of the Software.
 14 | 
 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | THE SOFTWARE.*/
 22 | 
 23 | package org.apache.spark.mllib.clustering
 24 | 
 25 | import breeze.linalg.max
 26 | import org.apache.spark.graphx._
 27 | import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, IndexedRow, IndexedRowMatrix}
 28 | import org.apache.spark.mllib.linalg.{SparseVector, Vectors}
 29 | import org.apache.spark.rdd.RDD
 30 | import org.apache.spark.sql.{DataFrame, Row}
 31 | import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
 32 | import org.apache.spark.sql.types._
 33 | 
 34 | /**
 35 |   * Utils functions for MCL algorithm implementation.
 36 |   */
 37 | object MCLUtils {
 38 | 
 39 |   /** Print an adjacency matrix in nice format.
 40 |     *
 41 |     * @param mat an adjacency matrix
 42 |     */
 43 |   def displayMatrix(mat: IndexedRowMatrix): Unit={
 44 |     println()
 45 |     mat
 46 |       .rows.sortBy(_.index).collect()
 47 |       .foreach(row => {
 48 |         printf(row.index + " => ")
 49 |         row.vector.toArray
 50 |           .foreach(v => printf(",%.4f", v))
 51 |         println()
 52 |       })
 53 |   }
 54 | 
 55 |   def displayBlockMatrix(mat: BlockMatrix): Unit={
 56 |     println()
 57 |     mat
 58 |       .blocks.sortBy(_._1).collect()
 59 |       .foreach(
 60 |         block => {
 61 |           printf(block._2.toString())
 62 |       })
 63 |   }
 64 | 
 65 |   /** Get a suitable graph for MCL model algorithm.
 66 |     *
 67 |     * Each vertex id in the graph corresponds to a row id in the adjacency matrix.
 68 |     *
 69 |     * @param graph original graph
 70 |     * @param lookupTable a matching table with nodes ids and new ordered ids
 71 |     * @return prepared graph for MCL algorithm
 72 |     */
 73 |   def preprocessGraph[VD](graph: Graph[VD, Double], lookupTable: DataFrame): Graph[Int, Double]={
 74 |     val newVertices: RDD[(VertexId, Int)] =
 75 |       lookupTable.rdd.map(
 76 |         row => (row.getInt(1).toLong, row.getInt(0))
 77 |       )
 78 | 
 79 |     Graph(newVertices, graph.edges)
 80 |       .groupEdges((e1,e2) => e1 + e2)
 81 |   }
 82 | 
 83 |   /** Deal with self loop
 84 |     *
 85 |     * Add one when weight is nil and remain as it is otherwise
 86 |     *
 87 |     * @param graph original graph
 88 |     * @param selfLoopWeight a coefficient between 0 and 1 to influence clustering granularity and objective
 89 |     * @return an RDD of self loops weights and associated coordinates.
 90 |     */
 91 |   def selfLoopManager(graph: Graph[Int, Double], selfLoopWeight: Double): RDD[(Int, (Int, Double))] = {
 92 | 
 93 |     val graphWithLinkedEdges: Graph[Array[Edge[Double]], Double] =
 94 |       Graph(
 95 |         graph
 96 |           .collectEdges(EdgeDirection.Either),
 97 |         graph.edges
 98 |       )
 99 | 
100 |     val selfLoop:RDD[(Int, (Int, Double))] =
101 |       graph
102 |       .triplets
103 |       .filter(e => e.srcId==e.dstId && e.attr > 0)
104 |       .map(e => (e.srcId, e.srcAttr))
105 |       .fullOuterJoin(graph.vertices)
106 |       .filter(join => join._2._1.isEmpty)
107 |       .leftOuterJoin(graphWithLinkedEdges.vertices)
108 |       .map(v =>
109 |         (v._2._1._2.get,
110 |           (v._2._1._2.get,
111 |             v._2._2.getOrElse(Array(Edge(1.0.toLong, 1.0.toLong, 1.0))).map(e => e.attr).max*selfLoopWeight)
112 |           )
113 |       )
114 | 
115 |     selfLoop
116 |   }
117 | 
118 |   /** Deal with multiple adjacency matrix filling strategy depending on graph orientation
119 |     *
120 |     * @param graph original graph
121 |     * @param graphOrientationStrategy chose a graph strategy completion depending on its nature. 3 choices: undirected, directed, birected.
122 |     * @return an RDD of new edges weights and associated coordinates.
123 |     */
124 |   def graphOrientationManager(graph: Graph[Int, Double], graphOrientationStrategy: String): RDD[(Int, (Int, Double))] = {
125 | 
126 |     graphOrientationStrategy match {
127 | 
128 |       //Undirected Graph Solution
129 |       case "undirected" =>
130 | 
131 |         graph.triplets.map(
132 |           triplet => (triplet.srcAttr, (triplet.dstAttr, triplet.attr))
133 |         )
134 | 
135 |       //Directed Graph Solution => with only one possible orientation per edge
136 |       case "directed" =>
137 | 
138 |         graph.triplets.flatMap(
139 |           triplet => {
140 |             if (triplet.srcAttr != triplet.dstAttr) {
141 |               Array((triplet.srcAttr, (triplet.dstAttr, triplet.attr)), (triplet.dstAttr, (triplet.srcAttr, triplet.attr)))
142 |             }
143 |             else {
144 |               Array((triplet.srcAttr, (triplet.dstAttr, triplet.attr)))
145 |             }
146 |           }
147 |         )
148 | 
149 |       //Directed Graph Solution => with only one possible orientation per edge
150 |       case "bidirected" =>
151 | 
152 |         val tempEntries: RDD[((Int, Int), (Double, Int))] = graph.triplets.flatMap(
153 |           triplet => {
154 |             Array(
155 |               ((triplet.srcAttr, triplet.dstAttr), (triplet.attr, 1)),
156 |               ((triplet.dstAttr, triplet.srcAttr), (triplet.attr, 2))
157 |             )
158 |           }
159 |         )
160 | 
161 |         tempEntries
162 |           .groupByKey()
163 |           .map(
164 |             e =>
165 |               if(e._2.size > 1){
166 |                 val value = e._2.filter(v => v._2 == 1).head._1
167 |                 (e._1._1, (e._1._2, value))
168 |               }
169 |               else{
170 |                 (e._1._1, (e._1._2, e._2.head._1))
171 |               }
172 |           )
173 |     }
174 |   }
175 | 
176 |   /** Transform a Graph into an IndexedRowMatrix
177 |     *
178 |     * @param graph original graph
179 |     * @param selfLoopWeight a coefficient between 0 and 1 to influence clustering granularity and objective
180 |     * @param graphOrientationStrategy chose a graph strategy completion depending on its nature. 3 choices: undirected, directed, birected.
181 |     * @return a ready adjacency matrix for MCL process.
182 |     * @todo Check graphOrientationStrategy choice for current graph
183 |     */
184 |   def toIndexedRowMatrix(graph: Graph[Int, Double], selfLoopWeight: Double, graphOrientationStrategy: String): IndexedRowMatrix = {
185 | 
186 |     //Especially relationships values have to be checked before doing what follows
187 |     val rawEntries: RDD[(Int, (Int, Double))] = graphOrientationManager(graph, graphOrientationStrategy)
188 | 
189 |     val numOfNodes:Int =  graph.numVertices.toInt
190 | 
191 |     val selfLoop:RDD[(Int, (Int, Double))] = selfLoopManager(graph, selfLoopWeight)
192 |     val entries:RDD[(Int, (Int, Double))] = rawEntries.union(selfLoop)
193 | 
194 |     val indexedRows = entries.groupByKey().map(e =>
195 |       IndexedRow(e._1, Vectors.sparse(numOfNodes, e._2.toSeq))
196 |     )
197 | 
198 |     new IndexedRowMatrix(indexedRows)
199 |   }
200 | 
201 |   /** Transform an IndexedRowMatrix into a Graph
202 |     *
203 |     * @param mat an adjacency matrix
204 |     * @param vertices vertices of original graph
205 |     * @return associated graph
206 |     */
207 |   def toGraph(mat: IndexedRowMatrix, vertices: RDD[(VertexId, String)]): Graph[String, Double] = {
208 |     val edges: RDD[Edge[Double]] =
209 |       mat.rows.flatMap(f = row => {
210 |         val svec: SparseVector = row.vector.toSparse
211 |         val it:Range = svec.indices.indices
212 |         it.map(ind => Edge(row.index, svec.indices.apply(ind), svec.values.apply(ind)))
213 |       })
214 |     Graph(vertices, edges)
215 |   }
216 | 
217 | 
218 | 
219 | }
220 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://travis-ci.org/joandre/MCL_spark.svg?branch=master)](https://travis-ci.org/joandre/MCL_spark)
  2 | [![codecov](https://codecov.io/gh/joandre/MCL_spark/branch/master/graph/badge.svg)](https://codecov.io/gh/joandre/MCL_spark)
  3 | 
  4 | # MCL Spark
  5 | 
  6 | **License:** [MIT](https://github.com/joandre/MCL_spark/blob/master/LICENSE.txt)
  7 | 
  8 | **MCL Spark** is an experimental project which goal is to implement a graph clustering algorithm in [Spark](https://github.com/apache/spark), using especially distributed matrix tools embedded in the scala API.
  9 | 
 10 | Why MCL algorithm? Because it responds to Spark MLLib [contribution policy](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark#ContributingtoSpark-MLlib-specificContributionGuidelines) first four points:
 11 |  * Be widely known
 12 |  * Be used and accepted
 13 |  * Be highly scalable
 14 |  * Be well documented
 15 | 
 16 | Please do not hesitate to post comments or questions.
 17 | 
 18 | Most of the following content is based on Stijn van Dongen website (http://micans.org/mcl/).
 19 | 
 20 | Table of Contents
 21 | =================
 22 | 
 23 | * [MCL Spark](#mcl-spark)
 24 |     * [Getting Started](#getting-started)
 25 |       * [Online Documentation](#online-documentation)
 26 |       * [Requirements](#requirements)
 27 |       * [Building From Sources](#building-from-sources)
 28 |       * [Use embarked example](#use-embarked-example)
 29 |       * [Parameters choices](#parameters-choices)
 30 |     * [MCL (Markov Cluster) algorithm theory](#mcl-markov-cluster-algorithm-theory)
 31 |       * [Expansion](#expansion)
 32 |       * [Inflation](#inflation)
 33 |       * [Convergence and clusters interpretation](#convergence-and-clusters-interpretation)
 34 |       * [Optimizations](#optimizations)
 35 |     * [Implementation thoughts](#implementation-thoughts)
 36 |       * [Spark matrices universe](#spark-matrices-universe)
 37 |         * [IndexedRowMatrix](#indexedrowmatrix)
 38 |         * [BlockMatrix](#blockmatrix)
 39 |       * [Directed graph management](#directed-graph-management)
 40 |       * [Hypergraph](#hypergraph)
 41 |     * [References](#references)
 42 | 
 43 | ## Getting Started
 44 | 
 45 | ### Online Documentation
 46 | 
 47 | A Scaladoc is available [here](http://joandre.github.io/docs/MCL_Spark/api/).
 48 | 
 49 | ### Requirements
 50 | 
 51 | * JDK 1.8 or higher
 52 | * SBT 0.13.9 (see http://www.scala-sbt.org/download.html for more information)
 53 | * Build against Spark 1.6.1+
 54 | 
 55 | ### Building From Sources
 56 | 
 57 | This library is built with SBT. To build a JAR file simply run "sbt package" from the project root. Currently project was built under scala 2.10.5.
 58 | 
 59 | ### Use embarked example
 60 | 
 61 | ```
 62 | 
 63 | $MCL_SPARK_HOME/sbt "run [--expansionRate num] [--inflationRate num] [--epsilon num] [--maxIterations num]  [--selfLoopWeight num] [--graphOrientationStrategy string]"
 64 | 
 65 | ```
 66 | 
 67 | ### Import MCL into your Spark Shell
 68 | 
 69 | ```
 70 | 
 71 | $SPARK_HOME/bin/spark-shell --jars $MCL_SPARK_HOME/target/scala-2.11/mcl_spark_2.11-1.0.0.jar 
 72 | 
 73 | ```
 74 | 
 75 | Then use MCL as follows:
 76 | 
 77 | ```
 78 | import org.apache.spark.graphx._
 79 | import org.apache.spark.mllib.clustering.{Assignment, MCL}
 80 | import org.apache.spark.rdd.RDD
 81 | import org.apache.spark.sql.Dataset
 82 | import org.apache.spark.sql.functions.{sort_array,collect_list,col}
 83 | 
 84 | val users: RDD[(VertexId, String)] =
 85 |             sc.parallelize(Array((0L,"Node1"), (1L,"Node2"),
 86 |                 (2L,"Node3"), (3L,"Node4"),(4L,"Node5"),
 87 |                 (5L,"Node6"), (6L,"Node7"), (7L, "Node8"),
 88 |                 (8L, "Node9"), (9L, "Node10"), (10L, "Node11")))
 89 | 
 90 | // Create an RDD for edges
 91 | val relationships: RDD[Edge[Double]] =
 92 |             sc.parallelize(
 93 |               Seq(Edge(0, 1, 1.0), Edge(1, 0, 1.0),
 94 |                 Edge(0, 2, 1.0), Edge(2, 0, 1.0),
 95 |                 Edge(0, 3, 1.0), Edge(3, 0, 1.0),
 96 |                 Edge(1, 2, 1.0), Edge(2, 1, 1.0),
 97 |                 Edge(1, 3, 1.0), Edge(3, 1, 1.0),
 98 |                 Edge(2, 3, 1.0), Edge(3, 2, 1.0),
 99 |                 Edge(4, 5, 1.0), Edge(5, 4, 1.0),
100 |                 Edge(4, 6, 1.0), Edge(6, 4, 1.0),
101 |                 Edge(4, 7, 1.0), Edge(7, 4, 1.0),
102 |                 Edge(5, 6, 1.0), Edge(6, 5, 1.0),
103 |                 Edge(5, 7, 1.0), Edge(7, 5, 1.0),
104 |                 Edge(6, 7, 1.0), Edge(7, 6, 1.0),
105 |                 Edge(3, 8, 1.0), Edge(8, 3, 1.0),
106 |                 Edge(9, 8, 1.0), Edge(8, 9, 1.0),
107 |                 Edge(9, 10, 1.0), Edge(10, 9, 1.0),
108 |                 Edge(4, 10, 1.0), Edge(10, 4, 1.0)
109 |               ))
110 | 
111 | // Build the initial Graph
112 | val graph = Graph(users, relationships)
113 | graph.cache()
114 | 
115 | val clusters: Dataset[Assignment] =
116 |     MCL.train(graph).assignments
117 | clusters
118 |     .groupBy("cluster")
119 |     .agg(sort_array(collect_list(col("id"))))
120 |     .show(3)
121 | 
122 | ```
123 | 
124 | ### Parameters choices
125 | 
126 | **Inflation and Expansion rates** => The two parameters influence what we call cluster granularity, so how many and how strong should be detected groups of nodes. Inflation increases intra cluster links and decreases inter cluster links while expansion connects nodes to further and new parts of the graph. **Default = 2**
127 | 
128 | 1. A big inflation rate will strengthen existing clusters.
129 | 2. A big expansion rate will boost clusters merging.
130 | 
131 | Nota bene: Only integers are accepted for expansion rate for now (for computational reasons).
132 | 
133 | **Epsilon** => In order to keep the adjacency matrix associated with our graph sparse, one strategy is to remove some negligible edges regarding its weight. Let's say you chose an epsilon equal to 0.05. This means that every edge, connected to one node, which weight is inferior to 5% of the sum of every edges weight connected to our node is removed (see Optimization paragraph for more details). **Default = 0.01**
134 |  
135 | **Maximum number of iterations** => It forces MCL to stop before it converges. Regarding Stijn van Dongen recommendations, a steady state is usually reached after 10 iterations. **Default = 10**
136 | 
137 | **Self loops weight management** => A percentage of the maximum weight can be applied to added self loops. For example, for a binary graph, 1 is the maximum weight to allocate (see Optimization paragraph for more details). **Default = 0.1**
138 | 
139 | **Directed and undirected graphs management** => To deal with directed graphs. **Default = "undirected"**
140 | 
141 | 1. "undirected": graph is supposed undirected. No edges are added.
142 | 2. "directed": graph is supposed directed. Each edge inverse is added so graph becomes undirected.
143 | 3. "bidirected": graph already owns bidirected edges. Excepted for already existing undirected edges, each edge inverse is added so graph becomes undirected.
144 | 
145 | See [Implementation thoughts](#implementation-thoughts) for more details.
146 | 
147 | ## MCL (Markov Cluster) algorithm theory
148 | 
149 | ### Recall about Markov chains
150 | 
151 | *"A Markov chain is a sequence of random variables X1, X2, X3, ... with the Markov property, namely that the probability of moving to next state depends only on the present state and not on the previous states."* ([wikipedia definition](https://en.wikipedia.org/wiki/Markov_chain#Formal_definition))
152 | 
153 | **Defintion**: a state is absorbent when it cannot be left.
154 | 
155 | **Definition**: a Markov chain is aperiodic, if it at least one of its state has a period of 1, so returning to the original state occurs irregularly.
156 | 
157 | **Definition**: a Markov chain is irreducible, if it is possible to get to any state from any state.
158 | 
159 | **Definition**: a Markov chain is ergodic, if it is both aperiodic and irreducible.
160 | 
161 | ### Principle
162 | 
163 | To detect clusters inside a graph, MCL algorithm uses a Column Stochastic Matrix representation and the concept of random walks. The idea is that random walks between two nodes that belong to the same group are more frequent than between two nodes belonging to different groups. So we should compute probability that a node reach each other node of the graph to have a better insight of clusters.
164 | 
165 | **Definition**: a Column Stochastic Matrix (CSM) is a non-negative matrix which each column sum is equal to 1. In our case, we will prefer Row Stochastic Matrix (RSM) instead of CSM to use Spark API tools (see Implementation thoughts for more details).
166 | 
167 | Two steps are needed to simulate random walks on a graph: expansion and inflation. Each step is associated with a specific rate (respectively eR and iR). In the following formula, n is the number of nodes in the graph.
168 | 
169 | ### Expansion
170 | To perform **expansion**, we raise the stochastic matrix to the power eR using the normal matrix product.
171 | 
172 | <p align="center"> <img src="https://github.com/joandre/MCL_spark/blob/master/images/Expansion.png"/> </p>
173 | 
174 | , for eR = 2.
175 | 
176 | ### Inflation
177 | To perform **inflation**, we apply the Hadamard power on the RSM (powers entrywise) and we then normalize each row to get back to probabilities.
178 | 
179 | <p align="center"> <img src="https://github.com/joandre/MCL_spark/blob/master/images/Inflation.png"/> </p>
180 | 
181 | ### Convergence and clusters interpretation
182 | 
183 | After each loop (expansion and inflation), a convergence test is applied on the new matrix. When it remains stable regarding the previous iteration, then the algorithm stops. Otherwise, a maximum number of iterations is defined to force the process to reach a steady state.
184 | 
185 | <p align="center"> <img src="https://github.com/joandre/MCL_spark/blob/master/images/Difference.png"/> </p>
186 | 
187 | Each non-empty column (with non-zero values) of A, corresponds to a cluster and its composition. A cluster will be a star with one or several attractor(s) in the center (see example below).
188 | 
189 | <p align="center"> <img src="https://github.com/joandre/MCL_spark/blob/master/images/MCL.png" alt="Graph shape for different convergence status (http://micans.org)"/> </p>
190 | 
191 | A node can belong to one or several cluster(s).
192 | 
193 | ### Optimizations
194 | Most of the following solutions were developed by Stijn van Dongen. More could come based on matrix distribution state.
195 | 
196 |  * Add self loop to each node. This is generally used to satisfy aperiodic condition of graph Markov chain. More than an optimization, this is required to avoid the non-convergence of MCL because of the infinite alternation between different states (depending on the period). Default weight allocated is the maximum weight of every edges related to the current node. To stay as closed as possible of the true graph, self loop weights can be decreased.
197 |  * Most of big graphs are sparse because of their nature. For example, in a social graph, people are not related to every other users but mostly to relatives, friends or colleagues (depending on the nature of the social network). In inflation and expansion steps, "weak" connections weight tends to zero (since it is the goal to detect strong connections in order to bring out clusters) without reaching it. In order to keep the graph sparse, we can adopt three strategies:
198 |     1. Set every small values to zero regarding a threshold. This can be dangerous when a large percentage of global weight belongs to small edges. Currently, this is the only strategy available.
199 |     2. Keep k largest values for each node. Can be very expensive for very large k and a high number of nonzero entries.
200 |     3. Mix the two strategies so a threshold pruning is first applied to reduce exact pruning cost.
201 |  * In order to improve convergence test speed, MCL author proposed a more efficient way to proceed. (Not Implemented Yet)
202 | 
203 | ## Implementation thoughts
204 | 
205 | ### Spark matrices universe
206 | As explained in introduction, this program is exclusively based on scala matrices Spark API. Two main matrix types are explored to implement inflation, expansion and normalization steps: [IndexedRowMatrix](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix) and [BlockMatrix](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.mllib.linalg.distributed.BlockMatrix).
207 | 
208 | #### IndexedRowMatrix
209 |  * Advantages: Each row can be stored in a sparse way, normalization is easy to apply since we apply it per row (instead of column like in the original implementation).
210 |  * Disadvantages: No multiplication between two IndexedRowMatrix available.
211 | 
212 | #### BlockMatrix
213 |  * Advantages: Fully scalable => Blocks of adjustable size (1024x1024 by default), with sparse matrices using [Compressed Sparse Column](http://netlib.org/linalg/html_templates/node92.html)
214 |  * Disadvantages: Hard to implement normalization.
215 | 
216 | For inflation and normalization, adjacency matrix is transformed in IndexedRowMatrix, so computations are done locally.
217 | For expansion, adjacency matrix is transformed in BlockMatrix, so we take advantage of a fully distributed matrix multiplication.
218 |  
219 | ### Directed graphs management
220 | To respect the irreducibility of graphs markov chain, MCL is only applied on undirected ones. For example, in a directed bipartite graph, there are a bunch of absorbent states, so associated markov chain is reducible and does not respect ergodic condition.
221 | 
222 | To offer the possibility to users to apply MCL on directed graphs, the only way is to make the graph symmetric by adding each edge inverse. This is due to GraphX API where edges are only directed. For the particular case of bidirected graphs (where some edges and their inverse already exist), birected edges remain as it is.
223 | 
224 | Note that symmetry (same weight for an edge and its inverse) is preferred for more efficiency.
225 | 
226 | ### Hypergraph
227 | When two nodes are related to each other with several edges, those edges are merged and their weights summed so there remains only one.
228 | 
229 | ## References
230 | 
231 | * Stijn van Dongen. MCL - a cluster algorithm for graphs. [Official Website](http://micans.org/mcl/)
232 | * Kathy Macropol. Clustering on Graphs: The Markov Cluster Algorithm (MCL). [A Presentation](https://www.cs.ucsb.edu/~xyan/classes/CS595D-2009winter/MCL_Presentation2.pdf)
233 | * Jean-Benoist Leger, Corinne Vacher, Jean-Jacques Daudin. Detection of structurally homogeneous subsets in graphs. [A Survey](http://vacher.corinne.free.fr/pdf/Leger_StatsComputing_2013.pdf)
234 | 


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/mllib/clustering/MCLUtilsSuite.scala:
--------------------------------------------------------------------------------
  1 | /*The MIT License (MIT)
  2 | 
  3 | Copyright (c) 2015-2016, Joan André
  4 | 
  5 | Permission is hereby granted, free of charge, to any person obtaining a copy
  6 |   of this software and associated documentation files (the "Software"), to deal
  7 |   in the Software without restriction, including without limitation the rights
  8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 |   copies of the Software, and to permit persons to whom the Software is
 10 | furnished to do so, subject to the following conditions:
 11 | 
 12 |   The above copyright notice and this permission notice shall be included in
 13 | all copies or substantial portions of the Software.
 14 | 
 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | THE SOFTWARE.*/
 22 | 
 23 | package org.apache.spark.mllib.clustering
 24 | 
 25 | import java.util.Locale
 26 | 
 27 | import org.apache.log4j.{Level, Logger}
 28 | import org.apache.spark.graphx._
 29 | import org.apache.spark.mllib.clustering.MCLUtils._
 30 | import org.apache.spark.mllib.linalg.DenseVector
 31 | import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix}
 32 | import org.apache.spark.rdd.RDD
 33 | import org.apache.spark.sql.{DataFrame, SQLContext}
 34 | 
 35 | import scala.io.Source
 36 | 
 37 | /** Scala Tests class for MCLUtils functions */
 38 | class MCLUtilsSuite extends MCLFunSuite{
 39 | 
 40 |   // Disable Spark messages when running program
 41 |   Logger.getLogger("org").setLevel(Level.OFF)
 42 |   Logger.getLogger("akka").setLevel(Level.OFF)
 43 | 
 44 |   // Unit Tests
 45 | 
 46 |   test("Print functions", UnitTest){
 47 |     val indexedMatrix: IndexedRowMatrix =
 48 |       new IndexedRowMatrix(
 49 |         sc.parallelize(
 50 |           Seq(
 51 |             IndexedRow(0, new DenseVector(Array(1,0,0,0,1,0))),
 52 |             IndexedRow(1, new DenseVector(Array(0,1,1,0,1,1))),
 53 |             IndexedRow(2, new DenseVector(Array(0,1,1,0,0,1))),
 54 |             IndexedRow(3, new DenseVector(Array(0,0,0,1,0,1))),
 55 |             IndexedRow(4, new DenseVector(Array(1,1,0,0,1,0))),
 56 |             IndexedRow(5, new DenseVector(Array(0,1,1,1,0,1)))
 57 |           )
 58 |         ))
 59 | 
 60 |     // Force local number format so "." is the only separator used for float numbers in print tests no matter in which environment they run
 61 |     Locale.setDefault(new Locale("en", "US"))
 62 | 
 63 |     val streamIM = new java.io.ByteArrayOutputStream()
 64 |     Console.withOut(streamIM) {
 65 |       displayMatrix(indexedMatrix)
 66 |     }
 67 | 
 68 |     streamIM.toString shouldEqual "\n0 => ,1.0000,0.0000,0.0000,0.0000,1.0000,0.0000\n1 => ,0.0000,1.0000,1.0000,0.0000,1.0000,1.0000\n2 => ,0.0000,1.0000,1.0000,0.0000,0.0000,1.0000\n3 => ,0.0000,0.0000,0.0000,1.0000,0.0000,1.0000\n4 => ,1.0000,1.0000,0.0000,0.0000,1.0000,0.0000\n5 => ,0.0000,1.0000,1.0000,1.0000,0.0000,1.0000\n"
 69 | 
 70 |     val streamBM = new java.io.ByteArrayOutputStream()
 71 |     Console.withOut(streamBM) {
 72 |       displayBlockMatrix(indexedMatrix.toBlockMatrix)
 73 |     }
 74 | 
 75 |     streamBM.toString shouldEqual "\n6 x 6 CSCMatrix\n(0,0) 1.0\n(4,0) 1.0\n(1,1) 1.0\n(2,1) 1.0\n(4,1) 1.0\n(5,1) 1.0\n(1,2) 1.0\n(2,2) 1.0\n(5,2) 1.0\n(3,3) 1.0\n(5,3) 1.0\n(0,4) 1.0\n(1,4) 1.0\n(4,4) 1.0\n(1,5) 1.0\n(2,5) 1.0\n(3,5) 1.0\n(5,5) 1.0"
 76 | 
 77 |   }
 78 | 
 79 |   test("Preprocessing Graph (ordered id for vertices and remove multiple edges)", UnitTest){
 80 | 
 81 |     val sqlContext = SQLContext.getOrCreate(sc)
 82 |     import sqlContext.implicits._
 83 | 
 84 |     val matchingList: RDD[(Int,Int)] = sc.parallelize(Array((0,2), (1,1), (2,3), (3,5), (4,8), (5, 0)))
 85 |     val lookupTable: DataFrame = matchingList.toDF("matrixId", "nodeId")
 86 | 
 87 |     // Create and RDD for vertices
 88 |     val users: RDD[(VertexId, String)] =
 89 |       sc.parallelize(Array((0L,"Node5"), (1L,"Node1"),
 90 |         (2L, "Node0"), (3L,"Node2"), (5L,"Node3"),(8L,"Node4")))
 91 | 
 92 |     // Create an RDD for edges
 93 |     val relationships: RDD[Edge[Double]] =
 94 |       sc.parallelize(
 95 |         Seq(Edge(0, 1, 1.0), Edge(1, 0, 1.0),
 96 |           Edge(0, 3, 1.0), Edge(3, 0, 1.0),
 97 |           Edge(0, 5, 1.0), Edge(5, 0, 1.0),
 98 |           Edge(1, 3, 1.0), Edge(3, 1, 1.0),
 99 |           Edge(1, 8, 1.0), Edge(8, 1, 1.0),
100 |           Edge(2, 8, 1.0), Edge(8, 2, 1.0),
101 |           Edge(8, 2, 1.0), Edge(2, 2, 1.0),
102 |           Edge(2, 2, 1.0)
103 |         ))
104 | 
105 |     // Build the initial Graph
106 |     val graph = Graph(users, relationships)
107 | 
108 |     val cleanedGraph: Graph[Int, Double] = preprocessGraph(graph, lookupTable)
109 | 
110 |     // Create and RDD for vertices
111 |     val challengeUsers: RDD[(VertexId, Int)] =
112 |       sc.parallelize(Array((2L,0), (1L,1),
113 |         (3L,2), (5L,3), (8L,4), (0L,5)))
114 | 
115 |     // Create an RDD for edges
116 |     val challengeRelationships: RDD[Edge[Double]] =
117 |       sc.parallelize(
118 |         Seq(Edge(0, 1, 1.0), Edge(1, 0, 1.0),
119 |           Edge(0, 3, 1.0), Edge(3, 0, 1.0),
120 |           Edge(0, 5, 1.0), Edge(5, 0, 1.0),
121 |           Edge(1, 3, 1.0), Edge(3, 1, 1.0),
122 |           Edge(1, 8, 1.0), Edge(8, 1, 1.0),
123 |           Edge(2, 8, 1.0), Edge(8, 2, 2.0),
124 |           Edge(2, 2, 2.0)
125 |         ))
126 | 
127 |     // Build the initial Graph
128 |     val challengeGraph = Graph(challengeUsers, challengeRelationships)
129 | 
130 |     cleanedGraph.vertices.count shouldEqual challengeGraph.vertices.count
131 |     cleanedGraph.vertices.map(v => (v._1, v._2)).collect.sorted shouldEqual challengeGraph.vertices.map(v => (v._1, v._2)).collect.sorted
132 | 
133 |     /*cleanedGraph.edges
134 |       .map(v => ((v.srcId, v.dstId), v.attr))
135 |       .collect.sortBy(tup => tup._1) shouldEqual
136 |     challengeGraph.edges
137 |       .map(v => ((v.srcId, v.dstId), v.attr))
138 |       .collect.sortBy(tup => tup._1)*/
139 | 
140 |   }
141 | 
142 |   test("Add self loop too each nodes", UnitTest){
143 | 
144 |     // Create and RDD for vertices
145 |     val users: RDD[(VertexId, Int)] =
146 |       sc.parallelize(Array((2L,0), (1L,1),
147 |         (3L,2), (5L,3), (8L,4), (0L,5)))
148 | 
149 |     // Create an RDD for edges
150 |     val relationships: RDD[Edge[Double]] =
151 |       sc.parallelize(
152 |         Seq(Edge(0, 1, 2.0), Edge(1, 0, 1.0),
153 |           Edge(0, 3, 1.0), Edge(3, 0, 1.0),
154 |           Edge(0, 5, 1.0), Edge(5, 0, 1.0),
155 |           Edge(1, 3, 1.0), Edge(3, 1, 1.0),
156 |           Edge(1, 8, 1.0), Edge(8, 1, 1.0),
157 |           Edge(2, 8, 1.0), Edge(8, 2, 1.0),
158 |           Edge(2, 2, 1.0)
159 |         ))
160 | 
161 |     // Build the initial Graph
162 |     val graph = Graph(users, relationships)
163 | 
164 |     val edgesWithSelfLoops: RDD[(Int, (Int, Double))] = selfLoopManager(graph, 2)
165 | 
166 |     val objective: RDD[(Int, (Int, Double))] =
167 |       sc.parallelize(
168 |         Seq((1, (1, 4.0)), (2, (2, 2.0)),
169 |           (3, (3, 2.0)), (4, (4, 2.0)),
170 |           (5, (5, 4.0))
171 |         ))
172 | 
173 |     edgesWithSelfLoops.count shouldEqual objective.count
174 |     edgesWithSelfLoops.collect.sortBy(edge => (edge._1, edge._2)) shouldEqual objective.collect.sortBy(edge => (edge._1, edge._2))
175 | 
176 |   }
177 | 
178 |   test("Completion strategy for graph depending on its nature (oriented or not)", UnitTest){
179 | 
180 |     // For undirected graphs
181 |     // Create and RDD for vertices
182 |     val undirectedUsers: RDD[(VertexId, Int)] =
183 |       sc.parallelize(Array((2L,0), (1L,1),
184 |         (3L,2), (5L,3), (8L,4), (0L,5)))
185 | 
186 |     // Create an RDD for edges
187 |     val undirectedRelationships: RDD[Edge[Double]] =
188 |       sc.parallelize(
189 |         Seq(Edge(0, 1, 1.0), Edge(1, 0, 1.0),
190 |           Edge(0, 3, 1.0), Edge(3, 0, 1.0),
191 |           Edge(0, 5, 1.0), Edge(5, 0, 1.0),
192 |           Edge(1, 3, 1.0), Edge(3, 1, 1.0),
193 |           Edge(1, 8, 1.0), Edge(8, 1, 1.0),
194 |           Edge(2, 8, 1.0), Edge(8, 2, 1.0),
195 |           Edge(2, 2, 1.0)
196 |         ))
197 | 
198 |     // Build the initial Graph
199 |     val undirectedGraph = Graph(undirectedUsers, undirectedRelationships)
200 | 
201 |     val undirectedEdges: RDD[(Int, (Int, Double))] = graphOrientationManager(undirectedGraph, "undirected")
202 | 
203 |     // For directed graphs
204 |     // Create and RDD for vertices
205 |     val directedUsers: RDD[(VertexId, Int)] =
206 |       sc.parallelize(Array((2L,0), (1L,1),
207 |         (3L,2), (5L,3), (8L,4), (0L,5)))
208 | 
209 |     // Create an RDD for edges
210 |     val directedRelationships: RDD[Edge[Double]] =
211 |       sc.parallelize(
212 |         Seq(Edge(0, 1, 1.0),
213 |           Edge(0, 3, 1.0),
214 |           Edge(0, 5, 1.0),
215 |           Edge(1, 3, 1.0),
216 |           Edge(1, 8, 1.0),
217 |           Edge(2, 8, 1.0),
218 |           Edge(2, 2, 1.0)
219 |         ))
220 | 
221 |     // Build the initial Graph
222 |     val directedGraph = Graph(directedUsers, directedRelationships)
223 | 
224 |     val directedEdges: RDD[(Int, (Int, Double))] = graphOrientationManager(directedGraph, "directed")
225 | 
226 |     // For bidirected graphs
227 |     // Create and RDD for vertices
228 |     val bidirectedUsers: RDD[(VertexId, Int)] =
229 |       sc.parallelize(Array((2L,0), (1L,1),
230 |         (3L,2), (5L,3), (8L,4), (0L,5)))
231 | 
232 |     // Create an RDD for edges
233 |     val bidirectedRelationships: RDD[Edge[Double]] =
234 |       sc.parallelize(
235 |         Seq(Edge(0, 1, 1.0),
236 |           Edge(0, 3, 1.0),
237 |           Edge(0, 5, 1.0),
238 |           Edge(1, 3, 1.0), Edge(3, 1, 1.0),
239 |           Edge(1, 8, 1.0), Edge(8, 1, 1.0),
240 |           Edge(2, 8, 1.0),
241 |           Edge(2, 2, 1.0)
242 |         ))
243 | 
244 |     // Build the initial Graph
245 |     val bidirectedGraph = Graph(bidirectedUsers, bidirectedRelationships)
246 | 
247 |     val bidirectedEdges: RDD[(Int, (Int, Double))] = graphOrientationManager(bidirectedGraph, "bidirected")
248 | 
249 |     val objective: RDD[(Int, (Int, Double))] =
250 |       sc.parallelize(
251 |         Seq((5, (1, 1.0)), (1, (5, 1.0)),
252 |           (5, (2, 1.0)), (2, (5, 1.0)),
253 |           (5, (3, 1.0)), (3, (5, 1.0)),
254 |           (1, (2, 1.0)), (2, (1, 1.0)),
255 |           (1, (4, 1.0)), (4, (1, 1.0)),
256 |           (0, (4, 1.0)), (4, (0, 1.0)),
257 |           (0, (0, 1.0))
258 |         ))
259 | 
260 |     undirectedEdges.count shouldEqual objective.count
261 |     undirectedEdges.collect.sortBy(edge => (edge._1, edge._2)) shouldEqual objective.collect.sortBy(edge => (edge._1, edge._2))
262 |     directedEdges.count shouldEqual objective.count
263 |     directedEdges.collect.sortBy(edge => (edge._1, edge._2)) shouldEqual objective.collect.sortBy(edge => (edge._1, edge._2))
264 |     bidirectedEdges.count shouldEqual objective.count
265 |     bidirectedEdges.collect.sortBy(edge => (edge._1, edge._2)) shouldEqual objective.collect.sortBy(edge => (edge._1, edge._2))
266 | 
267 |   }
268 | 
269 |   // Integration Tests
270 | 
271 |   test("Adjacency Matrix Transformation", IntegrationTest) {
272 | 
273 |     // Load data
274 |     val source:Seq[String] = Source.fromURL(getClass.getResource("/MCLUtils/OrientedEdges.txt")).getLines().toSeq
275 |     val nodesFile:Seq[String] = Source.fromURL(getClass.getResource("/MCLUtils/OrientedNodes.txt")).getLines().toSeq
276 |     val matrixSelfLoop:Seq[String] = Source.fromURL(getClass.getResource("/MCLUtils/OrientedMatrixSelfLoop.txt")).getLines().toSeq
277 | 
278 |     val edges:RDD[Edge[Double]] =
279 |       sc.parallelize(
280 |         source
281 |         .map(l => l.split(" "))
282 |         .map(e => Edge(e(0).toLong, e(1).toLong, e(2).toDouble))
283 |       )
284 |     val nodes:RDD[(VertexId, String)] =
285 |       sc.parallelize(
286 |         nodesFile
287 |           .map(l => l.split(" "))
288 |           .map(e => (e(0).toLong, "default"))
289 |       )
290 | 
291 |     val graph = Graph(nodes, edges)
292 | 
293 |     var range:Long = 0
294 |     val initialMatrixWithSelLoop =
295 |       new IndexedRowMatrix(
296 |         sc.parallelize(
297 |           matrixSelfLoop
298 |             .map{
299 |               line =>
300 |                 range = range + 1
301 |                 new IndexedRow(
302 |                   range-1,
303 |                   new DenseVector(
304 |                     line.split(";").map(e => e.toDouble)
305 |                   )
306 |                 )
307 |             }
308 |         )
309 |       )
310 | 
311 |     //Prepare graph for transformation
312 | 
313 |     val sqlContext = SQLContext.getOrCreate(sc)
314 |     import sqlContext.implicits._
315 | 
316 |     val lookupTable:DataFrame =
317 |       graph.vertices.sortByKey().zipWithIndex()
318 |         .map(indexedVertice => (indexedVertice._2.toInt, indexedVertice._1._1.toInt, indexedVertice._1._2))
319 |         .toDF("matrixId", "nodeId", "attribute")
320 | 
321 |     val preprocessedGraph: Graph[Int, Double] = preprocessGraph(graph, lookupTable)
322 | 
323 |     //Test matrix transformation
324 | 
325 |     val adjacencyMat:IndexedRowMatrix = toIndexedRowMatrix(preprocessedGraph, 1.0, "undirected")
326 | 
327 |     adjacencyMat.numRows shouldEqual initialMatrixWithSelLoop.numRows
328 |     adjacencyMat.numCols shouldEqual initialMatrixWithSelLoop.numCols
329 |     initialMatrixWithSelLoop.rows.map(iRow => (iRow.index, iRow.vector.toArray))
330 |       .join(
331 |         adjacencyMat.rows.map(iRow => (iRow.index, iRow.vector.toArray))
332 |       )
333 |       .collect.foreach(
334 |         pairOfRows =>
335 |           {
336 |             pairOfRows._2._1 shouldEqual pairOfRows._2._2
337 |           }
338 |       )
339 | 
340 |     //Test transformation from adjacency matrix to graph
341 | 
342 |     val vertices:RDD[(VertexId, String)] = lookupTable.rdd.map(row => (row.getInt(0).toLong, row.getString(2)))
343 |     val resultGraph: Graph[String, Double] = toGraph(adjacencyMat, vertices)
344 | 
345 |     // Missing self edges are manually added
346 |     val preEdges = preprocessedGraph.triplets
347 |       .map(tri => Edge(tri.srcAttr, tri.dstAttr, tri.attr)).collect
348 |       .union(for (i <- 1 to (preprocessedGraph.vertices.count.toInt - 2)) yield Edge(i, i , 1.0))
349 |       .sortBy(e => (e.srcId, e.dstId))
350 | 
351 |     val postEdges = resultGraph.edges.collect.sortBy(e => (e.srcId, e.dstId))
352 | 
353 |     preprocessedGraph.vertices.count shouldEqual resultGraph.vertices.count
354 |     preEdges.toSeq.length shouldEqual postEdges.toSeq.length
355 |     preEdges.toSeq shouldEqual postEdges.toSeq
356 | 
357 |   }
358 | 
359 | }
360 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/mllib/clustering/MCL.scala:
--------------------------------------------------------------------------------
  1 | /*The MIT License (MIT)
  2 | 
  3 | Copyright (c) 2015-2016, Joan André
  4 | 
  5 | Permission is hereby granted, free of charge, to any person obtaining a copy
  6 |   of this software and associated documentation files (the "Software"), to deal
  7 |   in the Software without restriction, including without limitation the rights
  8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 |   copies of the Software, and to permit persons to whom the Software is
 10 | furnished to do so, subject to the following conditions:
 11 | 
 12 |   The above copyright notice and this permission notice shall be included in
 13 | all copies or substantial portions of the Software.
 14 | 
 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | THE SOFTWARE.*/
 22 | 
 23 | package org.apache.spark.mllib.clustering
 24 | 
 25 | import org.apache.spark.graphx._
 26 | import org.apache.spark.mllib.clustering.MCLUtils._
 27 | import org.apache.spark.mllib.linalg._
 28 | import org.apache.spark.mllib.linalg.distributed._
 29 | import org.apache.spark.rdd.RDD
 30 | import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
 31 | 
 32 | /** A clustering model for MCL.
 33 |   *
 34 |   * @see README.md for more details on theory
 35 |   *
 36 |   * @constructor Constructs an MCL instance with default parameters: {expansionRate: 2, inflationRate: 2, convergenceRate: 0.01, epsilon: 0.05, maxIterations: 10, selfLoopWeight: 0.1, graphOrientationStrategy: "undirected"}.
 37 |   * @param expansionRate expansion rate of adjacency matrix at each iteration
 38 |   * @param inflationRate inflation rate of adjacency matrix at each iteration
 39 |   * @param epsilon pruning parameter. When an edge E1, starting from a node N1, has a weight which percentage is inferior to epsilon regarding other edges Ei starting from N, this weight is set to zero
 40 |   * @param maxIterations maximal number of iterations for a non convergent algorithm
 41 |   * @param selfLoopWeight a coefficient between 0 and 1 to influence clustering granularity and objective
 42 |   * @param graphOrientationStrategy chose a graph strategy completion depending on its nature. 3 choices: undirected, directed, birected.
 43 |   */
 44 | class MCL private(private var expansionRate: Int,
 45 |                   private var inflationRate: Double,
 46 |                   private var epsilon: Double,
 47 |                   private var maxIterations: Int,
 48 |                   private var selfLoopWeight: Double,
 49 |                   private var graphOrientationStrategy: String) extends Serializable{
 50 | 
 51 |   /** Construct an MCL instance
 52 |     *
 53 |     * Default parameters: {expansionRate: 2, inflationRate: 2,
 54 |     * convergenceRate: 0.01, epsilon: 0.05, maxIterations: 10, selfLoopWeight: 0.1, graphOrientationStrategy: "undirected"}.
 55 |     *
 56 |     * @return an MCL object
 57 |     */
 58 |   def this() = this(2, 2.0, 0.01, 10, 0.1, "undirected")
 59 | 
 60 |   /** Available graph orientation strategy options.
 61 |     *
 62 |     * @see README.md for more details
 63 |     */
 64 |   val graphOrientationStrategyOption: Seq[String] = Seq("undirected", "directed", "bidirected")
 65 | 
 66 |   /** Get expansion rate */
 67 |   def getExpansionRate: Int = expansionRate
 68 | 
 69 |   /** Set the expansion rate.
 70 |     *
 71 |     * Default: 2.
 72 |     *
 73 |     * @throws IllegalArgumentException expansionRate must be higher than 1
 74 |     */
 75 |   def setExpansionRate(expansionRate: Int): MCL = {
 76 |     this.expansionRate = expansionRate match {
 77 |       case eR if eR > 0 => eR
 78 |       case _ => throw new IllegalArgumentException("expansionRate parameter must be higher than 1")
 79 |     }
 80 |     this
 81 |   }
 82 | 
 83 |   /** Get inflation rate */
 84 |   def getInflationRate: Double = inflationRate
 85 | 
 86 |   /** Set the inflation rate.
 87 |     *
 88 |     * Default: 2.
 89 |     *
 90 |     * @throws IllegalArgumentException inflationRate must be higher than 0
 91 |     */
 92 |   def setInflationRate(inflationRate: Double): MCL = {
 93 |     this.inflationRate = inflationRate match {
 94 |       case iR if iR > 0 => iR
 95 |       case _ => throw new IllegalArgumentException("inflationRate parameter must be higher than 0")
 96 |     }
 97 |     this
 98 |   }
 99 | 
100 |   /** Get epsilon coefficient
101 |     *
102 |     * Change an edge value to zero when the overall weight of this edge is less than a certain percentage
103 |     *
104 |     */
105 |   def getEpsilon: Double = epsilon
106 | 
107 |   /** Set the minimum percentage to get an edge weight to zero.
108 |     *
109 |     * Default: 0.01.
110 |     *
111 |     * @throws IllegalArgumentException epsilon must be higher than 0 and lower than 1
112 |     */
113 |   def setEpsilon(epsilon: Double): MCL = {
114 |     this.epsilon = epsilon match {
115 |       case eps if eps < 1 & eps >= 0 => eps
116 |       case _ => throw new IllegalArgumentException("epsilon parameter must be higher than 0 and lower than 1")
117 |     }
118 | 
119 |     this
120 |   }
121 | 
122 |   /** Get stop condition if MCL algorithm does not converge fairly quickly */
123 |   def getMaxIterations: Int = maxIterations
124 | 
125 |   /** Set maximum number of iterations.
126 |     *
127 |     * Default: 10.
128 |     *
129 |     * @throws IllegalArgumentException maxIterations must be higher than 0
130 |     */
131 |   def setMaxIterations(maxIterations: Int): MCL = {
132 |     this.maxIterations = maxIterations match {
133 |       case mI if mI > 0 => mI
134 |       case _ => throw new IllegalArgumentException("maxIterations parameter must be higher than 0")
135 |     }
136 |     this
137 |   }
138 | 
139 |   /** Get weight of automatically added self loops in adjacency matrix rows */
140 |   def getSelfLoopWeight: Double = selfLoopWeight
141 | 
142 |   /** Set self loops weights.
143 |     *
144 |     * Default: 0.1.
145 |     *
146 |     * @throws IllegalArgumentException selfLoopWeight must be higher than 0 and lower than 1
147 |     */
148 |   def setSelfLoopWeight(selfLoopWeight: Double): MCL = {
149 |     this.selfLoopWeight = selfLoopWeight match {
150 |       case slw if slw > 0 & slw <= 1  => slw
151 |       case _ => throw new IllegalArgumentException("selfLoopWeight parameter must be higher than 0 and lower than 1")
152 |     }
153 |     this
154 |   }
155 | 
156 |   /** Get graph orientation strategy selected depending on graph nature */
157 |   def getGraphOrientationStrategy: String = graphOrientationStrategy
158 | 
159 |   /** Set graph orientation strategy.
160 |     *
161 |     * Default: undirected.
162 |     *
163 |     * @throws IllegalArgumentException graphOrientationStrategy must be contained in graphOrientationStrategyOption
164 |     */
165 |   def setGraphOrientationStrategy(graphOrientationStrategy: String): MCL = {
166 |     this.graphOrientationStrategy = graphOrientationStrategy match {
167 |       case gos if graphOrientationStrategyOption.contains(gos)  => gos
168 |       case _ => throw new IllegalArgumentException("you must select graphOrientationStrategy option in the following list: " + graphOrientationStrategyOption.mkString(", "))
169 |     }
170 |     this
171 |   }
172 | 
173 | 
174 |   /** Normalize matrix
175 |     *
176 |     * @param mat an unnormalized adjacency matrix
177 |     * @return normalized adjacency matrix
178 |     */
179 |   def normalization(mat: IndexedRowMatrix): IndexedRowMatrix ={
180 |     new IndexedRowMatrix(
181 |       mat.rows
182 |         .map{row =>
183 |           val svec = row.vector.toSparse
184 |           IndexedRow(row.index,
185 |             new SparseVector(svec.size, svec.indices, svec.values.map(v => v/svec.values.sum)))
186 |         })
187 |   }
188 | 
189 |   /** Normalize row
190 |     *
191 |     * @param row an unnormalized row of th adjacency matrix
192 |     * @return normalized row
193 |     */
194 |   def normalization(row: SparseVector): SparseVector ={
195 |     new SparseVector(row.size, row.indices, row.values.map(v => v/row.values.sum))
196 |   }
197 | 
198 |   /** Remove weakest connections from a row
199 |     *
200 |     * Connections weight in adjacency matrix which is inferior to a very small value is set to 0
201 |     *
202 |     * @param row a row of the adjacency matrix
203 |     * @return sparsed row
204 |     * @todo Add more complex pruning strategies.
205 |     * @see http://micans.org/mcl/index.html
206 |     */
207 |   def removeWeakConnections(row: SparseVector): SparseVector ={
208 |     new SparseVector(
209 |       row.size,
210 |       row.indices,
211 |       row.values.map(v => {
212 |         if(v < epsilon) 0.0
213 |         else v
214 |       })
215 |     )
216 |   }
217 | 
218 |   /** Expand matrix
219 |     *
220 |     * @param mat an adjacency matrix
221 |     * @return expanded adjacency matrix
222 |     */
223 |   def expansion(mat: IndexedRowMatrix): BlockMatrix = {
224 |     val bmat = mat.toBlockMatrix()
225 |     var resmat = bmat
226 |     for(i <- 1 until expansionRate){
227 |       resmat = resmat.multiply(bmat)
228 |     }
229 |     resmat
230 |   }
231 | 
232 |   /** Inflate matrix
233 |     *
234 |     * Prune and normalization are applied locally (on each row). So we avoid two more complete scanning of adjacency matrix.
235 |     * As explained in issue #8, pruning is applied on expanded matrix, so we take advantage of natural normalized expansion state.
236 |     *
237 |     * @param mat an adjacency matrix
238 |     * @return inflated adjacency matrix
239 |     */
240 |   def inflation(mat: BlockMatrix): IndexedRowMatrix = {
241 | 
242 |     new IndexedRowMatrix(
243 |       mat.toIndexedRowMatrix.rows
244 |         .map{row =>
245 |           val svec = removeWeakConnections(row.vector.toSparse) // Pruning elements locally, instead of scanning all matrix again
246 |           IndexedRow(row.index,
247 |             // Normalizing rows locally, instead of scanning all matrix again
248 |             normalization(
249 |               new SparseVector(svec.size, svec.indices, svec.values.map(v => Math.exp(inflationRate*Math.log(v))))
250 |             )
251 |           )
252 |         }
253 |     )
254 |   }
255 | 
256 |   /** Calculate the distance between two matrices.
257 |     *
258 |     * Find the euclidean distance bewtween two matrices.
259 |     *
260 |     * @param m1 an adjacency matrix at step n
261 |     * @param m2 same adjacency matrix at step n+1
262 |     * @return a normalized distance between m1 and m2
263 |     * @todo Use another object to speed up join between RDD.
264 |     */
265 |   def difference(m1: IndexedRowMatrix, m2: IndexedRowMatrix): Double = {
266 | 
267 |     val m1RDD:RDD[((Long,Int),Double)] = m1.rows.flatMap(r => {
268 |       val sv = r.vector.toSparse
269 |       sv.indices.map(i => ((r.index,i), sv.apply(i)))
270 |     })
271 | 
272 |     val m2RDD:RDD[((Long,Int),Double)] = m2.rows.flatMap(r => {
273 |       val sv = r.vector.toSparse
274 |       sv.indices.map(i => ((r.index,i), sv.apply(i)))
275 |     })
276 | 
277 |     val diffRDD = m1RDD.fullOuterJoin(m2RDD).map(diff => Math.pow(diff._2._1.getOrElse(0.0) - diff._2._2.getOrElse(0.0), 2))
278 |     diffRDD.sum()
279 |   }
280 | 
281 |   /** Train MCL algorithm.
282 |     *
283 |     * @param graph a graph to partitioned
284 |     * @return an MCLModel where each node is associated to one or more clusters
285 |     */
286 |   def run[VD](graph: Graph[VD, Double]): MCLModel = {
287 | 
288 |     // Add a new attributes to nodes: a unique row index starting from 0 to transform graph into adjacency matrix
289 |     val spark = SparkSession.builder().getOrCreate()
290 |     import spark.implicits._
291 | 
292 |     val lookupTable:DataFrame =
293 |       graph.vertices.sortBy(_._1).zipWithIndex()
294 |         .map(indexedVertex => (indexedVertex._2.toInt, indexedVertex._1._1.toInt))
295 |         .toDF("matrixId", "nodeId")
296 | 
297 |     val preprocessedGraph: Graph[Int, Double] = preprocessGraph(graph, lookupTable)
298 | 
299 |     val mat = toIndexedRowMatrix(preprocessedGraph, selfLoopWeight, graphOrientationStrategy)
300 | 
301 |     // Number of current iterations
302 |     var iter = 0
303 |     // Convergence indicator
304 |     var change = 1.0
305 | 
306 |     var M1:IndexedRowMatrix = normalization(mat)
307 |     while (iter < maxIterations && change > 0) {
308 |       val M2: IndexedRowMatrix = inflation(expansion(M1))
309 |       change = difference(M1, M2)
310 |       iter = iter + 1
311 |       M1 = M2
312 |     }
313 | 
314 |     // Get attractors in adjacency matrix (nodes with not only null values) and collect every nodes they are attached to in order to form a cluster.
315 | 
316 |     val rawDF =
317 |       M1.rows.flatMap(
318 |         r => {
319 |           val sv = r.vector.toSparse
320 |           sv.indices.map(i => (r.index, (i, sv.apply(i))))
321 |         }
322 |       ).groupByKey()
323 |        .map(node => (node._1, node._2.maxBy(_._2)._1))
324 |        .toDF("matrixId", "clusterId")
325 | 
326 |     // Reassign correct ids to each nodes instead of temporary matrix id associated
327 | 
328 |     val assignments: Dataset[Assignment] =
329 |       rawDF
330 |         .join(lookupTable, rawDF.col("matrixId")===lookupTable.col("matrixId"))
331 |         .select($"nodeId", $"clusterId")
332 |         .map(row => Assignment(row.getInt(0).toLong, row.getInt(1).toLong))
333 | 
334 |     new MCLModel(assignments)
335 |   }
336 | 
337 | }
338 | 
339 | object MCL{
340 | 
341 |   /** Train an MCL model using the given set of parameters.
342 |     *
343 |     * @param graph training points stored as `BlockMatrix`
344 |     * @param expansionRate expansion rate of adjacency matrix at each iteration
345 |     * @param inflationRate inflation rate of adjacency matrix at each iteration
346 |     * @param epsilon minimum percentage of a weight edge to be significant
347 |     * @param maxIterations maximal number of iterations for a non convergent algorithm
348 |     * @param selfLoopWeight a coefficient between 0 and 1 to influence clustering granularity and objective
349 |     * @param graphOrientationStrategy chose a graph strategy completion depending on its nature. 3 choices: undirected, directed, birected.
350 |     * @return an MCL object
351 |     */
352 |   def train[VD](graph: Graph[VD, Double],
353 |             expansionRate: Int = 2,
354 |             inflationRate: Double = 2.0,
355 |             epsilon : Double = 0.01,
356 |             maxIterations: Int = 10,
357 |             selfLoopWeight: Double = 1,
358 |             graphOrientationStrategy: String = "undirected"): MCLModel = {
359 | 
360 |     new MCL()
361 |       .setExpansionRate(expansionRate)
362 |       .setInflationRate(inflationRate)
363 |       .setEpsilon(epsilon)
364 |       .setMaxIterations(maxIterations)
365 |       .setSelfLoopWeight(selfLoopWeight)
366 |       .setGraphOrientationStrategy(graphOrientationStrategy)
367 |       .run(graph)
368 |   }
369 | 
370 | }


--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/mllib/clustering/MCLSuite.scala:
--------------------------------------------------------------------------------
  1 | /*The MIT License (MIT)
  2 | 
  3 | Copyright (c) 2015-2016, Joan André
  4 | 
  5 | Permission is hereby granted, free of charge, to any person obtaining a copy
  6 |   of this software and associated documentation files (the "Software"), to deal
  7 |   in the Software without restriction, including without limitation the rights
  8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 |   copies of the Software, and to permit persons to whom the Software is
 10 | furnished to do so, subject to the following conditions:
 11 | 
 12 |   The above copyright notice and this permission notice shall be included in
 13 | all copies or substantial portions of the Software.
 14 | 
 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 |   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | THE SOFTWARE.*/
 22 | 
 23 | package org.apache.spark.mllib.clustering
 24 | 
 25 | import org.apache.log4j.{Level, Logger}
 26 | import org.apache.spark.graphx._
 27 | import org.apache.spark.mllib.linalg.DenseVector
 28 | import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, IndexedRow, IndexedRowMatrix}
 29 | import org.apache.spark.rdd.RDD
 30 | import org.apache.spark.sql.{Dataset, Row, SparkSession}
 31 | import org.apache.spark.sql.functions._
 32 | 
 33 | import scala.collection.mutable
 34 | import scala.io._
 35 | 
 36 | /** Scala Tests class for MCL algorithm */
 37 | class MCLSuite extends MCLFunSuite{
 38 |   // Disable Spark messages when running program
 39 |   Logger.getLogger("org").setLevel(Level.OFF)
 40 |   Logger.getLogger("akka").setLevel(Level.OFF)
 41 | 
 42 |   // Unit Tests
 43 | 
 44 |   test("Parameters getters and setters", UnitTest){
 45 | 
 46 |     val mcl = new MCL()
 47 | 
 48 |     mcl.getEpsilon shouldEqual 0.01
 49 |     mcl.getExpansionRate shouldEqual 2
 50 |     mcl.getGraphOrientationStrategy shouldEqual "undirected"
 51 |     mcl.getInflationRate shouldEqual 2.0
 52 |     mcl.getMaxIterations shouldEqual 10
 53 |     mcl.getSelfLoopWeight shouldEqual 0.1
 54 | 
 55 |     an [IllegalArgumentException] should be thrownBy mcl.setEpsilon(1)
 56 |     an [IllegalArgumentException] should be thrownBy mcl.setEpsilon(-0.1)
 57 |     an [IllegalArgumentException] should be thrownBy mcl.setExpansionRate(-1)
 58 |     an [IllegalArgumentException] should be thrownBy mcl.setGraphOrientationStrategy("test")
 59 |     an [IllegalArgumentException] should be thrownBy mcl.setInflationRate(0)
 60 |     an [IllegalArgumentException] should be thrownBy mcl.setMaxIterations(0)
 61 |     an [IllegalArgumentException] should be thrownBy mcl.setSelfLoopWeight(1.1)
 62 |     an [IllegalArgumentException] should be thrownBy mcl.setSelfLoopWeight(0)
 63 |   }
 64 | 
 65 |   test("Matrix Normalization", UnitTest) {
 66 | 
 67 |     val indexedMatrix: IndexedRowMatrix =
 68 |       new IndexedRowMatrix(
 69 |         sc.parallelize(
 70 |           Seq(
 71 |             IndexedRow(0, new DenseVector(Array(1,0,0,0,1,0))),
 72 |             IndexedRow(1, new DenseVector(Array(0,1,1,0,1,1))),
 73 |             IndexedRow(2, new DenseVector(Array(0,1,1,0,0,1))),
 74 |             IndexedRow(3, new DenseVector(Array(0,0,0,1,0,1))),
 75 |             IndexedRow(4, new DenseVector(Array(1,1,0,0,1,0))),
 76 |             IndexedRow(5, new DenseVector(Array(0,1,1,1,0,1)))
 77 |           )
 78 |       ))
 79 | 
 80 |     val MCLObject: MCL = new MCL()
 81 |     val normalizedMatrix: IndexedRowMatrix =
 82 |       new IndexedRowMatrix(
 83 |         indexedMatrix.rows
 84 |           .map{row =>
 85 |             val svec = row.vector.toSparse
 86 |             IndexedRow(row.index,
 87 |               MCLObject.normalization(svec)
 88 |             )
 89 |           }
 90 |       )
 91 | 
 92 |     val objective: IndexedRowMatrix =
 93 |       new IndexedRowMatrix(
 94 |         sc.parallelize(
 95 |           Seq(
 96 |             IndexedRow(0, new DenseVector(Array(0.5,0,0,0,0.5,0))),
 97 |             IndexedRow(1, new DenseVector(Array(0,0.25,0.25,0,0.25,0.25))),
 98 |             IndexedRow(2, new DenseVector(Array(0,0.3333333333333333,0.3333333333333333,0,0,0.3333333333333333))),
 99 |             IndexedRow(3, new DenseVector(Array(0,0,0,0.5,0,0.5))),
100 |             IndexedRow(4, new DenseVector(Array(0.3333333333333333,0.3333333333333333,0,0,0.3333333333333333,0))),
101 |             IndexedRow(5, new DenseVector(Array(0,0.25,0.25,0.25,0,0.25)))
102 |           )
103 |         ))
104 | 
105 |     normalizedMatrix.numRows shouldEqual objective.numRows
106 |     normalizedMatrix.numCols shouldEqual objective.numCols
107 |     objective.rows.map(iRow => (iRow.index, iRow.vector.toArray))
108 |       .join(
109 |         normalizedMatrix.rows.map(iRow => (iRow.index, iRow.vector.toArray))
110 |       )
111 |       .collect.foreach(
112 |       pairOfRows =>
113 |       {
114 |         pairOfRows._2._1 shouldEqual pairOfRows._2._2
115 |       }
116 |     )
117 | 
118 |   }
119 | 
120 |   test("Remove Weak Connections", UnitTest) {
121 | 
122 |     val indexedMatrix: IndexedRowMatrix =
123 |       new IndexedRowMatrix(
124 |         sc.parallelize(
125 |           Seq(
126 |             IndexedRow(0, new DenseVector(Array(0.172225,0.027225,0,0,0.172225,0))),
127 |             IndexedRow(1, new DenseVector(Array(0.00680625,0.0841,0.04305625,0.00390625,0.021025,0.04305625))),
128 |             IndexedRow(2, new DenseVector(Array(0,0.07502121,0.07502121,0.00680625,0.00680625,0.07502121))),
129 |             IndexedRow(3, new DenseVector(Array(0,0.015625,0.015625,0.140625,0,0.140625))),
130 |             IndexedRow(4, new DenseVector(Array(0.07502121,0.03663396,0.00680625,0,0.12702096,0.00680625))),
131 |             IndexedRow(5, new DenseVector(Array(0,0.04305625,0.04305625,0.03515625,0.00390625,0.11055625)))
132 |           )
133 |         ))
134 | 
135 |     val MCLObject: MCL = new MCL().setEpsilon(0.01)
136 |     val sparsedMatrix: IndexedRowMatrix =
137 |       new IndexedRowMatrix(
138 |         indexedMatrix.rows
139 |           .map{row =>
140 |             val svec = row.vector.toSparse
141 |             IndexedRow(row.index,
142 |               MCLObject.removeWeakConnections(svec)
143 |             )
144 |           }
145 |       )
146 | 
147 |     val objective: IndexedRowMatrix =
148 |       new IndexedRowMatrix(
149 |         sc.parallelize(
150 |           Seq(
151 |             IndexedRow(0, new DenseVector(Array(0.172225,0.027225,0,0,0.172225,0))),
152 |             IndexedRow(1, new DenseVector(Array(0,0.0841,0.04305625,0,0.021025,0.04305625))),
153 |             IndexedRow(2, new DenseVector(Array(0,0.07502121,0.07502121,0,0,0.07502121))),
154 |             IndexedRow(3, new DenseVector(Array(0,0.015625,0.015625,0.140625,0,0.140625))),
155 |             IndexedRow(4, new DenseVector(Array(0.07502121,0.03663396,0,0,0.12702096,0))),
156 |             IndexedRow(5, new DenseVector(Array(0,0.04305625,0.04305625,0.03515625,0,0.11055625)))
157 |           )
158 |         ))
159 | 
160 |     sparsedMatrix.numRows shouldEqual objective.numRows
161 |     sparsedMatrix.numCols shouldEqual objective.numCols
162 |     objective.rows.map(iRow => (iRow.index, iRow.vector.toArray))
163 |       .join(
164 |         sparsedMatrix.rows.map(iRow => (iRow.index, iRow.vector.toArray))
165 |       )
166 |       .collect.sortBy(row => row._1).foreach(
167 |       pairOfRows =>
168 |       {
169 |         val sparsedRows = pairOfRows._2._2.map(e => BigDecimal(e).setScale(8, BigDecimal.RoundingMode.HALF_UP).toDouble)
170 |         pairOfRows._2._1 shouldEqual sparsedRows
171 |       }
172 |     )
173 | 
174 |   }
175 | 
176 |   test("Matrix Expansion", UnitTest) {
177 | 
178 |     val indexedMatrix: IndexedRowMatrix =
179 |       new IndexedRowMatrix(
180 |         sc.parallelize(
181 |           Seq(
182 |             IndexedRow(0, new DenseVector(Array(0.5,0,0,0,0.5,0))),
183 |             IndexedRow(1, new DenseVector(Array(0,0.25,0.25,0,0.25,0.25))),
184 |             IndexedRow(2, new DenseVector(Array(0,0.33,0.33,0,0,0.33))),
185 |             IndexedRow(3, new DenseVector(Array(0,0,0,0.5,0,0.5))),
186 |             IndexedRow(4, new DenseVector(Array(0.33,0.33,0,0,0.33,0))),
187 |             IndexedRow(5, new DenseVector(Array(0,0.25,0.25,0.25,0,0.25)))
188 |           )
189 |         ))
190 | 
191 |     val MCLObject: MCL = new MCL()
192 |     val expandedMatrix: IndexedRowMatrix = MCLObject.expansion(indexedMatrix).toIndexedRowMatrix()
193 | 
194 |     val objective: IndexedRowMatrix =
195 |       new IndexedRowMatrix(
196 |         sc.parallelize(
197 |           Seq(
198 |             IndexedRow(0, new DenseVector(Array(0.4150,0.1650,0,0,0.4150,0))),
199 |             IndexedRow(1, new DenseVector(Array(0.0825,0.2900,0.2075,0.0625,0.1450,0.2075))),
200 |             IndexedRow(2, new DenseVector(Array(0,0.2739,0.2739,0.0825,0.0825,0.2739))),
201 |             IndexedRow(3, new DenseVector(Array(0,0.1250,0.1250,0.3750,0,0.3750))),
202 |             IndexedRow(4, new DenseVector(Array(0.2739,0.1914,0.0825,0,0.3564,0.0825))),
203 |             IndexedRow(5, new DenseVector(Array(0,0.2075,0.2075,0.1875,0.0625,0.3325)))
204 |           )
205 |         ))
206 | 
207 |     expandedMatrix.numRows shouldEqual objective.numRows
208 |     expandedMatrix.numCols shouldEqual objective.numCols
209 |     objective.rows.map(iRow => (iRow.index, iRow.vector.toArray))
210 |       .join(
211 |         expandedMatrix.rows.map(iRow => (iRow.index, iRow.vector.toArray))
212 |       )
213 |       .collect.sortBy(row => row._1).foreach(
214 |       pairOfRows =>
215 |       {
216 |         val expandedRows = pairOfRows._2._2.map(e => BigDecimal(e).setScale(4, BigDecimal.RoundingMode.HALF_UP).toDouble)
217 |         pairOfRows._2._1 shouldEqual expandedRows
218 |       }
219 |     )
220 | 
221 |   }
222 | 
223 |   test("Matrix Inflation", UnitTest) {
224 | 
225 |     val indexedMatrix: BlockMatrix =
226 |       new IndexedRowMatrix(
227 |         sc.parallelize(
228 |           Seq(
229 |             IndexedRow(0, new DenseVector(Array(0.4150,0.1650,0,0,0.4150,0))),
230 |             IndexedRow(1, new DenseVector(Array(0.0825,0.2900,0.2075,0.0625,0.1450,0.2075))),
231 |             IndexedRow(2, new DenseVector(Array(0,0.2739,0.2739,0.0825,0.0825,0.2739))),
232 |             IndexedRow(3, new DenseVector(Array(0,0.1250,0.1250,0.3750,0,0.3750))),
233 |             IndexedRow(4, new DenseVector(Array(0.2739,0.1914,0.0825,0,0.3564,0.0825))),
234 |             IndexedRow(5, new DenseVector(Array(0,0.2075,0.2075,0.1875,0.0625,0.3325)))
235 |           )
236 |         )).toBlockMatrix
237 | 
238 |     val MCLObject: MCL = new MCL()
239 |     val inflatedMatrix: IndexedRowMatrix = MCLObject.inflation(indexedMatrix)
240 | 
241 |     val objective: IndexedRowMatrix =
242 |       new IndexedRowMatrix(
243 |         sc.parallelize(
244 |           Seq(
245 |             IndexedRow(0, new DenseVector(Array(0.46337526,0.07324948,0,0,0.46337526,0))),
246 |             IndexedRow(1, new DenseVector(Array(0.03370265,0.41643971,0.21320253,0.01934266,0.10410993,0.21320253))),
247 |             IndexedRow(2, new DenseVector(Array(0,0.31432222,0.31432222,0.02851668,0.02851668,0.31432222))),
248 |             IndexedRow(3, new DenseVector(Array(0,0.05000000,0.05000000,0.45000000,0,0.45000000))),
249 |             IndexedRow(4, new DenseVector(Array(0.29736263,0.14520654,0.02697803,0,0.50347477,0.02697803))),
250 |             IndexedRow(5, new DenseVector(Array(0,0.18264973,0.18264973,0.14913699,0.01657078,0.46899276)))
251 |           )
252 |         ))
253 | 
254 |     inflatedMatrix.numRows shouldEqual objective.numRows
255 |     inflatedMatrix.numCols shouldEqual objective.numCols
256 |     objective.rows.map(iRow => (iRow.index, iRow.vector.toArray))
257 |       .join(
258 |         inflatedMatrix.rows.map(iRow => (iRow.index, iRow.vector.toArray))
259 |       )
260 |       .collect.sortBy(row => row._1).foreach(
261 |       pairOfRows =>
262 |       {
263 |         val inflatedRows = pairOfRows._2._2.map(e => BigDecimal(e).setScale(8, BigDecimal.RoundingMode.HALF_UP).toDouble)
264 |         pairOfRows._2._1 shouldEqual inflatedRows
265 |       }
266 |     )
267 | 
268 |   }
269 | 
270 |   test("Difference Between Two Matrices", UnitTest) {
271 | 
272 |     val startMatrix: IndexedRowMatrix =
273 |       new IndexedRowMatrix(
274 |         sc.parallelize(
275 |           Seq(
276 |             IndexedRow(0, new DenseVector(Array(1,0,0,0,1,0))),
277 |             IndexedRow(1, new DenseVector(Array(0,1,1,0,1,1))),
278 |             IndexedRow(2, new DenseVector(Array(0,1,1,0,0,1))),
279 |             IndexedRow(3, new DenseVector(Array(0,0,0,1,0,1))),
280 |             IndexedRow(4, new DenseVector(Array(1,1,0,0,1,0))),
281 |             IndexedRow(5, new DenseVector(Array(0,1,1,1,0,1)))
282 |           )
283 |         ))
284 | 
285 |     val stopMatrix: IndexedRowMatrix =
286 |       new IndexedRowMatrix(
287 |         sc.parallelize(
288 |           Seq(
289 |             IndexedRow(0, new DenseVector(Array(0.172225,0.027225,0,0,0.172225,0))),
290 |             IndexedRow(1, new DenseVector(Array(0,0.0841,0.04305625,0,0.021025,0.04305625))),
291 |             IndexedRow(2, new DenseVector(Array(0,0.07502121,0.07502121,0,0,0.07502121))),
292 |             IndexedRow(3, new DenseVector(Array(0,0.015625,0.015625,0.140625,0,0.140625))),
293 |             IndexedRow(4, new DenseVector(Array(0.07502121,0.03663396,0,0,0.12702096,0))),
294 |             IndexedRow(5, new DenseVector(Array(0,0.04305625,0.04305625,0.03515625,0,0.11055625)))
295 |           )
296 |         ))
297 | 
298 |     val MCLObject: MCL = new MCL()
299 |     val diff: Double = MCLObject.difference(startMatrix, stopMatrix)
300 | 
301 |     BigDecimal(diff).setScale(7, BigDecimal.RoundingMode.HALF_UP).toDouble shouldEqual 15.1434766
302 | 
303 |   }
304 | 
305 |   // Integration Tests
306 | 
307 |   test("Official MCL Algorithm Versus Spark MCL", IntegrationTest) {
308 | 
309 |     val relationshipsFile:Seq[String] = Source.fromURL(getClass.getResource("/MCL/karateEdges.csv")).getLines().toSeq
310 |     val nodesFile:Seq[String] = Source.fromURL(getClass.getResource("/MCL/karateNodes.csv")).getLines().toSeq
311 |     val clustersFile:Seq[String] = Source.fromURL(getClass.getResource("/MCL/clusters.tab")).getLines().toSeq
312 | 
313 |     val relationships: RDD[Edge[Double]] =
314 |       sc.parallelize(
315 |         relationshipsFile
316 |         .map(line => line.split(" "))
317 |         .map(e => Edge(e(0).toLong, e(1).toLong, e(2).toDouble))
318 |       )
319 | 
320 |     val users: RDD[(VertexId, String)] =
321 |       sc.parallelize(
322 |         nodesFile
323 |         .map(line => line.split(" "))
324 |         .map(n => (n(0).toLong, n(1)))
325 |       )
326 | 
327 |     val graph: Graph[String, Double] = Graph(users, relationships)
328 | 
329 |     val spark = SparkSession.builder().getOrCreate()
330 |     import spark.implicits._
331 | 
332 |     val assignments:Dataset[Assignment] = MCL.train(graph, epsilon=0.01, maxIterations=30, selfLoopWeight = 1.0, graphOrientationStrategy = "bidirected").assignments
333 |     val clusters =
334 |         assignments
335 |           .groupBy("cluster")
336 |           .agg(collect_list(col("id")))
337 |           .withColumn("group", sort_array(col("collect_list(id)")))
338 |           .select("group").map{
339 |             case Row(group: mutable.WrappedArray[Long]) => (group.max, group)
340 |           }
341 |           .withColumnRenamed("_1", "clusterIdTest")
342 |           .withColumnRenamed("_2", "group")
343 | 
344 |     val clustersChallenge =
345 |       sc.parallelize(
346 |         clustersFile
347 |         .map(line => line.split("\t").map(node => node.toInt).toList)
348 |         .map(assignment => (assignment.max, assignment.toArray.sorted))
349 |       ).toDF("clusterIdReal", "group")
350 | 
351 |     val test = clusters.join(clustersChallenge, clusters.col("clusterIdTest")===clustersChallenge.col("clusterIdReal"))
352 |     test.count shouldEqual clustersChallenge.count
353 | 
354 |   }
355 | 
356 | }
357 | 


--------------------------------------------------------------------------------