├── project ├── build.properties └── plugins.sbt ├── images ├── MCL.png ├── Expansion.png ├── Inflation.png └── Difference.png ├── src ├── test │ ├── resources │ │ ├── MCL │ │ │ ├── clusters.tab │ │ │ ├── karateNodes.csv │ │ │ └── karateEdges.csv │ │ └── MCLUtils │ │ │ ├── OrientedNodes.txt │ │ │ ├── OrientedEdges.txt │ │ │ └── OrientedMatrixSelfLoop.txt │ └── scala │ │ ├── org │ │ └── apache │ │ │ └── spark │ │ │ └── mllib │ │ │ └── clustering │ │ │ ├── MCLFunSuite.scala │ │ │ ├── MCLModelSuite.scala │ │ │ ├── MCLUtilsSuite.scala │ │ │ └── MCLSuite.scala │ │ └── MainSuite.scala └── main │ └── scala │ ├── org │ └── apache │ │ └── spark │ │ └── mllib │ │ └── clustering │ │ ├── MCLModel.scala │ │ ├── MCLUtils.scala │ │ └── MCL.scala │ └── Main.scala ├── .gitignore ├── LICENSE.txt ├── .travis.yml └── README.md /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.11 -------------------------------------------------------------------------------- /images/MCL.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joandre/MCL_spark/HEAD/images/MCL.png -------------------------------------------------------------------------------- /images/Expansion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joandre/MCL_spark/HEAD/images/Expansion.png -------------------------------------------------------------------------------- /images/Inflation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joandre/MCL_spark/HEAD/images/Inflation.png -------------------------------------------------------------------------------- /images/Difference.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/joandre/MCL_spark/HEAD/images/Difference.png -------------------------------------------------------------------------------- /src/test/resources/MCL/clusters.tab: -------------------------------------------------------------------------------- 1 | 1 2 3 4 5 6 7 8 11 12 13 14 18 20 22 17 2 | 9 31 10 28 29 33 34 15 16 19 21 23 24 30 27 3 | 32 26 25 4 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.4.0") 2 | 3 | resolvers += "bintray-spark-packages" at "https://dl.bintray.com/spark-packages/maven/" 4 | 5 | addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.4") -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | # sbt specific 4 | .cache 5 | .lib/ 6 | dist/* 7 | target/ 8 | lib_managed/ 9 | src_managed/ 10 | project/boot/ 11 | project/plugins/project/ 12 | # Scala-IDE specific 13 | .idea/ 14 | .scala_dependencies 15 | .worksheet -------------------------------------------------------------------------------- /src/test/resources/MCL/karateNodes.csv: -------------------------------------------------------------------------------- 1 | 1 "1" 2 | 2 "2" 3 | 3 "3" 4 | 4 "4" 5 | 5 "5" 6 | 6 "6" 7 | 7 "7" 8 | 8 "8" 9 | 9 "9" 10 | 10 "10" 11 | 11 "11" 12 | 12 "12" 13 | 13 "13" 14 | 14 "14" 15 | 15 "15" 16 | 16 "16" 17 | 17 "17" 18 | 18 "18" 19 | 19 "19" 20 | 20 "20" 21 | 21 "21" 22 | 22 "22" 23 | 23 "23" 24 | 24 "24" 25 | 25 "25" 26 | 26 "26" 27 | 27 "27" 28 | 28 "28" 29 | 29 "29" 30 | 30 "30" 31 | 31 "31" 32 | 32 "32" 33 | 33 "33" 34 | 34 "34" 35 | -------------------------------------------------------------------------------- /src/test/resources/MCLUtils/OrientedNodes.txt: -------------------------------------------------------------------------------- 1 | 1 "0" 2 | 2 "1" 3 | 3 "2" 4 | 4 "3" 5 | 5 "4" 6 | 6 "5" 7 | 7 "6" 8 | 8 "7" 9 | 9 "8" 10 | 10 "9" 11 | 11 "10" 12 | 12 "11" 13 | 13 "12" 14 | 14 "13" 15 | 15 "14" 16 | 16 "15" 17 | 17 "16" 18 | 18 "17" 19 | 19 "18" 20 | 20 "19" 21 | 21 "20" 22 | 22 "21" 23 | 23 "22" 24 | 24 "23" 25 | 25 "24" 26 | 26 "25" 27 | 27 "26" 28 | 28 "27" 29 | 29 "28" 30 | 30 "29" 31 | 31 "30" 32 | 32 "31" 33 | 33 "32" 34 | 34 "33" 35 | 35 "34" 36 | 36 "35" 37 | 37 "36" 38 | 38 "37" 39 | 39 "38" 40 | 40 "39" 41 | 41 "40" 42 | 42 "41" 43 | 43 "42" 44 | 44 "43" 45 | 45 "44" 46 | 46 "45" 47 | 47 "46" 48 | 48 "47" 49 | 49 "48" 50 | 50 "49" 51 | -------------------------------------------------------------------------------- /src/test/resources/MCL/karateEdges.csv: -------------------------------------------------------------------------------- 1 | 1 2 4 2 | 1 3 5 3 | 1 4 3 4 | 1 5 3 5 | 1 6 3 6 | 1 7 3 7 | 1 8 2 8 | 1 9 2 9 | 1 11 2 10 | 1 12 3 11 | 1 13 1 12 | 1 14 3 13 | 1 18 2 14 | 1 20 2 15 | 1 22 2 16 | 1 32 2 17 | 2 3 6 18 | 2 4 3 19 | 2 8 4 20 | 2 14 5 21 | 2 18 1 22 | 2 20 2 23 | 2 22 2 24 | 2 31 2 25 | 3 4 3 26 | 3 8 4 27 | 3 9 5 28 | 3 10 1 29 | 3 14 3 30 | 3 28 2 31 | 3 29 2 32 | 3 33 2 33 | 4 8 3 34 | 4 13 3 35 | 4 14 3 36 | 5 7 2 37 | 5 11 3 38 | 6 7 5 39 | 6 11 3 40 | 6 17 3 41 | 7 17 3 42 | 9 31 3 43 | 9 33 3 44 | 9 34 4 45 | 10 34 2 46 | 14 34 3 47 | 15 33 3 48 | 15 34 2 49 | 16 33 3 50 | 16 34 4 51 | 19 33 1 52 | 19 34 2 53 | 20 34 1 54 | 21 33 3 55 | 21 34 1 56 | 23 33 2 57 | 23 34 3 58 | 24 26 5 59 | 24 28 4 60 | 24 30 3 61 | 24 33 5 62 | 24 34 4 63 | 25 26 2 64 | 25 28 3 65 | 25 32 2 66 | 26 32 7 67 | 27 30 4 68 | 27 34 2 69 | 28 34 4 70 | 29 32 2 71 | 29 34 2 72 | 30 33 4 73 | 30 34 2 74 | 31 33 3 75 | 31 34 3 76 | 32 33 4 77 | 32 34 4 78 | 33 34 5 79 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015-2017, Joan André 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | # 3 | # Copyright (c) 2015-2016, Joan André 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | language: scala 24 | script: 25 | - sbt clean coverage test coverageReport 26 | after_success: 27 | - bash <(curl -s https://codecov.io/bash) 28 | scala: 29 | - 2.10.5 30 | - 2.11.8 31 | jdk: 32 | - oraclejdk8 33 | env: 34 | - SPARKVERSION="2.0.1" 35 | 36 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/mllib/clustering/MCLFunSuite.scala: -------------------------------------------------------------------------------- 1 | /*The MIT License (MIT) 2 | 3 | Copyright (c) 2015-2016, Joan André 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE.*/ 22 | 23 | package org.apache.spark.mllib.clustering 24 | 25 | import com.holdenkarau.spark.testing.{DatasetSuiteBase, SharedSparkContext} 26 | import org.scalatest.{FunSuite, Matchers, Tag} 27 | 28 | /** MCL specific implementation of Scala Test Suite */ 29 | //TODO Why spark ? 30 | private[spark] abstract class MCLFunSuite extends FunSuite with Matchers with SharedSparkContext with DatasetSuiteBase 31 | 32 | object UnitTest extends Tag("org.apache.spark.mllib.clustering.tags.UnitTest") 33 | object IntegrationTest extends Tag("org.apache.spark.mllib.clustering.tags.IntegrationTest") -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/mllib/clustering/MCLModelSuite.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.mllib.clustering 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.graphx._ 5 | import org.apache.spark.rdd.RDD 6 | import org.apache.spark.sql.Row 7 | import org.apache.spark.util.Utils 8 | 9 | /** 10 | * Created by andrejoan on 4/30/16. 11 | */ 12 | class MCLModelSuite extends MCLFunSuite{ 13 | // Disable Spark messages when running program 14 | Logger.getLogger("org").setLevel(Level.OFF) 15 | Logger.getLogger("akka").setLevel(Level.OFF) 16 | 17 | test("model save/load", UnitTest){ 18 | 19 | val users: RDD[(VertexId, String)] = 20 | sc.parallelize(Array((0L,"Node1"), (1L,"Node2"), 21 | (2L,"Node3"), (3L,"Node4"),(4L,"Node5"), 22 | (5L,"Node6"), (6L,"Node7"), (7L, "Node8"), 23 | (8L, "Node9"), (9L, "Node10"), (10L, "Node11"))) 24 | 25 | val relationships: RDD[Edge[Double]] = 26 | sc.parallelize( 27 | Seq(Edge(0, 1, 1.0), Edge(1, 0, 1.0), 28 | Edge(0, 2, 1.0), Edge(2, 0, 1.0), 29 | Edge(0, 3, 1.0), Edge(3, 0, 1.0), 30 | Edge(1, 2, 1.0), Edge(2, 1, 1.0), 31 | Edge(1, 3, 1.0), Edge(3, 1, 1.0), 32 | Edge(2, 3, 1.0), Edge(3, 2, 1.0), 33 | Edge(4, 5, 1.0), Edge(5, 4, 1.0), 34 | Edge(4, 6, 1.0), Edge(6, 4, 1.0), 35 | Edge(4, 7, 1.0), Edge(7, 4, 1.0), 36 | Edge(5, 6, 1.0), Edge(6, 5, 1.0), 37 | Edge(5, 7, 1.0), Edge(7, 5, 1.0), 38 | Edge(6, 7, 1.0), Edge(7, 6, 1.0), 39 | Edge(3, 8, 1.0), Edge(8, 3, 1.0), 40 | Edge(9, 8, 1.0), Edge(8, 9, 1.0), 41 | Edge(9, 10, 1.0), Edge(10, 9, 1.0), 42 | Edge(4, 10, 1.0), Edge(10, 4, 1.0) 43 | )) 44 | 45 | val graph = Graph(users, relationships) 46 | 47 | val model: MCLModel = MCL.train(graph) 48 | 49 | // Check number of clusters 50 | model.nbClusters shouldEqual 3 51 | 52 | // Check save and load methods 53 | val tempDir = Utils.createTempDir() 54 | val path = tempDir.toURI.toString 55 | 56 | Array(true, false).foreach { case selector => 57 | // Save model, load it back, and compare. 58 | try { 59 | model.save(sc, path) 60 | val sameModel = MCLModel.load(sc, path) 61 | assertDatasetEquals(model.assignments.orderBy("id"), sameModel.assignments.orderBy("id")) 62 | } finally { 63 | Utils.deleteRecursively(tempDir) 64 | } 65 | } 66 | 67 | } 68 | 69 | test("nodes assignments", UnitTest) { 70 | val nodeId = 1.0.toLong 71 | val cluster = 2.0.toLong 72 | val newAssignment:Assignment = Assignment.apply(Row(nodeId, cluster)) 73 | 74 | newAssignment.id shouldEqual nodeId 75 | newAssignment.cluster shouldEqual cluster 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /src/test/scala/MainSuite.scala: -------------------------------------------------------------------------------- 1 | /*The MIT License (MIT) 2 | 3 | Copyright (c) 2015-2016, Joan André 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE.*/ 22 | 23 | import org.scalatest.{FunSuite, Matchers} 24 | 25 | class MainSuite extends FunSuite with Matchers{ 26 | 27 | test("toInt"){ 28 | val eR = Main.toInt(Symbol("expansionRate"), "2") 29 | eR shouldEqual 2 30 | 31 | an [Exception] should be thrownBy Main.toInt(Symbol("expansionRate"), "1.1") 32 | } 33 | 34 | test("toDouble"){ 35 | val iR = Main.toDouble(Symbol("inflationRate"), "2.0") 36 | iR shouldEqual 2.0 37 | 38 | an [Exception] should be thrownBy Main.toDouble(Symbol("inflationRate"), "test") 39 | } 40 | 41 | test("nextOption"){ 42 | val args: Array[String] = Array("--expansionRate", "3", "--inflationRate", "3.0", "--epsilon", "0.1", "--maxIterations", "20", "--selfLoopWeight", "0.1", "--graphOrientationStrategy", "directed") 43 | val arglist = args.toList 44 | 45 | val options = Main.nextOption(Map(),arglist) 46 | Main.toInt('expansionRate, options.getOrElse('expansionRate, 2).toString) shouldEqual 3 47 | Main.toDouble('inflationRate, options.getOrElse('inflationRate, 2.0).toString) shouldEqual 3.0 48 | Main.toDouble('epsilon, options.getOrElse('epsilon, 0.01).toString) shouldEqual 0.1 49 | Main.toInt('maxIterations, options.getOrElse('maxIterations, 10).toString) shouldEqual 20 50 | Main.toDouble('selfLoopWeight, options.getOrElse('selfLoopWeight, 1.0).toString) shouldEqual 0.1 51 | options.getOrElse('graphOrientationStrategy, "undirected").toString shouldEqual "directed" 52 | 53 | val args2: Array[String] = Array("--wrongOption", "test") 54 | val arglist2 = args2.toList 55 | 56 | an [Exception] should be thrownBy Main.nextOption(Map(),arglist2) 57 | } 58 | 59 | /*test("main"){ 60 | val args: Array[String] = Array("--expansionRate", "2", "--inflationRate", "2.0", "--epsilon", "0.01", "--maxIterations", "10", "--selfLoopWeight", "1", "--graphOrientationStrategy", "undirected") 61 | 62 | val streamIM = new java.io.ByteArrayOutputStream() 63 | Console.withOut(streamIM) { 64 | Main.main(args) 65 | } 66 | 67 | streamIM.toString.split("\n") should contain theSameElementsAs Array("0 => List(0, 1, 2, 3)", "4 => List(4, 5, 6, 7)", "9 => List(8, 9, 10)").toSeq 68 | }*/ 69 | } 70 | -------------------------------------------------------------------------------- /src/test/resources/MCLUtils/OrientedEdges.txt: -------------------------------------------------------------------------------- 1 | 1 1 1.0 2 | 1 9 1.0 3 | 1 18 1.0 4 | 1 20 1.0 5 | 1 21 1.0 6 | 1 24 1.0 7 | 1 35 1.0 8 | 1 43 1.0 9 | 1 45 1.0 10 | 1 46 1.0 11 | 2 7 1.0 12 | 2 11 1.0 13 | 2 12 1.0 14 | 2 13 1.0 15 | 2 18 1.0 16 | 2 24 1.0 17 | 2 28 1.0 18 | 2 31 1.0 19 | 2 40 1.0 20 | 3 9 1.0 21 | 3 13 1.0 22 | 3 15 1.0 23 | 3 21 1.0 24 | 3 23 1.0 25 | 3 24 1.0 26 | 3 29 1.0 27 | 3 30 1.0 28 | 3 38 1.0 29 | 3 42 1.0 30 | 3 50 1.0 31 | 4 35 1.0 32 | 4 40 1.0 33 | 4 45 1.0 34 | 4 46 1.0 35 | 4 48 1.0 36 | 4 49 1.0 37 | 5 14 1.0 38 | 5 15 1.0 39 | 5 19 1.0 40 | 5 22 1.0 41 | 5 26 1.0 42 | 5 28 1.0 43 | 5 34 1.0 44 | 5 36 1.0 45 | 5 40 1.0 46 | 6 20 1.0 47 | 6 21 1.0 48 | 6 23 1.0 49 | 6 26 1.0 50 | 6 28 1.0 51 | 6 36 1.0 52 | 6 38 1.0 53 | 6 44 1.0 54 | 7 8 1.0 55 | 7 12 1.0 56 | 7 15 1.0 57 | 7 17 1.0 58 | 7 20 1.0 59 | 7 22 1.0 60 | 7 25 1.0 61 | 7 26 1.0 62 | 7 28 1.0 63 | 7 36 1.0 64 | 7 42 1.0 65 | 8 25 1.0 66 | 8 26 1.0 67 | 8 28 1.0 68 | 8 35 1.0 69 | 8 37 1.0 70 | 8 38 1.0 71 | 8 39 1.0 72 | 9 10 1.0 73 | 9 11 1.0 74 | 9 17 1.0 75 | 9 18 1.0 76 | 9 34 1.0 77 | 9 46 1.0 78 | 9 47 1.0 79 | 10 13 1.0 80 | 10 21 1.0 81 | 10 22 1.0 82 | 10 24 1.0 83 | 10 28 1.0 84 | 10 34 1.0 85 | 10 48 1.0 86 | 10 49 1.0 87 | 10 50 1.0 88 | 11 18 1.0 89 | 11 22 1.0 90 | 11 24 1.0 91 | 11 26 1.0 92 | 11 27 1.0 93 | 11 34 1.0 94 | 11 36 1.0 95 | 11 37 1.0 96 | 11 44 1.0 97 | 12 28 1.0 98 | 12 33 1.0 99 | 12 36 1.0 100 | 12 38 1.0 101 | 12 50 1.0 102 | 13 17 1.0 103 | 13 19 1.0 104 | 13 23 1.0 105 | 13 29 1.0 106 | 13 31 1.0 107 | 13 39 1.0 108 | 13 41 1.0 109 | 13 46 1.0 110 | 13 50 1.0 111 | 14 21 1.0 112 | 14 24 1.0 113 | 14 26 1.0 114 | 14 35 1.0 115 | 14 45 1.0 116 | 15 20 1.0 117 | 15 22 1.0 118 | 15 25 1.0 119 | 15 29 1.0 120 | 15 43 1.0 121 | 15 46 1.0 122 | 15 47 1.0 123 | 16 19 1.0 124 | 16 26 1.0 125 | 16 33 1.0 126 | 16 38 1.0 127 | 16 40 1.0 128 | 16 44 1.0 129 | 16 45 1.0 130 | 16 50 1.0 131 | 17 18 1.0 132 | 17 20 1.0 133 | 17 24 1.0 134 | 17 26 1.0 135 | 17 27 1.0 136 | 17 28 1.0 137 | 17 30 1.0 138 | 17 44 1.0 139 | 17 45 1.0 140 | 18 20 1.0 141 | 18 31 1.0 142 | 18 36 1.0 143 | 18 37 1.0 144 | 18 40 1.0 145 | 19 30 1.0 146 | 19 35 1.0 147 | 19 36 1.0 148 | 19 37 1.0 149 | 19 39 1.0 150 | 19 43 1.0 151 | 19 46 1.0 152 | 20 24 1.0 153 | 20 27 1.0 154 | 20 34 1.0 155 | 20 45 1.0 156 | 20 47 1.0 157 | 21 22 1.0 158 | 21 31 1.0 159 | 21 32 1.0 160 | 22 23 1.0 161 | 22 25 1.0 162 | 22 27 1.0 163 | 22 28 1.0 164 | 22 30 1.0 165 | 22 32 1.0 166 | 22 36 1.0 167 | 22 47 1.0 168 | 23 32 1.0 169 | 23 36 1.0 170 | 23 37 1.0 171 | 23 38 1.0 172 | 23 39 1.0 173 | 23 43 1.0 174 | 23 44 1.0 175 | 23 45 1.0 176 | 24 28 1.0 177 | 24 40 1.0 178 | 24 47 1.0 179 | 25 34 1.0 180 | 26 27 1.0 181 | 26 33 1.0 182 | 26 34 1.0 183 | 26 35 1.0 184 | 26 42 1.0 185 | 26 43 1.0 186 | 27 30 1.0 187 | 27 32 1.0 188 | 27 33 1.0 189 | 27 35 1.0 190 | 27 38 1.0 191 | 27 41 1.0 192 | 27 44 1.0 193 | 28 32 1.0 194 | 28 42 1.0 195 | 28 43 1.0 196 | 28 46 1.0 197 | 29 31 1.0 198 | 29 32 1.0 199 | 29 37 1.0 200 | 29 40 1.0 201 | 30 31 1.0 202 | 31 36 1.0 203 | 31 48 1.0 204 | 31 49 1.0 205 | 32 37 1.0 206 | 32 40 1.0 207 | 32 46 1.0 208 | 32 48 1.0 209 | 33 35 1.0 210 | 33 44 1.0 211 | 33 49 1.0 212 | 33 50 1.0 213 | 34 38 1.0 214 | 34 42 1.0 215 | 34 44 1.0 216 | 34 47 1.0 217 | 35 36 1.0 218 | 35 42 1.0 219 | 35 47 1.0 220 | 35 49 1.0 221 | 36 37 1.0 222 | 36 38 1.0 223 | 36 41 1.0 224 | 36 43 1.0 225 | 36 47 1.0 226 | 37 41 1.0 227 | 37 48 1.0 228 | 38 45 1.0 229 | 38 48 1.0 230 | 39 50 1.0 231 | 40 41 1.0 232 | 40 42 1.0 233 | 40 50 1.0 234 | 41 42 1.0 235 | 41 45 1.0 236 | 41 47 1.0 237 | 42 47 1.0 238 | 43 44 1.0 239 | 43 46 1.0 240 | 43 49 1.0 241 | 44 47 1.0 242 | 50 50 1.0 243 | 45 49 1.0 244 | 46 49 1.0 245 | 47 48 1.0 246 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/mllib/clustering/MCLModel.scala: -------------------------------------------------------------------------------- 1 | /*The MIT License (MIT) 2 | 3 | Copyright (c) 2015-2016, Joan André 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE.*/ 22 | 23 | package org.apache.spark.mllib.clustering 24 | 25 | import org.apache.spark.SparkContext 26 | import org.apache.spark.mllib.util.{Loader, Saveable} 27 | import org.apache.spark.sql.functions._ 28 | import org.apache.spark.sql.{Dataset, Row, SparkSession} 29 | import org.json4s.JsonDSL._ 30 | import org.json4s._ 31 | import org.json4s.jackson.JsonMethods._ 32 | 33 | /** A clustering model for MCL. 34 | * 35 | * @param assignments an RDD of clustering assignments 36 | * @todo complete save and load features 37 | */ 38 | 39 | class MCLModel(var assignments: Dataset[Assignment]) extends Saveable with Serializable{ 40 | 41 | /** Get number of clusters.*/ 42 | def nbClusters: Int = assignments 43 | .groupBy("cluster") 44 | .agg(collect_list(col("id"))) 45 | .collect.length 46 | 47 | /** 48 | * Save MCL clusters assignments 49 | * 50 | * @param sc current Spark Context 51 | * @param path location where MCL model is saved 52 | */ 53 | 54 | override def save(sc: SparkContext, path: String): Unit = { 55 | MCLModel.SaveLoadV1_0.save(sc, this, path) 56 | } 57 | 58 | override protected def formatVersion: String = "1.0" 59 | } 60 | 61 | object MCLModel extends Loader[MCLModel]{ 62 | 63 | /** Load MCL clusters assignments 64 | * 65 | * @param sc current Spark Context 66 | * @param path location where MCL model is saved 67 | */ 68 | 69 | override def load(sc: SparkContext, path: String): MCLModel = { 70 | MCLModel.SaveLoadV1_0.load(sc, path) 71 | } 72 | 73 | private[clustering] 74 | object SaveLoadV1_0 { 75 | 76 | private val thisFormatVersion = "1.0" 77 | 78 | private[clustering] 79 | val thisClassName = "org.apache.spark.mllib.clustering.MCLModel" 80 | 81 | def save(sc: SparkContext, model: MCLModel, path: String): Unit = { 82 | val metadata = compact(render( 83 | ("class" -> thisClassName) ~ ("version" -> thisFormatVersion) 84 | )) 85 | sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path)) 86 | 87 | model.assignments.write.parquet(Loader.dataPath(path)) 88 | } 89 | 90 | def load(sc: SparkContext, path: String): MCLModel = { 91 | implicit val formats = DefaultFormats 92 | val spark = SparkSession.builder().getOrCreate() 93 | import spark.implicits._ 94 | 95 | val (className, formatVersion, metadata) = Loader.loadMetadata(sc, path) 96 | assert(className == thisClassName) 97 | assert(formatVersion == thisFormatVersion) 98 | 99 | /*val expansionRate = (metadata \ "expansionRate").extract[Double] 100 | val inflationRate = (metadata \ "inflationRate").extract[Double] 101 | val epsilon = (metadata \ "epsilon").extract[Double] 102 | val maxIterations = (metadata \ "maxIterations").extract[Int]*/ 103 | 104 | val assignments = spark.read.parquet(Loader.dataPath(path)) 105 | // Check if loading file respects Assignment class schema 106 | Loader.checkSchema[Assignment](assignments.schema) 107 | val certifiedAssignments = assignments.map { 108 | case Row(id: Long, cluster: Long) => Assignment(id, cluster) 109 | } 110 | 111 | new MCLModel(certifiedAssignments) 112 | } 113 | } 114 | } 115 | 116 | /** List which point belongs to which cluster 117 | * 118 | * @param id node id 119 | * @param cluster cluster id 120 | */ 121 | 122 | case class Assignment(id: Long, cluster: Long) 123 | 124 | /** Factory for [[MCLModel.assignments]] instances. */ 125 | private object Assignment { 126 | 127 | /** Creates an assignment with a given node id and a given cluster id 128 | * 129 | * @param r a row with two columns: one for node id and one for cluster id 130 | */ 131 | def apply(r: Row): Assignment = { 132 | Assignment(r.getLong(0), r.getLong(1)) 133 | } 134 | } -------------------------------------------------------------------------------- /src/test/resources/MCLUtils/OrientedMatrixSelfLoop.txt: -------------------------------------------------------------------------------- 1 | 1.0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;1.0;0;1.0;1.0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;1.0;0;1.0;1.0;0;0;0;0 2 | 0;1.0;0;0;0;0;1.0;0;0;0;1.0;1.0;1.0;0;0;0;0;1.0;0;0;0;0;0;1.0;0;0;0;1.0;0;0;1.0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0 3 | 0;0;1.0;0;0;0;0;0;1.0;0;0;0;1.0;0;1.0;0;0;0;0;0;1.0;0;1.0;1.0;0;0;0;0;1.0;1.0;0;0;0;0;0;0;0;1.0;0;0;0;1.0;0;0;0;0;0;0;0;1.0 4 | 0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;1.0;0;0;0;0;1.0;1.0;0;1.0;1.0;0 5 | 0;0;0;0;1.0;0;0;0;0;0;0;0;0;1.0;1.0;0;0;0;1.0;0;0;1.0;0;0;0;1.0;0;1.0;0;0;0;0;0;1.0;0;1.0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0 6 | 0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;1.0;0;0;1.0;0;1.0;0;0;0;0;0;0;0;1.0;0;1.0;0;0;0;0;0;1.0;0;0;0;0;0;0 7 | 0;0;0;0;0;0;1.0;1.0;0;0;0;1.0;0;0;1.0;0;1.0;0;0;1.0;0;1.0;0;0;1.0;1.0;0;1.0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0 8 | 0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;1.0;0;0;0;0;0;0;1.0;0;1.0;1.0;1.0;0;0;0;0;0;0;0;0;0;0;0 9 | 0;0;0;0;0;0;0;0;1.0;1.0;1.0;0;0;0;0;0;1.0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;0;0 10 | 0;0;0;0;0;0;0;0;0;1.0;0;0;1.0;0;0;0;0;0;0;0;1.0;1.0;0;1.0;0;0;0;1.0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;1.0 11 | 0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;1.0;0;0;0;1.0;0;1.0;0;1.0;1.0;0;0;0;0;0;0;1.0;0;1.0;1.0;0;0;0;0;0;0;1.0;0;0;0;0;0;0 12 | 0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;1.0;0;0;1.0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;1.0 13 | 0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;1.0;0;1.0;0;0;0;1.0;0;0;0;0;0;1.0;0;1.0;0;0;0;0;0;0;0;1.0;0;1.0;0;0;0;0;1.0;0;0;0;1.0 14 | 0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;1.0;0;0;1.0;0;1.0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0 15 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;1.0;0;1.0;0;0;1.0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;1.0;1.0;0;0;0 16 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;1.0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;1.0;0;0;0;0;1.0;0;1.0;0;0;0;1.0;1.0;0;0;0;0;1.0 17 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;1.0;0;0;0;1.0;0;1.0;1.0;1.0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;0;0;0;0 18 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;1.0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;1.0;1.0;0;0;1.0;0;0;0;0;0;0;0;0;0;0 19 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;1.0;1.0;1.0;0;1.0;0;0;0;1.0;0;0;1.0;0;0;0;0 20 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;1.0;0;0;1.0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;1.0;0;1.0;0;0;0 21 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;0;0;0;0;0;0;0;1.0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 22 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;1.0;0;1.0;1.0;0;1.0;0;1.0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0 23 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;1.0;0;0;0;1.0;1.0;1.0;1.0;0;0;0;1.0;1.0;1.0;0;0;0;0;0 24 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;1.0;0;0;0 25 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 26 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;0;0;0;0;1.0;1.0;1.0;0;0;0;0;0;0;1.0;1.0;0;0;0;0;0;0;0 27 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;1.0;0;1.0;1.0;0;1.0;0;0;1.0;0;0;1.0;0;0;1.0;0;0;0;0;0;0 28 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;0;1.0;0;0;0;0 29 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;1.0;1.0;0;0;0;0;1.0;0;0;1.0;0;0;0;0;0;0;0;0;0;0 30 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 31 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0 32 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;1.0;0;0;1.0;0;0;0;0;0;1.0;0;1.0;0;0 33 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;1.0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;1.0;1.0 34 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;1.0;0;0;0;1.0;0;1.0;0;0;1.0;0;0;0 35 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;0;0;0;0;1.0;0;0;0;0;1.0;0;1.0;0 36 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;1.0;0;0;1.0;0;1.0;0;0;0;1.0;0;0;0 37 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;1.0;0;0;0;0;0;0;1.0;0;0 38 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;1.0;0;0;1.0;0;0 39 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;0;0;0;0;0;0;1.0 40 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;1.0;0;0;0;0;0;0;0;1.0 41 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;0;1.0;0;1.0;0;0;0 42 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;0;1.0;0;0;0 43 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;1.0;0;0;1.0;0 44 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;1.0;0;0;0 45 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;0;1.0;0 46 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0;1.0;0 47 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;1.0;0;0 48 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0;0 49 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0;0 50 | 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;1.0 51 | -------------------------------------------------------------------------------- /src/main/scala/Main.scala: -------------------------------------------------------------------------------- 1 | /*The MIT License (MIT) 2 | 3 | Copyright (c) 2015-2016, Joan André 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE.*/ 22 | 23 | // Import required spark classes 24 | 25 | import org.apache.log4j.{Level, Logger} 26 | import org.apache.spark.graphx._ 27 | import org.apache.spark.mllib.clustering.{Assignment, MCL} 28 | import org.apache.spark.rdd.RDD 29 | import org.apache.spark.{SparkConf, SparkContext} 30 | import org.apache.spark.sql.Dataset 31 | import org.apache.spark.sql.functions._ 32 | 33 | /** Define main method for a start-up example*/ 34 | object Main { 35 | 36 | // Disable Spark messages when running program 37 | Logger.getLogger("org").setLevel(Level.OFF) 38 | Logger.getLogger("akka").setLevel(Level.OFF) 39 | 40 | // Guide for users who want to run MCL program 41 | val usage = """ 42 | Usage: mcl [--expansionRate num] [--inflationRate num] [--epsilon num] [--maxIterations num] [--selfLoopWeight num] [--graphOrientationStrategy string] 43 | """ 44 | 45 | type OptionMap = Map[Symbol, Any] 46 | 47 | def toInt(key: Symbol, s: String): Int = { 48 | try { 49 | s.toInt 50 | } catch { 51 | case e: Exception => throw new Exception("\n" + key.toString() + " must be an integer") 52 | } 53 | } 54 | 55 | def toDouble(key: Symbol, s: String): Double = { 56 | try { 57 | s.toDouble 58 | } catch { 59 | case e: Exception => throw new Exception("\n" + key.toString() + " must be a double") 60 | } 61 | } 62 | 63 | def nextOption(map : OptionMap, list: List[String]) : OptionMap = { 64 | list match { 65 | case Nil => map 66 | case "--expansionRate" :: value :: tail => 67 | nextOption(map ++ Map('expansionRate -> value), tail) 68 | case "--inflationRate" :: value :: tail => 69 | nextOption(map ++ Map('inflationRate -> value), tail) 70 | case "--epsilon" :: value :: tail => 71 | nextOption(map ++ Map('epsilon -> value), tail) 72 | case "--maxIterations" :: value :: tail => 73 | nextOption(map ++ Map('maxIterations -> value), tail) 74 | case "--selfLoopWeight" :: value :: tail => 75 | nextOption(map ++ Map('selfLoopWeight -> value), tail) 76 | case "--graphOrientationStrategy" :: value :: tail => 77 | nextOption(map ++ Map('graphOrientationStrategy -> value), tail) 78 | case option :: tail => throw new Exception("\nUnknown option " + option) 79 | } 80 | } 81 | 82 | def main(args: Array[String]) { 83 | 84 | // Manage options for the program 85 | if (args.length == 0) println(usage) 86 | val arglist = args.toList 87 | 88 | try{ 89 | val options = nextOption(Map(),arglist) 90 | val expansionRate:Int = toInt('expansionRate, options.getOrElse('expansionRate, 2).toString) 91 | val inflationRate:Double = toDouble('inflationRate, options.getOrElse('inflationRate, 2.0).toString) 92 | val epsilon:Double = toDouble('epsilon, options.getOrElse('epsilon, 0.01).toString) 93 | val maxIterations:Int = toInt('maxIterations, options.getOrElse('maxIterations, 10).toString) 94 | val selfLoopWeight:Double = toDouble('selfLoopWeight, options.getOrElse('selfLoopWeight, 1.0).toString) 95 | val graphOrientationStrategy:String = options.getOrElse('graphOrientationStrategy, "undirected").toString 96 | 97 | // Initialise spark context 98 | val conf = new SparkConf() 99 | .setMaster("local[*]") 100 | .set("spark.driver.memory", "1g") 101 | .set("spark.executor.memory", "1g") 102 | .setAppName("MCL") 103 | 104 | val sc = new SparkContext(conf) 105 | 106 | // Create and RDD for vertices 107 | val users: RDD[(VertexId, String)] = 108 | sc.parallelize(Array((0L,"Node1"), (1L,"Node2"), 109 | (2L,"Node3"), (3L,"Node4"),(4L,"Node5"), 110 | (5L,"Node6"), (6L,"Node7"), (7L, "Node8"), 111 | (8L, "Node9"), (9L, "Node10"), (10L, "Node11"))) 112 | 113 | // Create an RDD for edges 114 | val relationships: RDD[Edge[Double]] = 115 | sc.parallelize( 116 | Seq(Edge(0, 1, 1.0), Edge(1, 0, 1.0), 117 | Edge(0, 2, 1.0), Edge(2, 0, 1.0), 118 | Edge(0, 3, 1.0), Edge(3, 0, 1.0), 119 | Edge(1, 2, 1.0), Edge(2, 1, 1.0), 120 | Edge(1, 3, 1.0), Edge(3, 1, 1.0), 121 | Edge(2, 3, 1.0), Edge(3, 2, 1.0), 122 | Edge(4, 5, 1.0), Edge(5, 4, 1.0), 123 | Edge(4, 6, 1.0), Edge(6, 4, 1.0), 124 | Edge(4, 7, 1.0), Edge(7, 4, 1.0), 125 | Edge(5, 6, 1.0), Edge(6, 5, 1.0), 126 | Edge(5, 7, 1.0), Edge(7, 5, 1.0), 127 | Edge(6, 7, 1.0), Edge(7, 6, 1.0), 128 | Edge(3, 8, 1.0), Edge(8, 3, 1.0), 129 | Edge(9, 8, 1.0), Edge(8, 9, 1.0), 130 | Edge(9, 10, 1.0), Edge(10, 9, 1.0), 131 | Edge(4, 10, 1.0), Edge(10, 4, 1.0) 132 | )) 133 | 134 | // Build the initial Graph 135 | val graph = Graph(users, relationships) 136 | 137 | // Run MCL algorithm and get nodes assignments to generated clusters 138 | val clusters: Dataset[Assignment] = 139 | MCL.train( 140 | graph, 141 | expansionRate, 142 | inflationRate, 143 | epsilon, 144 | maxIterations, 145 | selfLoopWeight, 146 | graphOrientationStrategy) 147 | .assignments 148 | 149 | clusters 150 | .groupBy("cluster") 151 | .agg(sort_array(collect_list(col("id")))) 152 | .show(3) 153 | 154 | // Terminate spark context 155 | sc.stop() 156 | 157 | } 158 | catch{ 159 | case e: Exception => println(e.getMessage) 160 | sys.exit(1) 161 | } 162 | } 163 | } -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/mllib/clustering/MCLUtils.scala: -------------------------------------------------------------------------------- 1 | /*The MIT License (MIT) 2 | 3 | Copyright (c) 2015-2016, Joan André 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE.*/ 22 | 23 | package org.apache.spark.mllib.clustering 24 | 25 | import breeze.linalg.max 26 | import org.apache.spark.graphx._ 27 | import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, IndexedRow, IndexedRowMatrix} 28 | import org.apache.spark.mllib.linalg.{SparseVector, Vectors} 29 | import org.apache.spark.rdd.RDD 30 | import org.apache.spark.sql.{DataFrame, Row} 31 | import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} 32 | import org.apache.spark.sql.types._ 33 | 34 | /** 35 | * Utils functions for MCL algorithm implementation. 36 | */ 37 | object MCLUtils { 38 | 39 | /** Print an adjacency matrix in nice format. 40 | * 41 | * @param mat an adjacency matrix 42 | */ 43 | def displayMatrix(mat: IndexedRowMatrix): Unit={ 44 | println() 45 | mat 46 | .rows.sortBy(_.index).collect() 47 | .foreach(row => { 48 | printf(row.index + " => ") 49 | row.vector.toArray 50 | .foreach(v => printf(",%.4f", v)) 51 | println() 52 | }) 53 | } 54 | 55 | def displayBlockMatrix(mat: BlockMatrix): Unit={ 56 | println() 57 | mat 58 | .blocks.sortBy(_._1).collect() 59 | .foreach( 60 | block => { 61 | printf(block._2.toString()) 62 | }) 63 | } 64 | 65 | /** Get a suitable graph for MCL model algorithm. 66 | * 67 | * Each vertex id in the graph corresponds to a row id in the adjacency matrix. 68 | * 69 | * @param graph original graph 70 | * @param lookupTable a matching table with nodes ids and new ordered ids 71 | * @return prepared graph for MCL algorithm 72 | */ 73 | def preprocessGraph[VD](graph: Graph[VD, Double], lookupTable: DataFrame): Graph[Int, Double]={ 74 | val newVertices: RDD[(VertexId, Int)] = 75 | lookupTable.rdd.map( 76 | row => (row.getInt(1).toLong, row.getInt(0)) 77 | ) 78 | 79 | Graph(newVertices, graph.edges) 80 | .groupEdges((e1,e2) => e1 + e2) 81 | } 82 | 83 | /** Deal with self loop 84 | * 85 | * Add one when weight is nil and remain as it is otherwise 86 | * 87 | * @param graph original graph 88 | * @param selfLoopWeight a coefficient between 0 and 1 to influence clustering granularity and objective 89 | * @return an RDD of self loops weights and associated coordinates. 90 | */ 91 | def selfLoopManager(graph: Graph[Int, Double], selfLoopWeight: Double): RDD[(Int, (Int, Double))] = { 92 | 93 | val graphWithLinkedEdges: Graph[Array[Edge[Double]], Double] = 94 | Graph( 95 | graph 96 | .collectEdges(EdgeDirection.Either), 97 | graph.edges 98 | ) 99 | 100 | val selfLoop:RDD[(Int, (Int, Double))] = 101 | graph 102 | .triplets 103 | .filter(e => e.srcId==e.dstId && e.attr > 0) 104 | .map(e => (e.srcId, e.srcAttr)) 105 | .fullOuterJoin(graph.vertices) 106 | .filter(join => join._2._1.isEmpty) 107 | .leftOuterJoin(graphWithLinkedEdges.vertices) 108 | .map(v => 109 | (v._2._1._2.get, 110 | (v._2._1._2.get, 111 | v._2._2.getOrElse(Array(Edge(1.0.toLong, 1.0.toLong, 1.0))).map(e => e.attr).max*selfLoopWeight) 112 | ) 113 | ) 114 | 115 | selfLoop 116 | } 117 | 118 | /** Deal with multiple adjacency matrix filling strategy depending on graph orientation 119 | * 120 | * @param graph original graph 121 | * @param graphOrientationStrategy chose a graph strategy completion depending on its nature. 3 choices: undirected, directed, birected. 122 | * @return an RDD of new edges weights and associated coordinates. 123 | */ 124 | def graphOrientationManager(graph: Graph[Int, Double], graphOrientationStrategy: String): RDD[(Int, (Int, Double))] = { 125 | 126 | graphOrientationStrategy match { 127 | 128 | //Undirected Graph Solution 129 | case "undirected" => 130 | 131 | graph.triplets.map( 132 | triplet => (triplet.srcAttr, (triplet.dstAttr, triplet.attr)) 133 | ) 134 | 135 | //Directed Graph Solution => with only one possible orientation per edge 136 | case "directed" => 137 | 138 | graph.triplets.flatMap( 139 | triplet => { 140 | if (triplet.srcAttr != triplet.dstAttr) { 141 | Array((triplet.srcAttr, (triplet.dstAttr, triplet.attr)), (triplet.dstAttr, (triplet.srcAttr, triplet.attr))) 142 | } 143 | else { 144 | Array((triplet.srcAttr, (triplet.dstAttr, triplet.attr))) 145 | } 146 | } 147 | ) 148 | 149 | //Directed Graph Solution => with only one possible orientation per edge 150 | case "bidirected" => 151 | 152 | val tempEntries: RDD[((Int, Int), (Double, Int))] = graph.triplets.flatMap( 153 | triplet => { 154 | Array( 155 | ((triplet.srcAttr, triplet.dstAttr), (triplet.attr, 1)), 156 | ((triplet.dstAttr, triplet.srcAttr), (triplet.attr, 2)) 157 | ) 158 | } 159 | ) 160 | 161 | tempEntries 162 | .groupByKey() 163 | .map( 164 | e => 165 | if(e._2.size > 1){ 166 | val value = e._2.filter(v => v._2 == 1).head._1 167 | (e._1._1, (e._1._2, value)) 168 | } 169 | else{ 170 | (e._1._1, (e._1._2, e._2.head._1)) 171 | } 172 | ) 173 | } 174 | } 175 | 176 | /** Transform a Graph into an IndexedRowMatrix 177 | * 178 | * @param graph original graph 179 | * @param selfLoopWeight a coefficient between 0 and 1 to influence clustering granularity and objective 180 | * @param graphOrientationStrategy chose a graph strategy completion depending on its nature. 3 choices: undirected, directed, birected. 181 | * @return a ready adjacency matrix for MCL process. 182 | * @todo Check graphOrientationStrategy choice for current graph 183 | */ 184 | def toIndexedRowMatrix(graph: Graph[Int, Double], selfLoopWeight: Double, graphOrientationStrategy: String): IndexedRowMatrix = { 185 | 186 | //Especially relationships values have to be checked before doing what follows 187 | val rawEntries: RDD[(Int, (Int, Double))] = graphOrientationManager(graph, graphOrientationStrategy) 188 | 189 | val numOfNodes:Int = graph.numVertices.toInt 190 | 191 | val selfLoop:RDD[(Int, (Int, Double))] = selfLoopManager(graph, selfLoopWeight) 192 | val entries:RDD[(Int, (Int, Double))] = rawEntries.union(selfLoop) 193 | 194 | val indexedRows = entries.groupByKey().map(e => 195 | IndexedRow(e._1, Vectors.sparse(numOfNodes, e._2.toSeq)) 196 | ) 197 | 198 | new IndexedRowMatrix(indexedRows) 199 | } 200 | 201 | /** Transform an IndexedRowMatrix into a Graph 202 | * 203 | * @param mat an adjacency matrix 204 | * @param vertices vertices of original graph 205 | * @return associated graph 206 | */ 207 | def toGraph(mat: IndexedRowMatrix, vertices: RDD[(VertexId, String)]): Graph[String, Double] = { 208 | val edges: RDD[Edge[Double]] = 209 | mat.rows.flatMap(f = row => { 210 | val svec: SparseVector = row.vector.toSparse 211 | val it:Range = svec.indices.indices 212 | it.map(ind => Edge(row.index, svec.indices.apply(ind), svec.values.apply(ind))) 213 | }) 214 | Graph(vertices, edges) 215 | } 216 | 217 | 218 | 219 | } 220 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/joandre/MCL_spark.svg?branch=master)](https://travis-ci.org/joandre/MCL_spark) 2 | [![codecov](https://codecov.io/gh/joandre/MCL_spark/branch/master/graph/badge.svg)](https://codecov.io/gh/joandre/MCL_spark) 3 | 4 | # MCL Spark 5 | 6 | **License:** [MIT](https://github.com/joandre/MCL_spark/blob/master/LICENSE.txt) 7 | 8 | **MCL Spark** is an experimental project which goal is to implement a graph clustering algorithm in [Spark](https://github.com/apache/spark), using especially distributed matrix tools embedded in the scala API. 9 | 10 | Why MCL algorithm? Because it responds to Spark MLLib [contribution policy](https://cwiki.apache.org/confluence/display/SPARK/Contributing+to+Spark#ContributingtoSpark-MLlib-specificContributionGuidelines) first four points: 11 | * Be widely known 12 | * Be used and accepted 13 | * Be highly scalable 14 | * Be well documented 15 | 16 | Please do not hesitate to post comments or questions. 17 | 18 | Most of the following content is based on Stijn van Dongen website (http://micans.org/mcl/). 19 | 20 | Table of Contents 21 | ================= 22 | 23 | * [MCL Spark](#mcl-spark) 24 | * [Getting Started](#getting-started) 25 | * [Online Documentation](#online-documentation) 26 | * [Requirements](#requirements) 27 | * [Building From Sources](#building-from-sources) 28 | * [Use embarked example](#use-embarked-example) 29 | * [Parameters choices](#parameters-choices) 30 | * [MCL (Markov Cluster) algorithm theory](#mcl-markov-cluster-algorithm-theory) 31 | * [Expansion](#expansion) 32 | * [Inflation](#inflation) 33 | * [Convergence and clusters interpretation](#convergence-and-clusters-interpretation) 34 | * [Optimizations](#optimizations) 35 | * [Implementation thoughts](#implementation-thoughts) 36 | * [Spark matrices universe](#spark-matrices-universe) 37 | * [IndexedRowMatrix](#indexedrowmatrix) 38 | * [BlockMatrix](#blockmatrix) 39 | * [Directed graph management](#directed-graph-management) 40 | * [Hypergraph](#hypergraph) 41 | * [References](#references) 42 | 43 | ## Getting Started 44 | 45 | ### Online Documentation 46 | 47 | A Scaladoc is available [here](http://joandre.github.io/docs/MCL_Spark/api/). 48 | 49 | ### Requirements 50 | 51 | * JDK 1.8 or higher 52 | * SBT 0.13.9 (see http://www.scala-sbt.org/download.html for more information) 53 | * Build against Spark 1.6.1+ 54 | 55 | ### Building From Sources 56 | 57 | This library is built with SBT. To build a JAR file simply run "sbt package" from the project root. Currently project was built under scala 2.10.5. 58 | 59 | ### Use embarked example 60 | 61 | ``` 62 | 63 | $MCL_SPARK_HOME/sbt "run [--expansionRate num] [--inflationRate num] [--epsilon num] [--maxIterations num] [--selfLoopWeight num] [--graphOrientationStrategy string]" 64 | 65 | ``` 66 | 67 | ### Import MCL into your Spark Shell 68 | 69 | ``` 70 | 71 | $SPARK_HOME/bin/spark-shell --jars $MCL_SPARK_HOME/target/scala-2.11/mcl_spark_2.11-1.0.0.jar 72 | 73 | ``` 74 | 75 | Then use MCL as follows: 76 | 77 | ``` 78 | import org.apache.spark.graphx._ 79 | import org.apache.spark.mllib.clustering.{Assignment, MCL} 80 | import org.apache.spark.rdd.RDD 81 | import org.apache.spark.sql.Dataset 82 | import org.apache.spark.sql.functions.{sort_array,collect_list,col} 83 | 84 | val users: RDD[(VertexId, String)] = 85 | sc.parallelize(Array((0L,"Node1"), (1L,"Node2"), 86 | (2L,"Node3"), (3L,"Node4"),(4L,"Node5"), 87 | (5L,"Node6"), (6L,"Node7"), (7L, "Node8"), 88 | (8L, "Node9"), (9L, "Node10"), (10L, "Node11"))) 89 | 90 | // Create an RDD for edges 91 | val relationships: RDD[Edge[Double]] = 92 | sc.parallelize( 93 | Seq(Edge(0, 1, 1.0), Edge(1, 0, 1.0), 94 | Edge(0, 2, 1.0), Edge(2, 0, 1.0), 95 | Edge(0, 3, 1.0), Edge(3, 0, 1.0), 96 | Edge(1, 2, 1.0), Edge(2, 1, 1.0), 97 | Edge(1, 3, 1.0), Edge(3, 1, 1.0), 98 | Edge(2, 3, 1.0), Edge(3, 2, 1.0), 99 | Edge(4, 5, 1.0), Edge(5, 4, 1.0), 100 | Edge(4, 6, 1.0), Edge(6, 4, 1.0), 101 | Edge(4, 7, 1.0), Edge(7, 4, 1.0), 102 | Edge(5, 6, 1.0), Edge(6, 5, 1.0), 103 | Edge(5, 7, 1.0), Edge(7, 5, 1.0), 104 | Edge(6, 7, 1.0), Edge(7, 6, 1.0), 105 | Edge(3, 8, 1.0), Edge(8, 3, 1.0), 106 | Edge(9, 8, 1.0), Edge(8, 9, 1.0), 107 | Edge(9, 10, 1.0), Edge(10, 9, 1.0), 108 | Edge(4, 10, 1.0), Edge(10, 4, 1.0) 109 | )) 110 | 111 | // Build the initial Graph 112 | val graph = Graph(users, relationships) 113 | graph.cache() 114 | 115 | val clusters: Dataset[Assignment] = 116 | MCL.train(graph).assignments 117 | clusters 118 | .groupBy("cluster") 119 | .agg(sort_array(collect_list(col("id")))) 120 | .show(3) 121 | 122 | ``` 123 | 124 | ### Parameters choices 125 | 126 | **Inflation and Expansion rates** => The two parameters influence what we call cluster granularity, so how many and how strong should be detected groups of nodes. Inflation increases intra cluster links and decreases inter cluster links while expansion connects nodes to further and new parts of the graph. **Default = 2** 127 | 128 | 1. A big inflation rate will strengthen existing clusters. 129 | 2. A big expansion rate will boost clusters merging. 130 | 131 | Nota bene: Only integers are accepted for expansion rate for now (for computational reasons). 132 | 133 | **Epsilon** => In order to keep the adjacency matrix associated with our graph sparse, one strategy is to remove some negligible edges regarding its weight. Let's say you chose an epsilon equal to 0.05. This means that every edge, connected to one node, which weight is inferior to 5% of the sum of every edges weight connected to our node is removed (see Optimization paragraph for more details). **Default = 0.01** 134 | 135 | **Maximum number of iterations** => It forces MCL to stop before it converges. Regarding Stijn van Dongen recommendations, a steady state is usually reached after 10 iterations. **Default = 10** 136 | 137 | **Self loops weight management** => A percentage of the maximum weight can be applied to added self loops. For example, for a binary graph, 1 is the maximum weight to allocate (see Optimization paragraph for more details). **Default = 0.1** 138 | 139 | **Directed and undirected graphs management** => To deal with directed graphs. **Default = "undirected"** 140 | 141 | 1. "undirected": graph is supposed undirected. No edges are added. 142 | 2. "directed": graph is supposed directed. Each edge inverse is added so graph becomes undirected. 143 | 3. "bidirected": graph already owns bidirected edges. Excepted for already existing undirected edges, each edge inverse is added so graph becomes undirected. 144 | 145 | See [Implementation thoughts](#implementation-thoughts) for more details. 146 | 147 | ## MCL (Markov Cluster) algorithm theory 148 | 149 | ### Recall about Markov chains 150 | 151 | *"A Markov chain is a sequence of random variables X1, X2, X3, ... with the Markov property, namely that the probability of moving to next state depends only on the present state and not on the previous states."* ([wikipedia definition](https://en.wikipedia.org/wiki/Markov_chain#Formal_definition)) 152 | 153 | **Defintion**: a state is absorbent when it cannot be left. 154 | 155 | **Definition**: a Markov chain is aperiodic, if it at least one of its state has a period of 1, so returning to the original state occurs irregularly. 156 | 157 | **Definition**: a Markov chain is irreducible, if it is possible to get to any state from any state. 158 | 159 | **Definition**: a Markov chain is ergodic, if it is both aperiodic and irreducible. 160 | 161 | ### Principle 162 | 163 | To detect clusters inside a graph, MCL algorithm uses a Column Stochastic Matrix representation and the concept of random walks. The idea is that random walks between two nodes that belong to the same group are more frequent than between two nodes belonging to different groups. So we should compute probability that a node reach each other node of the graph to have a better insight of clusters. 164 | 165 | **Definition**: a Column Stochastic Matrix (CSM) is a non-negative matrix which each column sum is equal to 1. In our case, we will prefer Row Stochastic Matrix (RSM) instead of CSM to use Spark API tools (see Implementation thoughts for more details). 166 | 167 | Two steps are needed to simulate random walks on a graph: expansion and inflation. Each step is associated with a specific rate (respectively eR and iR). In the following formula, n is the number of nodes in the graph. 168 | 169 | ### Expansion 170 | To perform **expansion**, we raise the stochastic matrix to the power eR using the normal matrix product. 171 | 172 |

173 | 174 | , for eR = 2. 175 | 176 | ### Inflation 177 | To perform **inflation**, we apply the Hadamard power on the RSM (powers entrywise) and we then normalize each row to get back to probabilities. 178 | 179 |

180 | 181 | ### Convergence and clusters interpretation 182 | 183 | After each loop (expansion and inflation), a convergence test is applied on the new matrix. When it remains stable regarding the previous iteration, then the algorithm stops. Otherwise, a maximum number of iterations is defined to force the process to reach a steady state. 184 | 185 |

186 | 187 | Each non-empty column (with non-zero values) of A, corresponds to a cluster and its composition. A cluster will be a star with one or several attractor(s) in the center (see example below). 188 | 189 |

Graph shape for different convergence status (http://micans.org)

190 | 191 | A node can belong to one or several cluster(s). 192 | 193 | ### Optimizations 194 | Most of the following solutions were developed by Stijn van Dongen. More could come based on matrix distribution state. 195 | 196 | * Add self loop to each node. This is generally used to satisfy aperiodic condition of graph Markov chain. More than an optimization, this is required to avoid the non-convergence of MCL because of the infinite alternation between different states (depending on the period). Default weight allocated is the maximum weight of every edges related to the current node. To stay as closed as possible of the true graph, self loop weights can be decreased. 197 | * Most of big graphs are sparse because of their nature. For example, in a social graph, people are not related to every other users but mostly to relatives, friends or colleagues (depending on the nature of the social network). In inflation and expansion steps, "weak" connections weight tends to zero (since it is the goal to detect strong connections in order to bring out clusters) without reaching it. In order to keep the graph sparse, we can adopt three strategies: 198 | 1. Set every small values to zero regarding a threshold. This can be dangerous when a large percentage of global weight belongs to small edges. Currently, this is the only strategy available. 199 | 2. Keep k largest values for each node. Can be very expensive for very large k and a high number of nonzero entries. 200 | 3. Mix the two strategies so a threshold pruning is first applied to reduce exact pruning cost. 201 | * In order to improve convergence test speed, MCL author proposed a more efficient way to proceed. (Not Implemented Yet) 202 | 203 | ## Implementation thoughts 204 | 205 | ### Spark matrices universe 206 | As explained in introduction, this program is exclusively based on scala matrices Spark API. Two main matrix types are explored to implement inflation, expansion and normalization steps: [IndexedRowMatrix](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix) and [BlockMatrix](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.mllib.linalg.distributed.BlockMatrix). 207 | 208 | #### IndexedRowMatrix 209 | * Advantages: Each row can be stored in a sparse way, normalization is easy to apply since we apply it per row (instead of column like in the original implementation). 210 | * Disadvantages: No multiplication between two IndexedRowMatrix available. 211 | 212 | #### BlockMatrix 213 | * Advantages: Fully scalable => Blocks of adjustable size (1024x1024 by default), with sparse matrices using [Compressed Sparse Column](http://netlib.org/linalg/html_templates/node92.html) 214 | * Disadvantages: Hard to implement normalization. 215 | 216 | For inflation and normalization, adjacency matrix is transformed in IndexedRowMatrix, so computations are done locally. 217 | For expansion, adjacency matrix is transformed in BlockMatrix, so we take advantage of a fully distributed matrix multiplication. 218 | 219 | ### Directed graphs management 220 | To respect the irreducibility of graphs markov chain, MCL is only applied on undirected ones. For example, in a directed bipartite graph, there are a bunch of absorbent states, so associated markov chain is reducible and does not respect ergodic condition. 221 | 222 | To offer the possibility to users to apply MCL on directed graphs, the only way is to make the graph symmetric by adding each edge inverse. This is due to GraphX API where edges are only directed. For the particular case of bidirected graphs (where some edges and their inverse already exist), birected edges remain as it is. 223 | 224 | Note that symmetry (same weight for an edge and its inverse) is preferred for more efficiency. 225 | 226 | ### Hypergraph 227 | When two nodes are related to each other with several edges, those edges are merged and their weights summed so there remains only one. 228 | 229 | ## References 230 | 231 | * Stijn van Dongen. MCL - a cluster algorithm for graphs. [Official Website](http://micans.org/mcl/) 232 | * Kathy Macropol. Clustering on Graphs: The Markov Cluster Algorithm (MCL). [A Presentation](https://www.cs.ucsb.edu/~xyan/classes/CS595D-2009winter/MCL_Presentation2.pdf) 233 | * Jean-Benoist Leger, Corinne Vacher, Jean-Jacques Daudin. Detection of structurally homogeneous subsets in graphs. [A Survey](http://vacher.corinne.free.fr/pdf/Leger_StatsComputing_2013.pdf) 234 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/mllib/clustering/MCLUtilsSuite.scala: -------------------------------------------------------------------------------- 1 | /*The MIT License (MIT) 2 | 3 | Copyright (c) 2015-2016, Joan André 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE.*/ 22 | 23 | package org.apache.spark.mllib.clustering 24 | 25 | import java.util.Locale 26 | 27 | import org.apache.log4j.{Level, Logger} 28 | import org.apache.spark.graphx._ 29 | import org.apache.spark.mllib.clustering.MCLUtils._ 30 | import org.apache.spark.mllib.linalg.DenseVector 31 | import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix} 32 | import org.apache.spark.rdd.RDD 33 | import org.apache.spark.sql.{DataFrame, SQLContext} 34 | 35 | import scala.io.Source 36 | 37 | /** Scala Tests class for MCLUtils functions */ 38 | class MCLUtilsSuite extends MCLFunSuite{ 39 | 40 | // Disable Spark messages when running program 41 | Logger.getLogger("org").setLevel(Level.OFF) 42 | Logger.getLogger("akka").setLevel(Level.OFF) 43 | 44 | // Unit Tests 45 | 46 | test("Print functions", UnitTest){ 47 | val indexedMatrix: IndexedRowMatrix = 48 | new IndexedRowMatrix( 49 | sc.parallelize( 50 | Seq( 51 | IndexedRow(0, new DenseVector(Array(1,0,0,0,1,0))), 52 | IndexedRow(1, new DenseVector(Array(0,1,1,0,1,1))), 53 | IndexedRow(2, new DenseVector(Array(0,1,1,0,0,1))), 54 | IndexedRow(3, new DenseVector(Array(0,0,0,1,0,1))), 55 | IndexedRow(4, new DenseVector(Array(1,1,0,0,1,0))), 56 | IndexedRow(5, new DenseVector(Array(0,1,1,1,0,1))) 57 | ) 58 | )) 59 | 60 | // Force local number format so "." is the only separator used for float numbers in print tests no matter in which environment they run 61 | Locale.setDefault(new Locale("en", "US")) 62 | 63 | val streamIM = new java.io.ByteArrayOutputStream() 64 | Console.withOut(streamIM) { 65 | displayMatrix(indexedMatrix) 66 | } 67 | 68 | streamIM.toString shouldEqual "\n0 => ,1.0000,0.0000,0.0000,0.0000,1.0000,0.0000\n1 => ,0.0000,1.0000,1.0000,0.0000,1.0000,1.0000\n2 => ,0.0000,1.0000,1.0000,0.0000,0.0000,1.0000\n3 => ,0.0000,0.0000,0.0000,1.0000,0.0000,1.0000\n4 => ,1.0000,1.0000,0.0000,0.0000,1.0000,0.0000\n5 => ,0.0000,1.0000,1.0000,1.0000,0.0000,1.0000\n" 69 | 70 | val streamBM = new java.io.ByteArrayOutputStream() 71 | Console.withOut(streamBM) { 72 | displayBlockMatrix(indexedMatrix.toBlockMatrix) 73 | } 74 | 75 | streamBM.toString shouldEqual "\n6 x 6 CSCMatrix\n(0,0) 1.0\n(4,0) 1.0\n(1,1) 1.0\n(2,1) 1.0\n(4,1) 1.0\n(5,1) 1.0\n(1,2) 1.0\n(2,2) 1.0\n(5,2) 1.0\n(3,3) 1.0\n(5,3) 1.0\n(0,4) 1.0\n(1,4) 1.0\n(4,4) 1.0\n(1,5) 1.0\n(2,5) 1.0\n(3,5) 1.0\n(5,5) 1.0" 76 | 77 | } 78 | 79 | test("Preprocessing Graph (ordered id for vertices and remove multiple edges)", UnitTest){ 80 | 81 | val sqlContext = SQLContext.getOrCreate(sc) 82 | import sqlContext.implicits._ 83 | 84 | val matchingList: RDD[(Int,Int)] = sc.parallelize(Array((0,2), (1,1), (2,3), (3,5), (4,8), (5, 0))) 85 | val lookupTable: DataFrame = matchingList.toDF("matrixId", "nodeId") 86 | 87 | // Create and RDD for vertices 88 | val users: RDD[(VertexId, String)] = 89 | sc.parallelize(Array((0L,"Node5"), (1L,"Node1"), 90 | (2L, "Node0"), (3L,"Node2"), (5L,"Node3"),(8L,"Node4"))) 91 | 92 | // Create an RDD for edges 93 | val relationships: RDD[Edge[Double]] = 94 | sc.parallelize( 95 | Seq(Edge(0, 1, 1.0), Edge(1, 0, 1.0), 96 | Edge(0, 3, 1.0), Edge(3, 0, 1.0), 97 | Edge(0, 5, 1.0), Edge(5, 0, 1.0), 98 | Edge(1, 3, 1.0), Edge(3, 1, 1.0), 99 | Edge(1, 8, 1.0), Edge(8, 1, 1.0), 100 | Edge(2, 8, 1.0), Edge(8, 2, 1.0), 101 | Edge(8, 2, 1.0), Edge(2, 2, 1.0), 102 | Edge(2, 2, 1.0) 103 | )) 104 | 105 | // Build the initial Graph 106 | val graph = Graph(users, relationships) 107 | 108 | val cleanedGraph: Graph[Int, Double] = preprocessGraph(graph, lookupTable) 109 | 110 | // Create and RDD for vertices 111 | val challengeUsers: RDD[(VertexId, Int)] = 112 | sc.parallelize(Array((2L,0), (1L,1), 113 | (3L,2), (5L,3), (8L,4), (0L,5))) 114 | 115 | // Create an RDD for edges 116 | val challengeRelationships: RDD[Edge[Double]] = 117 | sc.parallelize( 118 | Seq(Edge(0, 1, 1.0), Edge(1, 0, 1.0), 119 | Edge(0, 3, 1.0), Edge(3, 0, 1.0), 120 | Edge(0, 5, 1.0), Edge(5, 0, 1.0), 121 | Edge(1, 3, 1.0), Edge(3, 1, 1.0), 122 | Edge(1, 8, 1.0), Edge(8, 1, 1.0), 123 | Edge(2, 8, 1.0), Edge(8, 2, 2.0), 124 | Edge(2, 2, 2.0) 125 | )) 126 | 127 | // Build the initial Graph 128 | val challengeGraph = Graph(challengeUsers, challengeRelationships) 129 | 130 | cleanedGraph.vertices.count shouldEqual challengeGraph.vertices.count 131 | cleanedGraph.vertices.map(v => (v._1, v._2)).collect.sorted shouldEqual challengeGraph.vertices.map(v => (v._1, v._2)).collect.sorted 132 | 133 | /*cleanedGraph.edges 134 | .map(v => ((v.srcId, v.dstId), v.attr)) 135 | .collect.sortBy(tup => tup._1) shouldEqual 136 | challengeGraph.edges 137 | .map(v => ((v.srcId, v.dstId), v.attr)) 138 | .collect.sortBy(tup => tup._1)*/ 139 | 140 | } 141 | 142 | test("Add self loop too each nodes", UnitTest){ 143 | 144 | // Create and RDD for vertices 145 | val users: RDD[(VertexId, Int)] = 146 | sc.parallelize(Array((2L,0), (1L,1), 147 | (3L,2), (5L,3), (8L,4), (0L,5))) 148 | 149 | // Create an RDD for edges 150 | val relationships: RDD[Edge[Double]] = 151 | sc.parallelize( 152 | Seq(Edge(0, 1, 2.0), Edge(1, 0, 1.0), 153 | Edge(0, 3, 1.0), Edge(3, 0, 1.0), 154 | Edge(0, 5, 1.0), Edge(5, 0, 1.0), 155 | Edge(1, 3, 1.0), Edge(3, 1, 1.0), 156 | Edge(1, 8, 1.0), Edge(8, 1, 1.0), 157 | Edge(2, 8, 1.0), Edge(8, 2, 1.0), 158 | Edge(2, 2, 1.0) 159 | )) 160 | 161 | // Build the initial Graph 162 | val graph = Graph(users, relationships) 163 | 164 | val edgesWithSelfLoops: RDD[(Int, (Int, Double))] = selfLoopManager(graph, 2) 165 | 166 | val objective: RDD[(Int, (Int, Double))] = 167 | sc.parallelize( 168 | Seq((1, (1, 4.0)), (2, (2, 2.0)), 169 | (3, (3, 2.0)), (4, (4, 2.0)), 170 | (5, (5, 4.0)) 171 | )) 172 | 173 | edgesWithSelfLoops.count shouldEqual objective.count 174 | edgesWithSelfLoops.collect.sortBy(edge => (edge._1, edge._2)) shouldEqual objective.collect.sortBy(edge => (edge._1, edge._2)) 175 | 176 | } 177 | 178 | test("Completion strategy for graph depending on its nature (oriented or not)", UnitTest){ 179 | 180 | // For undirected graphs 181 | // Create and RDD for vertices 182 | val undirectedUsers: RDD[(VertexId, Int)] = 183 | sc.parallelize(Array((2L,0), (1L,1), 184 | (3L,2), (5L,3), (8L,4), (0L,5))) 185 | 186 | // Create an RDD for edges 187 | val undirectedRelationships: RDD[Edge[Double]] = 188 | sc.parallelize( 189 | Seq(Edge(0, 1, 1.0), Edge(1, 0, 1.0), 190 | Edge(0, 3, 1.0), Edge(3, 0, 1.0), 191 | Edge(0, 5, 1.0), Edge(5, 0, 1.0), 192 | Edge(1, 3, 1.0), Edge(3, 1, 1.0), 193 | Edge(1, 8, 1.0), Edge(8, 1, 1.0), 194 | Edge(2, 8, 1.0), Edge(8, 2, 1.0), 195 | Edge(2, 2, 1.0) 196 | )) 197 | 198 | // Build the initial Graph 199 | val undirectedGraph = Graph(undirectedUsers, undirectedRelationships) 200 | 201 | val undirectedEdges: RDD[(Int, (Int, Double))] = graphOrientationManager(undirectedGraph, "undirected") 202 | 203 | // For directed graphs 204 | // Create and RDD for vertices 205 | val directedUsers: RDD[(VertexId, Int)] = 206 | sc.parallelize(Array((2L,0), (1L,1), 207 | (3L,2), (5L,3), (8L,4), (0L,5))) 208 | 209 | // Create an RDD for edges 210 | val directedRelationships: RDD[Edge[Double]] = 211 | sc.parallelize( 212 | Seq(Edge(0, 1, 1.0), 213 | Edge(0, 3, 1.0), 214 | Edge(0, 5, 1.0), 215 | Edge(1, 3, 1.0), 216 | Edge(1, 8, 1.0), 217 | Edge(2, 8, 1.0), 218 | Edge(2, 2, 1.0) 219 | )) 220 | 221 | // Build the initial Graph 222 | val directedGraph = Graph(directedUsers, directedRelationships) 223 | 224 | val directedEdges: RDD[(Int, (Int, Double))] = graphOrientationManager(directedGraph, "directed") 225 | 226 | // For bidirected graphs 227 | // Create and RDD for vertices 228 | val bidirectedUsers: RDD[(VertexId, Int)] = 229 | sc.parallelize(Array((2L,0), (1L,1), 230 | (3L,2), (5L,3), (8L,4), (0L,5))) 231 | 232 | // Create an RDD for edges 233 | val bidirectedRelationships: RDD[Edge[Double]] = 234 | sc.parallelize( 235 | Seq(Edge(0, 1, 1.0), 236 | Edge(0, 3, 1.0), 237 | Edge(0, 5, 1.0), 238 | Edge(1, 3, 1.0), Edge(3, 1, 1.0), 239 | Edge(1, 8, 1.0), Edge(8, 1, 1.0), 240 | Edge(2, 8, 1.0), 241 | Edge(2, 2, 1.0) 242 | )) 243 | 244 | // Build the initial Graph 245 | val bidirectedGraph = Graph(bidirectedUsers, bidirectedRelationships) 246 | 247 | val bidirectedEdges: RDD[(Int, (Int, Double))] = graphOrientationManager(bidirectedGraph, "bidirected") 248 | 249 | val objective: RDD[(Int, (Int, Double))] = 250 | sc.parallelize( 251 | Seq((5, (1, 1.0)), (1, (5, 1.0)), 252 | (5, (2, 1.0)), (2, (5, 1.0)), 253 | (5, (3, 1.0)), (3, (5, 1.0)), 254 | (1, (2, 1.0)), (2, (1, 1.0)), 255 | (1, (4, 1.0)), (4, (1, 1.0)), 256 | (0, (4, 1.0)), (4, (0, 1.0)), 257 | (0, (0, 1.0)) 258 | )) 259 | 260 | undirectedEdges.count shouldEqual objective.count 261 | undirectedEdges.collect.sortBy(edge => (edge._1, edge._2)) shouldEqual objective.collect.sortBy(edge => (edge._1, edge._2)) 262 | directedEdges.count shouldEqual objective.count 263 | directedEdges.collect.sortBy(edge => (edge._1, edge._2)) shouldEqual objective.collect.sortBy(edge => (edge._1, edge._2)) 264 | bidirectedEdges.count shouldEqual objective.count 265 | bidirectedEdges.collect.sortBy(edge => (edge._1, edge._2)) shouldEqual objective.collect.sortBy(edge => (edge._1, edge._2)) 266 | 267 | } 268 | 269 | // Integration Tests 270 | 271 | test("Adjacency Matrix Transformation", IntegrationTest) { 272 | 273 | // Load data 274 | val source:Seq[String] = Source.fromURL(getClass.getResource("/MCLUtils/OrientedEdges.txt")).getLines().toSeq 275 | val nodesFile:Seq[String] = Source.fromURL(getClass.getResource("/MCLUtils/OrientedNodes.txt")).getLines().toSeq 276 | val matrixSelfLoop:Seq[String] = Source.fromURL(getClass.getResource("/MCLUtils/OrientedMatrixSelfLoop.txt")).getLines().toSeq 277 | 278 | val edges:RDD[Edge[Double]] = 279 | sc.parallelize( 280 | source 281 | .map(l => l.split(" ")) 282 | .map(e => Edge(e(0).toLong, e(1).toLong, e(2).toDouble)) 283 | ) 284 | val nodes:RDD[(VertexId, String)] = 285 | sc.parallelize( 286 | nodesFile 287 | .map(l => l.split(" ")) 288 | .map(e => (e(0).toLong, "default")) 289 | ) 290 | 291 | val graph = Graph(nodes, edges) 292 | 293 | var range:Long = 0 294 | val initialMatrixWithSelLoop = 295 | new IndexedRowMatrix( 296 | sc.parallelize( 297 | matrixSelfLoop 298 | .map{ 299 | line => 300 | range = range + 1 301 | new IndexedRow( 302 | range-1, 303 | new DenseVector( 304 | line.split(";").map(e => e.toDouble) 305 | ) 306 | ) 307 | } 308 | ) 309 | ) 310 | 311 | //Prepare graph for transformation 312 | 313 | val sqlContext = SQLContext.getOrCreate(sc) 314 | import sqlContext.implicits._ 315 | 316 | val lookupTable:DataFrame = 317 | graph.vertices.sortByKey().zipWithIndex() 318 | .map(indexedVertice => (indexedVertice._2.toInt, indexedVertice._1._1.toInt, indexedVertice._1._2)) 319 | .toDF("matrixId", "nodeId", "attribute") 320 | 321 | val preprocessedGraph: Graph[Int, Double] = preprocessGraph(graph, lookupTable) 322 | 323 | //Test matrix transformation 324 | 325 | val adjacencyMat:IndexedRowMatrix = toIndexedRowMatrix(preprocessedGraph, 1.0, "undirected") 326 | 327 | adjacencyMat.numRows shouldEqual initialMatrixWithSelLoop.numRows 328 | adjacencyMat.numCols shouldEqual initialMatrixWithSelLoop.numCols 329 | initialMatrixWithSelLoop.rows.map(iRow => (iRow.index, iRow.vector.toArray)) 330 | .join( 331 | adjacencyMat.rows.map(iRow => (iRow.index, iRow.vector.toArray)) 332 | ) 333 | .collect.foreach( 334 | pairOfRows => 335 | { 336 | pairOfRows._2._1 shouldEqual pairOfRows._2._2 337 | } 338 | ) 339 | 340 | //Test transformation from adjacency matrix to graph 341 | 342 | val vertices:RDD[(VertexId, String)] = lookupTable.rdd.map(row => (row.getInt(0).toLong, row.getString(2))) 343 | val resultGraph: Graph[String, Double] = toGraph(adjacencyMat, vertices) 344 | 345 | // Missing self edges are manually added 346 | val preEdges = preprocessedGraph.triplets 347 | .map(tri => Edge(tri.srcAttr, tri.dstAttr, tri.attr)).collect 348 | .union(for (i <- 1 to (preprocessedGraph.vertices.count.toInt - 2)) yield Edge(i, i , 1.0)) 349 | .sortBy(e => (e.srcId, e.dstId)) 350 | 351 | val postEdges = resultGraph.edges.collect.sortBy(e => (e.srcId, e.dstId)) 352 | 353 | preprocessedGraph.vertices.count shouldEqual resultGraph.vertices.count 354 | preEdges.toSeq.length shouldEqual postEdges.toSeq.length 355 | preEdges.toSeq shouldEqual postEdges.toSeq 356 | 357 | } 358 | 359 | } 360 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/mllib/clustering/MCL.scala: -------------------------------------------------------------------------------- 1 | /*The MIT License (MIT) 2 | 3 | Copyright (c) 2015-2016, Joan André 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE.*/ 22 | 23 | package org.apache.spark.mllib.clustering 24 | 25 | import org.apache.spark.graphx._ 26 | import org.apache.spark.mllib.clustering.MCLUtils._ 27 | import org.apache.spark.mllib.linalg._ 28 | import org.apache.spark.mllib.linalg.distributed._ 29 | import org.apache.spark.rdd.RDD 30 | import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} 31 | 32 | /** A clustering model for MCL. 33 | * 34 | * @see README.md for more details on theory 35 | * 36 | * @constructor Constructs an MCL instance with default parameters: {expansionRate: 2, inflationRate: 2, convergenceRate: 0.01, epsilon: 0.05, maxIterations: 10, selfLoopWeight: 0.1, graphOrientationStrategy: "undirected"}. 37 | * @param expansionRate expansion rate of adjacency matrix at each iteration 38 | * @param inflationRate inflation rate of adjacency matrix at each iteration 39 | * @param epsilon pruning parameter. When an edge E1, starting from a node N1, has a weight which percentage is inferior to epsilon regarding other edges Ei starting from N, this weight is set to zero 40 | * @param maxIterations maximal number of iterations for a non convergent algorithm 41 | * @param selfLoopWeight a coefficient between 0 and 1 to influence clustering granularity and objective 42 | * @param graphOrientationStrategy chose a graph strategy completion depending on its nature. 3 choices: undirected, directed, birected. 43 | */ 44 | class MCL private(private var expansionRate: Int, 45 | private var inflationRate: Double, 46 | private var epsilon: Double, 47 | private var maxIterations: Int, 48 | private var selfLoopWeight: Double, 49 | private var graphOrientationStrategy: String) extends Serializable{ 50 | 51 | /** Construct an MCL instance 52 | * 53 | * Default parameters: {expansionRate: 2, inflationRate: 2, 54 | * convergenceRate: 0.01, epsilon: 0.05, maxIterations: 10, selfLoopWeight: 0.1, graphOrientationStrategy: "undirected"}. 55 | * 56 | * @return an MCL object 57 | */ 58 | def this() = this(2, 2.0, 0.01, 10, 0.1, "undirected") 59 | 60 | /** Available graph orientation strategy options. 61 | * 62 | * @see README.md for more details 63 | */ 64 | val graphOrientationStrategyOption: Seq[String] = Seq("undirected", "directed", "bidirected") 65 | 66 | /** Get expansion rate */ 67 | def getExpansionRate: Int = expansionRate 68 | 69 | /** Set the expansion rate. 70 | * 71 | * Default: 2. 72 | * 73 | * @throws IllegalArgumentException expansionRate must be higher than 1 74 | */ 75 | def setExpansionRate(expansionRate: Int): MCL = { 76 | this.expansionRate = expansionRate match { 77 | case eR if eR > 0 => eR 78 | case _ => throw new IllegalArgumentException("expansionRate parameter must be higher than 1") 79 | } 80 | this 81 | } 82 | 83 | /** Get inflation rate */ 84 | def getInflationRate: Double = inflationRate 85 | 86 | /** Set the inflation rate. 87 | * 88 | * Default: 2. 89 | * 90 | * @throws IllegalArgumentException inflationRate must be higher than 0 91 | */ 92 | def setInflationRate(inflationRate: Double): MCL = { 93 | this.inflationRate = inflationRate match { 94 | case iR if iR > 0 => iR 95 | case _ => throw new IllegalArgumentException("inflationRate parameter must be higher than 0") 96 | } 97 | this 98 | } 99 | 100 | /** Get epsilon coefficient 101 | * 102 | * Change an edge value to zero when the overall weight of this edge is less than a certain percentage 103 | * 104 | */ 105 | def getEpsilon: Double = epsilon 106 | 107 | /** Set the minimum percentage to get an edge weight to zero. 108 | * 109 | * Default: 0.01. 110 | * 111 | * @throws IllegalArgumentException epsilon must be higher than 0 and lower than 1 112 | */ 113 | def setEpsilon(epsilon: Double): MCL = { 114 | this.epsilon = epsilon match { 115 | case eps if eps < 1 & eps >= 0 => eps 116 | case _ => throw new IllegalArgumentException("epsilon parameter must be higher than 0 and lower than 1") 117 | } 118 | 119 | this 120 | } 121 | 122 | /** Get stop condition if MCL algorithm does not converge fairly quickly */ 123 | def getMaxIterations: Int = maxIterations 124 | 125 | /** Set maximum number of iterations. 126 | * 127 | * Default: 10. 128 | * 129 | * @throws IllegalArgumentException maxIterations must be higher than 0 130 | */ 131 | def setMaxIterations(maxIterations: Int): MCL = { 132 | this.maxIterations = maxIterations match { 133 | case mI if mI > 0 => mI 134 | case _ => throw new IllegalArgumentException("maxIterations parameter must be higher than 0") 135 | } 136 | this 137 | } 138 | 139 | /** Get weight of automatically added self loops in adjacency matrix rows */ 140 | def getSelfLoopWeight: Double = selfLoopWeight 141 | 142 | /** Set self loops weights. 143 | * 144 | * Default: 0.1. 145 | * 146 | * @throws IllegalArgumentException selfLoopWeight must be higher than 0 and lower than 1 147 | */ 148 | def setSelfLoopWeight(selfLoopWeight: Double): MCL = { 149 | this.selfLoopWeight = selfLoopWeight match { 150 | case slw if slw > 0 & slw <= 1 => slw 151 | case _ => throw new IllegalArgumentException("selfLoopWeight parameter must be higher than 0 and lower than 1") 152 | } 153 | this 154 | } 155 | 156 | /** Get graph orientation strategy selected depending on graph nature */ 157 | def getGraphOrientationStrategy: String = graphOrientationStrategy 158 | 159 | /** Set graph orientation strategy. 160 | * 161 | * Default: undirected. 162 | * 163 | * @throws IllegalArgumentException graphOrientationStrategy must be contained in graphOrientationStrategyOption 164 | */ 165 | def setGraphOrientationStrategy(graphOrientationStrategy: String): MCL = { 166 | this.graphOrientationStrategy = graphOrientationStrategy match { 167 | case gos if graphOrientationStrategyOption.contains(gos) => gos 168 | case _ => throw new IllegalArgumentException("you must select graphOrientationStrategy option in the following list: " + graphOrientationStrategyOption.mkString(", ")) 169 | } 170 | this 171 | } 172 | 173 | 174 | /** Normalize matrix 175 | * 176 | * @param mat an unnormalized adjacency matrix 177 | * @return normalized adjacency matrix 178 | */ 179 | def normalization(mat: IndexedRowMatrix): IndexedRowMatrix ={ 180 | new IndexedRowMatrix( 181 | mat.rows 182 | .map{row => 183 | val svec = row.vector.toSparse 184 | IndexedRow(row.index, 185 | new SparseVector(svec.size, svec.indices, svec.values.map(v => v/svec.values.sum))) 186 | }) 187 | } 188 | 189 | /** Normalize row 190 | * 191 | * @param row an unnormalized row of th adjacency matrix 192 | * @return normalized row 193 | */ 194 | def normalization(row: SparseVector): SparseVector ={ 195 | new SparseVector(row.size, row.indices, row.values.map(v => v/row.values.sum)) 196 | } 197 | 198 | /** Remove weakest connections from a row 199 | * 200 | * Connections weight in adjacency matrix which is inferior to a very small value is set to 0 201 | * 202 | * @param row a row of the adjacency matrix 203 | * @return sparsed row 204 | * @todo Add more complex pruning strategies. 205 | * @see http://micans.org/mcl/index.html 206 | */ 207 | def removeWeakConnections(row: SparseVector): SparseVector ={ 208 | new SparseVector( 209 | row.size, 210 | row.indices, 211 | row.values.map(v => { 212 | if(v < epsilon) 0.0 213 | else v 214 | }) 215 | ) 216 | } 217 | 218 | /** Expand matrix 219 | * 220 | * @param mat an adjacency matrix 221 | * @return expanded adjacency matrix 222 | */ 223 | def expansion(mat: IndexedRowMatrix): BlockMatrix = { 224 | val bmat = mat.toBlockMatrix() 225 | var resmat = bmat 226 | for(i <- 1 until expansionRate){ 227 | resmat = resmat.multiply(bmat) 228 | } 229 | resmat 230 | } 231 | 232 | /** Inflate matrix 233 | * 234 | * Prune and normalization are applied locally (on each row). So we avoid two more complete scanning of adjacency matrix. 235 | * As explained in issue #8, pruning is applied on expanded matrix, so we take advantage of natural normalized expansion state. 236 | * 237 | * @param mat an adjacency matrix 238 | * @return inflated adjacency matrix 239 | */ 240 | def inflation(mat: BlockMatrix): IndexedRowMatrix = { 241 | 242 | new IndexedRowMatrix( 243 | mat.toIndexedRowMatrix.rows 244 | .map{row => 245 | val svec = removeWeakConnections(row.vector.toSparse) // Pruning elements locally, instead of scanning all matrix again 246 | IndexedRow(row.index, 247 | // Normalizing rows locally, instead of scanning all matrix again 248 | normalization( 249 | new SparseVector(svec.size, svec.indices, svec.values.map(v => Math.exp(inflationRate*Math.log(v)))) 250 | ) 251 | ) 252 | } 253 | ) 254 | } 255 | 256 | /** Calculate the distance between two matrices. 257 | * 258 | * Find the euclidean distance bewtween two matrices. 259 | * 260 | * @param m1 an adjacency matrix at step n 261 | * @param m2 same adjacency matrix at step n+1 262 | * @return a normalized distance between m1 and m2 263 | * @todo Use another object to speed up join between RDD. 264 | */ 265 | def difference(m1: IndexedRowMatrix, m2: IndexedRowMatrix): Double = { 266 | 267 | val m1RDD:RDD[((Long,Int),Double)] = m1.rows.flatMap(r => { 268 | val sv = r.vector.toSparse 269 | sv.indices.map(i => ((r.index,i), sv.apply(i))) 270 | }) 271 | 272 | val m2RDD:RDD[((Long,Int),Double)] = m2.rows.flatMap(r => { 273 | val sv = r.vector.toSparse 274 | sv.indices.map(i => ((r.index,i), sv.apply(i))) 275 | }) 276 | 277 | val diffRDD = m1RDD.fullOuterJoin(m2RDD).map(diff => Math.pow(diff._2._1.getOrElse(0.0) - diff._2._2.getOrElse(0.0), 2)) 278 | diffRDD.sum() 279 | } 280 | 281 | /** Train MCL algorithm. 282 | * 283 | * @param graph a graph to partitioned 284 | * @return an MCLModel where each node is associated to one or more clusters 285 | */ 286 | def run[VD](graph: Graph[VD, Double]): MCLModel = { 287 | 288 | // Add a new attributes to nodes: a unique row index starting from 0 to transform graph into adjacency matrix 289 | val spark = SparkSession.builder().getOrCreate() 290 | import spark.implicits._ 291 | 292 | val lookupTable:DataFrame = 293 | graph.vertices.sortBy(_._1).zipWithIndex() 294 | .map(indexedVertex => (indexedVertex._2.toInt, indexedVertex._1._1.toInt)) 295 | .toDF("matrixId", "nodeId") 296 | 297 | val preprocessedGraph: Graph[Int, Double] = preprocessGraph(graph, lookupTable) 298 | 299 | val mat = toIndexedRowMatrix(preprocessedGraph, selfLoopWeight, graphOrientationStrategy) 300 | 301 | // Number of current iterations 302 | var iter = 0 303 | // Convergence indicator 304 | var change = 1.0 305 | 306 | var M1:IndexedRowMatrix = normalization(mat) 307 | while (iter < maxIterations && change > 0) { 308 | val M2: IndexedRowMatrix = inflation(expansion(M1)) 309 | change = difference(M1, M2) 310 | iter = iter + 1 311 | M1 = M2 312 | } 313 | 314 | // Get attractors in adjacency matrix (nodes with not only null values) and collect every nodes they are attached to in order to form a cluster. 315 | 316 | val rawDF = 317 | M1.rows.flatMap( 318 | r => { 319 | val sv = r.vector.toSparse 320 | sv.indices.map(i => (r.index, (i, sv.apply(i)))) 321 | } 322 | ).groupByKey() 323 | .map(node => (node._1, node._2.maxBy(_._2)._1)) 324 | .toDF("matrixId", "clusterId") 325 | 326 | // Reassign correct ids to each nodes instead of temporary matrix id associated 327 | 328 | val assignments: Dataset[Assignment] = 329 | rawDF 330 | .join(lookupTable, rawDF.col("matrixId")===lookupTable.col("matrixId")) 331 | .select($"nodeId", $"clusterId") 332 | .map(row => Assignment(row.getInt(0).toLong, row.getInt(1).toLong)) 333 | 334 | new MCLModel(assignments) 335 | } 336 | 337 | } 338 | 339 | object MCL{ 340 | 341 | /** Train an MCL model using the given set of parameters. 342 | * 343 | * @param graph training points stored as `BlockMatrix` 344 | * @param expansionRate expansion rate of adjacency matrix at each iteration 345 | * @param inflationRate inflation rate of adjacency matrix at each iteration 346 | * @param epsilon minimum percentage of a weight edge to be significant 347 | * @param maxIterations maximal number of iterations for a non convergent algorithm 348 | * @param selfLoopWeight a coefficient between 0 and 1 to influence clustering granularity and objective 349 | * @param graphOrientationStrategy chose a graph strategy completion depending on its nature. 3 choices: undirected, directed, birected. 350 | * @return an MCL object 351 | */ 352 | def train[VD](graph: Graph[VD, Double], 353 | expansionRate: Int = 2, 354 | inflationRate: Double = 2.0, 355 | epsilon : Double = 0.01, 356 | maxIterations: Int = 10, 357 | selfLoopWeight: Double = 1, 358 | graphOrientationStrategy: String = "undirected"): MCLModel = { 359 | 360 | new MCL() 361 | .setExpansionRate(expansionRate) 362 | .setInflationRate(inflationRate) 363 | .setEpsilon(epsilon) 364 | .setMaxIterations(maxIterations) 365 | .setSelfLoopWeight(selfLoopWeight) 366 | .setGraphOrientationStrategy(graphOrientationStrategy) 367 | .run(graph) 368 | } 369 | 370 | } -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/mllib/clustering/MCLSuite.scala: -------------------------------------------------------------------------------- 1 | /*The MIT License (MIT) 2 | 3 | Copyright (c) 2015-2016, Joan André 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE.*/ 22 | 23 | package org.apache.spark.mllib.clustering 24 | 25 | import org.apache.log4j.{Level, Logger} 26 | import org.apache.spark.graphx._ 27 | import org.apache.spark.mllib.linalg.DenseVector 28 | import org.apache.spark.mllib.linalg.distributed.{BlockMatrix, IndexedRow, IndexedRowMatrix} 29 | import org.apache.spark.rdd.RDD 30 | import org.apache.spark.sql.{Dataset, Row, SparkSession} 31 | import org.apache.spark.sql.functions._ 32 | 33 | import scala.collection.mutable 34 | import scala.io._ 35 | 36 | /** Scala Tests class for MCL algorithm */ 37 | class MCLSuite extends MCLFunSuite{ 38 | // Disable Spark messages when running program 39 | Logger.getLogger("org").setLevel(Level.OFF) 40 | Logger.getLogger("akka").setLevel(Level.OFF) 41 | 42 | // Unit Tests 43 | 44 | test("Parameters getters and setters", UnitTest){ 45 | 46 | val mcl = new MCL() 47 | 48 | mcl.getEpsilon shouldEqual 0.01 49 | mcl.getExpansionRate shouldEqual 2 50 | mcl.getGraphOrientationStrategy shouldEqual "undirected" 51 | mcl.getInflationRate shouldEqual 2.0 52 | mcl.getMaxIterations shouldEqual 10 53 | mcl.getSelfLoopWeight shouldEqual 0.1 54 | 55 | an [IllegalArgumentException] should be thrownBy mcl.setEpsilon(1) 56 | an [IllegalArgumentException] should be thrownBy mcl.setEpsilon(-0.1) 57 | an [IllegalArgumentException] should be thrownBy mcl.setExpansionRate(-1) 58 | an [IllegalArgumentException] should be thrownBy mcl.setGraphOrientationStrategy("test") 59 | an [IllegalArgumentException] should be thrownBy mcl.setInflationRate(0) 60 | an [IllegalArgumentException] should be thrownBy mcl.setMaxIterations(0) 61 | an [IllegalArgumentException] should be thrownBy mcl.setSelfLoopWeight(1.1) 62 | an [IllegalArgumentException] should be thrownBy mcl.setSelfLoopWeight(0) 63 | } 64 | 65 | test("Matrix Normalization", UnitTest) { 66 | 67 | val indexedMatrix: IndexedRowMatrix = 68 | new IndexedRowMatrix( 69 | sc.parallelize( 70 | Seq( 71 | IndexedRow(0, new DenseVector(Array(1,0,0,0,1,0))), 72 | IndexedRow(1, new DenseVector(Array(0,1,1,0,1,1))), 73 | IndexedRow(2, new DenseVector(Array(0,1,1,0,0,1))), 74 | IndexedRow(3, new DenseVector(Array(0,0,0,1,0,1))), 75 | IndexedRow(4, new DenseVector(Array(1,1,0,0,1,0))), 76 | IndexedRow(5, new DenseVector(Array(0,1,1,1,0,1))) 77 | ) 78 | )) 79 | 80 | val MCLObject: MCL = new MCL() 81 | val normalizedMatrix: IndexedRowMatrix = 82 | new IndexedRowMatrix( 83 | indexedMatrix.rows 84 | .map{row => 85 | val svec = row.vector.toSparse 86 | IndexedRow(row.index, 87 | MCLObject.normalization(svec) 88 | ) 89 | } 90 | ) 91 | 92 | val objective: IndexedRowMatrix = 93 | new IndexedRowMatrix( 94 | sc.parallelize( 95 | Seq( 96 | IndexedRow(0, new DenseVector(Array(0.5,0,0,0,0.5,0))), 97 | IndexedRow(1, new DenseVector(Array(0,0.25,0.25,0,0.25,0.25))), 98 | IndexedRow(2, new DenseVector(Array(0,0.3333333333333333,0.3333333333333333,0,0,0.3333333333333333))), 99 | IndexedRow(3, new DenseVector(Array(0,0,0,0.5,0,0.5))), 100 | IndexedRow(4, new DenseVector(Array(0.3333333333333333,0.3333333333333333,0,0,0.3333333333333333,0))), 101 | IndexedRow(5, new DenseVector(Array(0,0.25,0.25,0.25,0,0.25))) 102 | ) 103 | )) 104 | 105 | normalizedMatrix.numRows shouldEqual objective.numRows 106 | normalizedMatrix.numCols shouldEqual objective.numCols 107 | objective.rows.map(iRow => (iRow.index, iRow.vector.toArray)) 108 | .join( 109 | normalizedMatrix.rows.map(iRow => (iRow.index, iRow.vector.toArray)) 110 | ) 111 | .collect.foreach( 112 | pairOfRows => 113 | { 114 | pairOfRows._2._1 shouldEqual pairOfRows._2._2 115 | } 116 | ) 117 | 118 | } 119 | 120 | test("Remove Weak Connections", UnitTest) { 121 | 122 | val indexedMatrix: IndexedRowMatrix = 123 | new IndexedRowMatrix( 124 | sc.parallelize( 125 | Seq( 126 | IndexedRow(0, new DenseVector(Array(0.172225,0.027225,0,0,0.172225,0))), 127 | IndexedRow(1, new DenseVector(Array(0.00680625,0.0841,0.04305625,0.00390625,0.021025,0.04305625))), 128 | IndexedRow(2, new DenseVector(Array(0,0.07502121,0.07502121,0.00680625,0.00680625,0.07502121))), 129 | IndexedRow(3, new DenseVector(Array(0,0.015625,0.015625,0.140625,0,0.140625))), 130 | IndexedRow(4, new DenseVector(Array(0.07502121,0.03663396,0.00680625,0,0.12702096,0.00680625))), 131 | IndexedRow(5, new DenseVector(Array(0,0.04305625,0.04305625,0.03515625,0.00390625,0.11055625))) 132 | ) 133 | )) 134 | 135 | val MCLObject: MCL = new MCL().setEpsilon(0.01) 136 | val sparsedMatrix: IndexedRowMatrix = 137 | new IndexedRowMatrix( 138 | indexedMatrix.rows 139 | .map{row => 140 | val svec = row.vector.toSparse 141 | IndexedRow(row.index, 142 | MCLObject.removeWeakConnections(svec) 143 | ) 144 | } 145 | ) 146 | 147 | val objective: IndexedRowMatrix = 148 | new IndexedRowMatrix( 149 | sc.parallelize( 150 | Seq( 151 | IndexedRow(0, new DenseVector(Array(0.172225,0.027225,0,0,0.172225,0))), 152 | IndexedRow(1, new DenseVector(Array(0,0.0841,0.04305625,0,0.021025,0.04305625))), 153 | IndexedRow(2, new DenseVector(Array(0,0.07502121,0.07502121,0,0,0.07502121))), 154 | IndexedRow(3, new DenseVector(Array(0,0.015625,0.015625,0.140625,0,0.140625))), 155 | IndexedRow(4, new DenseVector(Array(0.07502121,0.03663396,0,0,0.12702096,0))), 156 | IndexedRow(5, new DenseVector(Array(0,0.04305625,0.04305625,0.03515625,0,0.11055625))) 157 | ) 158 | )) 159 | 160 | sparsedMatrix.numRows shouldEqual objective.numRows 161 | sparsedMatrix.numCols shouldEqual objective.numCols 162 | objective.rows.map(iRow => (iRow.index, iRow.vector.toArray)) 163 | .join( 164 | sparsedMatrix.rows.map(iRow => (iRow.index, iRow.vector.toArray)) 165 | ) 166 | .collect.sortBy(row => row._1).foreach( 167 | pairOfRows => 168 | { 169 | val sparsedRows = pairOfRows._2._2.map(e => BigDecimal(e).setScale(8, BigDecimal.RoundingMode.HALF_UP).toDouble) 170 | pairOfRows._2._1 shouldEqual sparsedRows 171 | } 172 | ) 173 | 174 | } 175 | 176 | test("Matrix Expansion", UnitTest) { 177 | 178 | val indexedMatrix: IndexedRowMatrix = 179 | new IndexedRowMatrix( 180 | sc.parallelize( 181 | Seq( 182 | IndexedRow(0, new DenseVector(Array(0.5,0,0,0,0.5,0))), 183 | IndexedRow(1, new DenseVector(Array(0,0.25,0.25,0,0.25,0.25))), 184 | IndexedRow(2, new DenseVector(Array(0,0.33,0.33,0,0,0.33))), 185 | IndexedRow(3, new DenseVector(Array(0,0,0,0.5,0,0.5))), 186 | IndexedRow(4, new DenseVector(Array(0.33,0.33,0,0,0.33,0))), 187 | IndexedRow(5, new DenseVector(Array(0,0.25,0.25,0.25,0,0.25))) 188 | ) 189 | )) 190 | 191 | val MCLObject: MCL = new MCL() 192 | val expandedMatrix: IndexedRowMatrix = MCLObject.expansion(indexedMatrix).toIndexedRowMatrix() 193 | 194 | val objective: IndexedRowMatrix = 195 | new IndexedRowMatrix( 196 | sc.parallelize( 197 | Seq( 198 | IndexedRow(0, new DenseVector(Array(0.4150,0.1650,0,0,0.4150,0))), 199 | IndexedRow(1, new DenseVector(Array(0.0825,0.2900,0.2075,0.0625,0.1450,0.2075))), 200 | IndexedRow(2, new DenseVector(Array(0,0.2739,0.2739,0.0825,0.0825,0.2739))), 201 | IndexedRow(3, new DenseVector(Array(0,0.1250,0.1250,0.3750,0,0.3750))), 202 | IndexedRow(4, new DenseVector(Array(0.2739,0.1914,0.0825,0,0.3564,0.0825))), 203 | IndexedRow(5, new DenseVector(Array(0,0.2075,0.2075,0.1875,0.0625,0.3325))) 204 | ) 205 | )) 206 | 207 | expandedMatrix.numRows shouldEqual objective.numRows 208 | expandedMatrix.numCols shouldEqual objective.numCols 209 | objective.rows.map(iRow => (iRow.index, iRow.vector.toArray)) 210 | .join( 211 | expandedMatrix.rows.map(iRow => (iRow.index, iRow.vector.toArray)) 212 | ) 213 | .collect.sortBy(row => row._1).foreach( 214 | pairOfRows => 215 | { 216 | val expandedRows = pairOfRows._2._2.map(e => BigDecimal(e).setScale(4, BigDecimal.RoundingMode.HALF_UP).toDouble) 217 | pairOfRows._2._1 shouldEqual expandedRows 218 | } 219 | ) 220 | 221 | } 222 | 223 | test("Matrix Inflation", UnitTest) { 224 | 225 | val indexedMatrix: BlockMatrix = 226 | new IndexedRowMatrix( 227 | sc.parallelize( 228 | Seq( 229 | IndexedRow(0, new DenseVector(Array(0.4150,0.1650,0,0,0.4150,0))), 230 | IndexedRow(1, new DenseVector(Array(0.0825,0.2900,0.2075,0.0625,0.1450,0.2075))), 231 | IndexedRow(2, new DenseVector(Array(0,0.2739,0.2739,0.0825,0.0825,0.2739))), 232 | IndexedRow(3, new DenseVector(Array(0,0.1250,0.1250,0.3750,0,0.3750))), 233 | IndexedRow(4, new DenseVector(Array(0.2739,0.1914,0.0825,0,0.3564,0.0825))), 234 | IndexedRow(5, new DenseVector(Array(0,0.2075,0.2075,0.1875,0.0625,0.3325))) 235 | ) 236 | )).toBlockMatrix 237 | 238 | val MCLObject: MCL = new MCL() 239 | val inflatedMatrix: IndexedRowMatrix = MCLObject.inflation(indexedMatrix) 240 | 241 | val objective: IndexedRowMatrix = 242 | new IndexedRowMatrix( 243 | sc.parallelize( 244 | Seq( 245 | IndexedRow(0, new DenseVector(Array(0.46337526,0.07324948,0,0,0.46337526,0))), 246 | IndexedRow(1, new DenseVector(Array(0.03370265,0.41643971,0.21320253,0.01934266,0.10410993,0.21320253))), 247 | IndexedRow(2, new DenseVector(Array(0,0.31432222,0.31432222,0.02851668,0.02851668,0.31432222))), 248 | IndexedRow(3, new DenseVector(Array(0,0.05000000,0.05000000,0.45000000,0,0.45000000))), 249 | IndexedRow(4, new DenseVector(Array(0.29736263,0.14520654,0.02697803,0,0.50347477,0.02697803))), 250 | IndexedRow(5, new DenseVector(Array(0,0.18264973,0.18264973,0.14913699,0.01657078,0.46899276))) 251 | ) 252 | )) 253 | 254 | inflatedMatrix.numRows shouldEqual objective.numRows 255 | inflatedMatrix.numCols shouldEqual objective.numCols 256 | objective.rows.map(iRow => (iRow.index, iRow.vector.toArray)) 257 | .join( 258 | inflatedMatrix.rows.map(iRow => (iRow.index, iRow.vector.toArray)) 259 | ) 260 | .collect.sortBy(row => row._1).foreach( 261 | pairOfRows => 262 | { 263 | val inflatedRows = pairOfRows._2._2.map(e => BigDecimal(e).setScale(8, BigDecimal.RoundingMode.HALF_UP).toDouble) 264 | pairOfRows._2._1 shouldEqual inflatedRows 265 | } 266 | ) 267 | 268 | } 269 | 270 | test("Difference Between Two Matrices", UnitTest) { 271 | 272 | val startMatrix: IndexedRowMatrix = 273 | new IndexedRowMatrix( 274 | sc.parallelize( 275 | Seq( 276 | IndexedRow(0, new DenseVector(Array(1,0,0,0,1,0))), 277 | IndexedRow(1, new DenseVector(Array(0,1,1,0,1,1))), 278 | IndexedRow(2, new DenseVector(Array(0,1,1,0,0,1))), 279 | IndexedRow(3, new DenseVector(Array(0,0,0,1,0,1))), 280 | IndexedRow(4, new DenseVector(Array(1,1,0,0,1,0))), 281 | IndexedRow(5, new DenseVector(Array(0,1,1,1,0,1))) 282 | ) 283 | )) 284 | 285 | val stopMatrix: IndexedRowMatrix = 286 | new IndexedRowMatrix( 287 | sc.parallelize( 288 | Seq( 289 | IndexedRow(0, new DenseVector(Array(0.172225,0.027225,0,0,0.172225,0))), 290 | IndexedRow(1, new DenseVector(Array(0,0.0841,0.04305625,0,0.021025,0.04305625))), 291 | IndexedRow(2, new DenseVector(Array(0,0.07502121,0.07502121,0,0,0.07502121))), 292 | IndexedRow(3, new DenseVector(Array(0,0.015625,0.015625,0.140625,0,0.140625))), 293 | IndexedRow(4, new DenseVector(Array(0.07502121,0.03663396,0,0,0.12702096,0))), 294 | IndexedRow(5, new DenseVector(Array(0,0.04305625,0.04305625,0.03515625,0,0.11055625))) 295 | ) 296 | )) 297 | 298 | val MCLObject: MCL = new MCL() 299 | val diff: Double = MCLObject.difference(startMatrix, stopMatrix) 300 | 301 | BigDecimal(diff).setScale(7, BigDecimal.RoundingMode.HALF_UP).toDouble shouldEqual 15.1434766 302 | 303 | } 304 | 305 | // Integration Tests 306 | 307 | test("Official MCL Algorithm Versus Spark MCL", IntegrationTest) { 308 | 309 | val relationshipsFile:Seq[String] = Source.fromURL(getClass.getResource("/MCL/karateEdges.csv")).getLines().toSeq 310 | val nodesFile:Seq[String] = Source.fromURL(getClass.getResource("/MCL/karateNodes.csv")).getLines().toSeq 311 | val clustersFile:Seq[String] = Source.fromURL(getClass.getResource("/MCL/clusters.tab")).getLines().toSeq 312 | 313 | val relationships: RDD[Edge[Double]] = 314 | sc.parallelize( 315 | relationshipsFile 316 | .map(line => line.split(" ")) 317 | .map(e => Edge(e(0).toLong, e(1).toLong, e(2).toDouble)) 318 | ) 319 | 320 | val users: RDD[(VertexId, String)] = 321 | sc.parallelize( 322 | nodesFile 323 | .map(line => line.split(" ")) 324 | .map(n => (n(0).toLong, n(1))) 325 | ) 326 | 327 | val graph: Graph[String, Double] = Graph(users, relationships) 328 | 329 | val spark = SparkSession.builder().getOrCreate() 330 | import spark.implicits._ 331 | 332 | val assignments:Dataset[Assignment] = MCL.train(graph, epsilon=0.01, maxIterations=30, selfLoopWeight = 1.0, graphOrientationStrategy = "bidirected").assignments 333 | val clusters = 334 | assignments 335 | .groupBy("cluster") 336 | .agg(collect_list(col("id"))) 337 | .withColumn("group", sort_array(col("collect_list(id)"))) 338 | .select("group").map{ 339 | case Row(group: mutable.WrappedArray[Long]) => (group.max, group) 340 | } 341 | .withColumnRenamed("_1", "clusterIdTest") 342 | .withColumnRenamed("_2", "group") 343 | 344 | val clustersChallenge = 345 | sc.parallelize( 346 | clustersFile 347 | .map(line => line.split("\t").map(node => node.toInt).toList) 348 | .map(assignment => (assignment.max, assignment.toArray.sorted)) 349 | ).toDF("clusterIdReal", "group") 350 | 351 | val test = clusters.join(clustersChallenge, clusters.col("clusterIdTest")===clustersChallenge.col("clusterIdReal")) 352 | test.count shouldEqual clustersChallenge.count 353 | 354 | } 355 | 356 | } 357 | --------------------------------------------------------------------------------