├── .gitignore ├── .idea ├── .name ├── artifacts │ └── canopy_clustering_spark_jar.xml ├── compiler.xml ├── copyright │ └── profiles_settings.xml ├── dictionaries │ └── abose.xml ├── libraries │ ├── scala_sdk_2_10_4.xml │ └── spark.xml ├── misc.xml ├── modules.xml ├── uiDesigner.xml ├── vcs.xml └── workspace.xml ├── META-INF └── MANIFEST.MF ├── README.md ├── canopy-clustering-spark.iml ├── docs └── design.md ├── lib └── LIST_OF_LIBRARIES ├── out └── README ├── src └── main │ └── scala │ └── ml │ └── dolphin │ └── personas │ ├── canopy │ ├── Attributes.scala │ ├── CanopyKMeans.scala │ ├── EuclideanVectorSpace.scala │ ├── PersonaCommon.scala │ ├── VectorSpace.scala │ └── XORShiftRandom.scala │ └── examples │ └── ExampleCanopyKMeans.scala └── target └── TARGET_README /.gitignore: -------------------------------------------------------------------------------- 1 | lib/*.jar 2 | .DS_Store 3 | *.class 4 | -------------------------------------------------------------------------------- /.idea/.name: -------------------------------------------------------------------------------- 1 | canopy-clustering-spark -------------------------------------------------------------------------------- /.idea/artifacts/canopy_clustering_spark_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | $PROJECT_DIR$/target 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /.idea/copyright/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.idea/dictionaries/abose.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.idea/libraries/scala_sdk_2_10_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /.idea/libraries/spark.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | General 15 | 16 | 17 | XPath 18 | 19 | 20 | XSLT 21 | 22 | 23 | 24 | 25 | DefaultFileTemplate 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 18 | 19 | 20 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 87 | 88 | 91 | 92 | 93 | 105 | 106 | 107 | 115 | 116 | 117 | 118 | 119 | 120 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 155 | 156 | 157 | 158 | 161 | 162 | 165 | 166 | 167 | 168 | 171 | 172 | 175 | 176 | 179 | 180 | 183 | 184 | 187 | 188 | 191 | 192 | 195 | 196 | 197 | 198 | 201 | 202 | 205 | 206 | 209 | 210 | 213 | 214 | 217 | 218 | 221 | 222 | 225 | 226 | 229 | 230 | 231 | 232 | 235 | 236 | 239 | 240 | 243 | 244 | 245 | 246 | 249 | 250 | 253 | 254 | 257 | 258 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 298 | 299 | 300 | 318 | 319 | 320 | 339 | 340 | 346 | 347 | 348 | 361 | 362 | 363 | 370 | 373 | 375 | 376 | 377 | 378 | 379 | 380 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 420 | 421 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 481 | 488 | 489 | 507 | 508 | 526 | 527 | 547 | 548 | 569 | 570 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | localhost 611 | 5050 612 | 613 | 614 | 615 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 1447289448985 627 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 665 | 668 | 669 | 670 | 672 | 673 | 674 | 675 | 676 | 677 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | 785 | 786 | 787 | 788 | 789 | 790 | 791 | 792 | 793 | 794 | 795 | 796 | 797 | 798 | 799 | 800 | 801 | 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | canopy-clustering-spark:jar 838 | 839 | 845 | 846 | 847 | 848 | 849 | 850 | 851 | 856 | 857 | 858 | 859 | 860 | 861 | No facets are configured 862 | 863 | 868 | 869 | 870 | 871 | 872 | 873 | Python 2.7.1 (/usr/bin/python) interpreter library 874 | 875 | 880 | 881 | 882 | 883 | 884 | 885 | 1.6 886 | 887 | 892 | 893 | 894 | 895 | 896 | 897 | canopy-clustering-spark 898 | 899 | 905 | 906 | 907 | 908 | 909 | 910 | 1.6 911 | 912 | 917 | 918 | 919 | 920 | 921 | 922 | spark 923 | 924 | 929 | 930 | 931 | 932 | 933 | 934 | -------------------------------------------------------------------------------- /META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | Main-Class: ExampleCanopyKMeans 3 | 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # canopy-clustering-spark 2 | Canopy k-means clustering using Spark and Scala 3 | 4 | An implementation of [Canopy Clustering](https://en.wikipedia.org/wiki/Canopy_clustering_algorithm) using Spark and Scala. 5 | -------------------------------------------------------------------------------- /canopy-clustering-spark.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /docs/design.md: -------------------------------------------------------------------------------- 1 | Code Walkthrough 2 | 3 | 4 | Reading Input Data 5 | 6 | Take a look at PersonaCommon.readHivePoints and PersonaCommon.readCsvMetadata 7 | methods, for reading in tab-delimited (format for Hive tables) and 8 | comma-delimited CSV files, and creating a RDD of Vector's out of them. This is standard boilerplate 9 | code for Spark. What is slightly different is that I also calculate a hash of 10 | each row and store each row as a tuple of (Vector, Int). Canopy algorithm calls for keeping track of 11 | which data points are within a given distance of a canopy center. Instead of storing the original 12 | features (i.e. columns of the dataset) of a Vector, I store just the hash to keep track of these proximities. For large datasets, 13 | a small amount of computation to calculate and compare hashes is usually cheaper than trying to store all the data objects in memory. So, our data is now stored as RDD[(Vector, Int)]. 14 | 15 | Vector Space operations 16 | 17 | Take a look at VectorSpace.scala and EuclideanVectorSpace.scala. 18 | 19 | The basic operations on a Vector space are defined in VectorSpace.scala 20 | as a trait: 21 | 22 | trait Note the generic type "A" used to define the attributes of the 23 | points. We will later extend it to Vector since our data points are defined 24 | as (Vector, Int). 25 | 26 | 27 | distance(): calculate distance between a pair of points 28 | 29 | distance() and angle() between two points, and 30 | groupwise: calculate centroid() of a sequence of points, and calculate the 31 | closest() of a group of points to a given point, as a Scala trait. Note the 32 | generic type definition as in "trait VectorSpace[A] {...". 33 | 34 | I extend this to an Euclidean space of Vectors in EuclideanVectorSpace.scala: 35 | 36 | object EuclideanVectorSpace extends VectorSpace[Vector] { 37 | .. 38 | 39 | and define the methods 40 | 41 | -------------------------------------------------------------------------------- /lib/LIST_OF_LIBRARIES: -------------------------------------------------------------------------------- 1 | The following libraries were used to build the project: 2 | 3 | datanucleus-api-jdo-3.2.6.jar 4 | datanucleus-core-3.2.10.jar 5 | datanucleus-rdbms-3.2.9.jar 6 | spark-1.2.0.2.2.0.0-82-yarn-shuffle.jar 7 | spark-assembly-1.2.0.2.2.0.0-82-hadoop2.6.0.2.2.0.0-2041.jar 8 | 9 | The above Spark 1.2 libraries are for HortonWorks Hadoop distro but other distros should work without issues. 10 | -------------------------------------------------------------------------------- /out/README: -------------------------------------------------------------------------------- 1 | The intermediate class files are created in this directory tree. 2 | -------------------------------------------------------------------------------- /src/main/scala/ml/dolphin/personas/canopy/Attributes.scala: -------------------------------------------------------------------------------- 1 | package ml.dolphin.personas.canopy 2 | 3 | /** 4 | * Definitions of an attribute list and an attribute map. 5 | * 6 | * @author Abhijit Bose 7 | * @version 1.0 06/24/2015 8 | * @since 1.0 06/24/2015 9 | * 10 | */ 11 | 12 | class Attributes(xL: List[((String, Int, Int), Int)], xM: Map[String, (Int, Int, Int)]) { 13 | var l: List[((String, Int, Int), Int)] = xL 14 | var m: Map[String, (Int, Int, Int)] = xM 15 | } 16 | -------------------------------------------------------------------------------- /src/main/scala/ml/dolphin/personas/canopy/CanopyKMeans.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.mllib.linalg.canopy 2 | 3 | import ml.dolphin.personas.canopy.{EuclideanVectorSpace, PersonaCommon, XORShiftRandom} 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.mllib.clustering.KMeansModel 6 | import org.apache.spark.mllib.linalg.BLAS.axpy 7 | import org.apache.spark.mllib.linalg.{Vector, Vectors} 8 | import org.apache.spark.rdd.RDD 9 | import org.apache.spark.{Logging, SparkContext} 10 | 11 | import scala.collection.mutable 12 | import scala.util.Random 13 | import scala.util.control.Breaks._ 14 | 15 | /** 16 | * An implementation of Canopy K-means clustering in Spark 17 | * 18 | * @author Abhijit Bose 19 | * @version 1.0 06/24/2015 20 | * @since 1.0 06/24/2015 21 | * 22 | */ 23 | 24 | class CanopyKMeans private( 25 | private var k: Int, 26 | private var maxIterations: Int, 27 | private var epsilon: Double, 28 | private var seed: Int, 29 | private var t1: Double, 30 | private var t2: Double) extends Serializable with Logging { 31 | 32 | // default constructor for class 33 | def this() = this(2, 10, 1.e-04, new scala.util.Random().nextInt(), 0.0, 0.0) 34 | 35 | /** 36 | * Definitions for getter and setter methods for algorithm parameters follow. 37 | */ 38 | 39 | // Number of centroids 40 | def getK: Int = k 41 | 42 | def setK(k: Int): this.type = { 43 | this.k = k 44 | this 45 | } 46 | 47 | // Maximum number of iterations 48 | def getMaxIterations: Int = maxIterations 49 | 50 | def setMaxIterations(maxIterations: Int): this.type = { 51 | this.maxIterations = maxIterations 52 | this 53 | } 54 | 55 | // Delta of centroid distances between successive iterations. Used to decide convergence. 56 | def getEpsilon: Double = epsilon 57 | 58 | def setEpsilon(epsilon: Double): this.type = { 59 | this.epsilon = epsilon 60 | this 61 | } 62 | 63 | // Random seed for cluster initialization 64 | def getSeed: Int = seed 65 | 66 | def setSeed(seed: Int): this.type = { 67 | this.seed = seed 68 | this 69 | } 70 | 71 | // T1: Distance from a canopy center beyond which the points can belong to other canopies. 72 | def getT1: Double = t1 73 | 74 | def setT1(value: Double): this.type = { 75 | this.t1 = value 76 | this 77 | } 78 | 79 | // T2: Distance from a canopy center within which all points belong to the same canopy. 80 | // T1 > T2 must be set. 81 | def getT2: Double = t2 82 | 83 | def setT2(value: Double): this.type = { 84 | this.t2 = value 85 | this 86 | } 87 | 88 | /** 89 | * Algorithm to be invoked for performing canopy clustering. Other methods are private. 90 | * 91 | * @param sc Spark Context 92 | * @param input Input file locator as a String 93 | * @return org.apache.spark.mllib.clustering.KMeansModel object containing the k-means model 94 | */ 95 | def runAlgorithm(sc: SparkContext, input: String): KMeansModel = { 96 | 97 | // Read input CSV files generated as the result of a Hive query. 98 | // @todo Generalize input file options. 99 | val features = PersonaCommon.readHivePoints(sc, input) 100 | 101 | /* 102 | * Generate k random centers from the input points. Each point is (Vector, Int) where the 103 | * second element represents the hashcode of the Vector. 104 | */ 105 | var costDiff = Double.PositiveInfinity 106 | val centers = initRandom(features) 107 | println("Initial Centers => " + centers.foreach(println)) 108 | /* 109 | * Apply the Canopy algorithm to find the canopy centers and assign canopies to each point. 110 | * To reduce the amount of storage, we keep the canopy->points associations as 111 | * canopy->Set[hashCode(point)] where point is a Vector. To check whether a Vector belongs to 112 | * a canopy, first produce the hash: hashcode(point), and then check which canopy it belongs to. 113 | * 114 | * We broadcast the canopy centers to all partitions so lloyd's algorithm can be performed 115 | * locally. 116 | */ 117 | val canopies = canopyCenters(features) 118 | val bcCanopies = sc.broadcast(canopies) 119 | 120 | /* 121 | * Main loop for k-means iterations. Within each iteration, we do all operations using 122 | * mapPartitions to calculate results locally within partitions and then do a global 123 | * collection/operation in order to avoid excessive shuffling of data. 124 | */ 125 | var iteration = 0 126 | while (iteration < maxIterations && costDiff >= epsilon) { 127 | println("Iteration Number => " + iteration) 128 | val bcCenters = sc.broadcast(centers).value 129 | 130 | var costAccumulator = sc.accumulator(0.0, "k-means cost function") 131 | /* 132 | * For each RDD partition of data, do a mapPartition operation as follows: 133 | * 134 | * - Within a partition p: 135 | * 1. Initialize Array for (a) keeping a running sum of all points closest to a centroid 136 | * and (b) count of such points to the centroid, so we can calculate average distances. 137 | * 2. For each point x in p: 138 | * 2.1 Find the closest centroid c of x based on distance(centers, x) 139 | * 2.2 Add to running sum for c (associative): contribution of x, increment count of points 140 | * 2.3 Add to accumulator sum of total cost (associative): distance(c, x) 141 | * 2.4 Return c -> (running sum, count) as iterator of mapPartition 142 | * - Perform reduceByKey over all the partitions to merge the results 143 | */ 144 | val partContribs = features.mapPartitions { points => { 145 | // local computations within a partition. Within a partition, the following are global: 146 | val k = bcCenters.length 147 | val dims = bcCenters(0)._1.size 148 | val runningSums = Array.fill(k)(Vectors.zeros(dims)) 149 | //val ones = Array.fill(dims)(1.0) 150 | val counts = Array.fill(k)(0L) 151 | 152 | 153 | // Operations for each point x in points 154 | points.foreach(x => { 155 | // check which center belongs to the same canopy as x. Return the index of that center. 156 | var index = 0 157 | val isCanopied = isWithinCanopy(bcCanopies.value, bcCenters, x) 158 | var distance = 0.0 159 | 160 | if (isCanopied >= 0) { 161 | distance = EuclideanVectorSpace.distance(bcCenters(index)._1, x._1) 162 | index = isCanopied 163 | } else { 164 | // Brute-force distance calculation over all centers and find the minimum 165 | val (i, d) = EuclideanVectorSpace.closest(EuclideanVectorSpace.toVector(centers), 166 | x._1) 167 | index = i 168 | distance = d 169 | } 170 | val sum = runningSums(index) 171 | 172 | axpy(1.0, x._1, sum) 173 | 174 | counts(index) += 1 175 | costAccumulator += distance 176 | }) 177 | val contribs = for (i <- 0 until k) yield { 178 | (i, (runningSums(i), counts(i))) 179 | } 180 | contribs.iterator 181 | }} 182 | 183 | // Sum up the running sum and count contributions from all partitions in costContribs 184 | type SumCount = (Vector, Long) 185 | val totalContribs = partContribs.reduceByKey((x: SumCount, y: SumCount) => { 186 | axpy(1.0, x._1, y._1) 187 | (y._1, x._2 + y._2) 188 | }).collectAsMap() 189 | 190 | // Update cluster centers 191 | costDiff = 0.0 192 | for (i <- 0 until k) { 193 | val (sum, count) = totalContribs(i) 194 | if (count != 0) { 195 | val newCenter = Vectors.dense(sum.toArray.map(_ / count)) 196 | costDiff += EuclideanVectorSpace.distance(newCenter, centers(i)._1) 197 | centers(i) = (newCenter, newCenter.hashCode()) 198 | } 199 | } 200 | iteration += 1 201 | } 202 | val cv = centers.map(_._1) // only need the center Vector's 203 | println("CVs => " + cv.foreach(println)) 204 | new KMeansModel(cv) 205 | } 206 | 207 | /** 208 | * Algorithm for canopy clustering. 209 | * 1. Find local canopy centers from each RDD partition of input data 210 | * 2. Merge local canopies to generate a global set of canopy centers. 211 | * 212 | * @param data RDD of [Vector, Int] where Vector corresponds to features or attributes 213 | * for a given point. Int corresponds to hash code of the Vector elements. 214 | * @return An array of Vector's corresponding to the canopy centers. 215 | */ 216 | private def canopyCenters(data: RDD[(Vector, Int)]): mutable.Map[Vector, mutable.Set[Int]] = { 217 | 218 | // Find local canopies from each partition 219 | val c = data.mapPartitions { points => { 220 | // Copy points into a mutable Array so we can access and modify the elements. 221 | // This needs to be readdressed if it becomes a memory bottleneck. 222 | var ptArray = mutable.ArrayBuffer[(Vector, Int)]() 223 | points.foreach(x => ptArray += x) 224 | 225 | val canopies = findCanopies(ptArray) 226 | canopies.foreach { x => 227 | println("canopyCenters from partitions => " + x) 228 | } 229 | canopies.iterator 230 | } 231 | }.collect.toMap 232 | 233 | // Merge local canopies across partitions to generate global canopies 234 | val centers = mutable.ArrayBuffer[(Vector, Int)]() 235 | c.foreach(x => centers.append((x._1, x._1.hashCode()))) 236 | // Use the same algorithm again on the local canopy centers 237 | println("canopyCenters: CENTERS => " + centers) 238 | 239 | val cpCenters = findCanopies(centers) 240 | 241 | // Create the final canopy centers by merging hash codes from canopy centers 242 | // that merged 243 | val canopies = mutable.Map[Vector, mutable.Set[Int]]() 244 | cpCenters.foreach(x => { 245 | val setX = c(x._1) 246 | println("ABOSE..." + "x._1 => " + x._1 + " " + setX) 247 | for (hX <- setX) { 248 | if (canopies.contains(x._1)) 249 | canopies(x._1).add(hX) 250 | else 251 | canopies += (x._1 -> mutable.Set[Int](hX)) 252 | } 253 | if (x._2.size > 0) { 254 | var h = 0 255 | for (h <- x._2) { 256 | centers.foreach(y => { 257 | if (y._2 == h) { 258 | val setY = c(y._1) 259 | for (hY <- setY) { 260 | canopies(x._1).add(hY) 261 | } 262 | } 263 | }) 264 | } 265 | } 266 | }) 267 | 268 | println("Final canopies " + canopies) 269 | canopies 270 | } 271 | 272 | /** * 273 | * Canopy finding algorithm for a given set of points. Note we send the hashcode 274 | * of a Vector along with the Vector as a new type: (Vector, Int). 275 | * 276 | * @param points 277 | * @return 278 | */ 279 | private def findCanopies(points: mutable.ArrayBuffer[(Vector, Int)]): 280 | mutable.Map[Vector, mutable.Set[Int]] = { 281 | var r = points 282 | var canopies = mutable.Map[Vector, mutable.Set[Int]]() 283 | println("findCanopies: POINTS SIZE " + r.size + "POINTS VALUES " + r) 284 | 285 | while (r.size > 0) { 286 | // Choose a point as canopy center 287 | //val canopyIdx = scala.util.Random.nextInt(points.size) 288 | //val canopy = points(canopyIdx) 289 | val shuffled = Random.shuffle(r) 290 | val canopy = shuffled.head 291 | println("INSIDE WHILE: New canopy => " + canopy) 292 | if (canopies.size > 0) { 293 | canopies.foreach(x => canopies(x._1).remove(canopy._2)) 294 | } 295 | canopies += (canopy._1 -> scala.collection.mutable.Set()) 296 | 297 | //val r = points.filter(x => x != canopy) 298 | //points.remove(canopyIdx) 299 | r = r.filter(x => x != canopy) 300 | 301 | for (point <- r) { 302 | //val point = r(idx) 303 | println("INNER LOOP....POINT => " + point + ", POINTS => " + r + ", POINTS_SIZE => " + r.size) 304 | 305 | val distance = EuclideanVectorSpace.distance(point._1, canopy._1) 306 | println("INNER LOOP....POINT => " + point._1 + " Canopy => " + canopy._1 + " distance => " + distance) 307 | if (distance <= getT1) { 308 | // Check if we are inserting into canopies for the first time 309 | //if (canopies.contains(canopy._1)) { 310 | canopies(canopy._1).add(point._2) 311 | //} else { 312 | // canopies += (canopy._1 -> scala.collection.mutable.Set(point._2)) 313 | //} 314 | } 315 | if (distance < getT2) { 316 | //points.remove(idx) 317 | r = r.filter(x => x != point) 318 | println("Point removed => " + point) 319 | } 320 | println("...CANOPIES SO FAR => " + canopies) 321 | } 322 | } 323 | // Add self to the list of canopy edges. 324 | canopies.foreach(x => canopies(x._1).add(x._1.hashCode())) 325 | println("..Reached end of findCanopies. Canopies => " + canopies) 326 | canopies 327 | } 328 | 329 | /** 330 | * Sample data points randomly to pick k initial cluster centers 331 | * @param data 332 | * @return An Array of (Vector, hashcode()) as k sampled points 333 | */ 334 | private def initRandom(data: RDD[(Vector, Int)]): Array[(Vector, Int)] = { 335 | val random = new XORShiftRandom(this.seed) 336 | data.takeSample(false, k, new XORShiftRandom(this.seed).nextInt()) 337 | } 338 | 339 | /** 340 | * For a given point "x" and a set of cluster centers, find which cluster center and 341 | * the point are both within a canopy. If such a co-occurrence cannot be found, return -1. 342 | * @param canopies 343 | * @param centers 344 | * @param x 345 | * @return 346 | */ 347 | private def isWithinCanopy(canopies: mutable.Map[Vector, mutable.Set[Int]], 348 | centers: Array[(Vector, Int)], 349 | x: (Vector, Int)): Int = { 350 | var index = 0 351 | breakable { 352 | for (center <- centers) { 353 | for ((k, v) <- canopies) { 354 | //println("isWithinCanopy: " + "canopy -> k " + k + " , v => " + v + ", center -> " + 355 | //center + " x -> " + x ) 356 | if (v.contains(center._2) && (v.contains(x._2))) { 357 | //println("isWithinCanopy: " + "canopy -> k " + k + " , v => " + v + ", center -> " + 358 | //center + " x -> " + x + " , index -> " + index) 359 | break 360 | } 361 | } 362 | index += 1 363 | } 364 | } 365 | if (index == centers.size) { 366 | index = -1 367 | } 368 | println("isWithinCanopy: " + "canopies -> " + canopies + " centers -> " + centers + 369 | " x -> " + x + "index -> " + index) 370 | index 371 | } 372 | } 373 | 374 | /** 375 | * User-callable methods for running Canopy k-Means package 376 | */ 377 | 378 | object CanopyKMeans { 379 | 380 | /** 381 | * Builds a canopy clustering model with all parameters specified by the user 382 | * 383 | * @param sc Spark Context 384 | * @param input Location of file(s) with input data points 385 | * @param k Number of clusters 386 | * @param maxIterations Maximum number of iterations 387 | * @param epsilon Distance threshold to determine convergence 388 | * @param seed Seed value for randomly picking k initial centers 389 | * @param t1 Distance from canopy center beyond which points can belong to other canopies 390 | * @param t2 Distance from canopy center within which all points belong to same canopy 391 | * @return 392 | */ 393 | def train( 394 | sc: SparkContext, 395 | input: String, 396 | k: Int, 397 | maxIterations: Int, 398 | epsilon: Double, 399 | seed: Int, 400 | t1: Double, 401 | t2: Double): KMeansModel = { 402 | new CanopyKMeans() 403 | .setK(k) 404 | .setMaxIterations(maxIterations) 405 | .setEpsilon(epsilon) 406 | .setSeed(seed) 407 | .setT1(t1) 408 | .setT2(t2).runAlgorithm(sc, input) 409 | } 410 | 411 | /** 412 | * Builds a canopy clustering model with a mix of default parameters and parameters specified 413 | * by the user 414 | * 415 | * @param sc Spark Context 416 | * @param input Location of file(s) with input data points 417 | * @param k Number of clusters 418 | * @param t1 Distance from canopy center beyond which points can belong to other canopies 419 | * @param t2 Distance from canopy center within which all points belong to same canopy 420 | * @return 421 | */ 422 | def train( 423 | sc: SparkContext, 424 | input: String, 425 | k: Int, 426 | t1: Double, 427 | t2: Double): KMeansModel = { 428 | if (t1 <= t2) { 429 | println("Parameter T1 (" + t1 + ") must be larger than T2 (" + t2 + "). Run aborted.") 430 | sc.stop() 431 | sys.exit() 432 | } 433 | train(sc, input, k, 4, 1.e-04, new scala.util.Random().nextInt(), t1, t2) 434 | } 435 | } 436 | -------------------------------------------------------------------------------- /src/main/scala/ml/dolphin/personas/canopy/EuclideanVectorSpace.scala: -------------------------------------------------------------------------------- 1 | 2 | //package org.apache.spark.mllib.linalg.canopy 3 | 4 | package ml.dolphin.personas.canopy 5 | /** 6 | * Euclidean Vector Space extended from VectorSpace. 7 | * @note Methods have no side effects 8 | * 9 | * @author Abhijit Bose 10 | * @version 1.0 06/24/2015 11 | * @since 1.0 06/24/2015 12 | */ 13 | 14 | import breeze.numerics.sqrt 15 | import org.apache.spark.mllib.linalg.{Vector, Vectors} 16 | 17 | import scala.math.pow 18 | 19 | object EuclideanVectorSpace extends VectorSpace[Vector] { 20 | 21 | /** 22 | * Euclidean Distance between two vectors x and y 23 | * 24 | * @param x Input Vector x 25 | * @param y Input Vector y 26 | * @return Double 27 | */ 28 | override def distance(x: Vector, y: Vector): Double = { 29 | val dSquared = x.toArray.zip(y.toArray).foldLeft(0.0)( 30 | (r, c) => r + pow(c._1 - c._2, 2) 31 | ) 32 | sqrt(dSquared) 33 | } 34 | 35 | /** 36 | * Centroid of a finite set of points represented as a sequence of Vector's 37 | * 38 | * @param points Input set of points 39 | * @return Vector with the centroid 40 | */ 41 | override def centroid(points: Seq[Vector]) = { 42 | val numCols = points(0).size 43 | val center = points.foldLeft(new Array[Double](numCols))( 44 | (r, c) => r.toArray.zip(c.toArray).map(t => t._1 + t._2) 45 | ) 46 | Vectors.dense(center.map(_ / points.size)) 47 | } 48 | 49 | /** 50 | * Cosine similarity distance measure between two Vector's x and y 51 | * 52 | * @param x Input Vector x 53 | * @param y Input Vector y 54 | * @return Double 55 | */ 56 | override def cosine(x: Vector, y: Vector): Double = { 57 | val normX = sqrt(x.toArray.foldLeft(0.0)( 58 | (r, c) => r + c * c 59 | )) 60 | val normY = sqrt(y.toArray.foldLeft(0.0)( 61 | (r, c) => r + c * c 62 | )) 63 | val inner = x.toArray.zip(y.toArray).foldLeft(0.0)( 64 | (r, c) => r + c._1 * c._2 65 | ) 66 | 1.0 * inner / (normX * normY) 67 | } 68 | 69 | /** 70 | * Finds closest point and shortest distance between a given array of points x, and a given 71 | * point y. Uses brute-force L2-distance pairwise calculation. 72 | * 73 | * @todo Use better algorithm such as triangle inequality to find shortest distance 74 | * @param x Given Array of points, e.g. centroids in K-means clustering 75 | * @param y Given point from which distance needs to be calculated 76 | * @return (index in x, distance) of the closest point to y 77 | */ 78 | override def closest(x: Array[Vector], y: Vector): (Int, Double) = { 79 | var shortestDistance = Double.PositiveInfinity 80 | var closestIndex = 0 81 | var index = 0 82 | x.foreach(center => { 83 | val thisDistance = distance(center, y) 84 | if (thisDistance < shortestDistance) { 85 | shortestDistance = thisDistance 86 | closestIndex = index 87 | } 88 | index += 1 89 | }) 90 | (closestIndex, shortestDistance) 91 | } 92 | 93 | /** 94 | * Converts Array[(Vector, hashcode)] data structure of centers and points to an array of Vectors 95 | * only. Mostly used as a precursor to closest and other operations. 96 | * @param x Array of (Vector, Int) 97 | * @return Array[Vector] 98 | */ 99 | def toVector(x: Array[(Vector, Int)]): Array[Vector] = { 100 | x.map(_._1) 101 | } 102 | 103 | } 104 | -------------------------------------------------------------------------------- /src/main/scala/ml/dolphin/personas/canopy/PersonaCommon.scala: -------------------------------------------------------------------------------- 1 | package ml.dolphin.personas.canopy 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.mllib.linalg.{Vector, Vectors} 5 | import org.apache.spark.rdd.RDD 6 | 7 | import scala.io.Source 8 | 9 | /** 10 | * Common methods for canopy clustering. Still to be developed. mostly input manipulations. 11 | * 12 | * @author Abhijit Bose 13 | * @version 1.0 06/24/2015 14 | * @since 1.0 06/24/2015 15 | */ 16 | 17 | object PersonaCommon { 18 | 19 | /** 20 | * Returns a Attribute object given an input file containing attribute metadata in CSV format. 21 | * 22 | * @param input Input file with attributes metadata 23 | * @return attribute: Attribute containing Attribute object that has information about all 24 | * attributes and their processing information 25 | */ 26 | def readCsvMetadata(input: String): Attributes = { 27 | 28 | /* 29 | * Each line of input CSV file with attributes metadata has the following: 30 | * name, flag, flatten 31 | * where: 32 | * name: String = name of the attribute 33 | * flag: Boolean == 1 => the attribute will be used for clustering 34 | * flatten: Boolean == 1 => the attribute is categorical and will be flattened. 35 | */ 36 | 37 | val schema = Source.fromFile(input).getLines() 38 | // List of attributes in the same order they appear in the data 39 | val aList = schema.map(s => { 40 | val elem = s.split(',') 41 | (elem(0).trim, elem(1).toInt, elem(2).toInt) 42 | }).zipWithIndex.toList 43 | // Map of attributes 44 | val aMap = aList.map(s => { 45 | s._1._1 ->(s._1._2, s._1._3, s._2) 46 | }).toMap 47 | new Attributes(aList, aMap) 48 | } 49 | 50 | /** 51 | * Reads data points in Hive format. Each row is converted into a Vector along with its hashcode. 52 | * @param sc 53 | * @param input 54 | * @return 55 | */ 56 | def readHivePoints(sc: SparkContext, input: String): RDD[(Vector, Int)] = { 57 | val data = sc.textFile(input) 58 | val rows = data.map(s => { 59 | val buffer = s.split('\t').toBuffer 60 | val features = Vectors.dense(buffer.map(_.toDouble).toArray) 61 | (features, features.hashCode()) 62 | }) 63 | rows 64 | } 65 | 66 | 67 | /* Flattening is a procedure by which each distinct value of a categorical attribute is 68 | * converted into an additional Boolean attribute. For example, 69 | * city = ["new york", "london", "delhi", "tokyo"] are distinct values of attribute "city" 70 | * Flattened Boolean attributes created: city_new_york, city_london, city_delhi, city_tokyo 71 | * 72 | * The attributes are named by concatenating the parent attribute with the attribute value 73 | * with "_" in between. Any blank space(s) in the attribute value will be converted into 74 | * "_" as shown in the above example. 75 | */ 76 | 77 | 78 | /** 79 | * 80 | * @param num 81 | * @return 82 | */ 83 | def toIntegerBucket(num: Double): Int = { 84 | if (num < 0.0) { 85 | println("ERROR: ml.dolphin.personas.PersonaCommon: toIntegerBucket(..) cannot handle negative values") 86 | sys.exit() 87 | } 88 | val leftover = num - num.floor 89 | if (leftover < 0.5) 90 | num.floor.toInt 91 | else 92 | num.ceil.toInt 93 | } 94 | 95 | 96 | } 97 | -------------------------------------------------------------------------------- /src/main/scala/ml/dolphin/personas/canopy/VectorSpace.scala: -------------------------------------------------------------------------------- 1 | package ml.dolphin.personas.canopy 2 | 3 | /** 4 | * Common algebraic operations in vector space. Define the functions for a class 5 | * mixed in with this trait that will be appropriate for a specific type of vector space. 6 | * @example EuclideanVectorSpace which extends this trait. 7 | * 8 | * @author Abhijit Bose 9 | * @version 1.0 06/24/2015 10 | * @since 1.0 06/24/2015 11 | */ 12 | trait VectorSpace[A] { 13 | 14 | // Distance between two points x and y 15 | def distance(x: A, y: A): Double 16 | 17 | // Cosine similarity measure between two points x and y 18 | def cosine(x: A, y: A): Double 19 | 20 | // Centroid of a set of points 21 | def centroid(points: Seq[A]): A 22 | 23 | // Index and Distance of the point in Array x that is closest to a given point y 24 | def closest(x: Array[A], y: A): (Int, Double) 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/ml/dolphin/personas/canopy/XORShiftRandom.scala: -------------------------------------------------------------------------------- 1 | package ml.dolphin.personas.canopy 2 | 3 | //package org.apache.spark.mllib.linalg.canopy 4 | 5 | import java.util.Random 6 | 7 | /** 8 | * XORShift Random Number Generator extended from Java's Random class 9 | * https://en.wikipedia.org/wiki/Xorshift 10 | * 11 | * @note This method is NOT thread-safe. For safe parallel execution, a parallel pseudo random 12 | * generator such as SPRNG (http://www.sprng.org) should be used to generate the seeds 13 | * across the different threads. 14 | * @author Abhijit Bose 15 | * @version 1.0 06/24/2015 16 | * @since 1.0 06/24/2015 17 | * 18 | */ 19 | 20 | class XORShiftRandom(private var seed: Int) extends Random { 21 | 22 | // Default constructor for class 23 | def this() = this(System.nanoTime().toInt) 24 | 25 | // override java.util.Random.next method for getting the next pseudo-random number 26 | override def next(nBits: Int): Int = { 27 | var x = this.seed 28 | x ^= (x << 21); 29 | x ^= (x >>> 35); 30 | x ^= (x << 4); 31 | this.seed = x; 32 | x &= ((1 << nBits) - 1); 33 | x; 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/ml/dolphin/personas/examples/ExampleCanopyKMeans.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.mllib.linalg.canopy._ 2 | import org.apache.spark.{SparkConf, SparkContext} 3 | 4 | /** 5 | * Example driver code for using CanopyKMeans 6 | * 7 | * @author Abhijit Bose 8 | * @version 1.0 06/24/2015 9 | * @since 1.0 06/24/2015 10 | * 11 | */ 12 | 13 | object ExampleCanopyKMeans { 14 | 15 | def main(args: Array[String]): Unit = { 16 | // define example points 17 | println("Starting Application....") 18 | val conf = new SparkConf() 19 | .setAppName("Example usage of Canopy Clustering") 20 | .set("spark.akka.frameSize", "10") 21 | .set("spark.akka.threads", "4") 22 | .set("spark.akka.timeout", "1000") 23 | .set("spark.akka.heartbeat.pauses", "6000") 24 | .set("spark.akka.failure-detector.threshold", "3000") 25 | .set("spark.akka.heartbeat.interval", "1000") 26 | .set("spark.eventLog.enabled", "true") 27 | 28 | //.set("spark.storage.memoryFraction", "") // Set RDD caching limit as a fraction of overall JVM heap (60% default) 29 | //.set("spark.shuffle.memoryFraction", "") // limit the total amount of memory used in shuffle-related buffers (20% default). Rest 20% is for user code memory 30 | //.set( 31 | 32 | // .set("spark.shuffle.io.retryWait", ) 33 | 34 | val sc = new SparkContext(conf) 35 | //val model = CanopyKMeans.train(sc, "/Users/r551839/canopy/points.csv", 2, 30.0, 20.0) 36 | //val model = CanopyKMeans.train(sc, "/Users/r551839/canopy/example2.csv", 3, 7.0, 3.0) 37 | val model = CanopyKMeans.train(sc, "/Users/r551839/canopy/wine_attribs_only.tsv", 3, 50.0, 1.0) 38 | model.clusterCenters.foreach(println) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /target/TARGET_README: -------------------------------------------------------------------------------- 1 | The executable jar files are created here. 2 | --------------------------------------------------------------------------------