├── .gitignore
├── .idea
├── .name
├── artifacts
│ └── canopy_clustering_spark_jar.xml
├── compiler.xml
├── copyright
│ └── profiles_settings.xml
├── dictionaries
│ └── abose.xml
├── libraries
│ ├── scala_sdk_2_10_4.xml
│ └── spark.xml
├── misc.xml
├── modules.xml
├── uiDesigner.xml
├── vcs.xml
└── workspace.xml
├── META-INF
└── MANIFEST.MF
├── README.md
├── canopy-clustering-spark.iml
├── docs
└── design.md
├── lib
└── LIST_OF_LIBRARIES
├── out
└── README
├── src
└── main
│ └── scala
│ └── ml
│ └── dolphin
│ └── personas
│ ├── canopy
│ ├── Attributes.scala
│ ├── CanopyKMeans.scala
│ ├── EuclideanVectorSpace.scala
│ ├── PersonaCommon.scala
│ ├── VectorSpace.scala
│ └── XORShiftRandom.scala
│ └── examples
│ └── ExampleCanopyKMeans.scala
└── target
└── TARGET_README
/.gitignore:
--------------------------------------------------------------------------------
1 | lib/*.jar
2 | .DS_Store
3 | *.class
4 |
--------------------------------------------------------------------------------
/.idea/.name:
--------------------------------------------------------------------------------
1 | canopy-clustering-spark
--------------------------------------------------------------------------------
/.idea/artifacts/canopy_clustering_spark_jar.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | $PROJECT_DIR$/target
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/.idea/compiler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/.idea/copyright/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/.idea/dictionaries/abose.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/.idea/libraries/scala_sdk_2_10_4.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/.idea/libraries/spark.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 | General
15 |
16 |
17 | XPath
18 |
19 |
20 | XSLT
21 |
22 |
23 |
24 |
25 | DefaultFileTemplate
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/uiDesigner.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | -
6 |
7 |
8 | -
9 |
10 |
11 | -
12 |
13 |
14 | -
15 |
16 |
17 | -
18 |
19 |
20 |
21 |
22 |
23 | -
24 |
25 |
26 |
27 |
28 |
29 | -
30 |
31 |
32 |
33 |
34 |
35 | -
36 |
37 |
38 |
39 |
40 |
41 | -
42 |
43 |
44 |
45 |
46 | -
47 |
48 |
49 |
50 |
51 | -
52 |
53 |
54 |
55 |
56 | -
57 |
58 |
59 |
60 |
61 | -
62 |
63 |
64 |
65 |
66 | -
67 |
68 |
69 |
70 |
71 | -
72 |
73 |
74 | -
75 |
76 |
77 |
78 |
79 | -
80 |
81 |
82 |
83 |
84 | -
85 |
86 |
87 |
88 |
89 | -
90 |
91 |
92 |
93 |
94 | -
95 |
96 |
97 |
98 |
99 | -
100 |
101 |
102 | -
103 |
104 |
105 | -
106 |
107 |
108 | -
109 |
110 |
111 | -
112 |
113 |
114 |
115 |
116 | -
117 |
118 |
119 | -
120 |
121 |
122 |
123 |
124 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
91 |
92 |
93 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 | localhost
611 | 5050
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 | 1447289448985
627 |
628 | 1447289448985
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
691 |
692 |
693 |
694 |
695 |
696 |
697 |
698 |
699 |
700 |
701 |
702 |
703 |
704 |
705 |
706 |
707 |
708 |
709 |
710 |
711 |
712 |
713 |
714 |
715 |
716 |
717 |
718 |
719 |
720 |
721 |
722 |
723 |
724 |
725 |
726 |
727 |
728 |
729 |
730 |
731 |
732 |
733 |
734 |
735 |
736 |
737 |
738 |
739 |
740 |
741 |
742 |
743 |
744 |
745 |
746 |
747 |
748 |
749 |
750 |
751 |
752 |
753 |
754 |
755 |
756 |
757 |
758 |
759 |
760 |
761 |
762 |
763 |
764 |
765 |
766 |
767 |
768 |
769 |
770 |
771 |
772 |
773 |
774 |
775 |
776 |
777 |
778 |
779 |
780 |
781 |
782 |
783 |
784 |
785 |
786 |
787 |
788 |
789 |
790 |
791 |
792 |
793 |
794 |
795 |
796 |
797 |
798 |
799 |
800 |
801 |
802 |
803 |
804 |
805 |
806 |
807 |
808 |
809 |
810 |
811 |
812 |
813 |
814 |
815 |
816 |
817 |
818 |
819 |
820 |
821 |
822 |
823 |
824 |
825 |
826 |
827 |
828 |
829 |
830 |
831 |
832 |
833 |
834 |
835 |
836 |
837 | canopy-clustering-spark:jar
838 |
839 |
840 |
841 |
842 |
843 |
844 |
845 |
846 |
847 |
848 |
849 |
850 |
851 |
852 |
853 |
854 |
855 |
856 |
857 |
858 |
859 |
860 |
861 | No facets are configured
862 |
863 |
864 |
865 |
866 |
867 |
868 |
869 |
870 |
871 |
872 |
873 | Python 2.7.1 (/usr/bin/python) interpreter library
874 |
875 |
876 |
877 |
878 |
879 |
880 |
881 |
882 |
883 |
884 |
885 | 1.6
886 |
887 |
888 |
889 |
890 |
891 |
892 |
893 |
894 |
895 |
896 |
897 | canopy-clustering-spark
898 |
899 |
900 |
901 |
902 |
903 |
904 |
905 |
906 |
907 |
908 |
909 |
910 | 1.6
911 |
912 |
913 |
914 |
915 |
916 |
917 |
918 |
919 |
920 |
921 |
922 | spark
923 |
924 |
925 |
926 |
927 |
928 |
929 |
930 |
931 |
932 |
933 |
934 |
--------------------------------------------------------------------------------
/META-INF/MANIFEST.MF:
--------------------------------------------------------------------------------
1 | Manifest-Version: 1.0
2 | Main-Class: ExampleCanopyKMeans
3 |
4 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # canopy-clustering-spark
2 | Canopy k-means clustering using Spark and Scala
3 |
4 | An implementation of [Canopy Clustering](https://en.wikipedia.org/wiki/Canopy_clustering_algorithm) using Spark and Scala.
5 |
--------------------------------------------------------------------------------
/canopy-clustering-spark.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/docs/design.md:
--------------------------------------------------------------------------------
1 | Code Walkthrough
2 |
3 |
4 | Reading Input Data
5 |
6 | Take a look at PersonaCommon.readHivePoints and PersonaCommon.readCsvMetadata
7 | methods, for reading in tab-delimited (format for Hive tables) and
8 | comma-delimited CSV files, and creating a RDD of Vector's out of them. This is standard boilerplate
9 | code for Spark. What is slightly different is that I also calculate a hash of
10 | each row and store each row as a tuple of (Vector, Int). Canopy algorithm calls for keeping track of
11 | which data points are within a given distance of a canopy center. Instead of storing the original
12 | features (i.e. columns of the dataset) of a Vector, I store just the hash to keep track of these proximities. For large datasets,
13 | a small amount of computation to calculate and compare hashes is usually cheaper than trying to store all the data objects in memory. So, our data is now stored as RDD[(Vector, Int)].
14 |
15 | Vector Space operations
16 |
17 | Take a look at VectorSpace.scala and EuclideanVectorSpace.scala.
18 |
19 | The basic operations on a Vector space are defined in VectorSpace.scala
20 | as a trait:
21 |
22 | trait Note the generic type "A" used to define the attributes of the
23 | points. We will later extend it to Vector since our data points are defined
24 | as (Vector, Int).
25 |
26 |
27 | distance(): calculate distance between a pair of points
28 |
29 | distance() and angle() between two points, and
30 | groupwise: calculate centroid() of a sequence of points, and calculate the
31 | closest() of a group of points to a given point, as a Scala trait. Note the
32 | generic type definition as in "trait VectorSpace[A] {...".
33 |
34 | I extend this to an Euclidean space of Vectors in EuclideanVectorSpace.scala:
35 |
36 | object EuclideanVectorSpace extends VectorSpace[Vector] {
37 | ..
38 |
39 | and define the methods
40 |
41 |
--------------------------------------------------------------------------------
/lib/LIST_OF_LIBRARIES:
--------------------------------------------------------------------------------
1 | The following libraries were used to build the project:
2 |
3 | datanucleus-api-jdo-3.2.6.jar
4 | datanucleus-core-3.2.10.jar
5 | datanucleus-rdbms-3.2.9.jar
6 | spark-1.2.0.2.2.0.0-82-yarn-shuffle.jar
7 | spark-assembly-1.2.0.2.2.0.0-82-hadoop2.6.0.2.2.0.0-2041.jar
8 |
9 | The above Spark 1.2 libraries are for HortonWorks Hadoop distro but other distros should work without issues.
10 |
--------------------------------------------------------------------------------
/out/README:
--------------------------------------------------------------------------------
1 | The intermediate class files are created in this directory tree.
2 |
--------------------------------------------------------------------------------
/src/main/scala/ml/dolphin/personas/canopy/Attributes.scala:
--------------------------------------------------------------------------------
1 | package ml.dolphin.personas.canopy
2 |
3 | /**
4 | * Definitions of an attribute list and an attribute map.
5 | *
6 | * @author Abhijit Bose
7 | * @version 1.0 06/24/2015
8 | * @since 1.0 06/24/2015
9 | *
10 | */
11 |
12 | class Attributes(xL: List[((String, Int, Int), Int)], xM: Map[String, (Int, Int, Int)]) {
13 | var l: List[((String, Int, Int), Int)] = xL
14 | var m: Map[String, (Int, Int, Int)] = xM
15 | }
16 |
--------------------------------------------------------------------------------
/src/main/scala/ml/dolphin/personas/canopy/CanopyKMeans.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.mllib.linalg.canopy
2 |
3 | import ml.dolphin.personas.canopy.{EuclideanVectorSpace, PersonaCommon, XORShiftRandom}
4 | import org.apache.spark.SparkContext._
5 | import org.apache.spark.mllib.clustering.KMeansModel
6 | import org.apache.spark.mllib.linalg.BLAS.axpy
7 | import org.apache.spark.mllib.linalg.{Vector, Vectors}
8 | import org.apache.spark.rdd.RDD
9 | import org.apache.spark.{Logging, SparkContext}
10 |
11 | import scala.collection.mutable
12 | import scala.util.Random
13 | import scala.util.control.Breaks._
14 |
15 | /**
16 | * An implementation of Canopy K-means clustering in Spark
17 | *
18 | * @author Abhijit Bose
19 | * @version 1.0 06/24/2015
20 | * @since 1.0 06/24/2015
21 | *
22 | */
23 |
24 | class CanopyKMeans private(
25 | private var k: Int,
26 | private var maxIterations: Int,
27 | private var epsilon: Double,
28 | private var seed: Int,
29 | private var t1: Double,
30 | private var t2: Double) extends Serializable with Logging {
31 |
32 | // default constructor for class
33 | def this() = this(2, 10, 1.e-04, new scala.util.Random().nextInt(), 0.0, 0.0)
34 |
35 | /**
36 | * Definitions for getter and setter methods for algorithm parameters follow.
37 | */
38 |
39 | // Number of centroids
40 | def getK: Int = k
41 |
42 | def setK(k: Int): this.type = {
43 | this.k = k
44 | this
45 | }
46 |
47 | // Maximum number of iterations
48 | def getMaxIterations: Int = maxIterations
49 |
50 | def setMaxIterations(maxIterations: Int): this.type = {
51 | this.maxIterations = maxIterations
52 | this
53 | }
54 |
55 | // Delta of centroid distances between successive iterations. Used to decide convergence.
56 | def getEpsilon: Double = epsilon
57 |
58 | def setEpsilon(epsilon: Double): this.type = {
59 | this.epsilon = epsilon
60 | this
61 | }
62 |
63 | // Random seed for cluster initialization
64 | def getSeed: Int = seed
65 |
66 | def setSeed(seed: Int): this.type = {
67 | this.seed = seed
68 | this
69 | }
70 |
71 | // T1: Distance from a canopy center beyond which the points can belong to other canopies.
72 | def getT1: Double = t1
73 |
74 | def setT1(value: Double): this.type = {
75 | this.t1 = value
76 | this
77 | }
78 |
79 | // T2: Distance from a canopy center within which all points belong to the same canopy.
80 | // T1 > T2 must be set.
81 | def getT2: Double = t2
82 |
83 | def setT2(value: Double): this.type = {
84 | this.t2 = value
85 | this
86 | }
87 |
88 | /**
89 | * Algorithm to be invoked for performing canopy clustering. Other methods are private.
90 | *
91 | * @param sc Spark Context
92 | * @param input Input file locator as a String
93 | * @return org.apache.spark.mllib.clustering.KMeansModel object containing the k-means model
94 | */
95 | def runAlgorithm(sc: SparkContext, input: String): KMeansModel = {
96 |
97 | // Read input CSV files generated as the result of a Hive query.
98 | // @todo Generalize input file options.
99 | val features = PersonaCommon.readHivePoints(sc, input)
100 |
101 | /*
102 | * Generate k random centers from the input points. Each point is (Vector, Int) where the
103 | * second element represents the hashcode of the Vector.
104 | */
105 | var costDiff = Double.PositiveInfinity
106 | val centers = initRandom(features)
107 | println("Initial Centers => " + centers.foreach(println))
108 | /*
109 | * Apply the Canopy algorithm to find the canopy centers and assign canopies to each point.
110 | * To reduce the amount of storage, we keep the canopy->points associations as
111 | * canopy->Set[hashCode(point)] where point is a Vector. To check whether a Vector belongs to
112 | * a canopy, first produce the hash: hashcode(point), and then check which canopy it belongs to.
113 | *
114 | * We broadcast the canopy centers to all partitions so lloyd's algorithm can be performed
115 | * locally.
116 | */
117 | val canopies = canopyCenters(features)
118 | val bcCanopies = sc.broadcast(canopies)
119 |
120 | /*
121 | * Main loop for k-means iterations. Within each iteration, we do all operations using
122 | * mapPartitions to calculate results locally within partitions and then do a global
123 | * collection/operation in order to avoid excessive shuffling of data.
124 | */
125 | var iteration = 0
126 | while (iteration < maxIterations && costDiff >= epsilon) {
127 | println("Iteration Number => " + iteration)
128 | val bcCenters = sc.broadcast(centers).value
129 |
130 | var costAccumulator = sc.accumulator(0.0, "k-means cost function")
131 | /*
132 | * For each RDD partition of data, do a mapPartition operation as follows:
133 | *
134 | * - Within a partition p:
135 | * 1. Initialize Array for (a) keeping a running sum of all points closest to a centroid
136 | * and (b) count of such points to the centroid, so we can calculate average distances.
137 | * 2. For each point x in p:
138 | * 2.1 Find the closest centroid c of x based on distance(centers, x)
139 | * 2.2 Add to running sum for c (associative): contribution of x, increment count of points
140 | * 2.3 Add to accumulator sum of total cost (associative): distance(c, x)
141 | * 2.4 Return c -> (running sum, count) as iterator of mapPartition
142 | * - Perform reduceByKey over all the partitions to merge the results
143 | */
144 | val partContribs = features.mapPartitions { points => {
145 | // local computations within a partition. Within a partition, the following are global:
146 | val k = bcCenters.length
147 | val dims = bcCenters(0)._1.size
148 | val runningSums = Array.fill(k)(Vectors.zeros(dims))
149 | //val ones = Array.fill(dims)(1.0)
150 | val counts = Array.fill(k)(0L)
151 |
152 |
153 | // Operations for each point x in points
154 | points.foreach(x => {
155 | // check which center belongs to the same canopy as x. Return the index of that center.
156 | var index = 0
157 | val isCanopied = isWithinCanopy(bcCanopies.value, bcCenters, x)
158 | var distance = 0.0
159 |
160 | if (isCanopied >= 0) {
161 | distance = EuclideanVectorSpace.distance(bcCenters(index)._1, x._1)
162 | index = isCanopied
163 | } else {
164 | // Brute-force distance calculation over all centers and find the minimum
165 | val (i, d) = EuclideanVectorSpace.closest(EuclideanVectorSpace.toVector(centers),
166 | x._1)
167 | index = i
168 | distance = d
169 | }
170 | val sum = runningSums(index)
171 |
172 | axpy(1.0, x._1, sum)
173 |
174 | counts(index) += 1
175 | costAccumulator += distance
176 | })
177 | val contribs = for (i <- 0 until k) yield {
178 | (i, (runningSums(i), counts(i)))
179 | }
180 | contribs.iterator
181 | }}
182 |
183 | // Sum up the running sum and count contributions from all partitions in costContribs
184 | type SumCount = (Vector, Long)
185 | val totalContribs = partContribs.reduceByKey((x: SumCount, y: SumCount) => {
186 | axpy(1.0, x._1, y._1)
187 | (y._1, x._2 + y._2)
188 | }).collectAsMap()
189 |
190 | // Update cluster centers
191 | costDiff = 0.0
192 | for (i <- 0 until k) {
193 | val (sum, count) = totalContribs(i)
194 | if (count != 0) {
195 | val newCenter = Vectors.dense(sum.toArray.map(_ / count))
196 | costDiff += EuclideanVectorSpace.distance(newCenter, centers(i)._1)
197 | centers(i) = (newCenter, newCenter.hashCode())
198 | }
199 | }
200 | iteration += 1
201 | }
202 | val cv = centers.map(_._1) // only need the center Vector's
203 | println("CVs => " + cv.foreach(println))
204 | new KMeansModel(cv)
205 | }
206 |
207 | /**
208 | * Algorithm for canopy clustering.
209 | * 1. Find local canopy centers from each RDD partition of input data
210 | * 2. Merge local canopies to generate a global set of canopy centers.
211 | *
212 | * @param data RDD of [Vector, Int] where Vector corresponds to features or attributes
213 | * for a given point. Int corresponds to hash code of the Vector elements.
214 | * @return An array of Vector's corresponding to the canopy centers.
215 | */
216 | private def canopyCenters(data: RDD[(Vector, Int)]): mutable.Map[Vector, mutable.Set[Int]] = {
217 |
218 | // Find local canopies from each partition
219 | val c = data.mapPartitions { points => {
220 | // Copy points into a mutable Array so we can access and modify the elements.
221 | // This needs to be readdressed if it becomes a memory bottleneck.
222 | var ptArray = mutable.ArrayBuffer[(Vector, Int)]()
223 | points.foreach(x => ptArray += x)
224 |
225 | val canopies = findCanopies(ptArray)
226 | canopies.foreach { x =>
227 | println("canopyCenters from partitions => " + x)
228 | }
229 | canopies.iterator
230 | }
231 | }.collect.toMap
232 |
233 | // Merge local canopies across partitions to generate global canopies
234 | val centers = mutable.ArrayBuffer[(Vector, Int)]()
235 | c.foreach(x => centers.append((x._1, x._1.hashCode())))
236 | // Use the same algorithm again on the local canopy centers
237 | println("canopyCenters: CENTERS => " + centers)
238 |
239 | val cpCenters = findCanopies(centers)
240 |
241 | // Create the final canopy centers by merging hash codes from canopy centers
242 | // that merged
243 | val canopies = mutable.Map[Vector, mutable.Set[Int]]()
244 | cpCenters.foreach(x => {
245 | val setX = c(x._1)
246 | println("ABOSE..." + "x._1 => " + x._1 + " " + setX)
247 | for (hX <- setX) {
248 | if (canopies.contains(x._1))
249 | canopies(x._1).add(hX)
250 | else
251 | canopies += (x._1 -> mutable.Set[Int](hX))
252 | }
253 | if (x._2.size > 0) {
254 | var h = 0
255 | for (h <- x._2) {
256 | centers.foreach(y => {
257 | if (y._2 == h) {
258 | val setY = c(y._1)
259 | for (hY <- setY) {
260 | canopies(x._1).add(hY)
261 | }
262 | }
263 | })
264 | }
265 | }
266 | })
267 |
268 | println("Final canopies " + canopies)
269 | canopies
270 | }
271 |
272 | /** *
273 | * Canopy finding algorithm for a given set of points. Note we send the hashcode
274 | * of a Vector along with the Vector as a new type: (Vector, Int).
275 | *
276 | * @param points
277 | * @return
278 | */
279 | private def findCanopies(points: mutable.ArrayBuffer[(Vector, Int)]):
280 | mutable.Map[Vector, mutable.Set[Int]] = {
281 | var r = points
282 | var canopies = mutable.Map[Vector, mutable.Set[Int]]()
283 | println("findCanopies: POINTS SIZE " + r.size + "POINTS VALUES " + r)
284 |
285 | while (r.size > 0) {
286 | // Choose a point as canopy center
287 | //val canopyIdx = scala.util.Random.nextInt(points.size)
288 | //val canopy = points(canopyIdx)
289 | val shuffled = Random.shuffle(r)
290 | val canopy = shuffled.head
291 | println("INSIDE WHILE: New canopy => " + canopy)
292 | if (canopies.size > 0) {
293 | canopies.foreach(x => canopies(x._1).remove(canopy._2))
294 | }
295 | canopies += (canopy._1 -> scala.collection.mutable.Set())
296 |
297 | //val r = points.filter(x => x != canopy)
298 | //points.remove(canopyIdx)
299 | r = r.filter(x => x != canopy)
300 |
301 | for (point <- r) {
302 | //val point = r(idx)
303 | println("INNER LOOP....POINT => " + point + ", POINTS => " + r + ", POINTS_SIZE => " + r.size)
304 |
305 | val distance = EuclideanVectorSpace.distance(point._1, canopy._1)
306 | println("INNER LOOP....POINT => " + point._1 + " Canopy => " + canopy._1 + " distance => " + distance)
307 | if (distance <= getT1) {
308 | // Check if we are inserting into canopies for the first time
309 | //if (canopies.contains(canopy._1)) {
310 | canopies(canopy._1).add(point._2)
311 | //} else {
312 | // canopies += (canopy._1 -> scala.collection.mutable.Set(point._2))
313 | //}
314 | }
315 | if (distance < getT2) {
316 | //points.remove(idx)
317 | r = r.filter(x => x != point)
318 | println("Point removed => " + point)
319 | }
320 | println("...CANOPIES SO FAR => " + canopies)
321 | }
322 | }
323 | // Add self to the list of canopy edges.
324 | canopies.foreach(x => canopies(x._1).add(x._1.hashCode()))
325 | println("..Reached end of findCanopies. Canopies => " + canopies)
326 | canopies
327 | }
328 |
329 | /**
330 | * Sample data points randomly to pick k initial cluster centers
331 | * @param data
332 | * @return An Array of (Vector, hashcode()) as k sampled points
333 | */
334 | private def initRandom(data: RDD[(Vector, Int)]): Array[(Vector, Int)] = {
335 | val random = new XORShiftRandom(this.seed)
336 | data.takeSample(false, k, new XORShiftRandom(this.seed).nextInt())
337 | }
338 |
339 | /**
340 | * For a given point "x" and a set of cluster centers, find which cluster center and
341 | * the point are both within a canopy. If such a co-occurrence cannot be found, return -1.
342 | * @param canopies
343 | * @param centers
344 | * @param x
345 | * @return
346 | */
347 | private def isWithinCanopy(canopies: mutable.Map[Vector, mutable.Set[Int]],
348 | centers: Array[(Vector, Int)],
349 | x: (Vector, Int)): Int = {
350 | var index = 0
351 | breakable {
352 | for (center <- centers) {
353 | for ((k, v) <- canopies) {
354 | //println("isWithinCanopy: " + "canopy -> k " + k + " , v => " + v + ", center -> " +
355 | //center + " x -> " + x )
356 | if (v.contains(center._2) && (v.contains(x._2))) {
357 | //println("isWithinCanopy: " + "canopy -> k " + k + " , v => " + v + ", center -> " +
358 | //center + " x -> " + x + " , index -> " + index)
359 | break
360 | }
361 | }
362 | index += 1
363 | }
364 | }
365 | if (index == centers.size) {
366 | index = -1
367 | }
368 | println("isWithinCanopy: " + "canopies -> " + canopies + " centers -> " + centers +
369 | " x -> " + x + "index -> " + index)
370 | index
371 | }
372 | }
373 |
374 | /**
375 | * User-callable methods for running Canopy k-Means package
376 | */
377 |
378 | object CanopyKMeans {
379 |
380 | /**
381 | * Builds a canopy clustering model with all parameters specified by the user
382 | *
383 | * @param sc Spark Context
384 | * @param input Location of file(s) with input data points
385 | * @param k Number of clusters
386 | * @param maxIterations Maximum number of iterations
387 | * @param epsilon Distance threshold to determine convergence
388 | * @param seed Seed value for randomly picking k initial centers
389 | * @param t1 Distance from canopy center beyond which points can belong to other canopies
390 | * @param t2 Distance from canopy center within which all points belong to same canopy
391 | * @return
392 | */
393 | def train(
394 | sc: SparkContext,
395 | input: String,
396 | k: Int,
397 | maxIterations: Int,
398 | epsilon: Double,
399 | seed: Int,
400 | t1: Double,
401 | t2: Double): KMeansModel = {
402 | new CanopyKMeans()
403 | .setK(k)
404 | .setMaxIterations(maxIterations)
405 | .setEpsilon(epsilon)
406 | .setSeed(seed)
407 | .setT1(t1)
408 | .setT2(t2).runAlgorithm(sc, input)
409 | }
410 |
411 | /**
412 | * Builds a canopy clustering model with a mix of default parameters and parameters specified
413 | * by the user
414 | *
415 | * @param sc Spark Context
416 | * @param input Location of file(s) with input data points
417 | * @param k Number of clusters
418 | * @param t1 Distance from canopy center beyond which points can belong to other canopies
419 | * @param t2 Distance from canopy center within which all points belong to same canopy
420 | * @return
421 | */
422 | def train(
423 | sc: SparkContext,
424 | input: String,
425 | k: Int,
426 | t1: Double,
427 | t2: Double): KMeansModel = {
428 | if (t1 <= t2) {
429 | println("Parameter T1 (" + t1 + ") must be larger than T2 (" + t2 + "). Run aborted.")
430 | sc.stop()
431 | sys.exit()
432 | }
433 | train(sc, input, k, 4, 1.e-04, new scala.util.Random().nextInt(), t1, t2)
434 | }
435 | }
436 |
--------------------------------------------------------------------------------
/src/main/scala/ml/dolphin/personas/canopy/EuclideanVectorSpace.scala:
--------------------------------------------------------------------------------
1 |
2 | //package org.apache.spark.mllib.linalg.canopy
3 |
4 | package ml.dolphin.personas.canopy
5 | /**
6 | * Euclidean Vector Space extended from VectorSpace.
7 | * @note Methods have no side effects
8 | *
9 | * @author Abhijit Bose
10 | * @version 1.0 06/24/2015
11 | * @since 1.0 06/24/2015
12 | */
13 |
14 | import breeze.numerics.sqrt
15 | import org.apache.spark.mllib.linalg.{Vector, Vectors}
16 |
17 | import scala.math.pow
18 |
19 | object EuclideanVectorSpace extends VectorSpace[Vector] {
20 |
21 | /**
22 | * Euclidean Distance between two vectors x and y
23 | *
24 | * @param x Input Vector x
25 | * @param y Input Vector y
26 | * @return Double
27 | */
28 | override def distance(x: Vector, y: Vector): Double = {
29 | val dSquared = x.toArray.zip(y.toArray).foldLeft(0.0)(
30 | (r, c) => r + pow(c._1 - c._2, 2)
31 | )
32 | sqrt(dSquared)
33 | }
34 |
35 | /**
36 | * Centroid of a finite set of points represented as a sequence of Vector's
37 | *
38 | * @param points Input set of points
39 | * @return Vector with the centroid
40 | */
41 | override def centroid(points: Seq[Vector]) = {
42 | val numCols = points(0).size
43 | val center = points.foldLeft(new Array[Double](numCols))(
44 | (r, c) => r.toArray.zip(c.toArray).map(t => t._1 + t._2)
45 | )
46 | Vectors.dense(center.map(_ / points.size))
47 | }
48 |
49 | /**
50 | * Cosine similarity distance measure between two Vector's x and y
51 | *
52 | * @param x Input Vector x
53 | * @param y Input Vector y
54 | * @return Double
55 | */
56 | override def cosine(x: Vector, y: Vector): Double = {
57 | val normX = sqrt(x.toArray.foldLeft(0.0)(
58 | (r, c) => r + c * c
59 | ))
60 | val normY = sqrt(y.toArray.foldLeft(0.0)(
61 | (r, c) => r + c * c
62 | ))
63 | val inner = x.toArray.zip(y.toArray).foldLeft(0.0)(
64 | (r, c) => r + c._1 * c._2
65 | )
66 | 1.0 * inner / (normX * normY)
67 | }
68 |
69 | /**
70 | * Finds closest point and shortest distance between a given array of points x, and a given
71 | * point y. Uses brute-force L2-distance pairwise calculation.
72 | *
73 | * @todo Use better algorithm such as triangle inequality to find shortest distance
74 | * @param x Given Array of points, e.g. centroids in K-means clustering
75 | * @param y Given point from which distance needs to be calculated
76 | * @return (index in x, distance) of the closest point to y
77 | */
78 | override def closest(x: Array[Vector], y: Vector): (Int, Double) = {
79 | var shortestDistance = Double.PositiveInfinity
80 | var closestIndex = 0
81 | var index = 0
82 | x.foreach(center => {
83 | val thisDistance = distance(center, y)
84 | if (thisDistance < shortestDistance) {
85 | shortestDistance = thisDistance
86 | closestIndex = index
87 | }
88 | index += 1
89 | })
90 | (closestIndex, shortestDistance)
91 | }
92 |
93 | /**
94 | * Converts Array[(Vector, hashcode)] data structure of centers and points to an array of Vectors
95 | * only. Mostly used as a precursor to closest and other operations.
96 | * @param x Array of (Vector, Int)
97 | * @return Array[Vector]
98 | */
99 | def toVector(x: Array[(Vector, Int)]): Array[Vector] = {
100 | x.map(_._1)
101 | }
102 |
103 | }
104 |
--------------------------------------------------------------------------------
/src/main/scala/ml/dolphin/personas/canopy/PersonaCommon.scala:
--------------------------------------------------------------------------------
1 | package ml.dolphin.personas.canopy
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.mllib.linalg.{Vector, Vectors}
5 | import org.apache.spark.rdd.RDD
6 |
7 | import scala.io.Source
8 |
9 | /**
10 | * Common methods for canopy clustering. Still to be developed. mostly input manipulations.
11 | *
12 | * @author Abhijit Bose
13 | * @version 1.0 06/24/2015
14 | * @since 1.0 06/24/2015
15 | */
16 |
17 | object PersonaCommon {
18 |
19 | /**
20 | * Returns a Attribute object given an input file containing attribute metadata in CSV format.
21 | *
22 | * @param input Input file with attributes metadata
23 | * @return attribute: Attribute containing Attribute object that has information about all
24 | * attributes and their processing information
25 | */
26 | def readCsvMetadata(input: String): Attributes = {
27 |
28 | /*
29 | * Each line of input CSV file with attributes metadata has the following:
30 | * name, flag, flatten
31 | * where:
32 | * name: String = name of the attribute
33 | * flag: Boolean == 1 => the attribute will be used for clustering
34 | * flatten: Boolean == 1 => the attribute is categorical and will be flattened.
35 | */
36 |
37 | val schema = Source.fromFile(input).getLines()
38 | // List of attributes in the same order they appear in the data
39 | val aList = schema.map(s => {
40 | val elem = s.split(',')
41 | (elem(0).trim, elem(1).toInt, elem(2).toInt)
42 | }).zipWithIndex.toList
43 | // Map of attributes
44 | val aMap = aList.map(s => {
45 | s._1._1 ->(s._1._2, s._1._3, s._2)
46 | }).toMap
47 | new Attributes(aList, aMap)
48 | }
49 |
50 | /**
51 | * Reads data points in Hive format. Each row is converted into a Vector along with its hashcode.
52 | * @param sc
53 | * @param input
54 | * @return
55 | */
56 | def readHivePoints(sc: SparkContext, input: String): RDD[(Vector, Int)] = {
57 | val data = sc.textFile(input)
58 | val rows = data.map(s => {
59 | val buffer = s.split('\t').toBuffer
60 | val features = Vectors.dense(buffer.map(_.toDouble).toArray)
61 | (features, features.hashCode())
62 | })
63 | rows
64 | }
65 |
66 |
67 | /* Flattening is a procedure by which each distinct value of a categorical attribute is
68 | * converted into an additional Boolean attribute. For example,
69 | * city = ["new york", "london", "delhi", "tokyo"] are distinct values of attribute "city"
70 | * Flattened Boolean attributes created: city_new_york, city_london, city_delhi, city_tokyo
71 | *
72 | * The attributes are named by concatenating the parent attribute with the attribute value
73 | * with "_" in between. Any blank space(s) in the attribute value will be converted into
74 | * "_" as shown in the above example.
75 | */
76 |
77 |
78 | /**
79 | *
80 | * @param num
81 | * @return
82 | */
83 | def toIntegerBucket(num: Double): Int = {
84 | if (num < 0.0) {
85 | println("ERROR: ml.dolphin.personas.PersonaCommon: toIntegerBucket(..) cannot handle negative values")
86 | sys.exit()
87 | }
88 | val leftover = num - num.floor
89 | if (leftover < 0.5)
90 | num.floor.toInt
91 | else
92 | num.ceil.toInt
93 | }
94 |
95 |
96 | }
97 |
--------------------------------------------------------------------------------
/src/main/scala/ml/dolphin/personas/canopy/VectorSpace.scala:
--------------------------------------------------------------------------------
1 | package ml.dolphin.personas.canopy
2 |
3 | /**
4 | * Common algebraic operations in vector space. Define the functions for a class
5 | * mixed in with this trait that will be appropriate for a specific type of vector space.
6 | * @example EuclideanVectorSpace which extends this trait.
7 | *
8 | * @author Abhijit Bose
9 | * @version 1.0 06/24/2015
10 | * @since 1.0 06/24/2015
11 | */
12 | trait VectorSpace[A] {
13 |
14 | // Distance between two points x and y
15 | def distance(x: A, y: A): Double
16 |
17 | // Cosine similarity measure between two points x and y
18 | def cosine(x: A, y: A): Double
19 |
20 | // Centroid of a set of points
21 | def centroid(points: Seq[A]): A
22 |
23 | // Index and Distance of the point in Array x that is closest to a given point y
24 | def closest(x: Array[A], y: A): (Int, Double)
25 | }
26 |
--------------------------------------------------------------------------------
/src/main/scala/ml/dolphin/personas/canopy/XORShiftRandom.scala:
--------------------------------------------------------------------------------
1 | package ml.dolphin.personas.canopy
2 |
3 | //package org.apache.spark.mllib.linalg.canopy
4 |
5 | import java.util.Random
6 |
7 | /**
8 | * XORShift Random Number Generator extended from Java's Random class
9 | * https://en.wikipedia.org/wiki/Xorshift
10 | *
11 | * @note This method is NOT thread-safe. For safe parallel execution, a parallel pseudo random
12 | * generator such as SPRNG (http://www.sprng.org) should be used to generate the seeds
13 | * across the different threads.
14 | * @author Abhijit Bose
15 | * @version 1.0 06/24/2015
16 | * @since 1.0 06/24/2015
17 | *
18 | */
19 |
20 | class XORShiftRandom(private var seed: Int) extends Random {
21 |
22 | // Default constructor for class
23 | def this() = this(System.nanoTime().toInt)
24 |
25 | // override java.util.Random.next method for getting the next pseudo-random number
26 | override def next(nBits: Int): Int = {
27 | var x = this.seed
28 | x ^= (x << 21);
29 | x ^= (x >>> 35);
30 | x ^= (x << 4);
31 | this.seed = x;
32 | x &= ((1 << nBits) - 1);
33 | x;
34 | }
35 |
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/scala/ml/dolphin/personas/examples/ExampleCanopyKMeans.scala:
--------------------------------------------------------------------------------
1 | import org.apache.spark.mllib.linalg.canopy._
2 | import org.apache.spark.{SparkConf, SparkContext}
3 |
4 | /**
5 | * Example driver code for using CanopyKMeans
6 | *
7 | * @author Abhijit Bose
8 | * @version 1.0 06/24/2015
9 | * @since 1.0 06/24/2015
10 | *
11 | */
12 |
13 | object ExampleCanopyKMeans {
14 |
15 | def main(args: Array[String]): Unit = {
16 | // define example points
17 | println("Starting Application....")
18 | val conf = new SparkConf()
19 | .setAppName("Example usage of Canopy Clustering")
20 | .set("spark.akka.frameSize", "10")
21 | .set("spark.akka.threads", "4")
22 | .set("spark.akka.timeout", "1000")
23 | .set("spark.akka.heartbeat.pauses", "6000")
24 | .set("spark.akka.failure-detector.threshold", "3000")
25 | .set("spark.akka.heartbeat.interval", "1000")
26 | .set("spark.eventLog.enabled", "true")
27 |
28 | //.set("spark.storage.memoryFraction", "") // Set RDD caching limit as a fraction of overall JVM heap (60% default)
29 | //.set("spark.shuffle.memoryFraction", "") // limit the total amount of memory used in shuffle-related buffers (20% default). Rest 20% is for user code memory
30 | //.set(
31 |
32 | // .set("spark.shuffle.io.retryWait", )
33 |
34 | val sc = new SparkContext(conf)
35 | //val model = CanopyKMeans.train(sc, "/Users/r551839/canopy/points.csv", 2, 30.0, 20.0)
36 | //val model = CanopyKMeans.train(sc, "/Users/r551839/canopy/example2.csv", 3, 7.0, 3.0)
37 | val model = CanopyKMeans.train(sc, "/Users/r551839/canopy/wine_attribs_only.tsv", 3, 50.0, 1.0)
38 | model.clusterCenters.foreach(println)
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/target/TARGET_README:
--------------------------------------------------------------------------------
1 | The executable jar files are created here.
2 |
--------------------------------------------------------------------------------