├── .gitignore
├── LICENSE
├── README.md
├── ensemble-clustering-spark
├── docstyle.css
├── pom.xml
└── src
│ ├── main
│ └── java
│ │ └── com
│ │ └── oculusinfo
│ │ └── ml
│ │ └── spark
│ │ ├── CsvParser.java
│ │ ├── Field.java
│ │ ├── SparkDataSet.java
│ │ ├── SparkInstanceParser.java
│ │ ├── SparkInstanceParserHelper.java
│ │ └── unsupervised
│ │ └── cluster
│ │ ├── SparkClusterResult.java
│ │ ├── dpmeans
│ │ └── DPMeansClusterer.java
│ │ ├── functions
│ │ ├── AggregateClusterFunction.java
│ │ ├── AggregateClustersFunction.java
│ │ ├── BestClusterFunction.java
│ │ ├── ComputeCentroidFunction.java
│ │ ├── DistanceFunction.java
│ │ ├── FindBestClusterFunction.java
│ │ ├── IncrementalClusterFunction.java
│ │ └── InstanceToClusterFunction.java
│ │ ├── kmeans
│ │ └── KMeansClusterer.java
│ │ └── threshold
│ │ └── ThresholdClusterer.java
│ └── test
│ └── java
│ └── com
│ └── oculusinfo
│ └── ml
│ └── spark
│ └── unsupervised
│ ├── InstanceParser.java
│ ├── TestDPMeans.java
│ ├── TestKMeans.java
│ └── TestThresholdClusterer.java
├── ensemble-clustering
├── docstyle.css
├── pom.xml
└── src
│ ├── main
│ └── java
│ │ └── com
│ │ └── oculusinfo
│ │ ├── geometry
│ │ ├── SphereUtilities.java
│ │ ├── cartesian
│ │ │ ├── CubicBSpline.java
│ │ │ ├── QuarticSpline.java
│ │ │ └── Spline.java
│ │ └── geodesic
│ │ │ ├── Position.java
│ │ │ ├── PositionCalculationParameters.java
│ │ │ ├── PositionCalculationType.java
│ │ │ ├── Track.java
│ │ │ ├── TrackPlotter.java
│ │ │ ├── WrappingRectangle.java
│ │ │ └── tracks
│ │ │ ├── Cartesian2DTrack.java
│ │ │ ├── Cartesian3DTrack.java
│ │ │ ├── GeodeticTrack.java
│ │ │ └── SphericalTrack.java
│ │ ├── math
│ │ ├── algebra
│ │ │ └── AngleUtilities.java
│ │ ├── linearalgebra
│ │ │ ├── ListUtilities.java
│ │ │ ├── TriDiagonalMatrix.java
│ │ │ └── Vector.java
│ │ └── statistics
│ │ │ └── StatTracker.java
│ │ └── ml
│ │ ├── DataSet.java
│ │ ├── Instance.java
│ │ ├── InstanceJsonMapper.java
│ │ ├── centroid
│ │ └── Centroid.java
│ │ ├── distance
│ │ └── DistanceFunction.java
│ │ ├── feature
│ │ ├── Feature.java
│ │ ├── FeatureTable.java
│ │ ├── bagofwords
│ │ │ ├── BagOfWordsFeature.java
│ │ │ ├── centroid
│ │ │ │ └── BagOfWordsCentroid.java
│ │ │ └── distance
│ │ │ │ ├── CosineDistance.java
│ │ │ │ ├── EditDistance.java
│ │ │ │ └── ExactTokenMatchDistance.java
│ │ ├── numeric
│ │ │ ├── NumericVectorFeature.java
│ │ │ ├── centroid
│ │ │ │ └── MeanNumericVectorCentroid.java
│ │ │ └── distance
│ │ │ │ └── EuclideanDistance.java
│ │ ├── semantic
│ │ │ ├── SemanticFeature.java
│ │ │ ├── centroid
│ │ │ │ └── SemanticCentroid.java
│ │ │ └── distance
│ │ │ │ ├── Concept.java
│ │ │ │ ├── SemMFDistance.java
│ │ │ │ └── WuPalmerDistance.java
│ │ ├── spatial
│ │ │ ├── GeoSpatialFeature.java
│ │ │ ├── TrackFeature.java
│ │ │ ├── centroid
│ │ │ │ ├── FastGeoSpatialCentroid.java
│ │ │ │ ├── GeoSpatialCentroid.java
│ │ │ │ └── TrackCentroid.java
│ │ │ └── distance
│ │ │ │ ├── EquitangularDistance.java
│ │ │ │ ├── EuclideanDistance.java
│ │ │ │ ├── HaversineDistance.java
│ │ │ │ ├── SphericalCosineDistance.java
│ │ │ │ └── TrackDistance.java
│ │ ├── string
│ │ │ ├── StringFeature.java
│ │ │ ├── centroid
│ │ │ │ └── StringMedianCentroid.java
│ │ │ └── distance
│ │ │ │ ├── EditDistance.java
│ │ │ │ └── ExactTokenMatchDistance.java
│ │ └── temporal
│ │ │ ├── TemporalFeature.java
│ │ │ ├── centroid
│ │ │ ├── TemporalCentroid.java
│ │ │ └── TemporalMinMaxCentroid.java
│ │ │ └── distance
│ │ │ └── TemporalDistance.java
│ │ ├── search
│ │ ├── ObjectiveFunction.java
│ │ ├── SearchException.java
│ │ ├── Solution.java
│ │ └── stochastic
│ │ │ └── SimulatedAnnealing.java
│ │ ├── stats
│ │ ├── FeatureFreqComparator.java
│ │ ├── FeatureFrequency.java
│ │ ├── FeatureFrequencyTable.java
│ │ └── TrackClusterWrapper.java
│ │ ├── unsupervised
│ │ └── cluster
│ │ │ ├── AbstractClusterer.java
│ │ │ ├── BaseClusterer.java
│ │ │ ├── Cluster.java
│ │ │ ├── ClusterFactory.java
│ │ │ ├── ClusterJsonMapper.java
│ │ │ ├── ClusterResult.java
│ │ │ ├── Clusterer.java
│ │ │ ├── FeatureTypeDefinition.java
│ │ │ ├── InMemoryClusterResult.java
│ │ │ ├── dpmeans
│ │ │ └── DPMeans.java
│ │ │ ├── kmeans
│ │ │ └── KMeans.java
│ │ │ └── threshold
│ │ │ └── ThresholdClusterer.java
│ │ ├── utils
│ │ └── StringTools.java
│ │ └── validation
│ │ └── unsupervised
│ │ ├── external
│ │ ├── BCubed.java
│ │ ├── BCubedHierarchical.java
│ │ ├── Hierarchical.java
│ │ └── NormMutualInformation.java
│ │ └── internal
│ │ ├── Cohesion.java
│ │ └── Separation.java
│ └── test
│ └── java
│ ├── TestGeoSpatialCentroid.java
│ └── com
│ └── oculusinfo
│ ├── geometry
│ ├── SphereUtilityTests.java
│ ├── cartesian
│ │ ├── ProgramaticSplineTest.java
│ │ └── VisualSplineTest.java
│ └── geodesic
│ │ ├── PositionTests.java
│ │ ├── TrackSimplificationTests.java
│ │ └── TrackTest.java
│ ├── math
│ ├── algebra
│ │ └── TestAngleUtilities.java
│ ├── linearalgebra
│ │ ├── ListUtilitiesTests.java
│ │ ├── TestTriDiagonalMatrix.java
│ │ └── VectorTests.java
│ └── statistics
│ │ └── TestStats.java
│ └── ml
│ ├── TestNormalization.java
│ ├── TestStringTools.java
│ ├── distance
│ ├── TestBagOfWordsEditDistance.java
│ ├── TestCosineDistance.java
│ ├── TestEquitangularDistance.java
│ ├── TestEuclideanDistance.java
│ ├── TestExactStringMatchDistance.java
│ ├── TestExactTokenMatchDistance.java
│ ├── TestHaversineDistance.java
│ ├── TestSphericalCosineDistance.java
│ ├── TestStringEditDistance.java
│ └── TestTemporalDistance.java
│ ├── search
│ └── TestAnnealer.java
│ ├── tracks
│ ├── TestFrame.java
│ ├── TestTrackCluster.java
│ ├── TrackCentroidTests.java
│ └── TrackClusteringTests.java
│ └── unsupervised
│ ├── TestBagOfWordsClustering.java
│ ├── TestDPMeans.java
│ ├── TestGeoClusteringWithDPMeans.java
│ ├── TestKMeans.java
│ ├── TestNameLocationClustering.java
│ ├── TestStringClustering.java
│ ├── TestStringClusteringWithDPMeans.java
│ ├── TestStringClusteringWithMissingFeatures.java
│ └── TestThresholdClusterer.java
└── pom.xml
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.class
3 | *.prefs
4 | bin
5 | .classpath
6 | .settings
7 | .project
8 | target
9 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2013 Oculus Info Inc.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
6 | this software and associated documentation files (the "Software"), to deal in
7 | the Software without restriction, including without limitation the rights to
8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 |
--------------------------------------------------------------------------------
/ensemble-clustering-spark/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 |
4 | Spark Ensemble Clustering Library
5 | Spark Ensemble Clustering Library
6 | ensemble-clustering-spark
7 | jar
8 |
9 |
10 | ml
11 | com.oculusinfo
12 | 0.1.0-SNAPSHOT
13 |
14 |
15 |
16 |
17 | cloudera-releases
18 | https://repository.cloudera.com/artifactory/cloudera-repos
19 |
20 |
21 |
22 |
23 |
24 | com.oculusinfo
25 | ensemble-clustering
26 | 0.1.0-SNAPSHOT
27 |
28 |
29 |
30 | org.apache.spark
31 | spark-core_2.10
32 | 1.0.0
33 |
34 |
35 |
36 | org.apache.hadoop
37 | hadoop-client
38 | 2.0.0-mr1-cdh4.6.0
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 | org.apache.maven.plugins
47 | maven-javadoc-plugin
48 | 2.9
49 |
50 | Spark Ensemble Clustering Library API
51 | Spark Ensemble Clustering Library
52 | ${basedir}/docstyle.css
53 |
54 |
55 |
56 | package-javadoc
57 |
58 | jar
59 |
60 |
61 |
62 |
63 |
64 |
65 |
--------------------------------------------------------------------------------
/ensemble-clustering-spark/src/main/java/com/oculusinfo/ml/spark/CsvParser.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2013 Oculus Info Inc.
3 | * http://www.oculusinfo.com/
4 | *
5 | * Released under the MIT License.
6 | *
7 | * Permission is hereby granted, free of charge, to any person obtaining a copy of
8 | * this software and associated documentation files (the "Software"), to deal in
9 | * the Software without restriction, including without limitation the rights to
10 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
11 | * of the Software, and to permit persons to whom the Software is furnished to do
12 | * so, subject to the following conditions:
13 | *
14 | * The above copyright notice and this permission notice shall be included in all
15 | * copies or substantial portions of the Software.
16 | *
17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | * SOFTWARE.
24 | */
25 | package com.oculusinfo.ml.spark;
26 | import java.util.ArrayList;
27 | import java.util.List;
28 |
29 | public class CsvParser {
30 | public static List fsmParse(String input) {
31 | ArrayList result = new ArrayList();
32 | int startChar = 0;
33 | int endChar = 0;
34 | boolean inString = false;
35 | while(endChar instances;
46 |
47 | /***
48 | * The constructor must be passed a Spark Context used to communicate with the Spark installation.
49 | *
50 | * @param sc - the Spark context
51 | */
52 | public SparkDataSet(JavaSparkContext sc) {
53 | this.sc = sc;
54 | }
55 |
56 | /***
57 | * The SparkDataSet loads data using this method to populate the DataSet with Instances.
58 | *
59 | * @param path the location of the data to load (filesystem or HDFS path)
60 | * @param parser is the object that converts each line in the data into an Instance
61 | * @param minSplits determine the number of Spark partitions to split the data into
62 | */
63 | public void load(String path, SparkInstanceParser parser, int minSplits) {
64 | try {
65 | JavaRDD lines = sc.textFile(path, minSplits);
66 | instances = lines.mapToPair( parser );
67 | } catch (Exception e) {
68 | e.printStackTrace();
69 | }
70 | }
71 |
72 | /***
73 | * The SparkDataSet loads data using this method to populate the DataSet with Instances.
74 | *
75 | * @param path the location of the data to load (filesystem or HDFS path)
76 | * @param parser is the object that converts each line in the data into an Instance
77 | */
78 | public void load(String path, SparkInstanceParser parser) {
79 | try {
80 | JavaRDD lines = sc.textFile(path);
81 | instances = lines.mapToPair( parser );
82 | } catch (Exception e) {
83 | e.printStackTrace();
84 | }
85 | }
86 |
87 | /***
88 | * The SparkDataSet can be directly provided a pre-defined RDD of Instances
89 | *
90 | * @param rdd the RDD of Instances
91 | */
92 | public void load(JavaPairRDD rdd) {
93 | this.instances = rdd;
94 | }
95 |
96 | /***
97 | * Return the underlying Spark RDD containing hte DataSet Instances
98 | * @return the RDD
99 | */
100 | public JavaPairRDD getRDD() {
101 | return this.instances;
102 | }
103 |
104 | /***
105 | * Return the Spark context this DataSet is bound to
106 | *
107 | * @return the JavaSparkContext
108 | */
109 | public JavaSparkContext getContext() {
110 | return this.sc;
111 | }
112 | }
113 |
--------------------------------------------------------------------------------
/ensemble-clustering-spark/src/main/java/com/oculusinfo/ml/spark/SparkInstanceParser.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2013 Oculus Info Inc.
3 | * http://www.oculusinfo.com/
4 | *
5 | * Released under the MIT License.
6 | *
7 | * Permission is hereby granted, free of charge, to any person obtaining a copy of
8 | * this software and associated documentation files (the "Software"), to deal in
9 | * the Software without restriction, including without limitation the rights to
10 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
11 | * of the Software, and to permit persons to whom the Software is furnished to do
12 | * so, subject to the following conditions:
13 |
14 | * The above copyright notice and this permission notice shall be included in all
15 | * copies or substantial portions of the Software.
16 |
17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | * SOFTWARE.
24 | */
25 | package com.oculusinfo.ml.spark;
26 |
27 | import scala.Tuple2;
28 | import org.apache.spark.api.java.function.PairFunction;
29 |
30 | import com.oculusinfo.ml.Instance;
31 |
32 | /***
33 | * This abstract class defines the abstract method each SparkInstanceParser must implement to convert
34 | * a line of data into an Instance that is added to a SparkDataSet
35 | *
36 | * @author slangevin
37 | *
38 | */
39 | public abstract class SparkInstanceParser implements PairFunction {
40 | private static final long serialVersionUID = -8686959633799632078L;
41 |
42 | @Override
43 | public abstract Tuple2 call(String arg0) throws Exception;
44 | }
45 |
--------------------------------------------------------------------------------
/ensemble-clustering-spark/src/main/java/com/oculusinfo/ml/spark/unsupervised/cluster/SparkClusterResult.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2013 Oculus Info Inc.
3 | * http://www.oculusinfo.com/
4 | *
5 | * Released under the MIT License.
6 | *
7 | * Permission is hereby granted, free of charge, to any person obtaining a copy of
8 | * this software and associated documentation files (the "Software"), to deal in
9 | * the Software without restriction, including without limitation the rights to
10 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
11 | * of the Software, and to permit persons to whom the Software is furnished to do
12 | * so, subject to the following conditions:
13 |
14 | * The above copyright notice and this permission notice shall be included in all
15 | * copies or substantial portions of the Software.
16 |
17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | * SOFTWARE.
24 | */
25 | package com.oculusinfo.ml.spark.unsupervised.cluster;
26 |
27 | import java.util.Iterator;
28 |
29 | import org.apache.spark.api.java.JavaPairRDD;
30 |
31 | import com.oculusinfo.ml.Instance;
32 | import com.oculusinfo.ml.unsupervised.cluster.Cluster;
33 | import com.oculusinfo.ml.unsupervised.cluster.ClusterResult;
34 |
35 | /***
36 | * This class represents a cluster result that stores cluster membership in a Spark RDD
37 | *
38 | * @author slangevin
39 | *
40 | */
41 | public class SparkClusterResult implements ClusterResult {
42 | private static final long serialVersionUID = -1586537333107747750L;
43 |
44 | JavaPairRDD rdd;
45 |
46 | public SparkClusterResult(JavaPairRDD clusters) {
47 | rdd = clusters;
48 | }
49 |
50 | public JavaPairRDD getRDD() {
51 | return rdd;
52 | }
53 |
54 | @Override
55 | public Iterator iterator() {
56 | throw new RuntimeException("Iterator is not supported for SparkClusterResult");
57 | }
58 |
59 | @Override
60 | public boolean isEmpty() {
61 | throw new RuntimeException("isEmpty is not supported for SparkClusterResult");
62 | }
63 |
64 | @Override
65 | public int size() {
66 | throw new RuntimeException("size is not supported for SparkClusterResult");
67 | }
68 |
69 | }
70 |
--------------------------------------------------------------------------------
/ensemble-clustering-spark/src/main/java/com/oculusinfo/ml/spark/unsupervised/cluster/functions/AggregateClusterFunction.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2013 Oculus Info Inc.
3 | * http://www.oculusinfo.com/
4 | *
5 | * Released under the MIT License.
6 | *
7 | * Permission is hereby granted, free of charge, to any person obtaining a copy of
8 | * this software and associated documentation files (the "Software"), to deal in
9 | * the Software without restriction, including without limitation the rights to
10 | * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
11 | * of the Software, and to permit persons to whom the Software is furnished to do
12 | * so, subject to the following conditions:
13 |
14 | * The above copyright notice and this permission notice shall be included in all
15 | * copies or substantial portions of the Software.
16 |
17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | * SOFTWARE.
24 | */
25 | package com.oculusinfo.ml.spark.unsupervised.cluster.functions;
26 |
27 | import java.util.Map;
28 |
29 | import scala.Tuple2;
30 | import org.apache.spark.api.java.function.Function2;
31 |
32 | import com.oculusinfo.ml.Instance;
33 | import com.oculusinfo.ml.unsupervised.cluster.Cluster;
34 |
35 | public class AggregateClusterFunction implements Function2