├── .gitignore ├── main ├── HDBSCAN-CPP ├── Runner │ ├── hdbscanParameters.cpp │ ├── hdbscanRunner.hpp │ ├── hdbscanResult.cpp │ ├── hdbscanResult.hpp │ ├── hdbscanParameters.hpp │ └── hdbscanRunner.cpp ├── Distance │ ├── IDistanceCalculator.cpp │ ├── ManhattanDistance.hpp │ ├── EuclideanDistance.hpp │ ├── ManhattanDistance.cpp │ ├── EuclideanDistance.cpp │ └── IDistanceCalculator.hpp ├── run.sh ├── README.md ├── Utils │ ├── bitSet.hpp │ └── bitSet.cpp ├── HdbscanStar │ ├── hdbscanConstraint.cpp │ ├── outlierScore.cpp │ ├── hdbscanConstraint.hpp │ ├── outlierScore.hpp │ ├── cluster.hpp │ ├── undirectedGraph.hpp │ ├── undirectedGraph.cpp │ ├── cluster.cpp │ ├── hdbscanAlgorithm.hpp │ └── hdbscanAlgorithm.cpp ├── Hdbscan │ ├── hdbscan.hpp │ └── hdbscan.cpp ├── .vscode │ ├── launch.json │ └── tasks.json ├── .gitattributes └── .gitignore ├── Makefile ├── .travis.yml ├── HDBSCAN-FourProminentClusterExample └── FourProminentClusterExample.cpp ├── LICENSE.md ├── README.md └── HDBSCANDataset └── FourProminentClusterDataset.csv /.gitignore: -------------------------------------------------------------------------------- 1 | HDBSCANUnitTest/Debug 2 | HDBSCAN-CPP/Debug -------------------------------------------------------------------------------- /main: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rohanmohapatra/hdbscan-cpp/HEAD/main -------------------------------------------------------------------------------- /HDBSCAN-CPP/Runner/hdbscanParameters.cpp: -------------------------------------------------------------------------------- 1 | #include "hdbscanParameters.hpp" 2 | -------------------------------------------------------------------------------- /HDBSCAN-CPP/Distance/IDistanceCalculator.cpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include"IDistanceCalculator.hpp" -------------------------------------------------------------------------------- /HDBSCAN-CPP/run.sh: -------------------------------------------------------------------------------- 1 | g++ -c -g -O3 */*.cpp 2 | g++ -g -O3 main.cpp *.o 3 | ./a.out 4 | rm *.o 5 | -------------------------------------------------------------------------------- /HDBSCAN-CPP/README.md: -------------------------------------------------------------------------------- 1 | # hdbscan-cpp 2 | Fast and Efficient Implementation of HDBSCAN in C++ using STL 3 | -------------------------------------------------------------------------------- /HDBSCAN-CPP/Runner/hdbscanRunner.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include"hdbscanResult.hpp" 3 | #include"hdbscanParameters.hpp" 4 | class hdbscanRunner 5 | { 6 | public: 7 | static hdbscanResult run(hdbscanParameters parameters); 8 | }; 9 | 10 | -------------------------------------------------------------------------------- /HDBSCAN-CPP/Utils/bitSet.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | class bitSet 4 | { 5 | private: 6 | std::vector _bits; 7 | public: 8 | bool get(int pos); 9 | 10 | void set(int pos); 11 | 12 | void ensure(int pos); 13 | }; 14 | 15 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SOURCES=$(shell find . -name "*.cpp") 2 | CXXFLAGS= -std=c++11 -Wall 3 | OBJECTS=$(SOURCES:%.cpp=%.o) 4 | TARGET=main 5 | 6 | .PHONY: all 7 | all: $(TARGET) 8 | 9 | $(TARGET): $(OBJECTS) 10 | $(LINK.cpp) $^ -std=c++11 $(LOADLIBES) $(LDLIBS) -o $@ 11 | 12 | .PHONY: clean 13 | clean: 14 | rm -f $(OBJECTS) 15 | 16 | -------------------------------------------------------------------------------- /HDBSCAN-CPP/Utils/bitSet.cpp: -------------------------------------------------------------------------------- 1 | #include "bitSet.hpp" 2 | 3 | 4 | bool bitSet::get(int pos) { 5 | return pos < _bits.size() && _bits[pos]; 6 | } 7 | 8 | void bitSet::set(int pos) { 9 | ensure(pos); 10 | _bits[pos] = true; 11 | } 12 | 13 | void bitSet::ensure(int pos) { 14 | if (pos >= _bits.size()) 15 | { 16 | _bits.resize(pos + 64); 17 | } 18 | } -------------------------------------------------------------------------------- /HDBSCAN-CPP/Distance/ManhattanDistance.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include"IDistanceCalculator.hpp" 3 | /// 4 | /// Computes the manhattan distance between two points, d = |x1-y1| + |x2-y2| + ... + |xn-yn|. 5 | /// 6 | class ManhattanDistance : IDistanceCalculator 7 | { 8 | public: 9 | double computeDistance(std::vector attributesOne, std::vector attributesTwo); 10 | }; 11 | 12 | -------------------------------------------------------------------------------- /HDBSCAN-CPP/Distance/EuclideanDistance.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include"IDistanceCalculator.hpp" 3 | /// 4 | /// Computes the euclidean distance between two points, d = sqrt((x1-y1)^2 + (x2-y2)^2 + ... + (xn-yn)^2). 5 | /// 6 | class EuclideanDistance : IDistanceCalculator 7 | { 8 | public: 9 | double computeDistance(std::vector attributesOne, std::vector attributesTwo); 10 | 11 | }; 12 | 13 | -------------------------------------------------------------------------------- /HDBSCAN-CPP/Runner/hdbscanResult.cpp: -------------------------------------------------------------------------------- 1 | #include "hdbscanResult.hpp" 2 | 3 | hdbscanResult::hdbscanResult() { 4 | ; 5 | } 6 | hdbscanResult::hdbscanResult(vector pLables, vector pOutlierScores, vector pmembershipProbabilities, bool pHsInfiniteStability) { 7 | labels = pLables; 8 | outliersScores = pOutlierScores; 9 | membershipProbabilities = pmembershipProbabilities; 10 | hasInfiniteStability = pHsInfiniteStability; 11 | } -------------------------------------------------------------------------------- /HDBSCAN-CPP/Distance/ManhattanDistance.cpp: -------------------------------------------------------------------------------- 1 | #include "ManhattanDistance.hpp" 2 | #include 3 | #include 4 | #include 5 | double ManhattanDistance::computeDistance(std::vector attributesOne, std::vector attributesTwo) { 6 | double distance = 0; 7 | for (uint32_t i = 0; i < attributesOne.size() && i < attributesTwo.size(); i++) { 8 | distance += fabs(attributesOne[i] - attributesTwo[i]); 9 | } 10 | 11 | return distance; 12 | } -------------------------------------------------------------------------------- /HDBSCAN-CPP/HdbscanStar/hdbscanConstraint.cpp: -------------------------------------------------------------------------------- 1 | #include "hdbscanConstraint.hpp" 2 | 3 | hdbscanConstraint::hdbscanConstraint(int pointA, int pointB, hdbscanConstraintType type) { 4 | _pointA = pointA; 5 | _pointB = pointB; 6 | _constraintType = type; 7 | } 8 | 9 | int hdbscanConstraint::getPointA() { 10 | return _pointA; 11 | } 12 | 13 | int hdbscanConstraint::getPointB() { 14 | return _pointB; 15 | } 16 | 17 | hdbscanConstraintType hdbscanConstraint::getConstraintType() { 18 | return _constraintType; 19 | } -------------------------------------------------------------------------------- /HDBSCAN-CPP/Runner/hdbscanResult.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include"../HdbscanStar/outlierScore.hpp" 4 | using namespace std; 5 | class hdbscanResult 6 | { 7 | public: 8 | vector labels; 9 | vector outliersScores; 10 | vector membershipProbabilities; 11 | bool hasInfiniteStability; 12 | hdbscanResult(); 13 | hdbscanResult(vector pLables, vector pOutlierScores, vector pmembershipProbabilities, bool pHsInfiniteStability); 14 | }; 15 | 16 | -------------------------------------------------------------------------------- /HDBSCAN-CPP/Distance/EuclideanDistance.cpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include"EuclideanDistance.hpp" 3 | #include 4 | #include 5 | #include 6 | double EuclideanDistance::computeDistance(std::vector attributesOne, std::vector attributesTwo) { 7 | double distance = 0; 8 | for (uint32_t i = 0; i < attributesOne.size() && i < attributesTwo.size(); i++) { 9 | distance += ((attributesOne[i] - attributesTwo[i]) * (attributesOne[i] - attributesTwo[i])); 10 | } 11 | 12 | return sqrt(distance); 13 | } -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Enable C++ support 2 | language: cpp 3 | 4 | # Compiler selection 5 | compiler: 6 | - g++ 7 | - gcc 8 | 9 | before_install: 10 | - pip install --user cpp-coveralls 11 | 12 | sudo: required 13 | script: 14 | - sudo unlink /usr/bin/g++ && sudo ln -s /usr/bin/g++-5 /usr/bin/g++ 15 | - sudo unlink /usr/bin/gcc && sudo ln -s /usr/bin/gcc-5 /usr/bin/gcc 16 | - gcc --version 17 | - make all clean 18 | 19 | after_success: 20 | - coveralls --gcov-options '\-lp' 21 | 22 | 23 | addons: 24 | apt: 25 | sources: 26 | - ubuntu-toolchain-r-test 27 | packages: 28 | - gcc-5 29 | - g++-5 30 | -------------------------------------------------------------------------------- /HDBSCAN-FourProminentClusterExample/FourProminentClusterExample.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include"../HDBSCAN-CPP/Hdbscan/hdbscan.hpp" 4 | using namespace std; 5 | 6 | 7 | int main() { 8 | Hdbscan hdbscan("HDBSCANDataset/FourProminentClusterDataset.csv"); 9 | hdbscan.loadCsv(2); 10 | vector> dataset = hdbscan.dataset; 11 | hdbscan.execute(5, 5, "Euclidean"); 12 | hdbscan.displayResult(); 13 | cout << "You can access other fields like cluster labels, membership probabilities and outlier scores."< 3 | 4 | outlierScore::outlierScore() { 5 | ; 6 | } 7 | 8 | outlierScore::outlierScore(double score, double coreDistance, int id) { 9 | outlierScore::score = score; 10 | outlierScore::coreDistance = coreDistance; 11 | outlierScore::id = id; 12 | } 13 | 14 | bool outlierScore::operator<(const outlierScore& other) const { 15 | /* 16 | if (score < other.score) 17 | return score < other.score; 18 | else if (coreDistance < other.coreDistance) 19 | return coreDistance < other.coreDistance; 20 | else if (id < other.id) 21 | return id < other.id; 22 | else 23 | return false;*/ 24 | return std::tie(score, coreDistance, id) < std::tie(other.score, other.coreDistance, other.id); 25 | } -------------------------------------------------------------------------------- /HDBSCAN-CPP/Distance/IDistanceCalculator.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | /// 4 | /// An interface for classes which compute the distance between two points (where points are 5 | /// represented as arrays of doubles). 6 | /// 7 | class IDistanceCalculator 8 | { 9 | /// 10 | /// Computes the distance between two points. 11 | /// Note that larger values indicate that the two points are farther apart. 12 | /// 13 | /// The attributes of the first point 14 | /// The attributes of the second point 15 | /// A double for the distance between the two points 16 | public: 17 | virtual double computeDistance(std::vector attributesOne, std::vector attributesTwo)=0; 18 | }; 19 | 20 | -------------------------------------------------------------------------------- /HDBSCAN-CPP/HdbscanStar/hdbscanConstraint.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | enum hdbscanConstraintType{mustLink, cannotLink}; 3 | /// 4 | /// A clustering constraint (either a must-link or cannot-link constraint between two points). 5 | /// 6 | class hdbscanConstraint 7 | { 8 | private : 9 | hdbscanConstraintType _constraintType; 10 | int _pointA; 11 | int _pointB; 12 | /// 13 | /// Creates a new constraint. 14 | /// 15 | /// The first point involved in the constraint 16 | /// The second point involved in the constraint 17 | /// The constraint type 18 | public: 19 | hdbscanConstraint(int pointA, int pointB, hdbscanConstraintType type); 20 | 21 | int getPointA(); 22 | 23 | int getPointB(); 24 | 25 | hdbscanConstraintType getConstraintType(); 26 | 27 | }; 28 | 29 | -------------------------------------------------------------------------------- /HDBSCAN-CPP/HdbscanStar/outlierScore.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | /// 3 | /// Simple storage class that keeps the outlier score, core distance, and id (index) for a single point. 4 | /// OutlierScores are sorted in ascending order by outlier score, with core distances used to break 5 | /// outlier score ties, and ids used to break core distance ties. 6 | /// 7 | class outlierScore 8 | { 9 | private: 10 | double coreDistance; 11 | public: 12 | double score; 13 | int id; 14 | /// 15 | /// Creates a new OutlierScore for a given point. 16 | /// 17 | /// The outlier score of the point 18 | /// The point's core distance 19 | /// The id (index) of the point 20 | outlierScore(double score, double coreDistance, int id); 21 | outlierScore(); 22 | /// 23 | /// Method Overridden to compare two objects. 24 | /// 25 | bool operator<(const outlierScore& other) const; 26 | 27 | 28 | }; 29 | 30 | -------------------------------------------------------------------------------- /HDBSCAN-CPP/Hdbscan/hdbscan.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include"../Runner/hdbscanRunner.hpp" 5 | #include"../Runner/hdbscanParameters.hpp" 6 | #include"../Runner/hdbscanResult.hpp" 7 | #include"../HdbscanStar/outlierScore.hpp" 8 | 9 | using namespace std; 10 | 11 | 12 | class Hdbscan 13 | 14 | { 15 | 16 | private: 17 | 18 | string fileName; 19 | 20 | hdbscanResult result; 21 | 22 | public: 23 | 24 | vector < vector > dataset; 25 | 26 | std::vector labels_; 27 | 28 | std::vector normalizedLabels_; 29 | 30 | std::vectoroutlierScores_; 31 | 32 | std::vector membershipProbabilities_; 33 | 34 | uint32_t noisyPoints_; 35 | 36 | uint32_t numClusters_; 37 | 38 | 39 | 40 | Hdbscan(string readFileName) { 41 | 42 | fileName = readFileName; 43 | 44 | } 45 | 46 | string getFileName(); 47 | 48 | int loadCsv(int numberOfValues, bool skipHeader=false); 49 | 50 | void execute(int minPoints, int minClusterSize, string distanceMetric); 51 | 52 | void displayResult(); 53 | 54 | 55 | }; 56 | 57 | -------------------------------------------------------------------------------- /HDBSCAN-CPP/.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | 8 | { 9 | "name": "g++ build and debug active file", 10 | "type": "cppdbg", 11 | "request": "launch", 12 | "program": "${workspaceFolder}/a.out", 13 | "args": [], 14 | "stopAtEntry": false, 15 | "cwd": "${workspaceFolder}", 16 | "environment": [], 17 | "externalConsole": false, 18 | "MIMode": "gdb", 19 | "setupCommands": [ 20 | { 21 | "description": "Enable pretty-printing for gdb", 22 | "text": "-enable-pretty-printing", 23 | "ignoreFailures": true 24 | } 25 | ], 26 | "preLaunchTask": "g++ build active file", 27 | "miDebuggerPath": "/usr/bin/gdb" 28 | } 29 | ] 30 | } -------------------------------------------------------------------------------- /HDBSCAN-CPP/Runner/hdbscanParameters.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include"../Distance/IDistanceCalculator.hpp" 4 | #include 5 | #include 6 | #include"../HdbscanStar/hdbscanConstraint.hpp" 7 | 8 | using namespace std; 9 | class hdbscanParameters 10 | { 11 | public: 12 | 13 | /// 14 | /// Parameters to be Passed to the HDBSCAN Algorithm 15 | /// 16 | /// The attributes of the first point 17 | /// The attributes of the second point 18 | /// The attributes of the second point 19 | /// Defines the type of distance measure to use : Euclidean, Manhattan ,.. 20 | /// Min Points in the cluster 21 | /// The minimum number of points which a cluster needs to be a valid cluster 22 | vector< vector > distances; 23 | vector< vector > dataset; 24 | string distanceFunction; 25 | uint32_t minPoints; 26 | uint32_t minClusterSize; 27 | vector constraints; 28 | }; 29 | 30 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Rohan Mohapatra, Sumedh Basarkod 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /HDBSCAN-CPP/HdbscanStar/cluster.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | class cluster 8 | { 9 | private: 10 | int _id; 11 | double _birthLevel; 12 | double _deathLevel; 13 | int _numPoints; 14 | double _propagatedStability; 15 | int _numConstraintsSatisfied; 16 | int _propagatedNumConstraintsSatisfied; 17 | std::set _virtualChildCluster; 18 | static int counter; 19 | 20 | public: 21 | std::vector PropagatedDescendants; 22 | double PropagatedLowestChildDeathLevel; 23 | cluster* Parent; 24 | double Stability; 25 | bool HasChildren; 26 | int Label; 27 | int HierarchyPosition; //First level where points with this cluster's label appear 28 | 29 | cluster(); 30 | 31 | cluster(int label, cluster *parent, double birthLevel, int numPoints); 32 | bool operator==(const cluster& other) const; 33 | void detachPoints(int numPoints, double level); 34 | void propagate(); 35 | void addPointsToVirtualChildCluster(std::set points); 36 | 37 | bool virtualChildClusterConstraintsPoint(int point); 38 | 39 | void addVirtualChildConstraintsSatisfied(int numConstraints); 40 | 41 | 42 | void addConstraintsSatisfied(int numConstraints); 43 | 44 | 45 | void releaseVirtualChildCluster(); 46 | 47 | int getClusterId(); 48 | 49 | }; 50 | -------------------------------------------------------------------------------- /HDBSCAN-CPP/HdbscanStar/undirectedGraph.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | class undirectedGraph 4 | { 5 | private: 6 | int _numVertices; 7 | std::vector _verticesA; 8 | std::vector _verticesB; 9 | std::vector _edgeWeights; 10 | std::vector> _edges; 11 | 12 | public: 13 | undirectedGraph(int numVertices, std::vector verticesA, std::vector verticesB, std::vector edgeWeights) 14 | { 15 | _numVertices = numVertices; 16 | _verticesA = verticesA; 17 | _verticesB = verticesB; 18 | _edgeWeights = edgeWeights; 19 | _edges.resize(numVertices); 20 | int _edgesLength = _edges.size(); 21 | int _edgeWeightsLength = _edgeWeights.size(); 22 | for (int i = 0; i < _edgeWeightsLength; i++) 23 | { 24 | _edges[_verticesA[i]].push_back(_verticesB[i]); 25 | 26 | if (_verticesA[i] != _verticesB[i]) 27 | _edges[_verticesB[i]].push_back(_verticesA[i]); 28 | } 29 | 30 | } 31 | 32 | void quicksortByEdgeWeight(); 33 | int getNumVertices(); 34 | 35 | int getNumEdges(); 36 | 37 | int getFirstVertexAtIndex(int index); 38 | int getSecondVertexAtIndex(int index); 39 | 40 | double getEdgeWeightAtIndex(int index); 41 | std::vector &getEdgeListForVertex(int vertex); 42 | private: 43 | int selectPivotIndex(int startIndex, int endIndex); 44 | 45 | int partition(int startIndex, int endIndex, int pivotIndex); 46 | void swapEdges(int indexOne, int indexTwo); 47 | 48 | }; 49 | 50 | -------------------------------------------------------------------------------- /HDBSCAN-CPP/.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "tasks": [ 3 | { 4 | "type": "shell", 5 | "label": "g++ build active file", 6 | "command": "/usr/bin/g++", 7 | "args": [ 8 | "-g", 9 | "${file}", 10 | "-o", 11 | "*.o" 12 | ], 13 | "options": { 14 | "cwd": "/usr/bin" 15 | } 16 | }, 17 | { 18 | "type": "shell", 19 | "label": "g++ build active file", 20 | "command": "/usr/bin/g++", 21 | "args": [ 22 | "-g", 23 | "${file}", 24 | "-o", 25 | "${fileDirname}/${fileBasenameNoExtension}" 26 | ], 27 | "options": { 28 | "cwd": "/usr/bin" 29 | }, 30 | "problemMatcher": [ 31 | "$gcc" 32 | ] 33 | }, 34 | { 35 | "type": "shell", 36 | "label": "g++ build active file", 37 | "command": "/usr/bin/g++", 38 | "args": [ 39 | "-g", 40 | "${file}", 41 | "-o", 42 | "${fileDirname}/${fileBasenameNoExtension}" 43 | ], 44 | "options": { 45 | "cwd": "/usr/bin" 46 | }, 47 | "problemMatcher": [ 48 | "$gcc" 49 | ], 50 | "group": { 51 | "kind": "build", 52 | "isDefault": true 53 | } 54 | } 55 | ], 56 | "version": "2.0.0" 57 | } -------------------------------------------------------------------------------- /HDBSCAN-CPP/.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /HDBSCAN-CPP/Runner/hdbscanRunner.cpp: -------------------------------------------------------------------------------- 1 | #include "hdbscanRunner.hpp" 2 | #include "hdbscanResult.hpp" 3 | #include "hdbscanParameters.hpp" 4 | #include"../Distance/EuclideanDistance.hpp" 5 | #include"../Distance/ManhattanDistance.hpp" 6 | #include"../HdbscanStar/hdbscanAlgorithm.hpp" 7 | #include"../HdbscanStar/undirectedGraph.hpp" 8 | #include"../HdbscanStar/cluster.hpp" 9 | #include"../HdbscanStar/outlierScore.hpp" 10 | 11 | using namespace hdbscanStar; 12 | 13 | hdbscanResult hdbscanRunner::run(hdbscanParameters parameters) { 14 | int numPoints = parameters.dataset.size() != 0 ? parameters.dataset.size() : parameters.distances.size(); 15 | 16 | hdbscanAlgorithm algorithm; 17 | hdbscanResult result; 18 | if (parameters.distances.size() == 0) { 19 | std::vector> distances(numPoints); 20 | for (int i = 0; i < numPoints; i++) { 21 | distances[i].resize(numPoints); 22 | //distances[i]=std::vector(numPoints); 23 | for (int j = 0; j < i; j++) { 24 | if (parameters.distanceFunction.length() == 0) { 25 | //Default to Euclidean 26 | EuclideanDistance EDistance; 27 | double distance; 28 | distance = EDistance.computeDistance(parameters.dataset[i], parameters.dataset[j]); 29 | distances[i][j] = distance; 30 | distances[j][i] = distance; 31 | 32 | } 33 | else if (parameters.distanceFunction == "Euclidean") { 34 | EuclideanDistance EDistance; 35 | double distance; 36 | distance = EDistance.computeDistance(parameters.dataset[i], parameters.dataset[j]); 37 | distances[i][j] = distance; 38 | distances[j][i] = distance; 39 | } 40 | else if (parameters.distanceFunction == "Manhattan") { 41 | ManhattanDistance MDistance; 42 | double distance; 43 | distance = MDistance.computeDistance(parameters.dataset[i], parameters.dataset[j]); 44 | distances[i][j] = distance; 45 | distances[j][i] = distance; 46 | } 47 | } 48 | } 49 | 50 | parameters.distances = distances; 51 | } 52 | 53 | std::vector coreDistances = algorithm.calculateCoreDistances( 54 | parameters.distances, 55 | parameters.minPoints); 56 | 57 | undirectedGraph mst = algorithm.constructMst( 58 | parameters.distances, 59 | coreDistances, 60 | true); 61 | mst.quicksortByEdgeWeight(); 62 | 63 | std::vector pointNoiseLevels(numPoints); 64 | std::vector pointLastClusters(numPoints); 65 | 66 | std::vector< std::vector > hierarchy; 67 | 68 | std::vector clusters; 69 | algorithm.computeHierarchyAndClusterTree( 70 | &mst, 71 | parameters.minClusterSize, 72 | parameters.constraints, 73 | hierarchy, 74 | pointNoiseLevels, 75 | pointLastClusters, 76 | clusters); 77 | bool infiniteStability = algorithm.propagateTree(clusters); 78 | 79 | std::vector prominentClusters = algorithm.findProminentClusters(clusters, hierarchy, numPoints); 80 | std::vector membershipProbabilities = algorithm.findMembershipScore(prominentClusters, coreDistances); 81 | std::vector scores = algorithm.calculateOutlierScores( 82 | clusters, 83 | pointNoiseLevels, 84 | pointLastClusters, 85 | coreDistances); 86 | 87 | return hdbscanResult(prominentClusters, scores, membershipProbabilities, infiniteStability); 88 | } 89 | -------------------------------------------------------------------------------- /HDBSCAN-CPP/Hdbscan/hdbscan.cpp: -------------------------------------------------------------------------------- 1 | #include "hdbscan.hpp" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | using namespace std; 9 | 10 | string Hdbscan::getFileName() { 11 | return this->fileName; 12 | } 13 | /// 14 | /// Loads the csv file as specified by the constructor.CSV 15 | /// 16 | /// A List of attributes to be choosen 17 | /// Bool value to skip header or not 18 | /// 1 if successful, 0 otherwise 19 | 20 | int Hdbscan::loadCsv(int numberOfValues, bool skipHeader) { 21 | string attribute; 22 | 23 | string line = ""; 24 | 25 | int currentAttributes; 26 | vector > dataset; 27 | 28 | string fileName = this->getFileName(); 29 | ifstream file(fileName, ios::in); 30 | if (!file) 31 | return 0; 32 | if (skipHeader) { 33 | getline(file, line); 34 | 35 | } 36 | while (getline(file, line)) { //Read through each line 37 | stringstream s(line); 38 | vector row; 39 | currentAttributes = numberOfValues; 40 | while (getline(s, attribute, ',') && currentAttributes != 0) { 41 | row.push_back(stod(attribute)); 42 | currentAttributes--; 43 | } 44 | dataset.push_back(row); 45 | 46 | } 47 | this->dataset = dataset; 48 | return 1; 49 | } 50 | 51 | void Hdbscan::execute(int minPoints, int minClusterSize, string distanceMetric) { 52 | //Call The Runner Class here 53 | hdbscanRunner runner; 54 | hdbscanParameters parameters; 55 | uint32_t noisyPoints = 0; 56 | set numClustersSet; 57 | map clustersMap; 58 | vector normalizedLabels; 59 | 60 | parameters.dataset = this->dataset; 61 | parameters.minPoints = minPoints; 62 | parameters.minClusterSize = minClusterSize; 63 | parameters.distanceFunction = distanceMetric; 64 | this->result = runner.run(parameters); 65 | this->labels_ = result.labels; 66 | this->outlierScores_ = result.outliersScores; 67 | for (uint32_t i = 0; i < result.labels.size(); i++) { 68 | if (result.labels[i] == 0) { 69 | noisyPoints++; 70 | } 71 | else { 72 | numClustersSet.insert(result.labels[i]); 73 | } 74 | } 75 | this->numClusters_ = numClustersSet.size(); 76 | this->noisyPoints_ = noisyPoints; 77 | int iNdex = 1; 78 | for (auto it = numClustersSet.begin(); it != numClustersSet.end(); it++) { 79 | clustersMap[*it] = iNdex++; 80 | } 81 | for (int i = 0; i < labels_.size(); i++) { 82 | if (labels_[i] != 0) 83 | normalizedLabels.push_back(clustersMap[labels_[i]]); 84 | else if (labels_[i] == 0) { 85 | normalizedLabels.push_back(-1); 86 | } 87 | 88 | } 89 | this->normalizedLabels_ = normalizedLabels; 90 | this->membershipProbabilities_ = result.membershipProbabilities; 91 | } 92 | 93 | void Hdbscan::displayResult() { 94 | hdbscanResult result = this->result; 95 | uint32_t numClusters = 0; 96 | 97 | cout << "HDBSCAN clustering for " << this->dataset.size() << " objects." << endl; 98 | 99 | for (uint32_t i = 0; i < result.labels.size(); i++) { 100 | cout << result.labels[i] << " "; 101 | } 102 | 103 | cout << endl << endl; 104 | 105 | cout << "The Clustering contains " << this->numClusters_ << " clusters with " << this->noisyPoints_ << " noise Points." << endl; 106 | 107 | } 108 | -------------------------------------------------------------------------------- /HDBSCAN-CPP/HdbscanStar/undirectedGraph.cpp: -------------------------------------------------------------------------------- 1 | #include "undirectedGraph.hpp" 2 | 3 | void undirectedGraph::quicksortByEdgeWeight() 4 | { 5 | int _edgeWeightsLength = _edgeWeights.size(); 6 | if (_edgeWeightsLength <= 1) 7 | return; 8 | 9 | std::vector startIndexStack(_edgeWeightsLength / 2); 10 | std::vector endIndexStack(_edgeWeightsLength / 2); 11 | 12 | startIndexStack[0] = 0; 13 | endIndexStack[0] = _edgeWeightsLength - 1; 14 | 15 | int stackTop = 0; 16 | while (stackTop >= 0) 17 | { 18 | int startIndex = startIndexStack[stackTop]; 19 | int endIndex = endIndexStack[stackTop]; 20 | stackTop--; 21 | int pivotIndex = selectPivotIndex(startIndex, endIndex); 22 | pivotIndex = partition(startIndex, endIndex, pivotIndex); 23 | if (pivotIndex > startIndex + 1) 24 | { 25 | startIndexStack[stackTop + 1] = startIndex; 26 | endIndexStack[stackTop + 1] = pivotIndex - 1; 27 | stackTop++; 28 | } 29 | if (pivotIndex < endIndex - 1) 30 | { 31 | startIndexStack[stackTop + 1] = pivotIndex + 1; 32 | endIndexStack[stackTop + 1] = endIndex; 33 | stackTop++; 34 | } 35 | 36 | } 37 | } 38 | int undirectedGraph::selectPivotIndex(int startIndex, int endIndex) 39 | { 40 | if (startIndex - endIndex <= 1) 41 | return startIndex; 42 | 43 | double first = _edgeWeights[startIndex]; 44 | double middle = _edgeWeights[startIndex + (endIndex - startIndex) / 2]; 45 | double last = _edgeWeights[endIndex]; 46 | 47 | if (first <= middle) 48 | { 49 | if (middle <= last) 50 | return startIndex + (endIndex - startIndex) / 2; 51 | 52 | if (last >= first) 53 | return endIndex; 54 | 55 | return startIndex; 56 | } 57 | 58 | if (first <= last) 59 | return startIndex; 60 | 61 | if (last >= middle) 62 | return endIndex; 63 | 64 | return startIndex + (endIndex - startIndex) / 2; 65 | } 66 | 67 | int undirectedGraph::partition(int startIndex, int endIndex, int pivotIndex) 68 | { 69 | double pivotValue = _edgeWeights[pivotIndex]; 70 | swapEdges(pivotIndex, endIndex); 71 | int lowIndex = startIndex; 72 | for (int i = startIndex; i < endIndex; i++) 73 | { 74 | if (_edgeWeights[i] < pivotValue) 75 | { 76 | swapEdges(i, lowIndex); 77 | lowIndex++; 78 | } 79 | } 80 | swapEdges(lowIndex, endIndex); 81 | return lowIndex; 82 | } 83 | 84 | void undirectedGraph::swapEdges(int indexOne, int indexTwo) 85 | { 86 | if (indexOne == indexTwo) 87 | return; 88 | 89 | int tempVertexA = _verticesA[indexOne]; 90 | int tempVertexB = _verticesB[indexOne]; 91 | double tempEdgeDistance = _edgeWeights[indexOne]; 92 | _verticesA[indexOne] = _verticesA[indexTwo]; 93 | _verticesB[indexOne] = _verticesB[indexTwo]; 94 | _edgeWeights[indexOne] = _edgeWeights[indexTwo]; 95 | _verticesA[indexTwo] = tempVertexA; 96 | _verticesB[indexTwo] = tempVertexB; 97 | _edgeWeights[indexTwo] = tempEdgeDistance; 98 | } 99 | 100 | int undirectedGraph::getNumVertices() 101 | { 102 | return _numVertices; 103 | } 104 | 105 | int undirectedGraph::getNumEdges() 106 | { 107 | return _edgeWeights.size(); 108 | } 109 | 110 | int undirectedGraph::getFirstVertexAtIndex(int index) 111 | { 112 | return _verticesA[index]; 113 | } 114 | 115 | int undirectedGraph::getSecondVertexAtIndex(int index) 116 | { 117 | return _verticesB[index]; 118 | } 119 | 120 | double undirectedGraph::getEdgeWeightAtIndex(int index) 121 | { 122 | return _edgeWeights[index]; 123 | } 124 | 125 | std::vector& undirectedGraph::getEdgeListForVertex(int vertex) 126 | { 127 | return _edges[vertex]; 128 | } -------------------------------------------------------------------------------- /HDBSCAN-CPP/HdbscanStar/cluster.cpp: -------------------------------------------------------------------------------- 1 | #include"cluster.hpp" 2 | #include 3 | int cluster::counter = 0; 4 | cluster::cluster() 5 | { 6 | 7 | _id = ++counter; 8 | } 9 | 10 | cluster::cluster(int label, cluster* parent, double birthLevel, int numPoints) //:Label(label), Parent(parent), _birthLevel(birthLevel), _numPoints(numPoints) 11 | { 12 | _id = ++counter; 13 | _deathLevel = 0; 14 | 15 | _propagatedStability = 0; 16 | _numConstraintsSatisfied = 0; 17 | _propagatedNumConstraintsSatisfied = 0; 18 | 19 | Parent = parent; 20 | Label = label; 21 | _birthLevel = birthLevel; 22 | _numPoints = numPoints; 23 | HierarchyPosition = 0; 24 | Stability = 0; 25 | PropagatedLowestChildDeathLevel = std::numeric_limits::max(); 26 | 27 | if (Parent != NULL) 28 | Parent->HasChildren = true; 29 | HasChildren = false; 30 | PropagatedDescendants.resize(0); 31 | } 32 | bool cluster ::operator==(const cluster& other) const { 33 | return (this->_id == other._id); 34 | } 35 | void cluster::detachPoints(int numPoints, double level) 36 | { 37 | _numPoints -= numPoints; 38 | Stability += (numPoints * (1 / level - 1 / _birthLevel)); 39 | 40 | if (_numPoints == 0) 41 | _deathLevel = level; 42 | else if (_numPoints < 0) 43 | throw std::invalid_argument("Cluster cannot have less than 0 points."); 44 | } 45 | 46 | void cluster::propagate() 47 | { 48 | if (Parent != NULL) 49 | { 50 | if (PropagatedLowestChildDeathLevel == std::numeric_limits::max()) 51 | PropagatedLowestChildDeathLevel = _deathLevel; 52 | if (PropagatedLowestChildDeathLevel < Parent->PropagatedLowestChildDeathLevel) 53 | Parent->PropagatedLowestChildDeathLevel = PropagatedLowestChildDeathLevel; 54 | if (!HasChildren) 55 | { 56 | Parent->_propagatedNumConstraintsSatisfied += _numConstraintsSatisfied; 57 | Parent->_propagatedStability += Stability; 58 | Parent->PropagatedDescendants.push_back(this); 59 | } 60 | else if (_numConstraintsSatisfied > _propagatedNumConstraintsSatisfied) 61 | { 62 | Parent->_propagatedNumConstraintsSatisfied += _numConstraintsSatisfied; 63 | Parent->_propagatedStability += Stability; 64 | Parent->PropagatedDescendants.push_back(this); 65 | } 66 | else if (_numConstraintsSatisfied < _propagatedNumConstraintsSatisfied) 67 | { 68 | Parent->_propagatedNumConstraintsSatisfied += _propagatedNumConstraintsSatisfied; 69 | Parent->_propagatedStability += _propagatedStability; 70 | Parent->PropagatedDescendants.insert(Parent->PropagatedDescendants.end(), PropagatedDescendants.begin(), PropagatedDescendants.end()); 71 | } 72 | else if (_numConstraintsSatisfied == _propagatedNumConstraintsSatisfied) 73 | { 74 | //Chose the parent over descendants if there is a tie in stability: 75 | if (Stability >= _propagatedStability) 76 | { 77 | Parent->_propagatedNumConstraintsSatisfied += _numConstraintsSatisfied; 78 | Parent->_propagatedStability += Stability; 79 | Parent->PropagatedDescendants.push_back(this); 80 | } 81 | else 82 | { 83 | Parent->_propagatedNumConstraintsSatisfied += _propagatedNumConstraintsSatisfied; 84 | Parent->_propagatedStability += _propagatedStability; 85 | Parent->PropagatedDescendants.insert(Parent->PropagatedDescendants.end(), PropagatedDescendants.begin(), PropagatedDescendants.end()); 86 | } 87 | } 88 | } 89 | } 90 | void cluster::addPointsToVirtualChildCluster(std::set points) 91 | { 92 | for (std::set::iterator it = points.begin(); it != points.end(); ++it) { 93 | _virtualChildCluster.insert(*it); 94 | } 95 | } 96 | bool cluster::virtualChildClusterConstraintsPoint(int point) 97 | { 98 | return (_virtualChildCluster.find(point) != _virtualChildCluster.end()); 99 | } 100 | 101 | void cluster::addVirtualChildConstraintsSatisfied(int numConstraints) 102 | { 103 | _propagatedNumConstraintsSatisfied += numConstraints; 104 | } 105 | 106 | void cluster::addConstraintsSatisfied(int numConstraints) 107 | { 108 | _numConstraintsSatisfied += numConstraints; 109 | } 110 | 111 | void cluster::releaseVirtualChildCluster() 112 | { 113 | _virtualChildCluster.clear(); 114 | } 115 | 116 | int cluster::getClusterId() { 117 | return this->_id; 118 | } 119 | 120 | 121 | -------------------------------------------------------------------------------- /HDBSCAN-CPP/HdbscanStar/hdbscanAlgorithm.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "../Utils/bitSet.hpp" 7 | #include 8 | #include "undirectedGraph.hpp" 9 | #include"outlierScore.hpp" 10 | #include"cluster.hpp" 11 | #include"hdbscanConstraint.hpp" 12 | 13 | namespace hdbscanStar 14 | { 15 | class hdbscanAlgorithm 16 | { 17 | public: 18 | /// 19 | /// Calculates the core distances for each point in the data set, given some value for k. 20 | /// 21 | /// A vector of vectors where index [i][j] indicates the jth attribute of data point i 22 | /// Each point's core distance will be it's distance to the kth nearest neighbor 23 | /// An array of core distances 24 | static std::vector calculateCoreDistances(std::vector> distances, int k); 25 | 26 | static undirectedGraph constructMst(std::vector> distances, std::vector coreDistances, bool selfEdges); 27 | 28 | 29 | /// 30 | /// Propagates constraint satisfaction, stability, and lowest child death level from each child 31 | /// cluster to each parent cluster in the tree. This method must be called before calling 32 | /// findProminentClusters() or calculateOutlierScores(). 33 | /// 34 | /// A list of Clusters forming a cluster tree 35 | /// true if there are any clusters with infinite stability, false otherwise 36 | 37 | 38 | static void computeHierarchyAndClusterTree(undirectedGraph *mst, int minClusterSize, std::vector constraints, std::vector> &hierarchy, std::vector &pointNoiseLevels, std::vector &pointLastClusters, std::vector &clusters); 39 | 40 | static std::vector findProminentClusters(std::vector &clusters, std::vector> &hierarchy, int numPoints); 41 | 42 | static std::vector findMembershipScore(std::vector clusterids, std::vector coreDistances); 43 | 44 | static bool propagateTree(std::vector &sclusters); 45 | 46 | /// 47 | /// Produces the outlier score for each point in the data set, and returns a sorted list of outlier 48 | /// scores. propagateTree() must be called before calling this method. 49 | /// 50 | /// A list of Clusters forming a cluster tree which has already been propagated 51 | /// A double[] with the levels at which each point became noise 52 | /// An int[] with the last label each point had before becoming noise 53 | /// An array of core distances for each data point 54 | /// An List of OutlierScores, sorted in descending order 55 | static std::vector calculateOutlierScores( 56 | std::vector &clusters, 57 | std::vector &pointNoiseLevels, 58 | std::vector &pointLastClusters, 59 | std::vector coreDistances); 60 | 61 | /// 62 | /// Removes the set of points from their parent Cluster, and creates a new Cluster, provided the 63 | /// clusterId is not 0 (noise). 64 | /// 65 | /// The set of points to be in the new Cluster 66 | /// An array of cluster labels, which will be modified 67 | /// The parent Cluster of the new Cluster being created 68 | /// The label of the new Cluster 69 | /// The edge weight at which to remove the points from their previous Cluster 70 | /// The new Cluster, or null if the clusterId was 0 71 | static cluster* createNewCluster( 72 | std::set& points, 73 | std::vector &clusterLabels, 74 | cluster *parentCluster, 75 | int clusterLabel, 76 | double edgeWeight); 77 | 78 | /// 79 | /// Calculates the number of constraints satisfied by the new clusters and virtual children of the 80 | /// parents of the new clusters. 81 | /// 82 | /// Labels of new clusters 83 | /// An List of clusters 84 | /// An List of constraints 85 | /// An array of current cluster labels for points 86 | static void calculateNumConstraintsSatisfied( 87 | std::set& newClusterLabels, 88 | std::vector& clusters, 89 | std::vector& constraints, 90 | std::vector& clusterLabels); 91 | 92 | }; 93 | 94 | } 95 | 96 | 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HDBSCAN-CPP 2 | [![HDBSCAN](https://img.shields.io/badge/HDBSCAN-Clustering-yellowgreen.svg)](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html) 3 | [![C++](https://img.shields.io/badge/C%2B%2B-14-green.svg )](https://en.wikipedia.org/wiki/C%2B%2B14) 4 | [![STL](https://img.shields.io/badge/C%2B%2B-STL-brightgreen.svg )](https://en.wikipedia.org/wiki/Standard_Template_Library) 5 | [![MIT License](https://img.shields.io/badge/License-MIT-blue.svg )](https://github.com/rohanmohapatra/hdbscan-cpp/blob/master/LICENSE.md) 6 | [![Build Status](https://travis-ci.org/rohanmohapatra/hdbscan-cpp.svg?branch=master)](https://travis-ci.org/rohanmohapatra/hdbscan-cpp) 7 | [![Coverage Status](https://coveralls.io/repos/github/rohanmohapatra/hdbscan-cpp/badge.svg?branch=master)](https://coveralls.io/github/rohanmohapatra/hdbscan-cpp?branch=master) 8 | 9 | Fast and Efficient Implementation of HDBSCAN in C++ using STL. 10 | -------------------------------------------------------------------------------------------------------------- 11 | 12 | Authored by: 13 | * [Sumedh Basarkod](https://github.com/sumedhpb) 14 | * [Rohan Mohapatra](https://github.com/rohanmohapatra) 15 | 16 | The Standard Template Library (STL) is a set of C++ template classes to provide common programming 17 | data structures and functions such as lists, stacks, arrays, etc. It is a library of container classes, algorithms, and iterators. 18 | 19 | # About HDBSCAN 20 | HDBSCAN - Hierarchical Density-Based Spatial Clustering of Applications with Noise. Performs DBSCAN over varying epsilon values and integrates the result to find a clustering that gives the best stability over epsilon. This allows HDBSCAN to find clusters of varying densities (unlike DBSCAN), and be more robust to parameter selection. 21 | 22 | In practice this means that HDBSCAN returns a good clustering straight away with little or no parameter tuning -- and the primary parameter, minimum cluster size, is intuitive and easy to select. 23 | 24 | HDBSCAN is ideal for exploratory data analysis; it's a fast and robust algorithm that you can trust to return meaningful clusters (if there are any). 25 | 26 | Based on the paper: 27 | > R. Campello, D. Moulavi, and J. Sander, Density-Based Clustering Based on Hierarchical Density Estimates In: Advances in Knowledge Discovery and Data Mining, Springer, pp 160-172. 2013 28 | 29 | Design of the Clustering algorithm is referenced from [this](https://github.com/doxakis/HdbscanSharp). 30 | 31 | ### How to Run this code? 32 | 33 | Clone this project as this contains the library. 34 | ``` 35 | git clone https://github.com/rohanmohapatra/hdbscan-cpp.git 36 | ``` 37 | 38 | Run the Makefile 39 | ``` 40 | make all clean 41 | ``` 42 | 43 | Wait for it to complete, this will run the already present example in the Four Prominent Cluster Example Folder. Plot the points and see the clustering. 44 | To run: 45 | ``` 46 | ./main 47 | ``` 48 | 49 | If you want to use it , have a look at the example and use it. 50 | 51 | 52 | 53 | ### Outlier Detection 54 | The HDBSCAN clusterer objects also support the GLOSH outlier detection algorithm. After fitting the clusterer to 55 | data the outlier scores can be accessed via the `outlierScores_` from the `Hdbscan` Object. The result is a vector of score values, 56 | one for each data point that was fit. Higher scores represent more outlier like objects. Selecting outliers via upper 57 | quantiles is often a good approach. 58 | 59 | Based on the papers: 60 | > R.J.G.B. Campello, D. Moulavi, A. Zimek and J. Sander Hierarchical Density Estimates for Data Clustering, Visualization, and Outlier Detection, ACM Trans. on Knowledge Discovery from Data, Vol 10, 1 (July 2015), 1-51. 61 | 62 | ## Examples 63 | ``` 64 | #include 65 | #include"../HDBSCAN-CPP/Hdbscan/hdbscan.hpp" 66 | using namespace std; 67 | int main() { 68 | 69 | Hdbscan hdbscan("HDBSCANDataset/FourProminentClusterDataset.csv"); 70 | hdbscan.loadCsv(2); 71 | hdbscan.execute(5, 5, "Euclidean"); 72 | hdbscan.displayResult(); 73 | cout << "You can access other fields like cluster labels, membership probabilities and outlier scores."< 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "../Utils/bitSet.hpp" 8 | #include 9 | #include "undirectedGraph.hpp" 10 | #include"outlierScore.hpp" 11 | #include"cluster.hpp" 12 | #include"hdbscanConstraint.hpp" 13 | #include"hdbscanAlgorithm.hpp" 14 | 15 | 16 | std::vector hdbscanStar::hdbscanAlgorithm::calculateCoreDistances(std::vector> distances, int k) 17 | { 18 | int length = distances.size(); 19 | 20 | int numNeighbors = k - 1; 21 | std::vectorcoreDistances(length); 22 | if (k == 1) 23 | { 24 | for (int point = 0; point < length; point++) 25 | { 26 | coreDistances[point] = 0; 27 | } 28 | return coreDistances; 29 | } 30 | for (int point = 0; point < length; point++) 31 | { 32 | std::vector kNNDistances(numNeighbors); //Sorted nearest distances found so far 33 | for (int i = 0; i < numNeighbors; i++) 34 | { 35 | kNNDistances[i] = std::numeric_limits::max(); 36 | } 37 | 38 | for (int neighbor = 0; neighbor < length; neighbor++) 39 | { 40 | if (point == neighbor) 41 | continue; 42 | double distance = distances[point][neighbor]; 43 | int neighborIndex = numNeighbors; 44 | //Check at which position in the nearest distances the current distance would fit: 45 | while (neighborIndex >= 1 && distance < kNNDistances[neighborIndex - 1]) 46 | { 47 | neighborIndex--; 48 | } 49 | //Shift elements in the array to make room for the current distance: 50 | if (neighborIndex < numNeighbors) 51 | { 52 | for (int shiftIndex = numNeighbors - 1; shiftIndex > neighborIndex; shiftIndex--) 53 | { 54 | kNNDistances[shiftIndex] = kNNDistances[shiftIndex - 1]; 55 | } 56 | kNNDistances[neighborIndex] = distance; 57 | } 58 | 59 | } 60 | coreDistances[point] = kNNDistances[numNeighbors - 1]; 61 | } 62 | return coreDistances; 63 | } 64 | undirectedGraph hdbscanStar::hdbscanAlgorithm::constructMst(std::vector> distances, std::vector coreDistances, bool selfEdges) 65 | { 66 | int length = distances.size(); 67 | int selfEdgeCapacity = 0; 68 | if (selfEdges) 69 | selfEdgeCapacity = length; 70 | bitSet attachedPoints; 71 | 72 | std::vector nearestMRDNeighbors(length - 1 + selfEdgeCapacity); 73 | std::vector nearestMRDDistances(length - 1 + selfEdgeCapacity); 74 | 75 | for (int i = 0; i < length - 1; i++) 76 | { 77 | nearestMRDDistances[i] = std::numeric_limits::max(); 78 | } 79 | 80 | int currentPoint = length - 1; 81 | int numAttachedPoints = 1; 82 | attachedPoints.set(length - 1); 83 | 84 | while (numAttachedPoints < length) 85 | { 86 | 87 | int nearestMRDPoint = -1; 88 | double nearestMRDDistance = std::numeric_limits::max(); 89 | for (int neighbor = 0; neighbor < length; neighbor++) 90 | { 91 | if (currentPoint == neighbor) 92 | continue; 93 | if (attachedPoints.get(neighbor) == true) 94 | continue; 95 | double distance = distances[currentPoint][neighbor]; 96 | double mutualReachabiltiyDistance = distance; 97 | if (coreDistances[currentPoint] > mutualReachabiltiyDistance) 98 | mutualReachabiltiyDistance = coreDistances[currentPoint]; 99 | 100 | if (coreDistances[neighbor] > mutualReachabiltiyDistance) 101 | mutualReachabiltiyDistance = coreDistances[neighbor]; 102 | 103 | if (mutualReachabiltiyDistance < nearestMRDDistances[neighbor]) 104 | { 105 | nearestMRDDistances[neighbor] = mutualReachabiltiyDistance; 106 | nearestMRDNeighbors[neighbor] = currentPoint; 107 | } 108 | 109 | if (nearestMRDDistances[neighbor] <= nearestMRDDistance) 110 | { 111 | nearestMRDDistance = nearestMRDDistances[neighbor]; 112 | nearestMRDPoint = neighbor; 113 | } 114 | 115 | } 116 | attachedPoints.set(nearestMRDPoint); 117 | numAttachedPoints++; 118 | currentPoint = nearestMRDPoint; 119 | } 120 | std::vector otherVertexIndices(length - 1 + selfEdgeCapacity); 121 | for (int i = 0; i < length - 1; i++) 122 | { 123 | otherVertexIndices[i] = i; 124 | } 125 | if (selfEdges) 126 | { 127 | for (int i = length - 1; i < length * 2 - 1; i++) 128 | { 129 | int vertex = i - (length - 1); 130 | nearestMRDNeighbors[i] = vertex; 131 | otherVertexIndices[i] = vertex; 132 | nearestMRDDistances[i] = coreDistances[vertex]; 133 | } 134 | } 135 | undirectedGraph undirectedGraphObject(length, nearestMRDNeighbors, otherVertexIndices, nearestMRDDistances); 136 | return undirectedGraphObject; 137 | 138 | } 139 | 140 | void hdbscanStar::hdbscanAlgorithm::computeHierarchyAndClusterTree(undirectedGraph* mst, int minClusterSize, std::vector constraints, std::vector>& hierarchy, std::vector& pointNoiseLevels, std::vector& pointLastClusters, std::vector& clusters) 141 | { 142 | int hierarchyPosition = 0; 143 | 144 | //The current edge being removed from the MST: 145 | int currentEdgeIndex = mst->getNumEdges() - 1; 146 | int nextClusterLabel = 2; 147 | bool nextLevelSignificant = true; 148 | 149 | //The previous and current cluster numbers of each point in the data set: 150 | std::vector previousClusterLabels(mst->getNumVertices()); 151 | std::vector currentClusterLabels(mst->getNumVertices()); 152 | 153 | for (int i = 0; i < currentClusterLabels.size(); i++) 154 | { 155 | currentClusterLabels[i] = 1; 156 | previousClusterLabels[i] = 1; 157 | } 158 | //std::vector clusters; 159 | clusters.push_back(NULL); 160 | //cluster cluster_object(1, NULL, std::numeric_limits::quiet_NaN(), mst->getNumVertices()); 161 | clusters.push_back(new cluster(1, NULL, std::numeric_limits::quiet_NaN(), mst->getNumVertices())); 162 | 163 | std::set clusterOne; 164 | clusterOne.insert(1); 165 | calculateNumConstraintsSatisfied( 166 | clusterOne, 167 | clusters, 168 | constraints, 169 | currentClusterLabels); 170 | std::set affectedClusterLabels; 171 | std::set affectedVertices; 172 | while (currentEdgeIndex >= 0) 173 | { 174 | double currentEdgeWeight = mst->getEdgeWeightAtIndex(currentEdgeIndex); 175 | std::vector newClusters; 176 | while (currentEdgeIndex >= 0 && mst->getEdgeWeightAtIndex(currentEdgeIndex) == currentEdgeWeight) 177 | { 178 | int firstVertex = mst->getFirstVertexAtIndex(currentEdgeIndex); 179 | int secondVertex = mst->getSecondVertexAtIndex(currentEdgeIndex); 180 | std::vector& firstVertexEdgeList = mst->getEdgeListForVertex(firstVertex); 181 | std::vector::iterator secondVertexInFirstEdgeList = std::find(firstVertexEdgeList.begin(), firstVertexEdgeList.end(), secondVertex); 182 | if (secondVertexInFirstEdgeList != mst->getEdgeListForVertex(firstVertex).end()) 183 | mst->getEdgeListForVertex(firstVertex).erase(secondVertexInFirstEdgeList); 184 | std::vector& secondVertexEdgeList = mst->getEdgeListForVertex(secondVertex); 185 | std::vector::iterator firstVertexInSecondEdgeList = std::find(secondVertexEdgeList.begin(), secondVertexEdgeList.end(), firstVertex); 186 | if (firstVertexInSecondEdgeList != mst->getEdgeListForVertex(secondVertex).end()) 187 | mst->getEdgeListForVertex(secondVertex).erase(firstVertexInSecondEdgeList); 188 | 189 | if (currentClusterLabels[firstVertex] == 0) 190 | { 191 | currentEdgeIndex--; 192 | continue; 193 | } 194 | affectedVertices.insert(firstVertex); 195 | affectedVertices.insert(secondVertex); 196 | affectedClusterLabels.insert(currentClusterLabels[firstVertex]); 197 | currentEdgeIndex--; 198 | } 199 | if (!affectedClusterLabels.size()) 200 | continue; 201 | while (affectedClusterLabels.size()) 202 | { 203 | int examinedClusterLabel = *prev(affectedClusterLabels.end()); 204 | affectedClusterLabels.erase(prev(affectedClusterLabels.end())); 205 | std::set examinedVertices; 206 | //std::set::iterator affectedIt; 207 | for (auto affectedIt = affectedVertices.begin(); affectedIt != affectedVertices.end();) 208 | { 209 | int vertex = *affectedIt; 210 | if (currentClusterLabels[vertex] == examinedClusterLabel) 211 | { 212 | examinedVertices.insert(vertex); 213 | affectedIt = affectedVertices.erase(affectedIt); 214 | 215 | } 216 | else { 217 | ++affectedIt; 218 | } 219 | } 220 | std::set firstChildCluster; 221 | std::list unexploredFirstChildClusterPoints; 222 | int numChildClusters = 0; 223 | while (examinedVertices.size()) 224 | { 225 | 226 | std::set constructingSubCluster; 227 | int iters = 0; 228 | std::list unexploredSubClusterPoints; 229 | bool anyEdges = false; 230 | bool incrementedChildCount = false; 231 | int rootVertex = *prev(examinedVertices.end()); 232 | constructingSubCluster.insert(rootVertex); 233 | unexploredSubClusterPoints.push_back(rootVertex); 234 | examinedVertices.erase(prev(examinedVertices.end())); 235 | while (unexploredSubClusterPoints.size()) 236 | { 237 | int vertexToExplore = *unexploredSubClusterPoints.begin(); 238 | unexploredSubClusterPoints.erase(unexploredSubClusterPoints.begin()); 239 | std::vector& vertexToExploreEdgeList = mst->getEdgeListForVertex(vertexToExplore); 240 | for (std::vector::iterator it = vertexToExploreEdgeList.begin(); it != vertexToExploreEdgeList.end();) 241 | { 242 | int neighbor = *it; 243 | anyEdges = true; 244 | if (std::find(constructingSubCluster.begin(), constructingSubCluster.end(), neighbor) == constructingSubCluster.end()) 245 | { 246 | constructingSubCluster.insert(neighbor); 247 | unexploredSubClusterPoints.push_back(neighbor); 248 | if (std::find(examinedVertices.begin(), examinedVertices.end(), neighbor) != examinedVertices.end()) 249 | examinedVertices.erase(std::find(examinedVertices.begin(), examinedVertices.end(), neighbor)); 250 | 251 | } 252 | else { 253 | ++it; 254 | } 255 | } 256 | if (!incrementedChildCount && constructingSubCluster.size() >= minClusterSize && anyEdges) 257 | { 258 | incrementedChildCount = true; 259 | numChildClusters++; 260 | 261 | //If this is the first valid child cluster, stop exploring it: 262 | if (firstChildCluster.size() == 0) 263 | { 264 | firstChildCluster = constructingSubCluster; 265 | unexploredFirstChildClusterPoints = unexploredSubClusterPoints; 266 | break; 267 | } 268 | } 269 | 270 | } 271 | //If there could be a split, and this child cluster is valid: 272 | if (numChildClusters >= 2 && constructingSubCluster.size() >= minClusterSize && anyEdges) 273 | { 274 | //Check this child cluster is not equal to the unexplored first child cluster: 275 | int firstChildClusterMember = *prev(firstChildCluster.end()); 276 | if (std::find(constructingSubCluster.begin(), constructingSubCluster.end(), firstChildClusterMember) != constructingSubCluster.end()) 277 | numChildClusters--; 278 | //Otherwise, c a new cluster: 279 | else 280 | { 281 | cluster* newCluster = createNewCluster(constructingSubCluster, currentClusterLabels, 282 | clusters[examinedClusterLabel], nextClusterLabel, currentEdgeWeight); 283 | newClusters.push_back(newCluster); 284 | clusters.push_back(newCluster); 285 | nextClusterLabel++; 286 | } 287 | } 288 | else if (constructingSubCluster.size() < minClusterSize || !anyEdges) 289 | { 290 | createNewCluster(constructingSubCluster, currentClusterLabels, 291 | clusters[examinedClusterLabel], 0, currentEdgeWeight); 292 | 293 | for (std::set::iterator it = constructingSubCluster.begin(); it != constructingSubCluster.end(); it++) 294 | { 295 | int point = *it; 296 | pointNoiseLevels[point] = currentEdgeWeight; 297 | pointLastClusters[point] = examinedClusterLabel; 298 | } 299 | } 300 | } 301 | if (numChildClusters >= 2 && currentClusterLabels[*firstChildCluster.begin()] == examinedClusterLabel) 302 | { 303 | while (unexploredFirstChildClusterPoints.size()) 304 | { 305 | int vertexToExplore = *unexploredFirstChildClusterPoints.begin(); 306 | unexploredFirstChildClusterPoints.pop_front(); 307 | for (std::vector::iterator it = mst->getEdgeListForVertex(vertexToExplore).begin(); it != mst->getEdgeListForVertex(vertexToExplore).end(); it++) 308 | { 309 | int neighbor = *it; 310 | if (std::find(firstChildCluster.begin(), firstChildCluster.end(), neighbor) == firstChildCluster.end()) 311 | { 312 | firstChildCluster.insert(neighbor); 313 | unexploredFirstChildClusterPoints.push_back(neighbor); 314 | } 315 | } 316 | } 317 | cluster* newCluster = createNewCluster(firstChildCluster, currentClusterLabels, 318 | clusters[examinedClusterLabel], nextClusterLabel, currentEdgeWeight); 319 | newClusters.push_back(newCluster); 320 | clusters.push_back(newCluster); 321 | nextClusterLabel++; 322 | } 323 | } 324 | if (nextLevelSignificant || newClusters.size()) 325 | { 326 | std::vector lineContents(previousClusterLabels.size()); 327 | for (int i = 0; i < previousClusterLabels.size(); i++) 328 | lineContents[i] = previousClusterLabels[i]; 329 | hierarchy.push_back(lineContents); 330 | hierarchyPosition++; 331 | } 332 | std::set newClusterLabels; 333 | for (std::vector::iterator it = newClusters.begin(); it != newClusters.end(); it++) 334 | { 335 | cluster* newCluster = *it; 336 | newCluster->HierarchyPosition = hierarchyPosition; 337 | newClusterLabels.insert(newCluster->Label); 338 | } 339 | if (newClusterLabels.size()) 340 | calculateNumConstraintsSatisfied(newClusterLabels, clusters, constraints, currentClusterLabels); 341 | 342 | for (int i = 0; i < previousClusterLabels.size(); i++) 343 | { 344 | previousClusterLabels[i] = currentClusterLabels[i]; 345 | } 346 | if (!newClusters.size()) 347 | nextLevelSignificant = false; 348 | else 349 | nextLevelSignificant = true; 350 | } 351 | 352 | { 353 | std::vector lineContents(previousClusterLabels.size() + 1); 354 | for (int i = 0; i < previousClusterLabels.size(); i++) 355 | lineContents[i] = 0; 356 | hierarchy.push_back(lineContents); 357 | } 358 | } 359 | std::vector hdbscanStar::hdbscanAlgorithm::findProminentClusters(std::vector& clusters, std::vector>& hierarchy, int numPoints) 360 | { 361 | //Take the list of propagated clusters from the root cluster: 362 | std::vector solution = clusters[1]->PropagatedDescendants; 363 | std::vector flatPartitioning(numPoints); 364 | 365 | //Store all the hierarchy positions at which to find the birth points for the flat clustering: 366 | std::map> significantHierarchyPositions; 367 | 368 | std::vector::iterator it = solution.begin(); 369 | while (it != solution.end()) 370 | { 371 | int hierarchyPosition = (*it)->HierarchyPosition; 372 | if (significantHierarchyPositions.count(hierarchyPosition) > 0) 373 | significantHierarchyPositions[hierarchyPosition].push_back((*it)->Label); 374 | else 375 | significantHierarchyPositions[hierarchyPosition].push_back((*it)->Label); 376 | it++; 377 | } 378 | 379 | //Go through the hierarchy file, setting labels for the flat clustering: 380 | while (significantHierarchyPositions.size()) 381 | { 382 | std::map>::iterator entry = significantHierarchyPositions.begin(); 383 | std::vector clusterList = entry->second; 384 | int hierarchyPosition = entry->first; 385 | significantHierarchyPositions.erase(entry->first); 386 | 387 | std::vector lineContents = hierarchy[hierarchyPosition]; 388 | 389 | for (int i = 0; i < lineContents.size(); i++) 390 | { 391 | int label = lineContents[i]; 392 | if (std::find(clusterList.begin(), clusterList.end(), label) != clusterList.end()) 393 | flatPartitioning[i] = label; 394 | } 395 | } 396 | return flatPartitioning; 397 | } 398 | std::vector hdbscanStar::hdbscanAlgorithm::findMembershipScore(std::vector clusterids, std::vector coreDistances) 399 | { 400 | 401 | int length = clusterids.size(); 402 | std::vector prob(length, std::numeric_limits::max()); 403 | int i=0; 404 | 405 | while(i::max()) 408 | { 409 | 410 | int clusterno = clusterids[i]; 411 | std::vector::iterator iter = clusterids.begin()+i; 412 | std::vector indices; 413 | while ((iter = std::find(iter, clusterids.end(), clusterno)) != clusterids.end()) 414 | { 415 | 416 | indices.push_back(distance(clusterids.begin(), iter)); 417 | iter++; 418 | if(iter==clusterids.end()) 419 | break; 420 | 421 | } 422 | if(clusterno==0) 423 | { 424 | for(int j=0; j tempCoreDistances(indices.size()); 432 | for(int j=0; j& clusters) 451 | { 452 | std::map clustersToExamine; 453 | bitSet addedToExaminationList; 454 | bool infiniteStability = false; 455 | 456 | //Find all leaf clusters in the cluster tree: 457 | for (cluster* cluster : clusters) 458 | { 459 | if (cluster != NULL && !cluster->HasChildren) 460 | { 461 | int label = cluster->Label; 462 | clustersToExamine.erase(label); 463 | clustersToExamine.insert({ label, cluster }); 464 | addedToExaminationList.set(label); 465 | } 466 | } 467 | //Iterate through every cluster, propagating stability from children to parents: 468 | while (clustersToExamine.size()) 469 | { 470 | std::map::iterator currentKeyValue = prev(clustersToExamine.end()); 471 | cluster* currentCluster = currentKeyValue->second; 472 | clustersToExamine.erase(currentKeyValue->first); 473 | currentCluster->propagate(); 474 | 475 | if (currentCluster->Stability == std::numeric_limits::infinity()) 476 | infiniteStability = true; 477 | 478 | if (currentCluster->Parent != NULL) 479 | { 480 | cluster* parent = currentCluster->Parent; 481 | int label = parent->Label; 482 | 483 | if (!addedToExaminationList.get(label)) 484 | { 485 | clustersToExamine.erase(label); 486 | clustersToExamine.insert({ label, parent }); 487 | addedToExaminationList.set(label); 488 | } 489 | } 490 | } 491 | 492 | return infiniteStability; 493 | } 494 | 495 | /// 496 | /// Produces the outlier score for each point in the data set, and returns a sorted list of outlier 497 | /// scores. propagateTree() must be called before calling this method. 498 | /// 499 | /// A list of Clusters forming a cluster tree which has already been propagated 500 | /// A double[] with the levels at which each point became noise 501 | /// An int[] with the last label each point had before becoming noise 502 | /// An array of core distances for each data point 503 | /// An List of OutlierScores, sorted in descending order 504 | std::vector hdbscanStar::hdbscanAlgorithm::calculateOutlierScores( 505 | std::vector& clusters, 506 | std::vector& pointNoiseLevels, 507 | std::vector& pointLastClusters, 508 | std::vector coreDistances) 509 | { 510 | int numPoints = pointNoiseLevels.size(); 511 | std::vector outlierScores; 512 | 513 | //Iterate through each point, calculating its outlier score: 514 | for (int i = 0; i < numPoints; i++) 515 | { 516 | double epsilonMax = clusters[pointLastClusters[i]]->PropagatedLowestChildDeathLevel; 517 | double epsilon = pointNoiseLevels[i]; 518 | double score = 0; 519 | 520 | if (epsilon != 0) 521 | score = 1 - (epsilonMax / epsilon); 522 | 523 | outlierScores.push_back(outlierScore(score, coreDistances[i], i)); 524 | } 525 | //Sort the outlier scores: 526 | sort(outlierScores.begin(), outlierScores.end()); 527 | 528 | return outlierScores; 529 | } 530 | 531 | /// 532 | /// Removes the set of points from their parent Cluster, and creates a new Cluster, provided the 533 | /// clusterId is not 0 (noise). 534 | /// 535 | /// The set of points to be in the new Cluster 536 | /// An array of cluster labels, which will be modified 537 | /// The parent Cluster of the new Cluster being created 538 | /// The label of the new Cluster 539 | /// The edge weight at which to remove the points from their previous Cluster 540 | /// The new Cluster, or null if the clusterId was 0 541 | cluster* hdbscanStar::hdbscanAlgorithm::createNewCluster( 542 | std::set& points, 543 | std::vector& clusterLabels, 544 | cluster* parentCluster, 545 | int clusterLabel, 546 | double edgeWeight) 547 | { 548 | std::set::iterator it = points.begin(); 549 | while (it != points.end()) 550 | { 551 | clusterLabels[*it] = clusterLabel; 552 | ++it; 553 | } 554 | parentCluster->detachPoints(points.size(), edgeWeight); 555 | 556 | if (clusterLabel != 0) 557 | { 558 | return new cluster(clusterLabel, parentCluster, edgeWeight, points.size()); 559 | } 560 | 561 | parentCluster->addPointsToVirtualChildCluster(points); 562 | return NULL; 563 | } 564 | /// 565 | /// Calculates the number of constraints satisfied by the new clusters and virtual children of the 566 | /// parents of the new clusters. 567 | /// 568 | /// Labels of new clusters 569 | /// An List of clusters 570 | /// An List of constraints 571 | /// An array of current cluster labels for points 572 | void hdbscanStar::hdbscanAlgorithm::calculateNumConstraintsSatisfied( 573 | std::set& newClusterLabels, 574 | std::vector& clusters, 575 | std::vector& constraints, 576 | std::vector& clusterLabels) 577 | { 578 | 579 | if (constraints.size() == 0) 580 | return; 581 | 582 | std::vector parents; 583 | std::vector ::iterator it; 584 | for (int label : newClusterLabels) 585 | { 586 | cluster* parent = clusters[label]->Parent; 587 | if (parent != NULL && !(find(parents.begin(), parents.end(), *parent) != parents.end())) 588 | parents.push_back(*parent); 589 | } 590 | 591 | for (hdbscanConstraint constraint : constraints) 592 | { 593 | int labelA = clusterLabels[constraint.getPointA()]; 594 | int labelB = clusterLabels[constraint.getPointB()]; 595 | 596 | if (constraint.getConstraintType() == hdbscanConstraintType::mustLink && labelA == labelB) 597 | { 598 | if (find(newClusterLabels.begin(), newClusterLabels.end(), labelA) != newClusterLabels.end()) 599 | clusters[labelA]->addConstraintsSatisfied(2); 600 | } 601 | else if (constraint.getConstraintType() == hdbscanConstraintType::cannotLink && (labelA != labelB || labelA == 0)) 602 | { 603 | if (labelA != 0 && find(newClusterLabels.begin(), newClusterLabels.end(), labelA) != newClusterLabels.end()) 604 | clusters[labelA]->addConstraintsSatisfied(1); 605 | if (labelB != 0 && (find(newClusterLabels.begin(), newClusterLabels.end(), labelA) != newClusterLabels.end())) 606 | clusters[labelB]->addConstraintsSatisfied(1); 607 | if (labelA == 0) 608 | { 609 | for (cluster parent : parents) 610 | { 611 | if (parent.virtualChildClusterConstraintsPoint(constraint.getPointA())) 612 | { 613 | parent.addVirtualChildConstraintsSatisfied(1); 614 | break; 615 | } 616 | } 617 | } 618 | if (labelB == 0) 619 | { 620 | for (cluster parent : parents) 621 | { 622 | if (parent.virtualChildClusterConstraintsPoint(constraint.getPointB())) 623 | { 624 | parent.addVirtualChildConstraintsSatisfied(1); 625 | break; 626 | } 627 | } 628 | } 629 | } 630 | } 631 | 632 | for (cluster parent : parents) 633 | { 634 | parent.releaseVirtualChildCluster(); 635 | } 636 | } 637 | --------------------------------------------------------------------------------