├── .github └── workflows │ └── test-code.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── images ├── illustration_circles.gif └── performance.png ├── incdbscan ├── __init__.py ├── _bfscomponentfinder.py ├── _deleter.py ├── _inserter.py ├── _labels.py ├── _neighbor_searcher.py ├── _object.py ├── _objects.py ├── _utils.py ├── incrementaldbscan.py └── tests │ ├── conftest.py │ ├── test_deleter.py │ ├── test_incrementaldbscan.py │ ├── test_inserter.py │ ├── test_testutils.py │ ├── test_with_data.py │ └── testutils.py ├── notebooks ├── incdbscan-usage.ipynb └── performance.ipynb ├── notes └── notes-on-paper.md ├── poetry.lock ├── profiling.py └── pyproject.toml /.github/workflows/test-code.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Test code with tests on Python 3.10 5 | 6 | on: 7 | push: 8 | branches: [ "master" ] 9 | pull_request: 10 | branches: [ "master" ] 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | build: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v4 22 | - name: Set up Python 3.10 23 | uses: actions/setup-python@v3 24 | with: 25 | python-version: "3.10" 26 | - name: Install dependencies with Poetry 27 | run: | 28 | python -m pip install --upgrade pip 29 | curl -sSL https://install.python-poetry.org | python3 - 30 | poetry install 31 | - name: Lint 32 | run: | 33 | poetry run make lint 34 | poetry run make isort-check 35 | - name: Test 36 | run: | 37 | poetry run make test 38 | - name: Slow test 39 | run: | 40 | poetry run make test-slow 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | .pytest_cache/* 3 | .vscode/* 4 | .ignore/* 5 | .idea/* 6 | src/__pytest__/* 7 | *pyc 8 | profiling/* 9 | *.egg-info/ 10 | .python-version 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2022 Arpad Fulop 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are met: 5 | 6 | 1. Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | 13 | 3. Neither the name of the copyright holder nor the 14 | names of its contributors may be used to endorse or promote products 15 | derived from this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 21 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | isort: 2 | isort . 3 | 4 | isort-check: 5 | isort . -c -v 6 | 7 | lint: 8 | pylint \ 9 | --disable=missing-class-docstring \ 10 | --disable=missing-function-docstring \ 11 | --disable=missing-module-docstring \ 12 | --disable=too-few-public-methods \ 13 | --fail-under 5 \ 14 | --fail-on E,F \ 15 | incdbscan/* 16 | 17 | test: 18 | python -m pytest -m "not slow" incdbscan/tests/* 19 | 20 | test-slow: 21 | python -m pytest -m slow incdbscan/tests/* 22 | 23 | profile: 24 | mkdir -p profiling 25 | python profiling.py $(tag) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # IncrementalDBSCAN 2 | 3 | `incdbscan` is an implementation of IncrementalDBSCAN, the incremental version of the DBSCAN clustering algorithm. 4 | 5 | IncrementalDBSCAN lets the user update the clustering by inserting or deleting data points. The algorithm yields the same result as DBSCAN but without reapplying DBSCAN to the modified data set. 6 | 7 | Thus, IncrementalDBSCAN is ideal to use when the size of the data set to cluster is so large that applying DBSCAN to the whole data set would be costly but for the purpose of the application it is enough to update an already existing clustering by inserting or deleting some data points. 8 | 9 | The implementation is based on the following paper. To see what's new compared to the paper, jump to [Notes on the IncrementalDBSCAN paper](https://github.com/DataOmbudsman/incdbscan/blob/master/notes/notes-on-paper.md). 10 | 11 | > Ester, Martin; Kriegel, Hans-Peter; Sander, Jörg; Wimmer, Michael; Xu, Xiaowei (1998). *Incremental Clustering for Mining in a Data Warehousing Environment.* In: Proceedings of the 24rd International Conference on Very Large Data Bases (VLDB 1998). 12 | 13 |

14 | indbscan illustration 15 |

16 | 17 | ## Table of Contents 18 | 19 | - [Highlights](#Highlights) 20 | - [Installation](#installation) 21 | - [Usage](#usage) 22 | - [Performance](#Performance) 23 | 24 | ## Highlights 25 | 26 | The `incdbscan` package is an implementation of the IncrementalDBSCAN algorithm by Ester et al., with about 40 unit tests covering diverse cases, and with [additional corrections](https://github.com/DataOmbudsman/incdbscan/blob/master/notes/notes-on-paper.md) to the original paper. 27 | 28 | ## Installation 29 | 30 | `incdbscan` is on PyPI, and can be installed with `pip`: 31 | ``` 32 | pip install incdbscan 33 | ``` 34 | 35 | The latest version of the package requires at least Python 3.9. 36 | 37 | ## Usage 38 | 39 | The algorithm is implemented in the `IncrementalDBSCAN` class. 40 | 41 | There are 3 methods to use: 42 | - `insert` for inserting data points into the clustering 43 | - `delete` for deleting data points from the clustering 44 | - `get_cluster_labels` for obtaining cluster labels 45 | 46 | All methods take a batch of data points in the form of an array of shape `(n_samples, n_features)` (similar to the `scikit-learn` API). 47 | 48 | ```python 49 | from sklearn.datasets import load_iris 50 | X = load_iris()['data'] 51 | X_1, X_2 = X[:100], X[100:] 52 | 53 | from incdbscan import IncrementalDBSCAN 54 | clusterer = IncrementalDBSCAN(eps=0.5, min_pts=5) 55 | 56 | # Insert 1st batch of data points and get their labels 57 | clusterer.insert(X_1) 58 | labels_part1 = clusterer.get_cluster_labels(X_1) 59 | 60 | # Insert 2nd batch and get labels of all points in a one-liner 61 | labels_all = clusterer.insert(X_2).get_cluster_labels(X) 62 | 63 | # Delete 1st batch and get labels for 2nd batch 64 | clusterer.delete(X_1) 65 | labels_part2 = clusterer.get_cluster_labels(X_2) 66 | ``` 67 | 68 | For a longer description of usage check out the [notebook](https://github.com/DataOmbudsman/incdbscan/blob/master/notebooks/incdbscan-usage.ipynb) developed just for that! 69 | 70 | ## Performance 71 | 72 | Performance has two components: insertion and deletion cost. 73 | 74 |

75 | indbscan performance 76 |

77 | 78 | The cost of **inserting** a new data point into IncrementalDBSCAN is quite small and **grows slower** than the cost of applying (`scikit-learns`'s) DBSCAN to a whole data set. In other words, *given that* we have a data set _D_ clustered with IncrementalDBSCAN, and we want to see which cluster would a new object _P_ belong to, it is faster to insert _P_ into the current IncrementalDBSCAN clustering than to apply DBSCAN to the union of _D_ and _P_. 79 | 80 | The cost of **deleting** a data point from IncrementalDBSCAN **grows faster** than the cost of applying (`scikit-learns`'s) DBSCAN to the data set minus that data point. Thus, the cost of deletion in IncrementalDBSCAN is quite small below a certain data set size, but becomes larger as data set size grows. 81 | 82 | These results do not imply that it is very efficient to cluster a whole data set with a series of IncrementalDBSCAN insertions. If we measure the time to cluster a data set with DBSCAN versus to cluster the data by adding the data points one by one to IncrementalDBSCAN, IncrementalDBSCAN will be slower compared to DBSCAN. A typical performance number is that clustering 8,000 data points takes about 10-20 seconds with this implementation. 83 | 84 | See [this notebook](https://github.com/DataOmbudsman/incdbscan/blob/master/notebooks/performance.ipynb) about performance for more details. 85 | 86 | ### Known limitations 87 | 88 | - **Batch insertion**: In the current implementation batch insertion of data points is not efficient, since pairwise distance calculation between new and existing data points is not yet vectorized. 89 | - **Deletion**: Data point deletion can take long in big data sets (big clusters) because of a graph traversal step. There isn't any clear direction of making it more efficient algorithmically. 90 | -------------------------------------------------------------------------------- /images/illustration_circles.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataOmbudsman/incdbscan/2488c191fbd805e6f116d773986d25402e73a9c6/images/illustration_circles.gif -------------------------------------------------------------------------------- /images/performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataOmbudsman/incdbscan/2488c191fbd805e6f116d773986d25402e73a9c6/images/performance.png -------------------------------------------------------------------------------- /incdbscan/__init__.py: -------------------------------------------------------------------------------- 1 | from .incrementaldbscan import ( 2 | IncrementalDBSCAN, 3 | IncrementalDBSCANWarning 4 | ) 5 | 6 | 7 | __version__ = '0.3.0' 8 | -------------------------------------------------------------------------------- /incdbscan/_bfscomponentfinder.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from typing import ( 3 | Dict, 4 | List 5 | ) 6 | 7 | import rustworkx as rx 8 | from rustworkx.visit import ( 9 | BFSVisitor, 10 | PruneSearch 11 | ) 12 | 13 | from ._object import ( 14 | NodeId, 15 | Object 16 | ) 17 | 18 | 19 | class BFSComponentFinder(BFSVisitor): 20 | 21 | # Traverse the objects in a BFS manner to find those components of 22 | # objects that need to be split away. A component here is a group of 23 | # objects that all can be linked to the same seed object. Starting from 24 | # the seed objects, expand the graph by adding neighboring objects. 25 | # Note that it could be even faster if the traversal terminted when all of 26 | # the next nodes to be visited are linked to the same seed object -- this 27 | # means that all but one component are traversed completely and they can 28 | # be split away. 29 | 30 | def __init__(self, graph): 31 | self.graph: rx.PyGraph = graph # graph of Objects # pylint: disable=no-member 32 | self.seed_to_component: Dict[NodeId, List[Object]] = defaultdict(set) 33 | self._node_to_seed: Dict[NodeId, NodeId] = defaultdict(int) 34 | 35 | def discover_vertex(self, vertex_node_id): 36 | # If this is the first time discovering a node then the node itself 37 | # will be its own seed. This is the way we keep track of singleton 38 | # nodes (i.e., ones without edges). 39 | 40 | if vertex_node_id not in self._node_to_seed: 41 | self._node_to_seed[vertex_node_id] = vertex_node_id 42 | self.seed_to_component[vertex_node_id].add(self.graph[vertex_node_id]) 43 | 44 | # If the node does not represent a core object then we don't want 45 | # traversal to go in that direction. 46 | 47 | if not self.graph[vertex_node_id].is_core: 48 | raise PruneSearch 49 | 50 | def tree_edge(self, edge): 51 | source_node_id, target_node_id, _ = edge 52 | 53 | # The target of the edge is a new node we see for the first time. Its 54 | # seed will be the seed of the source. 55 | 56 | seed = self._node_to_seed[source_node_id] 57 | self._node_to_seed[target_node_id] = seed 58 | self.seed_to_component[seed].add(self.graph[target_node_id]) 59 | 60 | def non_tree_edge(self, edge): 61 | source_node_id, target_node_id, _ = edge 62 | 63 | # A non-tree edge is the case of merge, that is, when two components 64 | # with different seeds meet. However, we only merge them if the target 65 | # represents a core object in the graph (i.e., dense connection). 66 | 67 | source_seed = self._node_to_seed[source_node_id] 68 | target_seed = self._node_to_seed[target_node_id] 69 | different_seeds = source_seed != target_seed 70 | 71 | if different_seeds and self.graph[target_node_id].is_core: 72 | if source_seed > target_seed: 73 | self._node_to_seed[target_node_id] = source_seed 74 | else: 75 | self._node_to_seed[source_node_id] = target_seed 76 | 77 | seed = self._node_to_seed[target_node_id] 78 | self.seed_to_component[seed].add(self.graph[target_node_id]) 79 | -------------------------------------------------------------------------------- /incdbscan/_deleter.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import rustworkx as rx 4 | 5 | from ._bfscomponentfinder import BFSComponentFinder 6 | from ._labels import CLUSTER_LABEL_NOISE 7 | 8 | 9 | class Deleter: 10 | def __init__(self, eps, min_pts, objects): 11 | self.eps = eps 12 | self.min_pts = min_pts 13 | self.objects = objects 14 | 15 | def delete(self, object_to_delete): 16 | self.objects.delete_object(object_to_delete) 17 | object_deleted = object_to_delete 18 | 19 | ex_cores = self._get_objects_that_lost_core_property(object_deleted) 20 | 21 | update_seeds, non_core_neighbors_of_ex_cores = \ 22 | self._get_update_seeds_and_non_core_neighbors_of_ex_cores( 23 | ex_cores, object_deleted) 24 | 25 | if update_seeds: 26 | # Only for update seeds belonging to the same cluster do we 27 | # have to consider if split is needed. 28 | 29 | update_seeds_by_cluster = \ 30 | self._group_objects_by_cluster(update_seeds) 31 | 32 | for seeds in update_seeds_by_cluster.values(): 33 | components = self._find_components_to_split_away(seeds) 34 | for component in components: 35 | self.objects.set_labels( 36 | component, self.objects.get_next_cluster_label()) 37 | 38 | # Updating labels of border objects that were in the neighborhood 39 | # of objects that lost their core property is always needed. They 40 | # become either borders of other clusters or noise. 41 | 42 | self._set_each_border_object_labels_to_largest_around( 43 | non_core_neighbors_of_ex_cores) 44 | 45 | def _get_objects_that_lost_core_property(self, object_deleted): 46 | for obj in object_deleted.neighbors: 47 | if obj.neighbor_count == self.min_pts - 1: 48 | yield obj 49 | 50 | # The result has to contain the deleted object if it was core 51 | if object_deleted.is_core: 52 | yield object_deleted 53 | 54 | def _get_update_seeds_and_non_core_neighbors_of_ex_cores( 55 | self, 56 | ex_cores, 57 | object_deleted): 58 | 59 | update_seeds = set() 60 | non_core_neighbors_of_ex_cores = set() 61 | 62 | for ex_core in ex_cores: 63 | # The is-core property of objects that became non core need to be 64 | # re-cached 65 | ex_core._clear_is_core_cache() 66 | for neighbor in ex_core.neighbors: 67 | if neighbor.is_core: 68 | update_seeds.add(neighbor) 69 | else: 70 | non_core_neighbors_of_ex_cores.add(neighbor) 71 | 72 | if object_deleted.count == 0: 73 | update_seeds = update_seeds.difference({object_deleted}) 74 | non_core_neighbors_of_ex_cores = \ 75 | non_core_neighbors_of_ex_cores.difference({object_deleted}) 76 | 77 | return update_seeds, non_core_neighbors_of_ex_cores 78 | 79 | def _group_objects_by_cluster(self, objects): 80 | grouped_objects = defaultdict(list) 81 | 82 | for obj in objects: 83 | label = self.objects.get_label(obj) 84 | grouped_objects[label].append(obj) 85 | 86 | return grouped_objects 87 | 88 | def _find_components_to_split_away(self, seed_objects): 89 | if len(seed_objects) == 1: 90 | return [] 91 | 92 | if self._objects_are_neighbors_of_each_other(seed_objects): 93 | return [] 94 | 95 | seed_node_ids = [obj.node_id for obj in seed_objects] 96 | finder = BFSComponentFinder(self.objects.graph) 97 | rx.bfs_search(self.objects.graph, seed_node_ids, finder) 98 | 99 | seed_of_largest, size_of_largest = 0, 0 100 | for seed_id, component in finder.seed_to_component.items(): 101 | component_size = len(component) 102 | if component_size > size_of_largest: 103 | size_of_largest = component_size 104 | seed_of_largest = seed_id 105 | 106 | for seed_id, component in finder.seed_to_component.items(): 107 | if seed_id != seed_of_largest: 108 | yield component 109 | 110 | @staticmethod 111 | def _objects_are_neighbors_of_each_other(objects): 112 | for obj1 in objects: 113 | for obj2 in objects: 114 | if obj2 not in obj1.neighbors: 115 | return False 116 | return True 117 | 118 | def _set_each_border_object_labels_to_largest_around(self, objects_to_set): 119 | cluster_updates = {} 120 | 121 | for obj in objects_to_set: 122 | labels = self._get_cluster_labels_in_neighborhood(obj) 123 | if not labels: 124 | labels.add(CLUSTER_LABEL_NOISE) 125 | 126 | cluster_updates[obj] = max(labels) 127 | 128 | for obj, new_cluster_label in cluster_updates.items(): 129 | self.objects.set_label(obj, new_cluster_label) 130 | 131 | def _get_cluster_labels_in_neighborhood(self, obj): 132 | return {self.objects.get_label(neighbor) 133 | for neighbor in obj.neighbors 134 | if neighbor.is_core} 135 | -------------------------------------------------------------------------------- /incdbscan/_inserter.py: -------------------------------------------------------------------------------- 1 | from ._labels import ( 2 | CLUSTER_LABEL_NOISE, 3 | CLUSTER_LABEL_UNCLASSIFIED 4 | ) 5 | 6 | 7 | class Inserter: 8 | def __init__(self, eps, min_pts, objects): 9 | self.eps = eps 10 | self.min_pts = min_pts 11 | self.objects = objects 12 | 13 | def insert(self, object_value): 14 | object_inserted = self.objects.insert_object(object_value) 15 | 16 | new_core_neighbors, old_core_neighbors = \ 17 | self._separate_core_neighbors_by_novelty(object_inserted) 18 | 19 | if not new_core_neighbors: 20 | # If there is no new core object, only the new object has to be 21 | # put in a cluster. 22 | 23 | if old_core_neighbors: 24 | # If there are already core objects near to the new object, 25 | # the new object is put in the most recent cluster. This is 26 | # similar to case "Absorption" in the paper but not defined 27 | # there. 28 | 29 | label_of_new_object = max([ 30 | self.objects.get_label(obj) for obj in old_core_neighbors 31 | ]) 32 | 33 | else: 34 | # If the new object does not have any core neighbors, 35 | # it becomes a noise. Called case "Noise" in the paper. 36 | 37 | label_of_new_object = CLUSTER_LABEL_NOISE 38 | 39 | self.objects.set_label(object_inserted, label_of_new_object) 40 | return 41 | 42 | update_seeds = self._get_update_seeds(new_core_neighbors) 43 | 44 | connected_components_in_update_seeds = \ 45 | self.objects.get_connected_components_within_objects(update_seeds) 46 | 47 | for component in connected_components_in_update_seeds: 48 | effective_cluster_labels = \ 49 | self._get_effective_cluster_labels_of_objects(component) 50 | 51 | if not effective_cluster_labels: 52 | # If in a connected component of update seeds there are only 53 | # previously unclassified and noise objects, a new cluster is 54 | # created. Corresponds to case "Creation" in the paper. 55 | 56 | next_cluster_label = self.objects.get_next_cluster_label() 57 | self.objects.set_labels(component, next_cluster_label) 58 | 59 | else: 60 | # If in a connected component of update seeds there are 61 | # already clustered objects, all objects in the component 62 | # will be merged into the most recent cluster. 63 | # Corresponds to cases "Absorption" and "Merge" in the paper. 64 | 65 | max_label = max(effective_cluster_labels) 66 | self.objects.set_labels(component, max_label) 67 | 68 | for label in effective_cluster_labels: 69 | self.objects.change_labels(label, max_label) 70 | 71 | # Finally all neighbors of each new core object inherits a label from 72 | # its new core neighbor, thereby affecting border and noise objects, 73 | # and the object being inserted. 74 | 75 | self._set_cluster_label_around_new_core_neighbors(new_core_neighbors) 76 | 77 | def _separate_core_neighbors_by_novelty(self, object_inserted): 78 | new_cores = set() 79 | old_cores = set() 80 | 81 | for obj in object_inserted.neighbors: 82 | if obj.neighbor_count == self.min_pts: 83 | new_cores.add(obj) 84 | elif obj.neighbor_count > self.min_pts: 85 | old_cores.add(obj) 86 | 87 | # If the inserted object is core, it is a new core 88 | 89 | if object_inserted in old_cores: 90 | old_cores.remove(object_inserted) 91 | new_cores.add(object_inserted) 92 | 93 | return new_cores, old_cores 94 | 95 | def _get_update_seeds(self, new_core_neighbors): 96 | seeds = set() 97 | 98 | for new_core_neighbor in new_core_neighbors: 99 | core_neighbors = [obj for obj in new_core_neighbor.neighbors 100 | if obj.neighbor_count >= self.min_pts] 101 | seeds.update(core_neighbors) 102 | 103 | return seeds 104 | 105 | def _get_effective_cluster_labels_of_objects(self, objects): 106 | non_effective_cluster_labels = {CLUSTER_LABEL_UNCLASSIFIED, 107 | CLUSTER_LABEL_NOISE} 108 | effective_cluster_labels = set() 109 | 110 | for obj in objects: 111 | label = self.objects.get_label(obj) 112 | if label not in non_effective_cluster_labels: 113 | effective_cluster_labels.add(label) 114 | 115 | return effective_cluster_labels 116 | 117 | def _set_cluster_label_around_new_core_neighbors(self, new_core_neighbors): 118 | for obj in new_core_neighbors: 119 | label = self.objects.get_label(obj) 120 | self.objects.set_labels(obj.neighbors, label) 121 | -------------------------------------------------------------------------------- /incdbscan/_labels.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | 4 | ClusterLabel = int 5 | 6 | CLUSTER_LABEL_UNCLASSIFIED: ClusterLabel = -2 7 | CLUSTER_LABEL_NOISE: ClusterLabel = -1 8 | CLUSTER_LABEL_FIRST_CLUSTER: ClusterLabel = 0 9 | 10 | 11 | class LabelHandler: 12 | def __init__(self): 13 | self._label_to_objects = defaultdict(set) 14 | self._object_to_label = {} 15 | 16 | def set_label(self, obj, label): 17 | previous_label = self._object_to_label[obj] 18 | self._label_to_objects[previous_label].remove(obj) 19 | self._label_to_objects[label].add(obj) 20 | self._object_to_label[obj] = label 21 | 22 | def set_label_of_inserted_object(self, obj): 23 | self._object_to_label[obj] = CLUSTER_LABEL_UNCLASSIFIED 24 | self._label_to_objects[CLUSTER_LABEL_UNCLASSIFIED].add(obj) 25 | 26 | def set_labels(self, objects, label): 27 | for obj in objects: 28 | self.set_label(obj, label) 29 | 30 | def delete_label_of_deleted_object(self, obj): 31 | label = self.get_label(obj) 32 | self._label_to_objects[label].remove(obj) 33 | 34 | def get_label(self, obj): 35 | return self._object_to_label[obj] 36 | 37 | def get_next_cluster_label(self): 38 | return max(self._label_to_objects.keys()) + 1 39 | 40 | def change_labels(self, change_from, change_to): 41 | affected_objects = self._label_to_objects.pop(change_from) 42 | self._label_to_objects[change_to].update(affected_objects) 43 | 44 | for obj in affected_objects: 45 | self._object_to_label[obj] = change_to 46 | -------------------------------------------------------------------------------- /incdbscan/_neighbor_searcher.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.neighbors import NearestNeighbors 3 | from sortedcontainers import SortedList 4 | 5 | 6 | class NeighborSearcher: 7 | def __init__(self, radius, metric, p): 8 | self.neighbor_searcher = \ 9 | NearestNeighbors(radius=radius, metric=metric, p=p) 10 | self.values = np.array([]) 11 | self.ids = SortedList() 12 | 13 | def insert(self, new_value, new_id): 14 | self.ids.add(new_id) 15 | position = self.ids.index(new_id) 16 | 17 | self._insert_into_array(new_value, position) 18 | self.neighbor_searcher = self.neighbor_searcher.fit(self.values) 19 | 20 | def _insert_into_array(self, new_value, position): 21 | extended = np.insert(self.values, position, new_value, axis=0) 22 | if not self.values.size: 23 | extended = extended.reshape(1, -1) 24 | self.values = extended 25 | 26 | def query_neighbors(self, query_value): 27 | neighbor_indices = self.neighbor_searcher.radius_neighbors( 28 | [query_value], return_distance=False)[0] 29 | 30 | for ix in neighbor_indices: 31 | yield self.ids[ix] 32 | 33 | def delete(self, id_): 34 | position = self.ids.index(id_) 35 | del self.ids[position] 36 | self.values = np.delete(self.values, position, axis=0) 37 | -------------------------------------------------------------------------------- /incdbscan/_object.py: -------------------------------------------------------------------------------- 1 | from functools import cached_property 2 | 3 | 4 | NodeId = int 5 | ObjectId = int 6 | 7 | 8 | class Object: 9 | def __init__(self, id_, min_pts): 10 | self.id: ObjectId = id_ 11 | self.node_id: NodeId = None 12 | self.count = 1 13 | self.neighbors = {self} 14 | self.neighbor_count = 0 15 | self.min_pts = min_pts 16 | 17 | @cached_property 18 | def is_core(self): 19 | # Note that this property is only valid during deletion 20 | return self.neighbor_count >= self.min_pts 21 | 22 | def _clear_is_core_cache(self): 23 | self.__dict__.pop('is_core', None) 24 | 25 | def __repr__(self): 26 | return f'{self.id}_' 27 | -------------------------------------------------------------------------------- /incdbscan/_objects.py: -------------------------------------------------------------------------------- 1 | from typing import ( 2 | Dict, 3 | List, 4 | Set 5 | ) 6 | 7 | import rustworkx as rx 8 | 9 | from ._labels import LabelHandler 10 | from ._neighbor_searcher import NeighborSearcher 11 | from ._object import ( 12 | NodeId, 13 | Object, 14 | ObjectId 15 | ) 16 | from ._utils import hash_ 17 | 18 | 19 | class Objects(LabelHandler): 20 | def __init__(self, eps, min_pts, metric, p): 21 | super().__init__() 22 | 23 | self.graph = rx.PyGraph(multigraph=False) # pylint: disable=no-member 24 | self._object_id_to_node_id: Dict[ObjectId, NodeId] = {} 25 | 26 | self.neighbor_searcher = \ 27 | NeighborSearcher(radius=eps, metric=metric, p=p) 28 | self.min_pts = min_pts 29 | 30 | def get_object(self, value): 31 | object_id = hash_(value) 32 | if object_id in self._object_id_to_node_id: 33 | obj = self._get_object_from_object_id(object_id) 34 | return obj 35 | return None 36 | 37 | def insert_object(self, value): 38 | object_id = hash_(value) 39 | 40 | if object_id in self._object_id_to_node_id: 41 | obj = self._get_object_from_object_id(object_id) 42 | obj.count += 1 43 | for neighbor in obj.neighbors: 44 | neighbor.neighbor_count += 1 45 | return obj 46 | 47 | new_object = Object(object_id, self.min_pts) 48 | 49 | self._insert_graph_metadata(new_object) 50 | self.set_label_of_inserted_object(new_object) 51 | self.neighbor_searcher.insert(value, object_id) 52 | self._update_neighbors_during_insertion(new_object, value) 53 | return new_object 54 | 55 | def _insert_graph_metadata(self, new_object): 56 | node_id = self.graph.add_node(new_object) 57 | new_object.node_id = node_id 58 | object_id = new_object.id 59 | self._object_id_to_node_id[object_id] = node_id 60 | 61 | def _update_neighbors_during_insertion(self, object_inserted, new_value): 62 | neighbors = self._get_neighbors(new_value) 63 | for obj in neighbors: 64 | obj.neighbor_count += 1 65 | if obj.id != object_inserted.id: 66 | object_inserted.neighbor_count += obj.count 67 | obj.neighbors.add(object_inserted) 68 | object_inserted.neighbors.add(obj) 69 | self.graph.add_edge(object_inserted.node_id, obj.node_id, None) 70 | 71 | def _get_neighbors(self, query_value): 72 | neighbor_ids = self.neighbor_searcher.query_neighbors(query_value) 73 | 74 | for id_ in neighbor_ids: 75 | obj = self._get_object_from_object_id(id_) 76 | yield obj 77 | 78 | def _get_object_from_object_id(self, object_id): 79 | node_id = self._object_id_to_node_id[object_id] 80 | obj = self.graph[node_id] 81 | return obj 82 | 83 | def delete_object(self, obj): 84 | obj.count -= 1 85 | remove_from_data = obj.count == 0 86 | 87 | for neighbor in obj.neighbors: 88 | neighbor.neighbor_count -= 1 89 | if remove_from_data: 90 | if neighbor.id != obj.id: 91 | neighbor.neighbors.remove(obj) 92 | 93 | if remove_from_data: 94 | self._delete_graph_metadata(obj) 95 | self.neighbor_searcher.delete(obj.id) 96 | self.delete_label_of_deleted_object(obj) 97 | 98 | def _delete_graph_metadata(self, deleted_object): 99 | node_id = deleted_object.node_id 100 | self.graph.remove_node(node_id) 101 | del self._object_id_to_node_id[deleted_object.id] 102 | 103 | def get_connected_components_within_objects( 104 | self, objects: Set[Object]) -> List[Set[Object]]: 105 | 106 | if len(objects) == 1: 107 | return [objects] 108 | 109 | node_ids = [obj.node_id for obj in objects] 110 | subgraph = self.graph.subgraph(node_ids) 111 | components_as_ids: List[Set[NodeId]] = rx.connected_components(subgraph) # pylint: disable=no-member 112 | 113 | def _get_original_object(subgraph, subgraph_node_id): 114 | original_node_id = subgraph[subgraph_node_id].node_id 115 | return self.graph[original_node_id] 116 | 117 | components_as_objects = [] 118 | for component in components_as_ids: 119 | component_objects = { 120 | _get_original_object(subgraph, subgraph_node_id) 121 | for subgraph_node_id in component 122 | } 123 | components_as_objects.append(component_objects) 124 | 125 | return components_as_objects 126 | -------------------------------------------------------------------------------- /incdbscan/_utils.py: -------------------------------------------------------------------------------- 1 | import xxhash 2 | from sklearn.utils.validation import check_array 3 | 4 | 5 | def hash_(array): 6 | return xxhash.xxh64_intdigest(array.tobytes()) >> 1 7 | 8 | 9 | def input_check(X): 10 | return check_array(X, dtype=float, accept_large_sparse=False) 11 | -------------------------------------------------------------------------------- /incdbscan/incrementaldbscan.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import numpy as np 4 | 5 | from ._deleter import Deleter 6 | from ._inserter import Inserter 7 | from ._objects import Objects 8 | from ._utils import input_check 9 | 10 | 11 | class IncrementalDBSCAN: 12 | """The incremental version of DBSCAN, a density-based clustering algorithm 13 | that handles outliers. 14 | 15 | After an initial clustering of an initial object set (i.e., set of data 16 | points), the object set can at any time be updated by increments of any 17 | size. An increment can be either the insertion or the deletion of objects. 18 | 19 | After each update, the result of the clustering is the same as if the 20 | updated object set (i.e., the initial object set modified by all 21 | increments) was clustered by DBSCAN. However, this result is reached by 22 | using information from the previous state of the clustering, and without 23 | the need of applying DBSCAN to the whole updated object set. 24 | 25 | Parameters 26 | ---------- 27 | eps : float, optional (default=0.5) 28 | The radius of neighborhood calculation. An object is the neighbor of 29 | another if the distance between them is no more than eps. 30 | 31 | min_pts : int, optional (default=1) 32 | The minimum number of neighbors that an object needs to have to be a 33 | core object of a cluster. 34 | 35 | metric : string or callable, optional (default='minkowski') 36 | The distance metric to use to calculate distance between data objects. 37 | Accepts metrics that are accepted by scikit-learn's NearestNeighbors 38 | class, excluding 'precomputed'. The default is 'minkowski', which is 39 | equivalent to the Euclidean distance if p=2. 40 | 41 | p : float or int, optional (default=2) 42 | Parameter for Minkowski distance if metric='minkowski'. 43 | 44 | References 45 | ---------- 46 | Ester et al. 1998. Incremental Clustering for Mining in a Data Warehousing 47 | Environment. In: Proceedings of the 24th International Conference on Very 48 | Large Data Bases (VLDB 1998). 49 | 50 | """ 51 | 52 | def __init__(self, eps=1, min_pts=5, metric='minkowski', p=2): 53 | self.eps = eps 54 | self.min_pts = min_pts 55 | self.metric = metric 56 | self.p = p 57 | 58 | self._objects = Objects(self.eps, self.min_pts, self.metric, self.p) 59 | self._inserter = Inserter(self.eps, self.min_pts, self._objects) 60 | self._deleter = Deleter(self.eps, self.min_pts, self._objects) 61 | 62 | def insert(self, X): 63 | """Insert objects into the object set, then update clustering. 64 | 65 | Parameters 66 | ---------- 67 | X : array-like of shape (n_samples, n_features) 68 | The data objects to be inserted into the object set. 69 | 70 | Returns 71 | ------- 72 | self 73 | 74 | """ 75 | X = input_check(X) 76 | 77 | for value in X: 78 | self._inserter.insert(value) 79 | 80 | return self 81 | 82 | def delete(self, X): 83 | """Delete objects from object set, then update clustering. 84 | 85 | Parameters 86 | ---------- 87 | X : array-like of shape (n_samples, n_features) 88 | The data objects to be deleted from the object set. 89 | 90 | Returns 91 | ------- 92 | self 93 | 94 | """ 95 | X = input_check(X) 96 | 97 | for ix, value in enumerate(X): 98 | obj = self._objects.get_object(value) 99 | 100 | if obj: 101 | self._deleter.delete(obj) 102 | 103 | else: 104 | warnings.warn( 105 | IncrementalDBSCANWarning( 106 | f'Object at position {ix} was not deleted because ' 107 | 'there is no such object in the object set.' 108 | ) 109 | ) 110 | 111 | return self 112 | 113 | def get_cluster_labels(self, X): 114 | """Get cluster labels of objects. 115 | 116 | Parameters 117 | ---------- 118 | X : array-like of shape (n_samples, n_features) 119 | The data objects to get labels for. 120 | 121 | Returns 122 | ------- 123 | labels : ndarray of shape (n_samples,) 124 | Cluster labels. Effective labels start from 0. -1 means the 125 | object is noise. numpy.nan means the object was not in the 126 | object set. 127 | 128 | """ 129 | X = input_check(X) 130 | 131 | labels = np.zeros(len(X)) 132 | 133 | for ix, value in enumerate(X): 134 | obj = self._objects.get_object(value) 135 | 136 | if obj: 137 | label = self._objects.get_label(obj) 138 | 139 | else: 140 | label = np.nan 141 | warnings.warn( 142 | IncrementalDBSCANWarning( 143 | f'No label was retrieved for object at position {ix} ' 144 | 'because there is no such object in the object set.' 145 | ) 146 | ) 147 | 148 | labels[ix] = label 149 | 150 | return labels 151 | 152 | 153 | class IncrementalDBSCANWarning(Warning): 154 | pass 155 | -------------------------------------------------------------------------------- /incdbscan/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from sklearn.datasets import make_blobs 4 | 5 | from incdbscan import IncrementalDBSCAN 6 | 7 | 8 | EPS = 1.5 9 | 10 | 11 | @pytest.fixture 12 | def incdbscan3(): 13 | return IncrementalDBSCAN(eps=EPS, min_pts=3) 14 | 15 | 16 | @pytest.fixture 17 | def incdbscan4(): 18 | return IncrementalDBSCAN(eps=EPS, min_pts=4) 19 | 20 | 21 | @pytest.fixture 22 | def blob_in_middle(): 23 | # pylint: disable=unbalanced-tuple-unpacking 24 | blob, _ = make_blobs( 25 | n_samples=10, 26 | centers=[[0, 0]], 27 | n_features=2, 28 | cluster_std=0.4, 29 | random_state=123, 30 | return_centers=False 31 | ) 32 | return blob 33 | 34 | 35 | @pytest.fixture 36 | def object_far_away(): 37 | return np.array([[10., 10.]]) 38 | 39 | 40 | @pytest.fixture 41 | def point_at_origin(): 42 | return np.array([[0., 0.]]) 43 | 44 | 45 | @pytest.fixture 46 | def three_points_on_the_left(): 47 | return np.array([ 48 | [-EPS, 0], 49 | [-EPS * 2, 0], 50 | [-EPS * 3, 0], 51 | ]) 52 | 53 | 54 | @pytest.fixture 55 | def three_points_on_the_top(): 56 | return np.array([ 57 | [0, EPS], 58 | [0, EPS * 2], 59 | [0, EPS * 3], 60 | ]) 61 | 62 | 63 | @pytest.fixture 64 | def three_points_at_the_bottom(): 65 | return np.array([ 66 | [0, -EPS], 67 | [0, -EPS * 2], 68 | [0, -EPS * 3], 69 | ]) 70 | 71 | 72 | @pytest.fixture 73 | def hourglass_on_the_right(): 74 | return np.array([ 75 | [EPS, EPS * 2], 76 | [EPS, EPS * 2], 77 | [EPS, EPS], 78 | [EPS, 0], 79 | [EPS, -EPS], 80 | [EPS, -EPS * 2], 81 | [EPS, -EPS * 2], 82 | ]) 83 | -------------------------------------------------------------------------------- /incdbscan/tests/test_deleter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from conftest import EPS 3 | 4 | from testutils import ( 5 | CLUSTER_LABEL_FIRST_CLUSTER, 6 | CLUSTER_LABEL_NOISE, 7 | assert_cluster_labels, 8 | assert_label_of_object_is_among_possible_ones, 9 | assert_split_creates_new_labels_for_new_clusters, 10 | insert_objects_then_assert_cluster_labels, 11 | reflect_horizontally 12 | ) 13 | 14 | 15 | def test_after_deleting_enough_objects_only_noise_remain( 16 | incdbscan4, 17 | blob_in_middle): 18 | 19 | incdbscan4.insert(blob_in_middle) 20 | 21 | for i in range(len(blob_in_middle) - 1): 22 | object_to_delete = blob_in_middle[[i]] 23 | 24 | incdbscan4.delete(object_to_delete) 25 | 26 | expected_label = ( 27 | CLUSTER_LABEL_NOISE 28 | if i > incdbscan4.min_pts + 1 29 | else CLUSTER_LABEL_FIRST_CLUSTER 30 | ) 31 | 32 | assert_cluster_labels(incdbscan4, blob_in_middle[i+1:], expected_label) 33 | 34 | 35 | def test_deleting_cores_only_makes_borders_noise(incdbscan4, point_at_origin): 36 | point_to_delete = point_at_origin 37 | incdbscan4.insert(point_to_delete) 38 | 39 | border = np.array([ 40 | [EPS, 0], 41 | [0, EPS], 42 | [0, -EPS], 43 | ]) 44 | 45 | incdbscan4.insert(border) 46 | incdbscan4.delete(point_to_delete) 47 | 48 | assert_cluster_labels(incdbscan4, border, CLUSTER_LABEL_NOISE) 49 | 50 | 51 | def test_objects_losing_core_property_can_keep_cluster_id( 52 | incdbscan3, 53 | point_at_origin): 54 | 55 | point_to_delete = point_at_origin 56 | 57 | core_points = np.array([ 58 | [EPS, 0], 59 | [0, EPS], 60 | [EPS, EPS], 61 | ]) 62 | 63 | all_points = np.vstack([point_to_delete, core_points]) 64 | 65 | insert_objects_then_assert_cluster_labels( 66 | incdbscan3, all_points, CLUSTER_LABEL_FIRST_CLUSTER) 67 | 68 | incdbscan3.delete(point_to_delete) 69 | assert_cluster_labels(incdbscan3, core_points, CLUSTER_LABEL_FIRST_CLUSTER) 70 | 71 | 72 | def test_border_object_can_switch_to_other_cluster( 73 | incdbscan4, 74 | point_at_origin): 75 | 76 | border = point_at_origin 77 | incdbscan4.insert(border) 78 | 79 | cluster_1 = np.array([ 80 | [EPS, 0], 81 | [EPS, EPS], 82 | [EPS, -EPS], 83 | ]) 84 | cluster_1_expected_label = CLUSTER_LABEL_FIRST_CLUSTER 85 | 86 | cluster_2 = reflect_horizontally(cluster_1) 87 | cluster_2_expected_label = cluster_1_expected_label + 1 88 | 89 | insert_objects_then_assert_cluster_labels( 90 | incdbscan4, cluster_1, cluster_1_expected_label) 91 | 92 | insert_objects_then_assert_cluster_labels( 93 | incdbscan4, cluster_2, cluster_2_expected_label 94 | ) 95 | 96 | assert_cluster_labels(incdbscan4, border, cluster_2_expected_label) 97 | 98 | incdbscan4.delete(cluster_2[[0]]) 99 | 100 | assert_cluster_labels(incdbscan4, border, cluster_1_expected_label) 101 | 102 | 103 | def test_borders_around_point_losing_core_property_can_become_noise( 104 | incdbscan4, 105 | point_at_origin): 106 | 107 | point_to_delete = point_at_origin 108 | 109 | core = np.array([[0, EPS]]) 110 | 111 | border = np.array([ 112 | [0, EPS * 2], 113 | [EPS, EPS] 114 | ]) 115 | 116 | all_points = np.vstack([point_to_delete, core, border]) 117 | all_points_but_point_to_delete = np.vstack([core, border]) 118 | 119 | insert_objects_then_assert_cluster_labels( 120 | incdbscan4, all_points, CLUSTER_LABEL_FIRST_CLUSTER) 121 | 122 | incdbscan4.delete(point_to_delete) 123 | 124 | assert_cluster_labels( 125 | incdbscan4, all_points_but_point_to_delete, CLUSTER_LABEL_NOISE) 126 | 127 | 128 | def test_core_property_of_singleton_update_seed_is_kept_after_deletion( 129 | incdbscan3, 130 | point_at_origin): 131 | 132 | point_to_delete = point_at_origin 133 | 134 | cores = np.array([ 135 | [EPS, 0], 136 | [2 * EPS, 0], 137 | [2 * EPS, 0], 138 | ]) 139 | 140 | lonely = np.array([[-EPS, 0]]) 141 | 142 | all_points = np.vstack([point_to_delete, cores, lonely]) 143 | 144 | insert_objects_then_assert_cluster_labels( 145 | incdbscan3, all_points, CLUSTER_LABEL_FIRST_CLUSTER) 146 | 147 | incdbscan3.delete(point_to_delete) 148 | 149 | assert_cluster_labels(incdbscan3, cores, CLUSTER_LABEL_FIRST_CLUSTER) 150 | assert_cluster_labels(incdbscan3, lonely, CLUSTER_LABEL_NOISE) 151 | 152 | 153 | def test_cluster_id_of_single_component_update_seeds_is_kept_after_deletion( 154 | incdbscan3, 155 | point_at_origin): 156 | 157 | point_to_delete = point_at_origin 158 | 159 | cores = np.array([ 160 | [EPS, 0], 161 | [EPS, 0], 162 | [2 * EPS, 0], 163 | ]) 164 | 165 | lonely = np.array([[-EPS, 0]]) 166 | 167 | all_points = np.vstack([point_to_delete, cores, lonely]) 168 | 169 | insert_objects_then_assert_cluster_labels( 170 | incdbscan3, all_points, CLUSTER_LABEL_FIRST_CLUSTER) 171 | 172 | incdbscan3.delete(point_to_delete) 173 | 174 | assert_cluster_labels(incdbscan3, cores, CLUSTER_LABEL_FIRST_CLUSTER) 175 | assert_cluster_labels(incdbscan3, lonely, CLUSTER_LABEL_NOISE) 176 | 177 | 178 | def test_cluster_id_of_single_component_objects_is_kept_after_deletion( 179 | incdbscan3, 180 | point_at_origin): 181 | 182 | point_to_delete = point_at_origin 183 | 184 | cores = np.array([ 185 | [EPS, 0], 186 | [0, EPS], 187 | [EPS, EPS], 188 | [EPS, EPS], 189 | ]) 190 | 191 | all_points = np.vstack([point_to_delete, cores]) 192 | 193 | insert_objects_then_assert_cluster_labels( 194 | incdbscan3, all_points, CLUSTER_LABEL_FIRST_CLUSTER) 195 | 196 | incdbscan3.delete(point_to_delete) 197 | 198 | assert_cluster_labels(incdbscan3, cores, CLUSTER_LABEL_FIRST_CLUSTER) 199 | 200 | 201 | def test_simple_two_way_split( 202 | incdbscan3, 203 | point_at_origin, 204 | three_points_on_the_left): 205 | 206 | point_to_delete = point_at_origin 207 | points_left = three_points_on_the_left 208 | points_right = reflect_horizontally(points_left) 209 | 210 | all_points = np.vstack([point_to_delete, points_left, points_right]) 211 | 212 | insert_objects_then_assert_cluster_labels( 213 | incdbscan3, all_points, CLUSTER_LABEL_FIRST_CLUSTER) 214 | 215 | incdbscan3.delete(point_to_delete) 216 | 217 | assert_split_creates_new_labels_for_new_clusters( 218 | incdbscan3, [points_left, points_right], CLUSTER_LABEL_FIRST_CLUSTER) 219 | 220 | 221 | def test_simple_two_way_split_with_noise( 222 | incdbscan3, 223 | point_at_origin, 224 | three_points_on_the_left, 225 | three_points_on_the_top, 226 | three_points_at_the_bottom): 227 | 228 | point_to_delete = point_at_origin 229 | points_left = three_points_on_the_left 230 | points_top = three_points_on_the_top 231 | points_bottom = three_points_at_the_bottom[:-1] 232 | 233 | all_points = np.vstack([ 234 | point_to_delete, 235 | points_left, 236 | points_top, 237 | points_bottom 238 | ]) 239 | 240 | insert_objects_then_assert_cluster_labels( 241 | incdbscan3, all_points, CLUSTER_LABEL_FIRST_CLUSTER) 242 | 243 | incdbscan3.delete(point_to_delete) 244 | 245 | assert_split_creates_new_labels_for_new_clusters( 246 | incdbscan3, [points_left, points_top], CLUSTER_LABEL_FIRST_CLUSTER) 247 | 248 | assert_cluster_labels(incdbscan3, points_bottom, CLUSTER_LABEL_NOISE) 249 | 250 | 251 | def test_three_way_split( 252 | incdbscan3, 253 | point_at_origin, 254 | three_points_on_the_left, 255 | three_points_on_the_top, 256 | three_points_at_the_bottom): 257 | 258 | point_to_delete = point_at_origin 259 | points_left = three_points_on_the_left 260 | points_top = three_points_on_the_top 261 | points_bottom = three_points_at_the_bottom 262 | 263 | all_points = np.vstack([ 264 | point_to_delete, 265 | points_left, 266 | points_top, 267 | points_bottom 268 | ]) 269 | 270 | insert_objects_then_assert_cluster_labels( 271 | incdbscan3, all_points, CLUSTER_LABEL_FIRST_CLUSTER) 272 | 273 | incdbscan3.delete(point_to_delete) 274 | 275 | assert_split_creates_new_labels_for_new_clusters( 276 | incdbscan3, 277 | [points_left, points_top, points_bottom], 278 | CLUSTER_LABEL_FIRST_CLUSTER 279 | ) 280 | 281 | 282 | def test_simultaneous_split_and_non_split( 283 | incdbscan3, 284 | point_at_origin, 285 | three_points_on_the_left): 286 | 287 | point_to_delete = point_at_origin 288 | points_left = three_points_on_the_left 289 | 290 | points_right = np.array([ 291 | [0, EPS], 292 | [0, -EPS], 293 | [EPS, 0], 294 | [EPS, EPS], 295 | [EPS, -EPS], 296 | ]) 297 | 298 | all_points = np.vstack([point_to_delete, points_left, points_right]) 299 | 300 | insert_objects_then_assert_cluster_labels( 301 | incdbscan3, all_points, CLUSTER_LABEL_FIRST_CLUSTER) 302 | 303 | incdbscan3.delete(point_to_delete) 304 | 305 | assert_split_creates_new_labels_for_new_clusters( 306 | incdbscan3, [points_left, points_right], CLUSTER_LABEL_FIRST_CLUSTER) 307 | 308 | 309 | def test_two_way_split_with_non_dense_bridge(incdbscan4, point_at_origin): 310 | point_to_delete = bridge_point = point_at_origin 311 | 312 | points_left = np.array([ 313 | [0, -EPS], 314 | [0, -EPS * 2], 315 | [0, -EPS * 2], 316 | [0, -EPS * 3], 317 | [0, -EPS * 3], 318 | ]) 319 | 320 | points_right = np.array([ 321 | [0, EPS], 322 | [0, EPS * 2], 323 | [0, EPS * 2], 324 | [0, EPS * 3], 325 | [0, EPS * 3], 326 | ]) 327 | 328 | all_points = np.vstack([ 329 | bridge_point, point_to_delete, points_left, points_right 330 | ]) 331 | 332 | insert_objects_then_assert_cluster_labels( 333 | incdbscan4, all_points, CLUSTER_LABEL_FIRST_CLUSTER) 334 | 335 | incdbscan4.delete(point_to_delete) 336 | 337 | assert_split_creates_new_labels_for_new_clusters( 338 | incdbscan4, [points_left, points_right], CLUSTER_LABEL_FIRST_CLUSTER) 339 | 340 | assert_label_of_object_is_among_possible_ones( 341 | incdbscan4, 342 | bridge_point, 343 | {CLUSTER_LABEL_FIRST_CLUSTER, CLUSTER_LABEL_FIRST_CLUSTER + 1} 344 | ) 345 | 346 | 347 | def test_simultaneous_splits_within_two_clusters( 348 | incdbscan4, 349 | point_at_origin, 350 | hourglass_on_the_right): 351 | 352 | point_to_delete = point_at_origin 353 | points_right = hourglass_on_the_right 354 | points_left = reflect_horizontally(points_right) 355 | 356 | incdbscan4.insert(point_to_delete) 357 | 358 | cluster_1_expected_label = CLUSTER_LABEL_FIRST_CLUSTER 359 | insert_objects_then_assert_cluster_labels( 360 | incdbscan4, points_left, cluster_1_expected_label) 361 | 362 | cluster_2_expected_label = CLUSTER_LABEL_FIRST_CLUSTER + 1 363 | insert_objects_then_assert_cluster_labels( 364 | incdbscan4, points_right, cluster_2_expected_label) 365 | 366 | incdbscan4.delete(point_to_delete) 367 | 368 | expected_clusters = [ 369 | points_left[:3], points_left[-3:], points_right[:3], points_right[-3:] 370 | ] 371 | 372 | assert_split_creates_new_labels_for_new_clusters( 373 | incdbscan4, expected_clusters, CLUSTER_LABEL_FIRST_CLUSTER) 374 | 375 | expected_cluster_labels_left = { 376 | incdbscan4.get_cluster_labels(points_left[[2]])[0], 377 | incdbscan4.get_cluster_labels(points_left[[4]])[0], 378 | } 379 | 380 | assert_label_of_object_is_among_possible_ones( 381 | incdbscan4, points_left[[3]], expected_cluster_labels_left) 382 | 383 | expected_cluster_labels_right = { 384 | incdbscan4.get_cluster_labels(points_right[[2]])[0], 385 | incdbscan4.get_cluster_labels(points_right[[4]])[0] 386 | } 387 | 388 | assert_label_of_object_is_among_possible_ones( 389 | incdbscan4, points_right[[3]], expected_cluster_labels_right) 390 | 391 | 392 | def test_two_non_dense_bridges(incdbscan4, point_at_origin): 393 | point_to_delete = point_at_origin 394 | 395 | points_left = np.array([ 396 | [-EPS, 0], 397 | [-EPS, 0], 398 | [-EPS, -EPS], 399 | [-EPS, -EPS], 400 | [-EPS, -EPS * 2], 401 | ]) 402 | points_right = reflect_horizontally(points_left) 403 | 404 | points_top = np.array([ 405 | [0, EPS], 406 | [0, EPS], 407 | [0, EPS * 2], 408 | [0, EPS * 2], 409 | [0, EPS * 3], 410 | [0, EPS * 3], 411 | [0, EPS * 4], 412 | [0, EPS * 4], 413 | ]) 414 | 415 | bottom_bridge = np.array([[0, -EPS * 2]]) 416 | 417 | all_points = np.vstack([ 418 | point_to_delete, points_left, points_right, points_top, bottom_bridge 419 | ]) 420 | 421 | insert_objects_then_assert_cluster_labels( 422 | incdbscan4, all_points, CLUSTER_LABEL_FIRST_CLUSTER) 423 | 424 | incdbscan4.delete(point_to_delete) 425 | 426 | expected_clusters = [points_left, points_right, points_top] 427 | 428 | assert_split_creates_new_labels_for_new_clusters( 429 | incdbscan4, expected_clusters, CLUSTER_LABEL_FIRST_CLUSTER) 430 | -------------------------------------------------------------------------------- /incdbscan/tests/test_incrementaldbscan.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from incdbscan import ( 4 | IncrementalDBSCAN, 5 | IncrementalDBSCANWarning 6 | ) 7 | from testutils import ( 8 | CLUSTER_LABEL_NOISE, 9 | delete_object_and_assert_error, 10 | delete_object_and_assert_no_warning, 11 | delete_object_and_assert_warning, 12 | get_label_and_assert_error, 13 | get_label_and_assert_no_warning, 14 | get_label_and_assert_warning, 15 | insert_object_and_assert_error, 16 | insert_objects_then_assert_cluster_labels 17 | ) 18 | 19 | 20 | def test_error_when_input_is_non_numeric(incdbscan3): 21 | inputs_not_welcomed = np.array([ 22 | [1, 2, 'x'], 23 | [1, 2, None], 24 | [1, 2, np.nan], 25 | [1, 2, np.inf], 26 | ]) 27 | 28 | for i in range(len(inputs_not_welcomed)): 29 | input_ = inputs_not_welcomed[[i]] 30 | 31 | insert_object_and_assert_error(incdbscan3, input_, ValueError) 32 | delete_object_and_assert_error(incdbscan3, input_, ValueError) 33 | get_label_and_assert_error(incdbscan3, input_, ValueError) 34 | 35 | 36 | def test_handling_of_same_object_with_different_dtype(incdbscan3): 37 | object_as_int = np.array([[1, 2]]) 38 | object_as_float = np.array([[1., 2.]]) 39 | 40 | incdbscan3.insert(object_as_int) 41 | 42 | assert incdbscan3.get_cluster_labels(object_as_int) == \ 43 | incdbscan3.get_cluster_labels(object_as_float) 44 | 45 | delete_object_and_assert_no_warning(incdbscan3, object_as_float) 46 | 47 | 48 | def test_handling_of_more_than_2d_arrays(incdbscan3, incdbscan4): 49 | object_3d = np.array([[1, 2, 3]]) 50 | 51 | incdbscan3.insert(object_3d) 52 | incdbscan3.insert(object_3d) 53 | incdbscan3.delete(object_3d) 54 | 55 | assert incdbscan3.get_cluster_labels(object_3d) == CLUSTER_LABEL_NOISE 56 | 57 | object_100d = np.random.random(100).reshape(1, -1) 58 | 59 | incdbscan4.insert(object_100d) 60 | incdbscan4.insert(object_100d) 61 | incdbscan4.delete(object_100d) 62 | 63 | assert incdbscan4.get_cluster_labels(object_100d) == CLUSTER_LABEL_NOISE 64 | 65 | 66 | def test_no_warning_when_a_known_object_is_deleted( 67 | incdbscan3, 68 | point_at_origin): 69 | 70 | incdbscan3.insert(point_at_origin) 71 | delete_object_and_assert_no_warning(incdbscan3, point_at_origin) 72 | 73 | incdbscan3.insert(point_at_origin) 74 | incdbscan3.insert(point_at_origin) 75 | delete_object_and_assert_no_warning(incdbscan3, point_at_origin) 76 | delete_object_and_assert_no_warning(incdbscan3, point_at_origin) 77 | 78 | 79 | def test_warning_when_unknown_object_is_deleted( 80 | incdbscan3, 81 | point_at_origin): 82 | 83 | delete_object_and_assert_warning( 84 | incdbscan3, point_at_origin, IncrementalDBSCANWarning) 85 | 86 | incdbscan3.insert(point_at_origin) 87 | 88 | incdbscan3.delete(point_at_origin) 89 | 90 | delete_object_and_assert_warning( 91 | incdbscan3, point_at_origin, IncrementalDBSCANWarning) 92 | 93 | 94 | def test_no_warning_when_cluster_label_is_gotten_for_known_object( 95 | incdbscan3, 96 | point_at_origin): 97 | 98 | expected_label = np.array([CLUSTER_LABEL_NOISE]) 99 | 100 | incdbscan3.insert(point_at_origin) 101 | label = get_label_and_assert_no_warning(incdbscan3, point_at_origin) 102 | assert label == expected_label 103 | 104 | incdbscan3.insert(point_at_origin) 105 | incdbscan3.delete(point_at_origin) 106 | label = get_label_and_assert_no_warning(incdbscan3, point_at_origin) 107 | assert label == expected_label 108 | 109 | 110 | def test_warning_when_cluster_label_is_gotten_for_unknown_object( 111 | incdbscan3, 112 | point_at_origin): 113 | 114 | label = get_label_and_assert_warning( 115 | incdbscan3, point_at_origin, IncrementalDBSCANWarning) 116 | assert np.isnan(label) 117 | 118 | incdbscan3.insert(point_at_origin) 119 | incdbscan3.delete(point_at_origin) 120 | 121 | label = get_label_and_assert_warning( 122 | incdbscan3, point_at_origin, IncrementalDBSCANWarning) 123 | assert np.isnan(label) 124 | 125 | 126 | def test_different_metrics_are_available(): 127 | incdbscan_euclidean = \ 128 | IncrementalDBSCAN(eps=1.5, min_pts=3, metric='euclidean') 129 | incdbscan_manhattan = \ 130 | IncrementalDBSCAN(eps=1.5, min_pts=3, metric='manhattan') 131 | 132 | diagonal = np.array([ 133 | [0, 0], 134 | [1, 1], 135 | [2, 2], 136 | ]) 137 | 138 | expected_label_euclidean = CLUSTER_LABEL_NOISE + 1 139 | insert_objects_then_assert_cluster_labels( 140 | incdbscan_euclidean, diagonal, expected_label_euclidean) 141 | 142 | expected_label_manhattan = CLUSTER_LABEL_NOISE 143 | insert_objects_then_assert_cluster_labels( 144 | incdbscan_manhattan, diagonal, expected_label_manhattan) 145 | -------------------------------------------------------------------------------- /incdbscan/tests/test_inserter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from conftest import EPS 3 | 4 | from testutils import ( 5 | CLUSTER_LABEL_FIRST_CLUSTER, 6 | CLUSTER_LABEL_NOISE, 7 | assert_cluster_labels, 8 | assert_label_of_object_is_among_possible_ones, 9 | assert_two_objects_are_in_same_cluster, 10 | insert_objects_then_assert_cluster_labels, 11 | reflect_horizontally 12 | ) 13 | 14 | 15 | def test_new_single_object_is_labeled_as_noise(incdbscan4, object_far_away): 16 | incdbscan4.insert(object_far_away) 17 | assert_cluster_labels(incdbscan4, object_far_away, CLUSTER_LABEL_NOISE) 18 | 19 | 20 | def test_new_object_far_from_cluster_is_labeled_as_noise( 21 | incdbscan4, 22 | blob_in_middle, 23 | object_far_away): 24 | 25 | incdbscan4.insert(blob_in_middle) 26 | incdbscan4.insert(object_far_away) 27 | 28 | assert_cluster_labels(incdbscan4, object_far_away, CLUSTER_LABEL_NOISE) 29 | 30 | 31 | def test_new_border_object_gets_label_from_core(incdbscan4): 32 | cluster = np.array([ 33 | [1., 1.], 34 | [0., 1.], 35 | [1., 0.], 36 | [0., 0.], 37 | ]) 38 | 39 | new_border_object = np.array([[1 + EPS, 1]]) 40 | 41 | incdbscan4.insert(cluster) 42 | incdbscan4.insert(new_border_object) 43 | 44 | print(incdbscan4.get_cluster_labels(cluster[[0]])) 45 | print(incdbscan4.get_cluster_labels(new_border_object)) 46 | 47 | assert_two_objects_are_in_same_cluster( 48 | incdbscan4, cluster[[0]], new_border_object) 49 | 50 | 51 | def test_labels_are_noise_only_until_not_enough_objects_in_cluster( 52 | incdbscan4, 53 | blob_in_middle): 54 | 55 | for i in range(len(blob_in_middle)): 56 | incdbscan4.insert(blob_in_middle[[i]]) 57 | 58 | expected_label = ( 59 | CLUSTER_LABEL_NOISE if i + 1 < incdbscan4.min_pts 60 | else CLUSTER_LABEL_FIRST_CLUSTER 61 | ) 62 | 63 | assert_cluster_labels(incdbscan4, blob_in_middle[:i+1], expected_label) 64 | 65 | 66 | def test_more_than_two_clusters_can_be_created(incdbscan4, blob_in_middle): 67 | cluster_1 = blob_in_middle 68 | cluster_1_expected_label = CLUSTER_LABEL_FIRST_CLUSTER 69 | 70 | insert_objects_then_assert_cluster_labels( 71 | incdbscan4, cluster_1, cluster_1_expected_label) 72 | 73 | cluster_2 = cluster_1 + 10 74 | cluster_2_expected_label = cluster_1_expected_label + 1 75 | 76 | insert_objects_then_assert_cluster_labels( 77 | incdbscan4, cluster_2, cluster_2_expected_label) 78 | 79 | cluster_3 = cluster_2 + 10 80 | cluster_3_expected_label = cluster_2_expected_label + 1 81 | 82 | insert_objects_then_assert_cluster_labels( 83 | incdbscan4, cluster_3, cluster_3_expected_label) 84 | 85 | 86 | def test_two_clusters_can_be_born_at_the_same_time( 87 | incdbscan4, 88 | point_at_origin): 89 | 90 | cluster_1 = np.array([ 91 | [EPS * 1, 0], 92 | [EPS * 2, 0], 93 | [EPS * 2, 0], 94 | ]) 95 | 96 | cluster_2 = reflect_horizontally(cluster_1) 97 | 98 | incdbscan4.insert(cluster_1) 99 | incdbscan4.insert(cluster_2) 100 | 101 | assert_cluster_labels(incdbscan4, cluster_1, CLUSTER_LABEL_NOISE) 102 | assert_cluster_labels(incdbscan4, cluster_2, CLUSTER_LABEL_NOISE) 103 | 104 | new_object = point_at_origin 105 | incdbscan4.insert(new_object) 106 | 107 | cluster_1_label_expected = incdbscan4.get_cluster_labels(cluster_1[[0]])[0] 108 | assert_cluster_labels(incdbscan4, cluster_1, cluster_1_label_expected) 109 | 110 | cluster_2_label_expected = \ 111 | CLUSTER_LABEL_FIRST_CLUSTER + 1 - cluster_1_label_expected 112 | assert_cluster_labels(incdbscan4, cluster_2, cluster_2_label_expected) 113 | 114 | assert_label_of_object_is_among_possible_ones( 115 | incdbscan4, 116 | new_object, 117 | {cluster_1_label_expected, cluster_2_label_expected} 118 | ) 119 | 120 | 121 | def test_absorption_with_noise(incdbscan3, point_at_origin): 122 | expected_cluster_label = CLUSTER_LABEL_FIRST_CLUSTER 123 | 124 | cluster_values = np.array([ 125 | [EPS, 0], 126 | [EPS * 2, 0], 127 | [EPS * 3, 0], 128 | ]) 129 | 130 | insert_objects_then_assert_cluster_labels( 131 | incdbscan3, cluster_values, expected_cluster_label) 132 | 133 | noise = np.array([[0, EPS]]) 134 | 135 | insert_objects_then_assert_cluster_labels( 136 | incdbscan3, noise, CLUSTER_LABEL_NOISE) 137 | 138 | new_object_value = point_at_origin 139 | 140 | insert_objects_then_assert_cluster_labels( 141 | incdbscan3, new_object_value, expected_cluster_label) 142 | 143 | assert_cluster_labels(incdbscan3, noise, expected_cluster_label) 144 | 145 | 146 | def test_merge_two_clusters(incdbscan3, point_at_origin): 147 | cluster_1 = np.array([ 148 | [EPS, 0], 149 | [EPS * 2, 0], 150 | [EPS * 3, 0], 151 | [EPS * 4, 0], 152 | ]) 153 | cluster_1_expected_label = CLUSTER_LABEL_FIRST_CLUSTER 154 | 155 | insert_objects_then_assert_cluster_labels( 156 | incdbscan3, cluster_1, cluster_1_expected_label) 157 | 158 | cluster_2 = reflect_horizontally(cluster_1) 159 | cluster_2_expected_label = cluster_1_expected_label + 1 160 | 161 | insert_objects_then_assert_cluster_labels( 162 | incdbscan3, cluster_2, cluster_2_expected_label) 163 | 164 | new_object = point_at_origin 165 | merged_cluster_expected_label = \ 166 | max([cluster_1_expected_label, cluster_2_expected_label]) 167 | 168 | insert_objects_then_assert_cluster_labels( 169 | incdbscan3, new_object, merged_cluster_expected_label) 170 | 171 | assert_cluster_labels(incdbscan3, cluster_1, merged_cluster_expected_label) 172 | assert_cluster_labels(incdbscan3, cluster_2, merged_cluster_expected_label) 173 | 174 | 175 | def test_merger_and_creation_can_happen_at_the_same_time( 176 | incdbscan4, 177 | point_at_origin, 178 | hourglass_on_the_right): 179 | 180 | # Insert objects to the right 181 | hourglass = hourglass_on_the_right 182 | 183 | top_right = hourglass[:3] 184 | top_right_expected_label = CLUSTER_LABEL_FIRST_CLUSTER 185 | 186 | bottom_right = hourglass[-3:] 187 | bottom_right_expected_label = top_right_expected_label + 1 188 | 189 | bridge_point = hourglass[[3]] 190 | 191 | incdbscan4.insert(top_right) 192 | incdbscan4.insert(bridge_point) 193 | incdbscan4.insert(bottom_right) 194 | 195 | assert_cluster_labels(incdbscan4, top_right, top_right_expected_label) 196 | assert_cluster_labels( 197 | incdbscan4, bottom_right, bottom_right_expected_label) 198 | 199 | assert_label_of_object_is_among_possible_ones( 200 | incdbscan4, 201 | bridge_point, 202 | {bottom_right_expected_label, bottom_right_expected_label} 203 | ) 204 | 205 | merged_cluster_expected_label = \ 206 | incdbscan4.get_cluster_labels(bridge_point)[0] 207 | 208 | # Insert objects to the left 209 | left_pre_cluster = np.array([ 210 | [-EPS, 0], 211 | [-EPS * 2, 0], 212 | [-EPS * 2, 0], 213 | ]) 214 | left_cluster_expected_label = bottom_right_expected_label + 1 215 | 216 | insert_objects_then_assert_cluster_labels( 217 | incdbscan4, 218 | left_pre_cluster, 219 | CLUSTER_LABEL_NOISE 220 | ) 221 | 222 | # Insert object to the center 223 | new_object = point_at_origin 224 | incdbscan4.insert(new_object) 225 | 226 | assert_cluster_labels( 227 | incdbscan4, top_right, merged_cluster_expected_label) 228 | assert_cluster_labels( 229 | incdbscan4, bottom_right, merged_cluster_expected_label) 230 | assert_cluster_labels( 231 | incdbscan4, bridge_point, merged_cluster_expected_label) 232 | assert_cluster_labels( 233 | incdbscan4, left_pre_cluster, left_cluster_expected_label) 234 | 235 | assert_label_of_object_is_among_possible_ones( 236 | incdbscan4, 237 | new_object, 238 | {merged_cluster_expected_label, left_cluster_expected_label} 239 | ) 240 | 241 | 242 | def test_two_mergers_can_happen_at_the_same_time( 243 | incdbscan4, 244 | point_at_origin, 245 | hourglass_on_the_right): 246 | 247 | # Insert objects to the right 248 | top_right = hourglass_on_the_right[:3] 249 | top_right_expected_label = CLUSTER_LABEL_FIRST_CLUSTER 250 | 251 | bottom_right = hourglass_on_the_right[-3:] 252 | bottom_right_expected_label = top_right_expected_label + 1 253 | 254 | bridge_point_right = hourglass_on_the_right[[3]] 255 | 256 | incdbscan4.insert(top_right) 257 | incdbscan4.insert(bridge_point_right) 258 | incdbscan4.insert(bottom_right) 259 | 260 | assert_cluster_labels(incdbscan4, top_right, top_right_expected_label) 261 | assert_cluster_labels( 262 | incdbscan4, bottom_right, bottom_right_expected_label) 263 | 264 | assert_label_of_object_is_among_possible_ones( 265 | incdbscan4, 266 | bridge_point_right, 267 | {bottom_right_expected_label, bottom_right_expected_label} 268 | ) 269 | 270 | # Insert objects to the left 271 | hourglass_on_the_left = reflect_horizontally(hourglass_on_the_right) 272 | 273 | top_left = hourglass_on_the_left[:3] 274 | top_left_expected_label = bottom_right_expected_label + 1 275 | 276 | bottom_left = hourglass_on_the_left[-3:] 277 | bottom_left_expected_label = top_left_expected_label + 1 278 | 279 | bridge_point_left = hourglass_on_the_left[[3]] 280 | 281 | incdbscan4.insert(top_left) 282 | incdbscan4.insert(bridge_point_left) 283 | incdbscan4.insert(bottom_left) 284 | 285 | assert_cluster_labels(incdbscan4, top_left, top_left_expected_label) 286 | assert_cluster_labels(incdbscan4, bottom_left, bottom_left_expected_label) 287 | 288 | assert_label_of_object_is_among_possible_ones( 289 | incdbscan4, 290 | bridge_point_left, 291 | {top_left_expected_label, bottom_left_expected_label} 292 | ) 293 | 294 | # Insert object to the center 295 | new_object = point_at_origin 296 | incdbscan4.insert(new_object) 297 | 298 | assert_cluster_labels( 299 | incdbscan4, 300 | np.vstack([top_right, bottom_right]), 301 | bottom_right_expected_label 302 | ) 303 | 304 | assert_cluster_labels( 305 | incdbscan4, 306 | np.vstack([top_left, bottom_left]), 307 | bottom_left_expected_label 308 | ) 309 | 310 | assert_label_of_object_is_among_possible_ones( 311 | incdbscan4, 312 | bridge_point_right, 313 | {bottom_left_expected_label, bottom_right_expected_label} 314 | ) 315 | 316 | assert_label_of_object_is_among_possible_ones( 317 | incdbscan4, 318 | bridge_point_left, 319 | {top_left_expected_label, bottom_left_expected_label} 320 | ) 321 | 322 | 323 | def test_object_is_core_if_it_has_more_than_enough_neigbhors( 324 | incdbscan3, 325 | point_at_origin): 326 | 327 | neighbors = np.array([ 328 | [0, EPS], 329 | [0, -EPS], 330 | [EPS, 0], 331 | [-EPS, 0], 332 | ]) 333 | expected_label = CLUSTER_LABEL_FIRST_CLUSTER 334 | 335 | incdbscan3.insert(neighbors) 336 | incdbscan3.insert(point_at_origin) 337 | 338 | assert_cluster_labels(incdbscan3, neighbors, expected_label) 339 | assert_cluster_labels(incdbscan3, point_at_origin, expected_label) 340 | -------------------------------------------------------------------------------- /incdbscan/tests/test_testutils.py: -------------------------------------------------------------------------------- 1 | from testutils import are_lists_isomorphic 2 | 3 | 4 | def test_isomorphism_check_fails_when_different_length(): 5 | assert not are_lists_isomorphic([0, 0], [0, 0, 0]) 6 | 7 | 8 | def test_isomorphism_check_fails_when_different_size_of_value_sets(): 9 | assert not are_lists_isomorphic([0, 0], [0, 1]) 10 | 11 | 12 | def test_isomorphism_check_fails_when_there_is_no_isomorphism(): 13 | assert not are_lists_isomorphic([1, 2, 3, 1], [1, 1, 3, 2]) 14 | 15 | 16 | def test_isomorphism_check_succeeds_when_there_is_isomorphism(): 17 | assert are_lists_isomorphic([1, 1, 2, 2, 3], [2, 2, 3, 3, 4]) 18 | -------------------------------------------------------------------------------- /incdbscan/tests/test_with_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from sklearn.cluster import DBSCAN 4 | 5 | from incdbscan import IncrementalDBSCAN 6 | from testutils import ( 7 | are_lists_isomorphic, 8 | read_handl_data 9 | ) 10 | 11 | 12 | @pytest.mark.slow 13 | def test_same_results_as_sklearn_dbscan(): 14 | EPS = 1 15 | MIN_PTS = 5 16 | 17 | data = read_handl_data() 18 | dbscan = DBSCAN(eps=EPS, min_samples=MIN_PTS) 19 | labels_dbscan = dbscan.fit_predict(data) 20 | 21 | incdbscan = IncrementalDBSCAN(eps=EPS, min_pts=MIN_PTS) 22 | labels_incdbscan_1 = incdbscan.insert(data).get_cluster_labels(data) 23 | assert are_lists_isomorphic(labels_dbscan, labels_incdbscan_1) 24 | 25 | labels_incdbscan_2 = \ 26 | incdbscan.insert(data).delete(data).get_cluster_labels(data) 27 | assert are_lists_isomorphic(labels_dbscan, labels_incdbscan_2) 28 | 29 | np.random.seed(123) 30 | noise = np.random.uniform(-14, 14, (1000, 2)) 31 | labels_incdbscan_3 = \ 32 | incdbscan.insert(noise).delete(noise).get_cluster_labels(data) 33 | assert are_lists_isomorphic(labels_dbscan, labels_incdbscan_3) 34 | -------------------------------------------------------------------------------- /incdbscan/tests/testutils.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from io import StringIO 3 | from typing import Iterable 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import pytest 8 | import requests 9 | 10 | 11 | CLUSTER_LABEL_NOISE = -1 12 | CLUSTER_LABEL_FIRST_CLUSTER = 0 13 | 14 | 15 | def assert_cluster_labels(incdbscan_fit, objects: Iterable, label): 16 | assert np.all( 17 | incdbscan_fit.get_cluster_labels(objects) == label 18 | ) 19 | 20 | 21 | def assert_two_objects_are_in_same_cluster(incdbscan_fit, object1, object2): 22 | assert incdbscan_fit.get_cluster_labels(object1) == \ 23 | incdbscan_fit.get_cluster_labels(object2) 24 | 25 | 26 | def assert_label_of_object_is_among_possible_ones( 27 | incdbscan_fit, 28 | obj, 29 | possible_labels): 30 | 31 | assert incdbscan_fit.get_cluster_labels(obj)[0] in possible_labels 32 | 33 | 34 | def insert_objects_then_assert_cluster_labels( 35 | incdbscan, 36 | values: Iterable, 37 | expected_label): 38 | 39 | incdbscan.insert(values) 40 | assert_cluster_labels(incdbscan, values, expected_label) 41 | 42 | 43 | def assert_split_creates_new_labels_for_new_clusters( 44 | incdbscan_fit, 45 | clusters: Iterable[Iterable], 46 | previous_common_label): 47 | 48 | all_labels = set() 49 | 50 | for cluster in clusters: 51 | labels_within_cluster = set() 52 | 53 | for obj in cluster: 54 | label_of_object = incdbscan_fit.get_cluster_labels([obj])[0] 55 | labels_within_cluster.add(label_of_object) 56 | 57 | assert len(labels_within_cluster) == 1 58 | all_labels.update(labels_within_cluster) 59 | 60 | assert previous_common_label in all_labels 61 | assert len(all_labels) == len(clusters) 62 | assert CLUSTER_LABEL_NOISE not in all_labels 63 | 64 | 65 | def reflect_horizontally(points): 66 | new_points = np.copy(points) 67 | new_points[:, 0] = np.negative(new_points[:, 0]) 68 | return new_points 69 | 70 | 71 | def delete_object_and_assert_error(incdbscan_fit, obj, error): 72 | with pytest.raises(error): 73 | incdbscan_fit.delete(obj) 74 | 75 | 76 | def delete_object_and_assert_no_warning(incdbscan_fit, obj): 77 | with warnings.catch_warnings(): 78 | warnings.simplefilter("error") 79 | incdbscan_fit.delete(obj) 80 | 81 | 82 | def delete_object_and_assert_warning(incdbscan_fit, obj, warning): 83 | with pytest.warns(warning): 84 | incdbscan_fit.delete(obj) 85 | 86 | 87 | def get_label_and_assert_error(incdbscan_fit, obj, error): 88 | with pytest.raises(error): 89 | incdbscan_fit.get_cluster_labels(obj) 90 | 91 | 92 | def get_label_and_assert_no_warning(incdbscan_fit, obj): 93 | with warnings.catch_warnings(): 94 | warnings.simplefilter("error") 95 | incdbscan_fit.get_cluster_labels(obj) 96 | 97 | return incdbscan_fit.get_cluster_labels(obj) 98 | 99 | 100 | def get_label_and_assert_warning(incdbscan_fit, obj, warning): 101 | with pytest.warns(warning): 102 | return incdbscan_fit.get_cluster_labels(obj) 103 | 104 | 105 | def insert_object_and_assert_error(incdbscan_fit, obj, error): 106 | with pytest.raises(error): 107 | incdbscan_fit.insert(obj) 108 | 109 | 110 | def are_lists_isomorphic(list_1, list_2): 111 | if len(list_1) != len(list_2): 112 | return False 113 | 114 | distinct_elements_1 = set(list_1) 115 | distinct_elements_2 = set(list_2) 116 | 117 | if len(distinct_elements_1) != len(distinct_elements_2): 118 | return False 119 | 120 | mappings = list(zip(list_1, list_2)) 121 | distinct_mappings = set(mappings) 122 | 123 | return len(distinct_elements_1) == len(distinct_mappings) 124 | 125 | 126 | def read_text_data_file_from_url(url): 127 | content = requests.get(url) 128 | data = np.loadtxt(StringIO(content.text)) 129 | return data 130 | 131 | 132 | def read_arff_data_file_from_url(url): 133 | content = requests.get(url) 134 | data = pd.read_csv(StringIO(content.text), skiprows=10) 135 | data = data.to_numpy() 136 | return data 137 | 138 | 139 | def read_handl_data(): 140 | # This is equivalent to the 2d-20c-no0 data set by Handl, J. 141 | # Also available from: 142 | # https://personalpages.manchester.ac.uk/staff/Julia.Handl/generators.html 143 | 144 | url = ( 145 | 'https://raw.githubusercontent.com/deric/clustering-benchmark/' 146 | 'master/src/main/resources/datasets/artificial/2d-20c-no0.arff' 147 | ) 148 | data = read_arff_data_file_from_url(url)[:, 0:2] 149 | return data 150 | 151 | 152 | def read_chameleon_data(): 153 | # This is equivalent to the t4.8k data set from the Chameleon collection 154 | # by Karypis, G. et al. Also available from: 155 | # http://glaros.dtc.umn.edu/gkhome/cluto/cluto/download 156 | 157 | url = ( 158 | 'https://raw.githubusercontent.com/yeahia2508/ml-examples/' 159 | 'master/Data/clustering/chameleon/t4.8k.txt' 160 | ) 161 | data = read_text_data_file_from_url(url)[:2000] 162 | return data 163 | -------------------------------------------------------------------------------- /notebooks/incdbscan-usage.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# `incdbscan` meets Iris" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "This is an introduction to the API of the `incdbscan` package. The logic of DBSCAN and its hyperparameters will not be covered here. Those can be checked at scikit-learn's [documentation about DBSCAN](https://scikit-learn.org/stable/modules/clustering.html#dbscan)." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "/home/xyz\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "cd .." 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "%matplotlib inline\n", 41 | "import matplotlib.pyplot as plt\n", 42 | "from sklearn.datasets import load_iris\n", 43 | "\n", 44 | "def plot_iris(array, labels=None):\n", 45 | " colors = ['blue', 'green', 'yellow', 'black', 'orange', 'purple', 'red', 'silver'] # quite hacky...\n", 46 | " plt.figure(figsize=(10, 10))\n", 47 | " \n", 48 | " if labels is not None:\n", 49 | " for label in set(labels):\n", 50 | " rows = label == labels\n", 51 | " plt.scatter(array[rows, 0], array[rows, 1], c=colors[int(label)], label=str(int(label)))\n", 52 | " plt.legend(title='Cluster labels')\n", 53 | " else:\n", 54 | " plt.scatter(array[:, 0], array[:, 1], c=labels, cmap='Set1')\n", 55 | " \n", 56 | " plt.xlim([4.0, 8.5])\n", 57 | " plt.ylim([1.75, 4.5])\n", 58 | " plt.xlabel('sepal length (cm)')\n", 59 | " plt.ylabel('sepal width (cm)')\n", 60 | " plt.show()\n" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "The data set we are going to cluster is 2-D variant of the Iris dataset (although `incdbscan` is capable of dealing with higher dimensions). The data set is split into two batches of points. We take a peek to see all data points first, then get to clustering." 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 3, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "name": "stderr", 77 | "output_type": "stream", 78 | "text": [ 79 | "/tmp/ipykernel_64753/3386316966.py:15: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored\n", 80 | " plt.scatter(array[:, 0], array[:, 1], c=labels, cmap='Set1')\n" 81 | ] 82 | }, 83 | { 84 | "data": { 85 | "image/png": "\n", 86 | "text/plain": [ 87 | "
" 88 | ] 89 | }, 90 | "metadata": { 91 | "needs_background": "light" 92 | }, 93 | "output_type": "display_data" 94 | } 95 | ], 96 | "source": [ 97 | "X = load_iris()['data'][:, :2]\n", 98 | "X_1, X_2 = X[:75], X[75:]\n", 99 | "\n", 100 | "plot_iris(X)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 4, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "from incdbscan import IncrementalDBSCAN\n", 110 | "clusterer = IncrementalDBSCAN(eps=0.25, min_pts=5)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "## Input data shape\n", 118 | "\n", 119 | "All 3 main methods - `insert`, `delete` and `get_cluster_labels` - expect data points to be in a 2-D `ndarray` of shape `(n_samples, n_features)`, but also can be in a pandas `DataFrame` or in a list of data points stored in 1-D `ndarray`s of shape `(n_features,)`." 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "## Insert data points with `insert`\n", 127 | "First insert a batch of points into the clusterer. Note that `insert` returns with the `IncrementalDBSCAN` instance itself." 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 5, 133 | "metadata": {}, 134 | "outputs": [ 135 | { 136 | "data": { 137 | "text/plain": [ 138 | "" 139 | ] 140 | }, 141 | "execution_count": 5, 142 | "metadata": {}, 143 | "output_type": "execute_result" 144 | } 145 | ], 146 | "source": [ 147 | "clusterer.insert(X_1)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "The same points can be inserted again if wanted. Let's insert the first 10 data points again into the clusterer. Now these points will be there two times." 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 6, 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "data": { 164 | "text/plain": [ 165 | "" 166 | ] 167 | }, 168 | "execution_count": 6, 169 | "metadata": {}, 170 | "output_type": "execute_result" 171 | } 172 | ], 173 | "source": [ 174 | "clusterer.insert(X_1[:10])" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "## Obtain cluster labels with `get_cluster_labels`\n", 182 | "This method helps us gain the cluster labels. In contrast with scikit-learn's `DBSCAN`, the labels are obtained not for all objects but for objects we specifically ask cluster labels for. Let's get the cluster labels for the whole 1st batch and see them on a plot!" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 7, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "labels_1 = clusterer.get_cluster_labels(X_1)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 8, 197 | "metadata": {}, 198 | "outputs": [ 199 | { 200 | "data": { 201 | "text/plain": [ 202 | "array([ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,\n", 203 | " 1., -1., -1., 1., 1., -1., 1., 1., 1., 1., 1., 1., 1.,\n", 204 | " 1., 1., 1., 1., 1., 1., -1., -1., 1., 1., 1., 1., 1.,\n", 205 | " 1., 1., -1., 1., 1., 1., 1., 1., 1., 1., 1., -1., -1.,\n", 206 | " -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,\n", 207 | " -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.])" 208 | ] 209 | }, 210 | "execution_count": 8, 211 | "metadata": {}, 212 | "output_type": "execute_result" 213 | } 214 | ], 215 | "source": [ 216 | "labels_1" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 9, 222 | "metadata": {}, 223 | "outputs": [ 224 | { 225 | "data": { 226 | "image/png": "\n", 227 | "text/plain": [ 228 | "
" 229 | ] 230 | }, 231 | "metadata": { 232 | "needs_background": "light" 233 | }, 234 | "output_type": "display_data" 235 | } 236 | ], 237 | "source": [ 238 | "plot_iris(X_1, labels_1)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "There is one cluster according to IncrementalDBSCAN -- the points shown in green. But so far many of the points are found to be noise -- these are the grey points. The amount of noise is quite big because of the relatively low value of `eps`." 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "If we pass such data points to `get_cluster_labels` that are missing from the clustering (i.e., were not inserted so far, or were inserted but also deleted), two things will happen. A warning will show up, and the labels for these missing objects will have a value of `numpy.nan`." 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 10, 258 | "metadata": {}, 259 | "outputs": [ 260 | { 261 | "name": "stderr", 262 | "output_type": "stream", 263 | "text": [ 264 | "/home/xyz/incdbscan/incrementaldbscan.py:141: IncrementalDBSCANWarning: No label was retrieved for object at position 5 because there is no such object in the object set.\n", 265 | " warnings.warn(\n", 266 | "/home/xyz/incdbscan/incrementaldbscan.py:141: IncrementalDBSCANWarning: No label was retrieved for object at position 6 because there is no such object in the object set.\n", 267 | " warnings.warn(\n", 268 | "/home/xyz/incdbscan/incrementaldbscan.py:141: IncrementalDBSCANWarning: No label was retrieved for object at position 7 because there is no such object in the object set.\n", 269 | " warnings.warn(\n", 270 | "/home/xyz/incdbscan/incrementaldbscan.py:141: IncrementalDBSCANWarning: No label was retrieved for object at position 8 because there is no such object in the object set.\n", 271 | " warnings.warn(\n", 272 | "/home/xyz/incdbscan/incrementaldbscan.py:141: IncrementalDBSCANWarning: No label was retrieved for object at position 9 because there is no such object in the object set.\n", 273 | " warnings.warn(\n" 274 | ] 275 | } 276 | ], 277 | "source": [ 278 | "labels_with_missing = clusterer.get_cluster_labels(X[70:80])" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 11, 284 | "metadata": {}, 285 | "outputs": [ 286 | { 287 | "data": { 288 | "text/plain": [ 289 | "array([-1., -1., -1., -1., -1., nan, nan, nan, nan, nan])" 290 | ] 291 | }, 292 | "execution_count": 11, 293 | "metadata": {}, 294 | "output_type": "execute_result" 295 | } 296 | ], 297 | "source": [ 298 | "labels_with_missing" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "Let's insert the 2nd batch of data points and see the clustering of all data points." 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 12, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "labels_all = clusterer.insert(X_2).get_cluster_labels(X)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 13, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "data": { 324 | "image/png": "\n", 325 | "text/plain": [ 326 | "
" 327 | ] 328 | }, 329 | "metadata": { 330 | "needs_background": "light" 331 | }, 332 | "output_type": "display_data" 333 | } 334 | ], 335 | "source": [ 336 | "plot_iris(X, labels_all)" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "Thanks to the new points, two clusters emerged from the set of points that were previously noise: the purple and the red cluster. " 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "## Delete data points with `delete`\n", 351 | "\n", 352 | "One just has to pass the batch of data points to delete. Let's try it out by deleting the first batch of data points, and get the labels of the second batch in one line." 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 14, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "labels_2 = clusterer.delete(X_1).get_cluster_labels(X_2)" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": 15, 367 | "metadata": {}, 368 | "outputs": [ 369 | { 370 | "data": { 371 | "image/png": "\n", 372 | "text/plain": [ 373 | "
" 374 | ] 375 | }, 376 | "metadata": { 377 | "needs_background": "light" 378 | }, 379 | "output_type": "display_data" 380 | } 381 | ], 382 | "source": [ 383 | "plot_iris(X_2, labels_2)" 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": {}, 389 | "source": [ 390 | "Only the purple cluster remained.\n", 391 | "\n", 392 | "But remember that we inserted the first 10 data points from the first batch two times? When we invoked `delete(X_1)`, one of the duplicated points remained. So we still can get the labels of the first 10 points. Just by peeking into the values we can see that they became noise." 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 16, 398 | "metadata": {}, 399 | "outputs": [], 400 | "source": [ 401 | "labels_10 = clusterer.get_cluster_labels(X_1[:10])" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": 17, 407 | "metadata": {}, 408 | "outputs": [ 409 | { 410 | "data": { 411 | "text/plain": [ 412 | "array([-1., -1., -1., -1., -1., -1., -1., -1., -1., -1.])" 413 | ] 414 | }, 415 | "execution_count": 17, 416 | "metadata": {}, 417 | "output_type": "execute_result" 418 | } 419 | ], 420 | "source": [ 421 | "labels_10" 422 | ] 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "metadata": {}, 427 | "source": [ 428 | "One thing left to note. If we try to delete such points that are missing from the clustering, we will not succeed but have to face a warning. Here first we check what happens if we try to delete the first 10 points (which currently are still in the clustering), and then what happens if we try to delete them again." 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 18, 434 | "metadata": {}, 435 | "outputs": [ 436 | { 437 | "data": { 438 | "text/plain": [ 439 | "" 440 | ] 441 | }, 442 | "execution_count": 18, 443 | "metadata": {}, 444 | "output_type": "execute_result" 445 | } 446 | ], 447 | "source": [ 448 | "clusterer.delete(X_1[:10])" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 19, 454 | "metadata": {}, 455 | "outputs": [ 456 | { 457 | "name": "stderr", 458 | "output_type": "stream", 459 | "text": [ 460 | "/home/xyz/incdbscan/incrementaldbscan.py:104: IncrementalDBSCANWarning: Object at position 0 was not deleted because there is no such object in the object set.\n", 461 | " warnings.warn(\n", 462 | "/home/xyz/incdbscan/incrementaldbscan.py:104: IncrementalDBSCANWarning: Object at position 1 was not deleted because there is no such object in the object set.\n", 463 | " warnings.warn(\n", 464 | "/home/xyz/incdbscan/incrementaldbscan.py:104: IncrementalDBSCANWarning: Object at position 2 was not deleted because there is no such object in the object set.\n", 465 | " warnings.warn(\n", 466 | "/home/xyz/incdbscan/incrementaldbscan.py:104: IncrementalDBSCANWarning: Object at position 3 was not deleted because there is no such object in the object set.\n", 467 | " warnings.warn(\n", 468 | "/home/xyz/incdbscan/incrementaldbscan.py:104: IncrementalDBSCANWarning: Object at position 4 was not deleted because there is no such object in the object set.\n", 469 | " warnings.warn(\n", 470 | "/home/xyz/incdbscan/incrementaldbscan.py:104: IncrementalDBSCANWarning: Object at position 5 was not deleted because there is no such object in the object set.\n", 471 | " warnings.warn(\n", 472 | "/home/xyz/incdbscan/incrementaldbscan.py:104: IncrementalDBSCANWarning: Object at position 6 was not deleted because there is no such object in the object set.\n", 473 | " warnings.warn(\n", 474 | "/home/xyz/incdbscan/incrementaldbscan.py:104: IncrementalDBSCANWarning: Object at position 7 was not deleted because there is no such object in the object set.\n", 475 | " warnings.warn(\n", 476 | "/home/xyz/incdbscan/incrementaldbscan.py:104: IncrementalDBSCANWarning: Object at position 8 was not deleted because there is no such object in the object set.\n", 477 | " warnings.warn(\n", 478 | "/home/xyz/incdbscan/incrementaldbscan.py:104: IncrementalDBSCANWarning: Object at position 9 was not deleted because there is no such object in the object set.\n", 479 | " warnings.warn(\n" 480 | ] 481 | }, 482 | { 483 | "data": { 484 | "text/plain": [ 485 | "" 486 | ] 487 | }, 488 | "execution_count": 19, 489 | "metadata": {}, 490 | "output_type": "execute_result" 491 | } 492 | ], 493 | "source": [ 494 | "clusterer.delete(X_1[:10])" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": null, 500 | "metadata": {}, 501 | "outputs": [], 502 | "source": [] 503 | } 504 | ], 505 | "metadata": { 506 | "kernelspec": { 507 | "display_name": "Python 3 (ipykernel)", 508 | "language": "python", 509 | "name": "python3" 510 | }, 511 | "language_info": { 512 | "codemirror_mode": { 513 | "name": "ipython", 514 | "version": 3 515 | }, 516 | "file_extension": ".py", 517 | "mimetype": "text/x-python", 518 | "name": "python", 519 | "nbconvert_exporter": "python", 520 | "pygments_lexer": "ipython3", 521 | "version": "3.10.12" 522 | } 523 | }, 524 | "nbformat": 4, 525 | "nbformat_minor": 4 526 | } 527 | -------------------------------------------------------------------------------- /notes/notes-on-paper.md: -------------------------------------------------------------------------------- 1 | # Notes on the IncrementalDBSCAN paper 2 | The work by Ester et al. 1998 lays the groundwork for this implementation of IncrementalDBSCAN. However, some parts of the algorithm are not covered in the paper. Here, these holes will be identified, and solutions are proposed to fill them. 3 | 4 | Notations used: 5 | - *D*: the set of data objects. 6 | - *NEps(p)*: the set of all objects that are in the *Eps*-neighborhood of *p*. 7 | - *UpdSeedIns*, the set of update seeds after insertion, is defined in *Definition 7* as the set of core objects in the *Eps*-neighborhood of those objects that gain their core object property as a result of the insertion into *D*. 8 | - *UpdSeedDel*, the set of update seeds after deletion, is defined in *Definition 7* as the set of core objects in the *Eps*-neighborhood of those objects that lose their core object property as a result of the deletion from *D*. 9 | 10 | ## Absorption when *UpdSeedIns* is empty 11 | Let's suppose that cluster *C* is already established, and a new object *p* is inserted in the *Eps*-neighborhood of a core object *c* of cluster *C*. Additionally, suppose that there are not enough objects in *NEps(p)* for *p* to become a core object and that no other objects become core objects due to the insertion. 12 | 13 | Now, how should *p* be handled? 14 | 15 | Since there are no new core objects after the insertion, *UpdSeedIns* is empty. According to *Section 4.2*, _"if *UpdSeedIns* is empty [...] then *p* is a noise object."_ But we also know that *p* is in *NEps(c)*, and according to *Definition 4* this means that it should be assigned to cluster *C*. There is clearly a contradiction here. 16 | 17 | **Solution**: In this implementation, even if *UpdSeedIns* is empty, *p* is assigned to cluster *C* if *c* is a core object of cluster *C* and *p* is in *NEps(c)*. 18 | 19 | ## Simultaneous creations, absorptions and merges 20 | In *Section 4.2*, cases of creation, absorption and merge are presented. These are indeed essential building blocks of IncrementalDBSCAN. However, the paper fails to mention that these events can happen simultaneously. 21 | 22 | Let's see an example. Suppose we have a 1 dimensional data set with 6 objects (*a*, *b*, *c*, *x*, *y*, *z*) as illustrated in the following block. The coordinates of the objects are as noted below their names. 23 |
24 | - - - c - b - a - - - - - - - x - y - z - - -
25 |      -4  -3  -2               2   3   4      
26 | 
27 | 28 | If we apply IncrementalDBSCAN to the data set with *Eps*=2 and *MinPts*=4, no clusters are created since none of the objects have an *Eps*-neighborhood that contain at least 4 objects. That is, all objects are noise objects. 29 | 30 | We now insert object *p* at position 0. 31 |
32 | - - - c - b - a - - - p - - - x - y - z - - -
33 |      -4  -3  -2       0       2   3   4      
34 | 
35 | 36 | After the insertion, both *NEps(a)* and *NEps(x)* contain 4 objects, so *a* and *x* become core objects. *UpdSeedIns* then contains the new core objects, *a* and *x*. According to the paper if _"UpdSeedIns contains only core objects which did not belong to a cluster before the insertion of p, i.e. they were noise objects or equal to p, [...] a new cluster containing these noise objects as well as p is created."_ 37 | 38 | Here *UpdSeedIns* contains only new core objects (*a* and *x*) but all 7 of the objects cannot be part of one cluster, since not all objects would be density-reachable from any other object in the cluster (because, e.g., *a* is not directly density-reachable from *p*). Thus, the definition of a cluster (*Definition 4*) wouldn't hold. This is contradictory to the above quote from *Section 4.2*. 39 | 40 | Analogous examples can be constructed for absorptions and merges. E.g., a creation and an absorption can happen at the same time, or even two merges can. But the paper doesn't cover these cases. 41 | 42 | **Solution**: *UpdSeedIns* should be broken down to components in which each object is density-connected to any other object in the component. The rules of creation, absorption and merge should be applied not to *UpdSeedIns* as a whole but to each component individually. 43 | 44 | ## Extended definition of *UpdSeedDel* 45 | 46 | The point of defining *UpdSeedDel* is to take the first step towards finding all objects in the whole object set that eventually might be affected by a deletion. *UpdSeedDel* contains the _"seed objects for the update"_. 47 | 48 | Let's take the following object set *D* of 7 one dimensional objects (*a*, *b*, *c*, *p*, *x*, *y*, *z*). The coordinates of the objects are as noted below their names. 49 |
50 | - - - c - b - a - - - p - - - x - y - z - - -
51 |      -4  -3  -2       0       2   3   4      
52 | 
53 | 54 | Suppose we apply IncrementalDBSCAN to the objects with *MinPts*=3 and *Eps*=2. As a result, all objects belong to a single cluster. 55 | 56 | Now suppose we delete *p*. Following *Definition 7* in the paper, *UpdSeedDel* would be empty, since there is no object that is core in *D* but not in *D* \ {*p*}. Thus, according to the definition, there are no seed objects for the update. 57 | 58 | This is in conflict with the results of the deletion, in which there are now two clusters of objects, as can be seen below. Thus, there was indeed a need for cluster membership update. 59 |
60 | - - - c - b - a - - - - - - - x - y - z - - -
61 |      -4  -3  -2               2   3   4      
62 | 
63 | 64 | **Solution**: in this implementation, the definition of *UpdSeedDel* is extended to cover such cases. It is (informally) the set of core objects in the *Eps*-neighborhood of either (1) those objects that lose their core object property as a result of the deletion of *p* or (2) *p* itself. 65 | 66 | ## Updates needed when *UpdSeedDel* is empty 67 | 68 | According to *Section 4.3* of the paper, when during the deletion of an object *p* if _"UpdSeedDel is empty [...] then p is deleted from D and eventually other objects in NEps(p) change from a former cluster C to noise"._ 69 | 70 | However, consider there are two core objects in *D*, *p* and *q*, not in the *Eps*-neighborhood of each other. They are of different clusters, *C1* and *C2*, respectively. And suppose there is an object *b* that is not core and is in both *NEps(p)* and *NEps(q)* (but not in *NEps(r)* for of any other object *r*). In such cases *b* is either in cluster *C1* or *C2*. In this example assume it is in *C1*. 71 | 72 | We now delete *p* from *D*. *UpdSeedDel* is empty because there are no core objects in the *Eps*-neighborhood of objects that lost their core property. *b* is then no longer in *C1* (as there is no object to keep it there) but does not become noise. Instead, because it is in *NEps(q)* it should be assigned to *C2*, which goes against the description in the paper. 73 | 74 | **Solution**: in this implementation whenever an object loses its cluster membership it is checked first if it should be reassigned to another cluster. Only if it is not in the *Eps*-neighborhood of any other core objects it becomes noise. 75 | 76 | ## Simultaneous splits 77 | 78 | When the paper, in *Section 4.3* (_"potential Split"_), describes the splitting logic that happens after an object *p* is deleted, it says this is when *UpdSeedDel* is not empty and the objects in it _"belonged to exactly one cluster [...] before the deletion of p."_ 79 | 80 | Take the following two dimensional object set *D* as example. There are several objects, most of them marked with a star, and 3 of them with a letter: *p*, *b*, and *q*. With the left and bottom axes one can see the coordinates of the objects. 81 | 82 |
83 |  1   *  *  *     q     *  *  *
84 | 
85 |  0               b
86 | 
87 | -1   *  *  *     p     *  *  *
88 |     -2    -1     0     1     2
89 | 
90 | 91 | When we cluster these objects according to DBSCAN with *MinPts*=4 and *Eps*=1, two clusters emerge. The first cluster consists of the objects on the *y*=-1 line, while the second one with objects on the line *y*=1. Object *b*, since there are less than *MinPts* objects in *NEps(b)*, is not a core object itself, but it belongs to either one of the clusters as a border object. 92 | 93 | What happens when we delete *b* from *D*? As a result, both *q* and *p* lose their core property. According to the definition of *UpdSeedDel*, the core objects in the neighborhood of *p* and *q*, that is, the objects marked with stars next to them, will be in *UpdSeedDel*. These objects belonged to two clusters, not _"exactly one cluster [...] before the deletion of p"_, as the paper states. The paper misses a point here. 94 | 95 | **Solution**: In this case, this implementation follows the logic of DBSCAN and reaches the conclusion of what would happen if DBSCAN was applied to *D* after the deletion of *p*. That is, four clusters are formed, two at the top and two at the bottom. So two splits need happen at the same time: both of the bottom and the top cluster break down into two smaller clusters. 96 | -------------------------------------------------------------------------------- /profiling.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from datetime import datetime 3 | from pathlib import Path 4 | 5 | from line_profiler import LineProfiler 6 | 7 | from incdbscan import IncrementalDBSCAN 8 | from incdbscan._bfscomponentfinder import BFSComponentFinder 9 | from incdbscan._deleter import Deleter 10 | from incdbscan._inserter import Inserter 11 | from incdbscan._labels import LabelHandler 12 | from incdbscan._neighbor_searcher import NeighborSearcher 13 | from incdbscan._object import Object 14 | from incdbscan._objects import Objects 15 | from incdbscan.tests.testutils import ( 16 | read_chameleon_data, 17 | read_handl_data 18 | ) 19 | 20 | 21 | BASE_PATH = Path(__file__).parent 22 | DATA_PATH = BASE_PATH / 'incdbscan' / 'tests' / 'data' 23 | 24 | 25 | def test1(): 26 | data = read_handl_data() 27 | 28 | algo = IncrementalDBSCAN(eps=1) 29 | algo.insert(data) 30 | algo.delete(data) 31 | 32 | 33 | def test2(): 34 | data = read_chameleon_data()[:2000] 35 | 36 | algo = IncrementalDBSCAN(eps=10) 37 | algo.insert(data) 38 | algo.delete(data) 39 | 40 | 41 | def print_profile(test, tag=''): 42 | profiler = LineProfiler() 43 | profiler.add_module(Inserter) 44 | profiler.add_module(Deleter) 45 | # profiler.add_module(BFSComponentFinder) 46 | # profiler.add_module(Object) 47 | # profiler.add_module(Objects) 48 | # profiler.add_module(LabelHandler) 49 | # profiler.add_module(NeighborSearcher) 50 | 51 | wrapper = profiler(test) 52 | wrapper() 53 | 54 | timestamp = str(datetime.now())[:19] 55 | filename = f'{timestamp}_{test.__name__}{tag}.txt' 56 | profile_path = BASE_PATH / 'profiling' / filename 57 | 58 | with open(profile_path, 'w') as f: 59 | profiler.print_stats(stream=f) 60 | 61 | 62 | if __name__ == "__main__": 63 | tag = '_' + sys.argv[1] if len(sys.argv) > 1 else '' 64 | for test in [test1, test2]: 65 | print(f'{datetime.now()} Creating profile for {test.__name__} ...') 66 | print_profile(test, tag) 67 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | authors = ["Arpad Fulop "] 3 | description = "Implementation of IncrementalDBSCAN clustering." 4 | license = "BSD-3-Clause" 5 | name = "incdbscan" 6 | version = "0.3.0" 7 | readme = "README.md" 8 | homepage = "https://github.com/DataOmbudsman/incdbscan" 9 | repository = "https://github.com/DataOmbudsman/incdbscan" 10 | keywords = [ 11 | "clustering", 12 | "incremental clustering" 13 | ] 14 | classifiers = [ 15 | "Development Status :: 4 - Beta", 16 | "Intended Audience :: Developers", 17 | "Intended Audience :: Science/Research", 18 | "License :: OSI Approved :: BSD License", 19 | 'Operating System :: OS Independent', 20 | 'Programming Language :: Python :: 3.8', 21 | 'Programming Language :: Python :: 3.9', 22 | 'Programming Language :: Python :: 3.10', 23 | 'Programming Language :: Python :: 3.11', 24 | 'Programming Language :: Python :: 3.12', 25 | 'Programming Language :: Python :: 3.13', 26 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 27 | ] 28 | 29 | [tool.poetry.dependencies] 30 | numpy = "^2.0.0" 31 | python = ">=3.9.0,<4.0" 32 | rustworkx = "^0.15.0" 33 | scikit-learn = "^1.5.0" 34 | sortedcontainers = "^2.4.0" 35 | xxhash = "^3.5.0" 36 | 37 | [tool.poetry.group.dev.dependencies] 38 | isort = "^6.0.1" 39 | jupyterlab = "^4.4.0" 40 | line-profiler = "^4.2.0" 41 | matplotlib = "^3.9.4" 42 | pandas = "^2.2.3" 43 | pylint = "^3.3.6" 44 | pytest = "^8.3.5" 45 | requests = "^2.32.3" 46 | tqdm = "^4.67.1" 47 | 48 | [build-system] 49 | requires = ["poetry_core >= 1.8.1"] 50 | build-backend = "poetry.core.masonry.api" 51 | 52 | [tool.isort] 53 | known_first_party = ["testutils"] 54 | profile = "pycharm" 55 | 56 | [tool.pytest.ini_options] 57 | markers = [ 58 | "slow: marks tests as slow (deselect with '-m \"not slow\"')" 59 | ] --------------------------------------------------------------------------------