├── .github └── workflows │ ├── ci_checks.yml │ └── upload_pypi.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── denstream ├── __init__.py ├── den_stream.py ├── micro_cluster.py ├── preprocessing.py ├── typing.py └── utils.py ├── examples └── user_guide.ipynb ├── pyproject.toml ├── scripts └── ci_checks.sh └── tests ├── __init__.py ├── test_den_stream_core.py ├── test_den_stream_fitting.py ├── test_helpers.py ├── test_micro_cluster.py ├── test_preprocessing.py └── test_utils.py /.github/workflows/ci_checks.yml: -------------------------------------------------------------------------------- 1 | name: CI checks 2 | 3 | on: 4 | pull_request: 5 | branches: ["*"] 6 | push: 7 | branches: [ master ] 8 | 9 | jobs: 10 | code-coverage: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | - uses: actions/setup-python@v5 15 | with: 16 | python-version: '3.12' 17 | - name: Install 18 | run: | 19 | pip install .[dev] 20 | - name: Unit-tests with coverage 21 | run: | 22 | pytest --cov=./ --cov-report=xml 23 | - name: Upload coverage to Codecov 24 | uses: codecov/codecov-action@v4 25 | with: 26 | token: ${{ secrets.CODECOV_TOKEN }} 27 | 28 | pre-commit: 29 | runs-on: ubuntu-latest 30 | steps: 31 | - uses: actions/checkout@v4 32 | - uses: actions/setup-python@v5 33 | with: 34 | python-version: '3.12' 35 | - name: Install dev dependencies 36 | run: | 37 | pip install .[dev] 38 | - uses: pre-commit/action@v3.0.1 39 | 40 | run_lint: 41 | runs-on: ubuntu-latest 42 | steps: 43 | - uses: actions/checkout@v4 44 | - uses: actions/setup-python@v5 45 | with: 46 | python-version: '3.12' 47 | - name: Install dev dependencies 48 | run: | 49 | pip install .[dev] 50 | - name: Run linting 51 | shell: bash 52 | run: | 53 | bash scripts/ci_checks.sh 54 | 55 | unit-tests: 56 | runs-on: ${{ matrix.os }} 57 | strategy: 58 | matrix: 59 | python-version: ['3.9', '3.10', '3.11', '3.12'] 60 | os: [ubuntu-latest, windows-latest, macos-latest] 61 | steps: 62 | - uses: actions/checkout@v4 63 | - name: Set up Python ${{ matrix.python-version }} 64 | uses: actions/setup-python@v5 65 | with: 66 | python-version: ${{ matrix.python-version }} 67 | - name: Install 68 | run: | 69 | pip install .[dev] 70 | - name: Unit-tests with coverage 71 | run: | 72 | pytest tests/ 73 | -------------------------------------------------------------------------------- /.github/workflows/upload_pypi.yml: -------------------------------------------------------------------------------- 1 | name: Upload to pypi 2 | 3 | on: workflow_dispatch 4 | 5 | jobs: 6 | pypi-publish: 7 | name: Upload release to PyPI 8 | runs-on: ubuntu-latest 9 | environment: 10 | name: pypi 11 | url: https://pypi.org/p/denstream 12 | permissions: 13 | id-token: write 14 | steps: 15 | - uses: actions/checkout@v4 16 | - uses: actions/setup-python@v5 17 | with: 18 | python-version: '3.12' 19 | - name: Install 20 | run: | 21 | pip install .[dev] 22 | - name: Create package 23 | run: | 24 | python3 -m build 25 | - name: Publish package distributions to PyPI 26 | uses: pypa/gh-action-pypi-publish@release/v1 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | **/__pycache__/ 3 | denstream.egg-info/ 4 | examples/.ipynb_checkpoints/ 5 | .venv/ 6 | build/ 7 | 8 | .coverage 9 | coverage.xml 10 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | rev: v0.7.3 4 | hooks: 5 | - id: ruff 6 | args: [ --fix ] 7 | 8 | - repo: https://github.com/psf/black 9 | rev: 24.10.0 10 | hooks: 11 | - id: black 12 | 13 | - repo: https://github.com/PyCQA/isort 14 | rev: 5.13.2 15 | hooks: 16 | - id: isort 17 | 18 | - repo: local 19 | hooks: 20 | - id: mypy 21 | # note: assumes python env is setup and activated 22 | name: mypy 23 | entry: mypy . 24 | language: system 25 | pass_filenames: false 26 | types: [python] 27 | 28 | - repo: https://github.com/pre-commit/pre-commit-hooks 29 | rev: v5.0.0 30 | hooks: 31 | - id: check-yaml 32 | - id: end-of-file-fixer 33 | - id: trailing-whitespace 34 | - id: requirements-txt-fixer 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 MrParosk 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pyDenStream 2 | 3 | ![master](https://github.com/MrParosk/pyDenStream/workflows/master/badge.svg?branch=master) [![codecov](https://codecov.io/gh/MrParosk/pyDenStream/branch/master/graph/badge.svg?token=HEKMVIH5WO)](https://codecov.io/gh/MrParosk/pyDenStream) 4 | 5 | Implementation of the algorithm [Density-Based Clustering over an Evolving Data Stream with Noise](https://archive.siam.org/meetings/sdm06/proceedings/030caof.pdf) in Python. 6 | 7 | ## Installation 8 | 9 | ```Shell 10 | pip install denstream 11 | ``` 12 | 13 | ## Example usage 14 | 15 | ```python 16 | import numpy as np 17 | from denstream import DenStream 18 | 19 | # Model parameters 20 | eps = 0.3 21 | lambd = 0.1 22 | beta = 0.2 23 | mu = 10 24 | min_samples = 1 25 | 26 | model = DenStream(eps, beta, mu, lambd, min_samples) 27 | 28 | x = np.array([[1, 2]]) 29 | t = 0 30 | 31 | model.partial_fit(x, t) 32 | ``` 33 | 34 | ## In depth example 35 | 36 | A more in depth example of how to use this package is included in *examples/user_guide.ipynb*. 37 | -------------------------------------------------------------------------------- /denstream/__init__.py: -------------------------------------------------------------------------------- 1 | from .den_stream import DenStream # noqa: F401 2 | from .preprocessing import RollingStats # noqa: F401 3 | -------------------------------------------------------------------------------- /denstream/den_stream.py: -------------------------------------------------------------------------------- 1 | from inspect import isfunction 2 | from typing import Any, Callable, Dict, Iterator, List, Literal, Optional, Union 3 | from warnings import warn 4 | 5 | import numpy as np 6 | import sklearn.cluster 7 | from sklearn.base import BaseEstimator 8 | 9 | from denstream import micro_cluster, preprocessing 10 | from denstream.typing import FloatArrayType, InputDict, IntArrayType, MetricsDict 11 | 12 | 13 | class DenStream: 14 | """ 15 | This class implements the DenStream algorithm: 16 | "Density-Based Clustering over an Evolving Data Stream with Noise" by Martin et. al. (2006). 17 | 18 | The notation used here are from the paper and therefore it will be easier to understand the code if ones read 19 | the paper first. 20 | """ 21 | 22 | def __init__( 23 | self, 24 | epsilon: float, 25 | beta: float, 26 | mu: int, 27 | lambd: float, 28 | min_samples: int, 29 | label_metrics_list: List[Callable[[IntArrayType, IntArrayType], float]] = [], 30 | no_label_metrics_list: List[Callable[[FloatArrayType, IntArrayType], float]] = [], 31 | distance_measure: Union[None, float, Literal["fro", "nuc"]] = None, 32 | ): 33 | """ 34 | :param epsilon: The radius used by the micro-cluster and DBScan. 35 | :param beta: Potential factor. 36 | :param mu: Weight factor used by the micro-clusters. 37 | :param lambd: Fading factor. 38 | :param min_samples: Minimum number of samples used in DBScan. 39 | :param label_metrics_list: List of function used to evaluate the cluster quality with labels, e.g. 40 | sklearn.metrics.v_measure_score. The functions requires the format function(true_labels, predicted_labels). 41 | :param no_label_metrics_list: List of functions used to evaluate the cluster quality without labels, e.g. 42 | sklearn.metrics.silhouette_score. The functions requires the format function(features, predicted_labels). 43 | :param distance_measure: Type of distance measure used for finding the closest p-micro-cluster. 44 | Need to be compatible with numpy.linalg.norm's parameter "ord". 45 | :return 46 | """ 47 | 48 | self.epsilon = epsilon 49 | self.beta = beta 50 | self.mu = mu 51 | self.lambd = lambd 52 | self.min_samples = min_samples 53 | self.distance_measure = distance_measure 54 | 55 | self.label_metrics_list = label_metrics_list 56 | self.no_label_metrics_list = no_label_metrics_list 57 | self.metrics_results: List[Dict[str, Optional[Any]]] = [] 58 | self._validate_init_input() 59 | 60 | self.Tp = (1.0 / self.lambd) * np.log((self.beta * self.mu)) / (self.beta * self.mu - 1) 61 | 62 | self.o_micro_clusters: List[micro_cluster.MicroCluster] = [] 63 | self.p_micro_clusters: List[micro_cluster.MicroCluster] = [] 64 | 65 | self.completed_o_clusters: List[micro_cluster.MicroCluster] = [] 66 | self.completed_p_clusters: List[micro_cluster.MicroCluster] = [] 67 | 68 | self.iterations = 0 69 | 70 | self.model = sklearn.cluster.DBSCAN( 71 | eps=self.epsilon, 72 | min_samples=self.min_samples, 73 | metric="euclidean", 74 | algorithm="auto", 75 | n_jobs=-1, 76 | ) 77 | 78 | def _find_closest_cluster(self, cluster_list: List[micro_cluster.MicroCluster], feature_array: FloatArrayType) -> int: 79 | """ 80 | Function for finding the closest cluster for a given point p (feature_array). 81 | 82 | :param cluster_list: List of micro-clusters. 83 | :param feature_array: array for a given data point. Must have the shape (1, num_features). 84 | :return: Index which specifies the closest cluster to the point p (feature_array). 85 | """ 86 | 87 | cluster_centers = np.concatenate([c.center for c in cluster_list], axis=0) 88 | dist = np.linalg.norm(feature_array - cluster_centers, axis=1, ord=self.distance_measure) 89 | closest_cluster_index = np.argmin(dist) 90 | return int(closest_cluster_index) 91 | 92 | def _calculate_xi(self, time: int, creation_time: int) -> float: 93 | """ 94 | Function for calculating the xi-value (see the paper for further context). 95 | 96 | :param time: Specifying the current time. 97 | :param creation_time: Specifying the creation time of a cluster. 98 | :return: The xi value. 99 | """ 100 | 101 | xi: float = (np.power(2, -self.lambd * (time - creation_time + self.Tp)) - 1) / (np.power(2, -self.lambd * self.Tp) - 1) 102 | return xi 103 | 104 | def _merging(self, current_time: int, feature_array: FloatArrayType, label: Optional[int] = None) -> None: 105 | """ 106 | The merging step of a point p (feature_array) as described in the paper. 107 | 108 | :param current_time: The current time. 109 | :param feature_array: Array for a given data point p. 110 | :param label: Specifying the true label of a data point. None indicates that it is not provided. 111 | :return 112 | """ 113 | 114 | if len(self.p_micro_clusters) > 0: 115 | closest_p_index = self._find_closest_cluster(self.p_micro_clusters, feature_array) 116 | closest_p_cluster = self.p_micro_clusters[closest_p_index] 117 | 118 | closest_p_cluster.append(current_time, feature_array, label) 119 | r_p, weight, cf1 = closest_p_cluster.calculate_radius(current_time) 120 | 121 | if r_p <= self.epsilon: 122 | closest_p_cluster.update_parameters(cf1_score=cf1, weight=weight) 123 | return 124 | else: 125 | closest_p_cluster.pop() 126 | 127 | if len(self.o_micro_clusters) > 0: 128 | closest_o_index = self._find_closest_cluster(self.o_micro_clusters, feature_array) 129 | closest_o_cluster = self.o_micro_clusters[closest_o_index] 130 | 131 | closest_o_cluster.append(current_time, feature_array, label) 132 | r_o, weight, cf1 = closest_o_cluster.calculate_radius(current_time) 133 | 134 | if r_o <= self.epsilon: 135 | closest_o_cluster.update_parameters(cf1_score=cf1, weight=weight) 136 | 137 | if closest_o_cluster.weight > self.beta * self.mu: 138 | self.p_micro_clusters.append(closest_o_cluster) 139 | self.o_micro_clusters.pop(closest_o_index) 140 | return 141 | else: 142 | # The clusters is not compact enough, therefore removing the newly added point. 143 | closest_o_cluster.pop() 144 | 145 | new_o_cluster = micro_cluster.MicroCluster(current_time, self.lambd) 146 | new_o_cluster.append(current_time, feature_array, label) 147 | new_o_cluster.update_parameters(time=current_time) 148 | self.o_micro_clusters.append(new_o_cluster) 149 | 150 | def _prune_p_clusters(self, time: int) -> None: 151 | """ 152 | Pruning the potential activate clusters. 153 | 154 | :param time: The current time. 155 | :return 156 | """ 157 | 158 | for idx in range(len(self.p_micro_clusters) - 1, -1, -1): 159 | p_cluster = self.p_micro_clusters[idx] 160 | p_cluster.update_parameters(time=time) 161 | 162 | if p_cluster.weight < self.beta * self.mu: 163 | self.completed_p_clusters.append(p_cluster) 164 | self.p_micro_clusters.pop(idx) 165 | 166 | def _prune_o_clusters(self, time: int) -> None: 167 | """ 168 | Pruning the outlier activate clusters. 169 | 170 | :param time: The current time. 171 | :return 172 | """ 173 | 174 | for idx in range(len(self.o_micro_clusters) - 1, -1, -1): 175 | o_cluster = self.o_micro_clusters[idx] 176 | o_cluster.update_parameters(time=time) 177 | xi = self._calculate_xi(time, o_cluster.creation_time) 178 | 179 | if o_cluster.weight < xi: 180 | self.completed_o_clusters.append(o_cluster) 181 | self.o_micro_clusters.pop(idx) 182 | 183 | def partial_fit( 184 | self, 185 | feature_array: FloatArrayType, 186 | time: int, 187 | label: Optional[int] = None, 188 | request_period: Optional[Any] = None, 189 | ) -> None: 190 | """ 191 | :param feature_array: Array for a given data point p. 192 | :param time: The current time. 193 | :param label: Specifying the true label of a data point. None indicates that it is not provided. 194 | :param request_period: Specifying when (in terms of #data-points) we should compute the clusters. 195 | It can have the types: 196 | - An integer, i.e. do the clustering every request_period. 197 | - List of integers, i.e. cluster if the iteration number is request_period[idx]. 198 | - None, i.e. do no cluster with self.model. 199 | :return 200 | """ 201 | 202 | DenStream._validate_fit_input(time, feature_array, label) 203 | self.iterations += 1 204 | 205 | self._merging(time, feature_array, label) 206 | 207 | if time % np.ceil(self.Tp) == 0: 208 | self._prune_p_clusters(time) 209 | self._prune_o_clusters(time) 210 | 211 | if isinstance(request_period, int): 212 | if self.iterations % request_period == 0: 213 | self._cluster_evaluate(self.iterations) 214 | elif isinstance(request_period, list): 215 | if self.iterations in request_period: 216 | self._cluster_evaluate(self.iterations) 217 | 218 | def fit_generator( 219 | self, 220 | generator: Iterator[InputDict], 221 | normalize: bool = False, 222 | request_period: Optional[Any] = None, 223 | warmup_period: int = 1, 224 | ) -> None: 225 | """ 226 | Fitting DenStream to a stream of data-points (i.e. python generator). 227 | It will run until the generator does not have any data points left. 228 | 229 | :param generator: used to stream data-points to the model. It must yield a python dictionary with the keys: 230 | time [int]: integer for the time when the data point arrived. 231 | feature_array [np.ndarray]: numpy array for the data point. Must have the shape (1, num_features). 232 | label [Optional[int]]: the true label of the data point. Needed for self.label_metrics_list. 233 | :param normalize: Whether to normalize the features to zero mean and unit variance. 234 | The normalization is done with rolling statistics, i.e. update mean and variance iterable. 235 | :param request_period: Specifying when (in terms of #data-points) we should compute the clusters. 236 | It can have the types: 237 | - An integer, i.e. do the clustering every request_period. 238 | - List of integers, i.e. cluster if the iteration number is request_period[idx]. 239 | - None, i.e. do no cluster with self.model. 240 | :param warmup_period: The number of samples used to "warm-up" the rolling mean and variance, if normalize=True. 241 | :return 242 | """ 243 | 244 | if self.iterations > 0: 245 | raise RuntimeError("Seems like the method as already been fitted, try to re-create it.") 246 | 247 | if normalize: 248 | for _ in range(warmup_period): 249 | try: 250 | gen_dict = generator.__next__() 251 | except StopIteration: 252 | raise RuntimeError(f"Not enough samples where given for the warmup-period, warmup_period={warmup_period}") 253 | 254 | feature_array = gen_dict["feature_array"] 255 | rs = preprocessing.RollingStats(feature_array.shape) 256 | rs.update_statistics(feature_array) 257 | 258 | while True: 259 | try: 260 | gen_dict = generator.__next__() 261 | except StopIteration: 262 | break 263 | time = gen_dict["time"] 264 | feature_array = gen_dict["feature_array"] 265 | 266 | if "label" in gen_dict: 267 | label = gen_dict["label"] 268 | else: 269 | label = None 270 | 271 | if normalize: 272 | DenStream._validate_fit_input(time, feature_array, label) 273 | rs.update_statistics(feature_array) 274 | feature_array = rs.normalize(feature_array) 275 | 276 | self.partial_fit(feature_array, time, label, request_period=request_period) 277 | 278 | def set_clustering_model(self, new_model: BaseEstimator) -> None: 279 | """ 280 | This method allows the user to use another clustering method than DBScan, e.g. K-Means. 281 | Note that it needs to be a sklearn model. 282 | Example usage: 283 | new_model = sklearn.cluster.KMeans(n_clusters=2) 284 | DenStream.set_clustering_model(new_model) 285 | 286 | :param new_model: A sklearn clustering model. 287 | :return: 288 | """ 289 | 290 | if not isinstance(new_model, BaseEstimator): 291 | raise ValueError("The new model needs to be a sklearn-model.") 292 | 293 | self.model = new_model 294 | 295 | def _cluster_evaluate(self, iteration: int) -> None: 296 | """ 297 | Calling request clustering and computing the metrics. 298 | 299 | :param iteration: current iteration, i.e. #data-points. 300 | :return 301 | """ 302 | 303 | predicted_labels = self._request_clustering() 304 | 305 | if len(predicted_labels) > 0: 306 | metrics = [] 307 | 308 | if len(self.label_metrics_list) > 0: 309 | metrics += self._compute_label_metrics(predicted_labels) 310 | if len(self.no_label_metrics_list) > 0: 311 | # Checking that we have atleast two clusters (exluding outlier clusters, i.e. label=-1). 312 | if len(set(predicted_labels[predicted_labels != -1])) > 1: 313 | metrics += self._compute_no_label_metric(predicted_labels) 314 | else: 315 | warn("Number of predicted clusters are 1 or less. Therefore no-label-metrics are not computed!") 316 | if len(metrics) > 0: 317 | self.metrics_results.append({"iteration": iteration, "metrics": metrics}) 318 | else: 319 | self.metrics_results.append({"iteration": iteration, "metrics": None}) 320 | 321 | def _request_clustering(self) -> FloatArrayType: 322 | """ 323 | Clustering based on self.model for the p-micro-clusters. 324 | 325 | :return: Array of predicted labels for each p-micro-cluster. 326 | """ 327 | 328 | if len(self.p_micro_clusters) > 0: 329 | center_array = np.concatenate([c.center for c in self.p_micro_clusters], axis=0) 330 | else: 331 | return np.empty(0, dtype=np.float32) 332 | 333 | # TODO: Should the new clusters be connected? I.e. if micro-cluster 1 and 2 and connected, should they be merged 334 | local_model = sklearn.base.clone(self.model) 335 | predicted_labels: FloatArrayType = local_model.fit_predict(center_array) 336 | return predicted_labels 337 | 338 | def _compute_label_metrics(self, predicted_labels: FloatArrayType) -> List[MetricsDict]: 339 | """ 340 | Compute the label metrics given the predicted labels. 341 | 342 | :param predicted_labels: Array of the predicted labels for each p-micro-cluster. 343 | :return: List of dictionaries with the values for each label metrics. 344 | It has the key name (i.e. name of the metric) and value (i.e. the value of the metric). 345 | """ 346 | predicted_list, true_list = [], [] 347 | 348 | for idx, predicted_label in enumerate(predicted_labels): 349 | true_labels = self.p_micro_clusters[idx].labels_array 350 | true_list.append(np.array(true_labels)) 351 | 352 | repeated_prediction = np.repeat(predicted_label, len(true_labels)) 353 | predicted_list.append(repeated_prediction) 354 | 355 | true_array = np.concatenate(true_list, axis=0) 356 | predicted_array = np.concatenate(predicted_list, axis=0) 357 | 358 | results = [] 359 | for metric in self.label_metrics_list: 360 | val = metric(true_array, predicted_array) 361 | result_dict = MetricsDict(name=metric.__name__, value=val) 362 | results.append(result_dict) 363 | return results 364 | 365 | def _compute_no_label_metric(self, predicted_labels: FloatArrayType) -> List[MetricsDict]: 366 | """ 367 | Compute the no-label metrics given the predicted labels. 368 | 369 | :param predicted_labels: Array of the predicted labels for each p-micro-cluster. 370 | :return: List of dictionaries with the values for each no-label metrics. 371 | It has the key name (i.e. name of the metric) and value (i.e. the value of the metric). 372 | """ 373 | predicted_list, feature_list = [], [] 374 | 375 | for idx, predicted_label in enumerate(predicted_labels): 376 | features = self.p_micro_clusters[idx].features_array 377 | feature_list.append(np.array(features)) 378 | 379 | repeated_prediction = np.repeat(predicted_label, len(features)) 380 | predicted_list.append(repeated_prediction) 381 | 382 | combined_feature_array = np.concatenate(feature_list, axis=0) 383 | predicted_array = np.concatenate(predicted_list, axis=0) 384 | 385 | results = [] 386 | for metric in self.no_label_metrics_list: 387 | val = metric(combined_feature_array, predicted_array) 388 | result_dict = MetricsDict(name=metric.__name__, value=val) 389 | results.append(result_dict) 390 | return results 391 | 392 | def _validate_init_input(self) -> None: 393 | """ 394 | Checking that the input to init is valid. 395 | :return 396 | """ 397 | 398 | if isinstance(self.epsilon, int) or isinstance(self.epsilon, float): 399 | if self.epsilon <= 0: 400 | raise ValueError("epsilon must be positive.") 401 | else: 402 | raise ValueError("epsion must be of type float or integer.") 403 | 404 | if isinstance(self.beta, float): 405 | if not 0.0 < self.beta <= 1.0: 406 | raise ValueError("beta must be between 0.0 and 1.0.") 407 | else: 408 | raise ValueError("beta must be of type float") 409 | 410 | if isinstance(self.mu, int): 411 | if self.mu <= 0: 412 | raise ValueError("mu must be positive.") 413 | else: 414 | raise ValueError("mu must be of type integer.") 415 | 416 | if isinstance(self.min_samples, int): 417 | if self.min_samples <= 0: 418 | raise ValueError("min_samples must be positive.") 419 | else: 420 | raise ValueError("min_samples must be of type integer.") 421 | 422 | if isinstance(self.lambd, int) or isinstance(self.lambd, float): 423 | if self.min_samples <= 0.0: 424 | raise ValueError("lambd must be positive.") 425 | else: 426 | raise ValueError("lambd must be of type float or integer.") 427 | 428 | if self.beta * self.mu <= 1.0: 429 | raise ValueError("beta * mu <= 1.0 which will cause problems when computing Tp.") 430 | 431 | for label_metric in self.label_metrics_list: 432 | if not isfunction(label_metric): 433 | raise ValueError("The label metric input(s) must be a function.") 434 | 435 | for no_label_metric in self.no_label_metrics_list: 436 | if not isfunction(no_label_metric): 437 | raise ValueError("The no-label metric input(s) must be a function.") 438 | 439 | @staticmethod 440 | def _validate_fit_input(time: int, feature_array: FloatArrayType, label: Optional[int] = None) -> None: 441 | """ 442 | Validate the fit_generator's input parameters. 443 | 444 | :param time: The current time. 445 | :param feature_array: Array for a given data point p. 446 | :param label: Specifying the true label of a data point. None indicates that the label is not provided. 447 | :return 448 | """ 449 | 450 | if not isinstance(feature_array, np.ndarray): 451 | raise ValueError(f"Provided x is not an numpy.ndarray, type(x)={type(feature_array)}") 452 | elif len(feature_array.shape) != 2: 453 | raise ValueError(f"feature_array need to have the shape (1, num_features), " f"given shape={feature_array.shape}") 454 | 455 | if not isinstance(time, int): 456 | raise ValueError(f"Provided time is not an int. type(time)={type(time)}") 457 | elif time < 0: 458 | raise ValueError(f"Time needs to be positive. time={time}") 459 | 460 | if not isinstance(label, int) and label is not None: 461 | raise ValueError(f"Provided label is not an int or None. label={label}") 462 | -------------------------------------------------------------------------------- /denstream/micro_cluster.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | 3 | import numpy as np 4 | from typing_extensions import Unpack 5 | 6 | from denstream import utils 7 | from denstream.typing import FloatArrayType, UpdateParameters 8 | 9 | 10 | class MicroCluster: 11 | """ 12 | This class represents the micro-potential / outlier cluster describe in the paper: 13 | "Density-Based Clustering over an Evolving Data Stream with Noise" by Martin Ester et. al. 14 | """ 15 | 16 | def __init__(self, creation_time: int, lambd: float): 17 | """ 18 | Initializing the micro-cluster. 19 | 20 | :param creation_time: The creation time (i.e. "now") for this micro-cluster. 21 | :param lambd: Fading factor for this cluster. 22 | :return 23 | """ 24 | 25 | self.lambd = lambd 26 | self.creation_time = creation_time 27 | 28 | self.features_array = np.array([], dtype=np.float32) 29 | self.time_array = np.array([], dtype=np.int32) 30 | self.labels_array = np.array([], dtype=np.int32) 31 | 32 | self.weight = np.array(np.nan, dtype=np.float32) 33 | self.center = np.array([], dtype=np.float32) 34 | 35 | self.cf1_func = utils.numba_cf1 36 | self.cf2_func = utils.numba_cf2 37 | 38 | def append(self, time: int, feature_array: FloatArrayType, label: Optional[int] = None) -> None: 39 | """ 40 | This function appends data-points to the features / time / labels arrays. 41 | 42 | :param time: The time to append. 43 | :param feature_array: Array for a given data-point. Must have the shape (1, num_features). 44 | :param label: Specifying the true cluster label of a data-point. None indicates that it is not given. 45 | :return 46 | """ 47 | 48 | time_array = np.array(time).reshape((1, 1)) 49 | 50 | assert len(self.features_array) == len(self.time_array) 51 | if len(self.features_array) == 0: 52 | self.features_array = feature_array 53 | self.time_array = time_array 54 | else: 55 | self.features_array = np.append(self.features_array, feature_array, axis=0) 56 | self.time_array = np.append(self.time_array, time_array, axis=0) 57 | 58 | if label is not None: 59 | label_array = np.array([label]).reshape((1,)) 60 | 61 | if len(self.labels_array) == 0: 62 | self.labels_array = label_array 63 | else: 64 | self.labels_array = np.append(self.labels_array, label_array, axis=0) 65 | 66 | def pop(self) -> None: 67 | """ 68 | This function pops out the last data-point (i.e. the len(features_array) -1 element). 69 | 70 | :return 71 | """ 72 | 73 | assert len(self.features_array) == len(self.time_array) 74 | if len(self.features_array) == 0: 75 | pass 76 | else: 77 | self.features_array = np.delete(self.features_array, [len(self.features_array) - 1], axis=0) 78 | self.time_array = np.delete(self.time_array, [len(self.time_array) - 1], axis=0) 79 | 80 | if len(self.labels_array) == 0: 81 | pass 82 | else: 83 | self.labels_array = np.delete(self.labels_array, [len(self.labels_array) - 1], axis=0) 84 | 85 | def _calculate_fading(self, time: int) -> FloatArrayType: 86 | """ 87 | This function calculates the fading values for time for this micro-cluster. 88 | 89 | :param time: The time value for which to compute the fading value for. 90 | :return: Array containing the fading values from this micro-cluster. 91 | """ 92 | 93 | return utils.fading_function(self.lambd, time - self.time_array) 94 | 95 | def calculate_radius(self, time: int) -> Tuple[FloatArrayType, FloatArrayType, FloatArrayType]: 96 | """ 97 | Calculating the radius of a micro-cluster according to the paper 98 | https://archive.siam.org/meetings/sdm06/proceedings/030caof.pdf. 99 | 100 | :param time: Time value used for calculating the radius. 101 | :return: Calculate radius, weight and CF1-score. 102 | """ 103 | 104 | fading_array = utils.fading_function(self.lambd, time - self.time_array) 105 | weight = np.sum(fading_array, axis=0) 106 | cf1 = self.cf1_func(self.features_array, fading_array) 107 | cf2 = self.cf2_func(self.features_array, fading_array) 108 | 109 | radius_squared = np.sum(np.abs(cf2), axis=1) / weight - 1 / np.power(weight, 2) * np.dot(cf1, cf1.T) 110 | radius_squared = radius_squared if radius_squared > 0 else 0 111 | radius = np.sqrt(radius_squared) 112 | 113 | return radius, weight, cf1 114 | 115 | def update_parameters(self, **kwargs: Unpack[UpdateParameters]) -> None: 116 | """ 117 | Updating the weight and center parameter for the micro-cluster. 118 | There is two modes: 119 | - One when only "time" is given. Then calculate the weight and cf1 from scratch. 120 | - If "cf1_score" and "weight" is given, simply use them. This is done to avoid recomputing them. 121 | 122 | :param kwargs: 123 | :return 124 | """ 125 | 126 | if "time" in kwargs: 127 | fading_array = utils.fading_function(self.lambd, kwargs["time"] - self.time_array) 128 | weight = np.sum(fading_array, axis=0) 129 | self.weight = weight 130 | self.center = self.cf1_func(self.features_array, fading_array) / weight 131 | elif "cf1_score" in kwargs and "weight" in kwargs: 132 | self.center = kwargs["cf1_score"] / kwargs["weight"] 133 | self.weight = kwargs["weight"] 134 | else: 135 | raise ValueError("Wrong input to MicroCluster.update_parameters") 136 | -------------------------------------------------------------------------------- /denstream/preprocessing.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import numpy as np 4 | 5 | from denstream.typing import FloatArrayType 6 | 7 | 8 | class RollingStats: 9 | """ 10 | This class implements rolling statistics, i.e. mean and variance are updated for each new data-point. 11 | """ 12 | 13 | def __init__(self, dim: Tuple[int, ...], eps: float = 1e-10): 14 | """ 15 | Initializing the rolling statistics class. 16 | 17 | :param dim - describing the dimension of input data, .e.g. (1, 5). 18 | :param eps: Constant rensuring we don't divide by zero. 19 | :return 20 | """ 21 | 22 | self.dim = dim 23 | self.mean = np.zeros(self.dim) 24 | 25 | self.variance = np.zeros(self.dim) 26 | self.sse = np.zeros(self.dim) 27 | 28 | self.num_data_points = 0 29 | self.eps = eps 30 | 31 | def update_statistics(self, x: FloatArrayType) -> None: 32 | """ 33 | Updating the mean and variance according to x. The update equations can be found here: 34 | https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance. 35 | 36 | :param x: Array with dimension equal to self.dim. 37 | :return: 38 | """ 39 | 40 | self.num_data_points += 1 41 | 42 | old_mean = self.mean 43 | self.mean = self.mean + (x - self.mean) / self.num_data_points 44 | self.mean = self.mean.reshape(self.dim) 45 | 46 | self.sse = self.sse + np.multiply(x - old_mean, x - self.mean) 47 | self.sse = self.sse.reshape(self.dim) 48 | self.variance = self.sse / self.num_data_points 49 | 50 | def normalize(self, x: FloatArrayType) -> FloatArrayType: 51 | """ 52 | Normalizing the input data. 53 | 54 | :param x: Input array. 55 | :return: Normalized input array. 56 | """ 57 | 58 | return (x - self.mean) / (np.sqrt(self.variance) + self.eps) 59 | -------------------------------------------------------------------------------- /denstream/typing.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, TypedDict 2 | 3 | import numpy as np 4 | 5 | FloatArrayType = np.typing.NDArray[np.float32] 6 | IntArrayType = np.typing.NDArray[np.int32] 7 | 8 | 9 | class InputDict(TypedDict): 10 | feature_array: FloatArrayType 11 | time: int 12 | label: Optional[int] 13 | 14 | 15 | class MetricsDict(TypedDict): 16 | name: str 17 | value: float 18 | 19 | 20 | class UpdateParameters(TypedDict, total=False): 21 | time: int 22 | cf1_score: FloatArrayType 23 | weight: FloatArrayType 24 | -------------------------------------------------------------------------------- /denstream/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numba import jit 3 | 4 | from denstream.typing import FloatArrayType, IntArrayType 5 | 6 | 7 | def fading_function(lambd: float, time: IntArrayType) -> FloatArrayType: 8 | """ 9 | Calculating the fading value. 10 | 11 | :param lambd: Fading factor. 12 | :param time: Specifying the time. 13 | :return: The calculated fading array. 14 | """ 15 | 16 | return np.power(2, -lambd * time, dtype=np.float32) 17 | 18 | 19 | def numpy_cf1(x: FloatArrayType, fading_array: FloatArrayType) -> FloatArrayType: 20 | """ 21 | Calculating the CF1 according to the paper https://archive.siam.org/meetings/sdm06/proceedings/030caof.pdf, 22 | using numpy. 23 | 24 | :param x: Array containing the data points. 25 | :param fading_array: Array containing the calculated fading values. 26 | :return: Array with the calculated CF1 values. 27 | """ 28 | 29 | x_weighted = np.multiply(x, fading_array) 30 | x_sum: FloatArrayType = np.sum(x_weighted, axis=0) 31 | x_sum = x_sum.reshape(1, x.shape[1]) 32 | return x_sum 33 | 34 | 35 | @jit(nopython=True, cache=True) # type: ignore 36 | def numba_cf1(x: FloatArrayType, fading_array: FloatArrayType) -> FloatArrayType: 37 | """ 38 | Calculating the CF1 according to the paper https://archive.siam.org/meetings/sdm06/proceedings/030caof.pdf, 39 | using numba. 40 | 41 | :param x: Array containing the data points. 42 | :param fading_array: Array containing the calculated fading values. 43 | :return: Array with the calculated CF1 values. 44 | """ 45 | 46 | return_array = np.zeros((1, x.shape[1]), dtype=fading_array.dtype) 47 | 48 | for i in range(x.shape[0]): 49 | for j in range(x.shape[1]): 50 | return_array[0, j] += x[i, j] * fading_array[i, 0] 51 | 52 | return return_array.reshape((1, x.shape[1])) 53 | 54 | 55 | def numpy_cf2(x: FloatArrayType, fading_array: FloatArrayType) -> FloatArrayType: 56 | """ 57 | Calculating the CF2 according to the paper https://archive.siam.org/meetings/sdm06/proceedings/030caof.pdf, 58 | using numpy. 59 | 60 | :param x: Array containing the data points. 61 | :param fading_array: Array containing the calculated fading values. 62 | :return: Array with the calculated CF2 values. 63 | """ 64 | 65 | x_squared = np.multiply(x, x) 66 | x_weighted = np.multiply(x_squared, fading_array) 67 | x_sum: FloatArrayType = np.sum(x_weighted, axis=0) 68 | return x_sum.reshape((1, x.shape[1])) 69 | 70 | 71 | @jit(nopython=True, cache=True) # type: ignore 72 | def numba_cf2(x: FloatArrayType, fading_array: FloatArrayType) -> FloatArrayType: 73 | """ 74 | Calculating the CF2 according to the paper https://archive.siam.org/meetings/sdm06/proceedings/030caof.pdf, 75 | using numba. 76 | 77 | :param x: Array containing the data points. 78 | :param fading_array: Array containing the calculated fading values. 79 | :return: Array with the calculated CF2 values. 80 | """ 81 | 82 | return_array = np.zeros((1, x.shape[1]), dtype=x.dtype) 83 | 84 | for i in range(x.shape[0]): 85 | for j in range(x.shape[1]): 86 | return_array[0, j] += x[i, j] * x[i, j] * fading_array[i, 0] 87 | 88 | return return_array.reshape((1, x.shape[1])) 89 | -------------------------------------------------------------------------------- /examples/user_guide.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Example usage of the pyDenStream package" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# Make sure you have install the package\n", 17 | "from denstream import DenStream" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import numpy as np\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "from sklearn import metrics" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 3, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "num_samples = 100\n", 38 | "num_features = 2\n", 39 | "\n", 40 | "sigma = 0.1" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 4, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# Generating test data\n", 50 | "\n", 51 | "np.random.seed(42)\n", 52 | "\n", 53 | "# Generating data for cluster 1.\n", 54 | "center_1 = np.array([1.0, 1.0]).reshape((1, num_features))\n", 55 | "x_1 = center_1 + np.random.normal(0.0, sigma, [num_samples, num_features])\n", 56 | "y_1 = np.repeat(0, num_samples).reshape((num_samples, 1))\n", 57 | "t_1 = np.linspace(1, 100, num=num_samples).reshape((num_samples, 1))\n", 58 | "\n", 59 | "# Generating data for cluster 2.\n", 60 | "center_2 = np.array([1.0, -1.0]).reshape((1, num_features))\n", 61 | "x_2 = center_2 + np.random.normal(0.0, sigma, [num_samples, num_features])\n", 62 | "y_2 = np.repeat(1, num_samples).reshape((num_samples, 1))\n", 63 | "t_2 = np.linspace(101, 200, num=num_samples).reshape((num_samples, 1))\n", 64 | "\n", 65 | "# Generating data for cluster 3.\n", 66 | "center_3 = np.array([-1.0, -1.0]).reshape((1, num_features))\n", 67 | "x_3 = center_3 + np.random.normal(0.0, sigma, [num_samples, num_features])\n", 68 | "y_3 = np.repeat(2, num_samples).reshape((num_samples, 1))\n", 69 | "t_3 = np.linspace(51, 150, num=num_samples).reshape((num_samples, 1))\n", 70 | "\n", 71 | "# Generating data for cluster 4.\n", 72 | "center_4 = np.array([-1.0, 1.0]).reshape((1, num_features))\n", 73 | "x_4 = center_4 + np.random.normal(0.0, sigma, [num_samples, num_features])\n", 74 | "y_4 = np.repeat(3, num_samples).reshape((num_samples, 1))\n", 75 | "t_4 = np.linspace(51, 150, num=num_samples).reshape((num_samples, 1))\n", 76 | "\n", 77 | "X = np.concatenate([x_1, x_2, x_3, x_4], axis=0).astype(float)\n", 78 | "Y = np.concatenate([y_1, y_2, y_3, y_4], axis=0).astype(int)\n", 79 | "T = np.concatenate([t_1, t_2, t_3, t_4], axis=0).astype(int)\n", 80 | "\n", 81 | "# Sorting data s.t. they come in time order.\n", 82 | "idx = np.argsort(T, axis=0).reshape(T.shape[0],)\n", 83 | "X = X[idx, :]\n", 84 | "Y = Y[idx, :]\n", 85 | "T = T[idx, :]" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 5, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "image/png": "\n", 96 | "text/plain": [ 97 | "
" 98 | ] 99 | }, 100 | "metadata": { 101 | "needs_background": "light" 102 | }, 103 | "output_type": "display_data" 104 | } 105 | ], 106 | "source": [ 107 | "plt.figure(figsize=(8, 8))\n", 108 | "plt.scatter(X[:, 0], X[:, 1], c=Y[:, 0])\n", 109 | "plt.show()" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 6, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "eps = 0.3\n", 119 | "lambd = 0.1\n", 120 | "beta = 0.2\n", 121 | "mu = 10\n", 122 | "min_samples = 1\n", 123 | "\n", 124 | "label_metrics_list = [metrics.homogeneity_score, metrics.completeness_score]" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "## Fit with a generator" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 7, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "[{'iteration': 100,\n", 143 | " 'metrics': [{'name': 'homogeneity_score', 'value': 1.0},\n", 144 | " {'name': 'completeness_score', 'value': 1.0}]},\n", 145 | " {'iteration': 200,\n", 146 | " 'metrics': [{'name': 'homogeneity_score', 'value': 1.0},\n", 147 | " {'name': 'completeness_score', 'value': 1.0}]},\n", 148 | " {'iteration': 300,\n", 149 | " 'metrics': [{'name': 'homogeneity_score', 'value': 1.0},\n", 150 | " {'name': 'completeness_score', 'value': 1.0}]},\n", 151 | " {'iteration': 400,\n", 152 | " 'metrics': [{'name': 'homogeneity_score', 'value': 1.0},\n", 153 | " {'name': 'completeness_score', 'value': 1.0}]}]" 154 | ] 155 | }, 156 | "execution_count": 7, 157 | "metadata": {}, 158 | "output_type": "execute_result" 159 | } 160 | ], 161 | "source": [ 162 | "def generator(X, Y, T):\n", 163 | " for i in range(0, X.shape[0]):\n", 164 | " yield {\n", 165 | " \"time\": int(T[i, :]),\n", 166 | " \"feature_array\": X[i, :].reshape((1, X.shape[1])),\n", 167 | " \"label\": int(Y[i, :])\n", 168 | " }\n", 169 | "\n", 170 | "gen = generator(X, Y, T)\n", 171 | "\n", 172 | "ds_generator = DenStream(eps, beta, mu, lambd, min_samples, label_metrics_list)\n", 173 | "ds_generator.fit_generator(gen, request_period=100)\n", 174 | "\n", 175 | "ds_generator.metrics_results" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "## Partial fit" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 8, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "data": { 192 | "text/plain": [ 193 | "[{'iteration': 100,\n", 194 | " 'metrics': [{'name': 'homogeneity_score', 'value': 1.0},\n", 195 | " {'name': 'completeness_score', 'value': 1.0}]},\n", 196 | " {'iteration': 200,\n", 197 | " 'metrics': [{'name': 'homogeneity_score', 'value': 1.0},\n", 198 | " {'name': 'completeness_score', 'value': 1.0}]},\n", 199 | " {'iteration': 300,\n", 200 | " 'metrics': [{'name': 'homogeneity_score', 'value': 1.0},\n", 201 | " {'name': 'completeness_score', 'value': 1.0}]},\n", 202 | " {'iteration': 400,\n", 203 | " 'metrics': [{'name': 'homogeneity_score', 'value': 1.0},\n", 204 | " {'name': 'completeness_score', 'value': 1.0}]}]" 205 | ] 206 | }, 207 | "execution_count": 8, 208 | "metadata": {}, 209 | "output_type": "execute_result" 210 | } 211 | ], 212 | "source": [ 213 | "ds_partial = DenStream(eps, beta, mu, lambd, min_samples, label_metrics_list)\n", 214 | "\n", 215 | "for i in range(len(X)):\n", 216 | " t = int(T[i, :])\n", 217 | " x = X[i, :][np.newaxis, :]\n", 218 | " y = int(Y[i, :])\n", 219 | "\n", 220 | " ds_partial.partial_fit(x, t, y, request_period=100)\n", 221 | "\n", 222 | "ds_partial.metrics_results" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "## Normalize the input\n", 230 | "\n", 231 | "pyDenStream supports normalizing the input to zero mean and unit variance.\n", 232 | "\n", 233 | "Note that this is implemented by calculating the rolling mean and variance, therefore the results in the beginning might be worse since the estimated mean and variance is not \"set in\" yet." 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 9, 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "data": { 243 | "text/plain": [ 244 | "[{'iteration': 100,\n", 245 | " 'metrics': [{'name': 'homogeneity_score', 'value': 1.0},\n", 246 | " {'name': 'completeness_score', 'value': 0.7716970557081937}]},\n", 247 | " {'iteration': 200,\n", 248 | " 'metrics': [{'name': 'homogeneity_score', 'value': 1.0},\n", 249 | " {'name': 'completeness_score', 'value': 1.0}]},\n", 250 | " {'iteration': 300,\n", 251 | " 'metrics': [{'name': 'homogeneity_score', 'value': 1.0},\n", 252 | " {'name': 'completeness_score', 'value': 1.0}]}]" 253 | ] 254 | }, 255 | "execution_count": 9, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "gen_norm = generator(X, Y, T)\n", 262 | "\n", 263 | "ds_norm = DenStream(eps, beta, mu, lambd, min_samples, label_metrics_list)\n", 264 | "ds_norm.fit_generator(gen_norm, request_period=100, normalize=True)\n", 265 | "\n", 266 | "ds_norm.metrics_results" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": {}, 272 | "source": [ 273 | "# Changing the cluster algorithm\n", 274 | "\n", 275 | "By default, the clustering algorithm used in DenStream is DBScan. However, one can overwrite this." 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 10, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/plain": [ 286 | "[{'iteration': 100,\n", 287 | " 'metrics': [{'name': 'homogeneity_score', 'value': 1.0},\n", 288 | " {'name': 'completeness_score', 'value': 1.0}]},\n", 289 | " {'iteration': 200,\n", 290 | " 'metrics': [{'name': 'homogeneity_score', 'value': 1.0},\n", 291 | " {'name': 'completeness_score', 'value': 1.0}]},\n", 292 | " {'iteration': 300,\n", 293 | " 'metrics': [{'name': 'homogeneity_score', 'value': 1.0},\n", 294 | " {'name': 'completeness_score', 'value': 1.0}]},\n", 295 | " {'iteration': 400,\n", 296 | " 'metrics': [{'name': 'homogeneity_score', 'value': 1.0},\n", 297 | " {'name': 'completeness_score', 'value': 1.0}]}]" 298 | ] 299 | }, 300 | "execution_count": 10, 301 | "metadata": {}, 302 | "output_type": "execute_result" 303 | } 304 | ], 305 | "source": [ 306 | "from sklearn.cluster import MeanShift\n", 307 | "\n", 308 | "cluster_model = MeanShift()\n", 309 | "\n", 310 | "gen_mean = generator(X, Y, T)\n", 311 | "ds_mean = DenStream(eps, beta, mu, lambd, min_samples, label_metrics_list)\n", 312 | "ds_mean.set_clustering_model(cluster_model)\n", 313 | "ds_mean.fit_generator(gen_mean, request_period=100)\n", 314 | "\n", 315 | "ds_mean.metrics_results" 316 | ] 317 | } 318 | ], 319 | "metadata": { 320 | "kernelspec": { 321 | "display_name": "Python 3", 322 | "language": "python", 323 | "name": "python3" 324 | }, 325 | "language_info": { 326 | "codemirror_mode": { 327 | "name": "ipython", 328 | "version": 3 329 | }, 330 | "file_extension": ".py", 331 | "mimetype": "text/x-python", 332 | "name": "python", 333 | "nbconvert_exporter": "python", 334 | "pygments_lexer": "ipython3", 335 | "version": "3.8.5" 336 | } 337 | }, 338 | "nbformat": 4, 339 | "nbformat_minor": 4 340 | } 341 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | build-backend = "setuptools.build_meta" 3 | requires = ["setuptools>=61.0", "wheel>=0.37.1"] 4 | 5 | [project] 6 | name = "denstream" 7 | version = "0.1.1" 8 | description = "Implementation of the DenStream algorithm" 9 | readme = "README.md" 10 | requires-python = ">=3.9, <3.13" 11 | authors = [ 12 | {name = "MrParosk"} 13 | ] 14 | 15 | dependencies = [ 16 | "numba>=0.56.2", 17 | "numpy>=1.23,<3.0", 18 | "scikit-learn>=1.0", 19 | ] 20 | 21 | [project.optional-dependencies] 22 | dev = [ 23 | "black==24.10.0", 24 | "isort==5.13.2", 25 | "mypy==1.13.0", 26 | "pre-commit==4.0.1", 27 | "pytest==8.3.3", 28 | "pytest-cov==6.0.0", 29 | "ruff==0.7.3", 30 | "build==1.2.2", 31 | "twine==5.1.1", 32 | ] 33 | 34 | [tool.mypy] 35 | ignore_missing_imports = true 36 | strict = true 37 | exclude = [ 38 | "build", 39 | "tests" 40 | ] 41 | 42 | [tool.black] 43 | line-length = 128 44 | 45 | [tool.isort] 46 | line_length = 128 47 | profile = "black" 48 | 49 | [tool.ruff] 50 | line-length = 128 51 | -------------------------------------------------------------------------------- /scripts/ci_checks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | echo 'running ruff' 6 | ruff check . 7 | 8 | echo 'running isort' 9 | isort . --check 10 | 11 | echo 'running black' 12 | black . --check 13 | 14 | echo 'running mypy' 15 | mypy . 16 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MrParosk/pyDenStream/69ea3db3cb7ef7f5ee697c9875b96e52f779c3c5/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_den_stream_core.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | 5 | from denstream.den_stream import DenStream 6 | from denstream.micro_cluster import MicroCluster 7 | 8 | 9 | class TestDenStreamCore(unittest.TestCase): 10 | def setUp(self): 11 | self.TOL = 1e-6 12 | self.eps = 1 13 | self.lambd = 1 14 | self.beta = 0.5 15 | self.mu = 3 16 | self.min_samples = 3 17 | 18 | def test_closest_cluster(self): 19 | """ 20 | This test is designed to check that we find the expected closest cluster (i.e. index) for a given data point. 21 | """ 22 | 23 | # Creating potential cluster 1. 24 | c1 = MicroCluster(1, self.lambd) 25 | x1_c1 = np.array([3.9, 3.9]).reshape((1, 2)) 26 | c1.append(1, x1_c1) 27 | x2_c1 = np.array([3.9, 3.9]).reshape((1, 2)) 28 | c1.append(1, x2_c1) 29 | c1.update_parameters(time=1) 30 | 31 | # Creating potential cluster 2. 32 | c2 = MicroCluster(1, self.lambd) 33 | x1_c2 = np.array([4.0, 4.0]).reshape((1, 2)) 34 | c2.append(1, x1_c2) 35 | x2_c2 = np.array([4.0, 4.0]).reshape((1, 2)) 36 | c2.append(1, x2_c2) 37 | c2.update_parameters(time=1) 38 | 39 | # Creating outlier cluster 1. 40 | c3 = MicroCluster(1, self.lambd) 41 | x1_c3 = np.array([0.0, 0.0]).reshape((1, 2)) 42 | c3.append(1, x1_c3) 43 | x2_c3 = np.array([0.0, 0.0]).reshape((1, 2)) 44 | c3.append(1, x2_c3) 45 | c3.update_parameters(time=1) 46 | 47 | # DenStream setup. 48 | ds = DenStream(self.eps, self.beta, self.mu, self.lambd, self.min_samples) 49 | ds.p_micro_clusters.append(c1) 50 | ds.p_micro_clusters.append(c2) 51 | ds.o_micro_clusters.append(c3) 52 | 53 | feature_array = np.array([4.1, 4.1]).reshape((1, 2)) 54 | closest_index = ds._find_closest_cluster(ds.p_micro_clusters, feature_array) 55 | self.assertTrue(closest_index == 1) 56 | 57 | def test_merging_points(self): 58 | """ 59 | This test is designed to check that merging a data point into a micro-cluster works as expected. 60 | The merging is done for two points, one should go to the p-micro-cluster indexed 0 and the other one should 61 | go to the o-micro-cluster indexed 0. 62 | """ 63 | 64 | # Creating potential cluster 1. 65 | c1 = MicroCluster(1, self.lambd) 66 | x1_c1 = np.array([4.1, 3.9]).reshape((1, 2)) 67 | c1.append(1, x1_c1) 68 | x2_c1 = np.array([4.0, 4.0]).reshape((1, 2)) 69 | c1.append(1, x2_c1) 70 | c1.update_parameters(time=1) 71 | 72 | # Creating potential cluster 2. 73 | c2 = MicroCluster(1, self.lambd) 74 | x1_c2 = np.array([-4.1, -3.9]).reshape((1, 2)) 75 | c2.append(1, x1_c2) 76 | x2_c2 = np.array([-4.2, -4.1]).reshape((1, 2)) 77 | c2.append(1, x2_c2) 78 | c2.update_parameters(time=1) 79 | 80 | # Creating outlier cluster 1. 81 | c3 = MicroCluster(1, self.lambd) 82 | x1_c3 = np.array([0.0, 0.0]).reshape((1, 2)) 83 | c3.append(1, x1_c3) 84 | x2_c3 = np.array([0.0, 0.0]).reshape((1, 2)) 85 | c3.append(1, x2_c3) 86 | c3.update_parameters(time=1) 87 | 88 | # DenStream setup. 89 | ds = DenStream(self.eps, self.beta, self.mu, self.lambd, self.min_samples) 90 | ds.p_micro_clusters.append(c1) 91 | ds.p_micro_clusters.append(c2) 92 | ds.o_micro_clusters.append(c3) 93 | 94 | # Defining test data points. 95 | p_potential = np.array([4.1, 3.9]).reshape((1, 2)) 96 | ds._merging(1, p_potential) 97 | 98 | p_outlier = np.array([7.0, 7.0]).reshape((1, 2)) 99 | ds._merging(1, p_outlier) 100 | 101 | # Testing that the data point get to the correct cluster, 102 | # i.e. p_potential to c1 and p_outlier to a new outlier cluster. 103 | self.assertEqual(len(ds.p_micro_clusters), 2) 104 | self.assertEqual(len(ds.p_micro_clusters[0].features_array), 3) 105 | self.assertEqual(len(ds.p_micro_clusters[1].features_array), 2) 106 | 107 | self.assertEqual(len(ds.o_micro_clusters), 2) 108 | self.assertEqual(len(ds.o_micro_clusters[0].features_array), 2) 109 | self.assertEqual(len(ds.o_micro_clusters[1].features_array), 1) 110 | 111 | def test_no_clusters(self): 112 | """ 113 | This test is designed to check that merging a point when we have no p-micro-clusters and no o-micro-clusters 114 | works as expected. 115 | The expected outcome is that it will create a new o-micro-cluster. 116 | """ 117 | 118 | beta = 1.0 119 | 120 | ds = DenStream(self.eps, beta, self.mu, self.lambd, self.min_samples) 121 | self.assertEqual(len(ds.p_micro_clusters), 0) 122 | self.assertEqual(len(ds.o_micro_clusters), 0) 123 | 124 | feature_array_1 = np.array([100.0, 100.0]).reshape((1, 2)) 125 | ds._merging(1, feature_array_1) 126 | 127 | self.assertEqual(len(ds.p_micro_clusters), 0) 128 | self.assertEqual(len(ds.o_micro_clusters), 1) 129 | 130 | feature_array_2 = np.array([-100.0, -100.0]).reshape((1, 2)) 131 | ds._merging(1, feature_array_2) 132 | 133 | self.assertEqual(len(ds.p_micro_clusters), 0) 134 | self.assertEqual(len(ds.o_micro_clusters), 2) 135 | 136 | def test_moving_o_to_p_cluster(self): 137 | """ 138 | This test is designed to check that moving a cluster from the outlier to a potential cluster, if w < beta * mu. 139 | """ 140 | 141 | # Creating potential cluster 1. 142 | c1 = MicroCluster(1, self.lambd) 143 | x1_c1 = np.array([4.1, 3.9]).reshape((1, 2)) 144 | c1.append(1, x1_c1) 145 | x2_c1 = np.array([4.0, 4.0]).reshape((1, 2)) 146 | c1.append(1, x2_c1) 147 | c1.update_parameters(time=1) 148 | 149 | # Creating potential cluster 2. 150 | c2 = MicroCluster(1, self.lambd) 151 | x1_c2 = np.array([0.0, 0.0]).reshape((1, 2)) 152 | c2.append(1, x1_c2) 153 | x2_c2 = np.array([0.0, 0.0]).reshape((1, 2)) 154 | c2.append(1, x2_c2) 155 | c2.update_parameters(time=1) 156 | 157 | # Creating outlier cluster 1. 158 | c3 = MicroCluster(1, self.lambd) 159 | x1_c3 = np.array([-4.0, -4.0]).reshape((1, 2)) 160 | c3.append(1, x1_c3) 161 | x2_c3 = np.array([-4.0, -4.1]).reshape((1, 2)) 162 | c3.append(1, x2_c3) 163 | c3.update_parameters(time=1) 164 | 165 | # DenStream setup. 166 | ds = DenStream(self.eps, self.beta, self.mu, self.lambd, self.min_samples) 167 | ds.p_micro_clusters.append(c1) 168 | ds.o_micro_clusters.append(c2) 169 | ds.o_micro_clusters.append(c3) 170 | 171 | # Defining test data point. 172 | o_potential = np.array([0.1, 0.1]).reshape((1, 2)) 173 | ds._merging(1, o_potential) 174 | 175 | # Testing that the p and o micro-clusters has the expected number of clusters. 176 | self.assertEqual(len(ds.o_micro_clusters), 1) 177 | self.assertEqual(len(ds.p_micro_clusters), 2) 178 | 179 | # Testing that p and o clusters contains the expected number of points. 180 | self.assertEqual(len(ds.p_micro_clusters[0].time_array), 2) 181 | self.assertEqual(len(ds.p_micro_clusters[0].features_array), 2) 182 | self.assertEqual(len(ds.p_micro_clusters[1].time_array), 3) 183 | self.assertEqual(len(ds.p_micro_clusters[1].features_array), 3) 184 | 185 | self.assertEqual(len(ds.o_micro_clusters[0].time_array), 2) 186 | self.assertEqual(len(ds.o_micro_clusters[0].features_array), 2) 187 | 188 | 189 | if __name__ == "__main__": 190 | unittest.main() 191 | -------------------------------------------------------------------------------- /tests/test_den_stream_fitting.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | from sklearn import metrics 5 | from sklearn.cluster import KMeans 6 | 7 | from denstream.den_stream import DenStream 8 | from denstream.micro_cluster import MicroCluster 9 | 10 | from .test_helpers import generate_test_data 11 | 12 | 13 | class TestDenStreamFitting(unittest.TestCase): 14 | def setUp(self): 15 | self.TOL = 1e-6 16 | 17 | def test_fit_generator_cluster(self): 18 | """ 19 | This test is designed to check that the expected number of p/o-micro-clusters are formed from a stream (generator). 20 | The input have two expected p-micro-clusters and three expected o-micro-clusters. 21 | """ 22 | 23 | x_inputs = np.array( 24 | [ 25 | [4.0, 4.0], 26 | [-4.0, -4.0], 27 | [3.99, 3.99], 28 | [-10.0, -10.0], 29 | [4.01, 4.01], 30 | [-4.01, -4.01], 31 | [300.0, 300.0], 32 | [10.0, -10.0], 33 | ] 34 | ) 35 | 36 | time_input = [1, 1, 1, 1, 1, 1, 1, 1] 37 | 38 | def generator(feature_arrays, time_list): 39 | for i in range(0, len(time_input)): 40 | yield { 41 | "time": time_list[i], 42 | "feature_array": feature_arrays[i, :].reshape((1, 2)), 43 | } 44 | 45 | eps = 1 46 | lambd = 1 47 | beta = 0.5 48 | mu = 3 49 | min_samples = 3 50 | 51 | ds = DenStream(eps, beta, mu, lambd, min_samples) 52 | gen = generator(x_inputs, time_input) 53 | 54 | ds.fit_generator(gen) 55 | 56 | self.assertEqual(len(ds.o_micro_clusters), 3) 57 | self.assertEqual(len(ds.p_micro_clusters), 2) 58 | self.assertEqual(len(ds.completed_o_clusters), 0) 59 | self.assertEqual(len(ds.completed_p_clusters), 0) 60 | 61 | def test_fit_generator_fading(self): 62 | """ 63 | This test is designed to check that the micro-clusters are fading, i.e. the activate micro-clusters are moved 64 | to the completed ones. 65 | """ 66 | 67 | x_inputs = np.array( 68 | [ 69 | [-4.0, -4.0], 70 | [4.0, 4.0], 71 | [3.99, 3.99], 72 | [-10.0, 10.0], 73 | [4.01, 4.01], 74 | [-4.01, -4.01], 75 | [300.0, 300.0], 76 | [10.0, -10.0], 77 | ] 78 | ) 79 | 80 | time_input = [4, 1, 1, 1, 1, 4, 1, 4] 81 | 82 | def generator(feature_arrays, time_list): 83 | for i in range(0, len(time_input)): 84 | print(i) 85 | yield { 86 | "time": time_list[i], 87 | "feature_array": feature_arrays[i, :].reshape((1, 2)), 88 | } 89 | 90 | eps = 1 91 | lambd = 1 92 | beta = 0.9 93 | mu = 2 94 | min_samples = 2 95 | 96 | ds = DenStream(eps, beta, mu, lambd, min_samples) 97 | gen = generator(x_inputs, time_input) 98 | 99 | ds.fit_generator(gen) 100 | 101 | # Asserting that the activate p/o-micro-clusters have the expected size/number of data points. 102 | self.assertEqual(len(ds.o_micro_clusters), 1) 103 | self.assertEqual(len(ds.o_micro_clusters[0].features_array), 1) 104 | self.assertEqual(len(ds.o_micro_clusters[0].time_array), 1) 105 | 106 | self.assertEqual(len(ds.p_micro_clusters), 1) 107 | self.assertEqual(len(ds.p_micro_clusters[0].features_array), 2) 108 | self.assertEqual(len(ds.p_micro_clusters[0].time_array), 2) 109 | 110 | # Checking that the inactivate p/o-micro-clusters have the expected size / number of data-points. 111 | self.assertEqual(len(ds.completed_p_clusters), 1) 112 | self.assertEqual(len(ds.completed_p_clusters[0].features_array), 3) 113 | self.assertEqual(len(ds.completed_p_clusters[0].time_array), 3) 114 | 115 | self.assertEqual(len(ds.completed_o_clusters), 2) 116 | self.assertEqual(len(ds.completed_o_clusters[0].features_array), 1) 117 | self.assertEqual(len(ds.completed_o_clusters[0].time_array), 1) 118 | self.assertEqual(len(ds.completed_o_clusters[1].features_array), 1) 119 | self.assertEqual(len(ds.completed_o_clusters[1].time_array), 1) 120 | 121 | def test_request_clustering(self): 122 | """ 123 | This test is designed to check that the request_clustering works as expected, i.e. that the DBScan creates 124 | the expected clusters based on p-micro-clusters' centers. 125 | """ 126 | 127 | eps = 1 128 | lambd = 1 129 | beta = 0.9 130 | mu = 2 131 | min_samples = 2 132 | 133 | # Potential cluster 1-4. 134 | c1 = MicroCluster(1, lambd) 135 | x1_c1 = np.array([4.0, 4.0]).reshape((1, 2)) 136 | c1.append(1, x1_c1) 137 | c1.update_parameters(time=1) 138 | 139 | c2 = MicroCluster(1, lambd) 140 | x1_c2 = np.array([3.9, 3.9]).reshape((1, 2)) 141 | c2.append(1, x1_c2) 142 | c2.update_parameters(time=1) 143 | 144 | c3 = MicroCluster(1, lambd) 145 | x1_c3 = np.array([-4.0, -4.0]).reshape((1, 2)) 146 | c3.append(1, x1_c3) 147 | c3.update_parameters(time=1) 148 | 149 | c4 = MicroCluster(1, lambd) 150 | x1_c4 = np.array([-3.9, -3.9]).reshape((1, 2)) 151 | c4.append(1, x1_c4) 152 | c4.update_parameters(time=1) 153 | 154 | # Outlier cluster 1-2. 155 | c5 = MicroCluster(1, lambd) 156 | x1_c5 = np.array([4.0, -4.0]).reshape((1, 2)) 157 | c5.append(1, x1_c5) 158 | c5.update_parameters(time=1) 159 | 160 | c6 = MicroCluster(1, lambd) 161 | x1_c6 = np.array([-3.9, 3.9]).reshape((1, 2)) 162 | c6.append(1, x1_c6) 163 | c6.update_parameters(time=1) 164 | 165 | # Creating DenStream and appending the micro-cluster. 166 | ds = DenStream(eps, beta, mu, lambd, min_samples) 167 | ds.p_micro_clusters.append(c1) 168 | ds.p_micro_clusters.append(c2) 169 | ds.p_micro_clusters.append(c3) 170 | ds.p_micro_clusters.append(c4) 171 | ds.p_micro_clusters.append(c5) 172 | ds.p_micro_clusters.append(c6) 173 | 174 | labels = ds._request_clustering() 175 | expected_labels = np.array([0, 0, 1, 1, -1, -1]) 176 | self.assertTrue(np.array_equal(labels, expected_labels)) 177 | 178 | def test_compute_metrics(self): 179 | """ 180 | This test is designed to check that the metrics computation is done correctly. 181 | It uses sklearn.metrics.homogeneity_score. 182 | """ 183 | 184 | eps = 1 185 | lambd = 1 186 | beta = 0.9 187 | mu = 2 188 | min_samples = 2 189 | 190 | # Creating potential cluster 1. 191 | x1_c1 = np.array([4.1, 3.9]).reshape((1, 2)) 192 | x2_c1 = np.array([4.0, 4.0]).reshape((1, 2)) 193 | 194 | c1 = MicroCluster(1, lambd) 195 | c1.append(1, x1_c1, 1) 196 | c1.append(1, x2_c1, 1) 197 | c1.update_parameters(time=1) 198 | 199 | # Creating potential cluster 2. 200 | x1_c2 = np.array([-4.1, -3.9]).reshape((1, 2)) 201 | x2_c2 = np.array([-4.2, -4.1]).reshape((1, 2)) 202 | 203 | c2 = MicroCluster(1, lambd) 204 | c2.append(1, x1_c2, 2) 205 | c2.append(1, x2_c2, 2) 206 | c2.update_parameters(time=1) 207 | 208 | # Creating potential cluster 3. 209 | x1_c3 = np.array([4.1, 3.9]).reshape((1, 2)) 210 | 211 | c3 = MicroCluster(1, lambd) 212 | c3.append(1, x1_c3, 1) 213 | c3.update_parameters(time=1) 214 | 215 | # Creating potential cluster 4. 216 | x1_c4 = np.array([-4.2, 4.1]).reshape((1, 2)) 217 | 218 | c4 = MicroCluster(1, lambd) 219 | c4.append(1, x1_c4, 1) 220 | c4.update_parameters(time=1) 221 | 222 | # Creating DenStream and appending the micro-clusters. 223 | ds = DenStream( 224 | eps, 225 | beta, 226 | mu, 227 | lambd, 228 | min_samples, 229 | label_metrics_list=[metrics.homogeneity_score], 230 | ) 231 | ds.p_micro_clusters.append(c1) 232 | ds.p_micro_clusters.append(c2) 233 | ds.p_micro_clusters.append(c3) 234 | ds.p_micro_clusters.append(c4) 235 | 236 | pred_labels = ds._request_clustering() 237 | computed_value = ds._compute_label_metrics(pred_labels)[0]["value"] 238 | self.assertTrue(np.abs(computed_value - 0.5) < self.TOL) 239 | 240 | def test_int_list_request_period(self): 241 | """ 242 | This test checks that we get the same evaluation metrics for request_period with int and lists. 243 | """ 244 | 245 | eps = 0.3 246 | lambd = 0.1 247 | beta = 0.2 248 | mu = 10 249 | min_samples = 1 250 | label_metrics_list = [metrics.homogeneity_score, metrics.completeness_score] 251 | no_label_metrics_list = [ 252 | metrics.silhouette_score, 253 | metrics.calinski_harabasz_score, 254 | ] 255 | 256 | gen_int = generate_test_data() 257 | ds_int = DenStream(eps, beta, mu, lambd, min_samples, label_metrics_list, no_label_metrics_list) 258 | ds_int.fit_generator(gen_int, request_period=100, normalize=True) 259 | 260 | gen_list = generate_test_data() 261 | ds_list = DenStream(eps, beta, mu, lambd, min_samples, label_metrics_list, no_label_metrics_list) 262 | ds_list.fit_generator(gen_list, request_period=[100, 200, 300, 400], normalize=True) 263 | 264 | for i in range(len(ds_int.metrics_results)): 265 | int_metrics_i = ds_int.metrics_results[i]["metrics"] 266 | list_metrics_i = ds_list.metrics_results[i]["metrics"] 267 | 268 | for j in range(len(int_metrics_i)): 269 | self.assertTrue(np.abs(int_metrics_i[j]["value"] - list_metrics_i[j]["value"]) < self.TOL) 270 | 271 | def test_set_cluster_method(self): 272 | """ 273 | This test checks that setting a new clustering method works. 274 | """ 275 | 276 | eps = 0.3 277 | lambd = 0.1 278 | beta = 0.2 279 | mu = 10 280 | min_samples = 1 281 | label_metrics_list = [metrics.homogeneity_score, metrics.completeness_score] 282 | no_label_metrics_list = [ 283 | metrics.silhouette_score, 284 | metrics.calinski_harabasz_score, 285 | ] 286 | 287 | gen_int = generate_test_data() 288 | ds_int = DenStream(eps, beta, mu, lambd, min_samples, label_metrics_list, no_label_metrics_list) 289 | model_int = KMeans(n_clusters=2, random_state=42) 290 | ds_int.set_clustering_model(model_int) 291 | ds_int.fit_generator(gen_int, request_period=100, normalize=True) 292 | 293 | gen_list = generate_test_data() 294 | ds_list = DenStream(eps, beta, mu, lambd, min_samples, label_metrics_list, no_label_metrics_list) 295 | model_list = KMeans(n_clusters=2, random_state=42) 296 | ds_list.set_clustering_model(model_list) 297 | ds_list.fit_generator(gen_list, request_period=[100, 200, 300, 400], normalize=True) 298 | 299 | for i in range(len(ds_int.metrics_results)): 300 | int_metrics_i = ds_int.metrics_results[i]["metrics"] 301 | list_metrics_i = ds_list.metrics_results[i]["metrics"] 302 | 303 | for j in range(len(int_metrics_i)): 304 | self.assertTrue(np.abs(int_metrics_i[j]["value"] - list_metrics_i[j]["value"]) < self.TOL) 305 | 306 | 307 | if __name__ == "__main__": 308 | unittest.main() 309 | -------------------------------------------------------------------------------- /tests/test_helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def generator(X, Y, T): 5 | """ 6 | Creating generator used for fit_generator. 7 | """ 8 | 9 | for i in range(0, X.shape[0]): 10 | yield { 11 | "time": int(T[i, :]), 12 | "feature_array": X[i, :].reshape((1, X.shape[1])), 13 | "label": int(Y[i, :]), 14 | } 15 | 16 | 17 | def generate_test_data(): 18 | """ 19 | Generating test-data. 20 | """ 21 | 22 | np.random.seed(42) 23 | 24 | num_samples = 100 25 | num_features = 2 26 | 27 | sigma = 0.1 28 | 29 | # Generating data for cluster 1. 30 | center_1 = np.array([1.0, 1.0]).reshape((1, num_features)) 31 | x_1 = center_1 + np.random.normal(0.0, sigma, [num_samples, num_features]) 32 | y_1 = np.repeat(0, num_samples).reshape((num_samples, 1)) 33 | t_1 = np.linspace(1, 100, num=num_samples).reshape((num_samples, 1)) 34 | 35 | # Generating data for cluster 2. 36 | center_2 = np.array([1.0, -1.0]).reshape((1, num_features)) 37 | x_2 = center_2 + np.random.normal(0.0, sigma, [num_samples, num_features]) 38 | y_2 = np.repeat(1, num_samples).reshape((num_samples, 1)) 39 | t_2 = np.linspace(101, 200, num=num_samples).reshape((num_samples, 1)) 40 | 41 | # Generating data for cluster 3. 42 | center_3 = np.array([-1.0, -1.0]).reshape((1, num_features)) 43 | x_3 = center_3 + np.random.normal(0.0, sigma, [num_samples, num_features]) 44 | y_3 = np.repeat(2, num_samples).reshape((num_samples, 1)) 45 | t_3 = np.linspace(51, 150, num=num_samples).reshape((num_samples, 1)) 46 | 47 | # Generating data for cluster 4. 48 | center_4 = np.array([-1.0, 1.0]).reshape((1, num_features)) 49 | x_4 = center_4 + np.random.normal(0.0, sigma, [num_samples, num_features]) 50 | y_4 = np.repeat(3, num_samples).reshape((num_samples, 1)) 51 | t_4 = np.linspace(51, 150, num=num_samples).reshape((num_samples, 1)) 52 | 53 | X = np.concatenate([x_1, x_2, x_3, x_4], axis=0).astype(float) 54 | Y = np.concatenate([y_1, y_2, y_3, y_4], axis=0).astype(int) 55 | T = np.concatenate([t_1, t_2, t_3, t_4], axis=0).astype(int) 56 | 57 | # Sorting data s.t. they come in time order. 58 | idx = np.argsort(T, axis=0).reshape( 59 | T.shape[0], 60 | ) 61 | X = X[idx, :] 62 | Y = Y[idx, :] 63 | T = T[idx, :] 64 | 65 | return generator(X, Y, T) 66 | -------------------------------------------------------------------------------- /tests/test_micro_cluster.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | 5 | from denstream import micro_cluster 6 | 7 | 8 | class TestMicroCluster(unittest.TestCase): 9 | def setUp(self): 10 | self.TOL = 1e-6 11 | 12 | def test_fading_cf1(self): 13 | """ 14 | This test is designed to check that the fading calculations and the cf1 calculations works as expected. 15 | """ 16 | 17 | t = 3 18 | lambd = 0.5 19 | 20 | # Feature-arrays 21 | x1 = np.array([1, 2]).reshape((1, 2)) 22 | x2 = np.array([4, 5]).reshape((1, 2)) 23 | x3 = np.array([0, 0]).reshape((1, 2)) 24 | 25 | # Time values 26 | t1 = 1.0 27 | t2 = 2.0 28 | t3 = 3.0 29 | 30 | mc = micro_cluster.MicroCluster(0, lambd) 31 | mc.append(t1, x1) 32 | mc.append(t2, x2) 33 | mc.append(t3, x3) 34 | 35 | # Check that the fading calculation works 36 | estimated_fading = mc._calculate_fading(t) 37 | expected_fading = np.array([0.5, 1 / np.sqrt(2), 1]).reshape((3, 1)) 38 | self.assertTrue(np.linalg.norm(estimated_fading - expected_fading) < self.TOL) 39 | 40 | # Check that the CF1 calculation works 41 | x = mc.features_array 42 | estimated_cf1 = mc.cf1_func(x, estimated_fading) 43 | expected_cf1 = np.array([0.5 + 4 / np.sqrt(2), 1 + 5 / np.sqrt(2)]).reshape((1, 2)) 44 | self.assertTrue(np.linalg.norm(estimated_cf1 - expected_cf1) < self.TOL) 45 | 46 | def test_radius(self): 47 | """ 48 | This test is designed to check that the radius calculations works as expected for the given micro-cluster. 49 | """ 50 | 51 | t = 3 52 | lambd = 0.5 53 | 54 | # Feature-arrays 55 | x1 = np.array([1, 2], dtype=np.float32).reshape((1, 2)) 56 | x2 = np.array([4, 5], dtype=np.float32).reshape((1, 2)) 57 | x3 = np.array([0, 0], dtype=np.float32).reshape((1, 2)) 58 | 59 | # Time values 60 | t1 = 1.0 61 | t2 = 2.0 62 | t3 = 3.0 63 | 64 | mc = micro_cluster.MicroCluster(0, lambd) 65 | mc.append(t1, x1) 66 | mc.append(t2, x2) 67 | mc.append(t3, x3) 68 | 69 | # (cf1 / w) squared 70 | expected_c1 = (21.75 + 14 / np.sqrt(2)) / (11 / 4 + 3 / np.sqrt(2)) 71 | 72 | # abs(cf2 / w) 73 | expected_c2 = (0.5 + 16 / np.sqrt(2) + 2 + 25 / np.sqrt(2)) / (1.5 + 1 / np.sqrt(2)) 74 | 75 | expected_radius = np.sqrt(expected_c2 - expected_c1) 76 | estimated_radius, _, _ = mc.calculate_radius(t) 77 | 78 | self.assertTrue(np.abs(estimated_radius - expected_radius) < self.TOL) 79 | 80 | def test_adding_updating(self): 81 | """ 82 | This test is designed to test that adding of points works as expected. 83 | It also checks that the update_parameters works as expected. 84 | """ 85 | 86 | t0 = 0.5 87 | lambd = 0.5 88 | mc = micro_cluster.MicroCluster(t0, lambd) 89 | 90 | # Feature-arrays 91 | x1 = np.array([1, 2]).reshape((1, 2)) 92 | x2 = np.array([4, 5]).reshape((1, 2)) 93 | x3 = np.array([0, 0]).reshape((1, 2)) 94 | 95 | # Time values 96 | t1 = 1.0 97 | t2 = 2.0 98 | t3 = 3.0 99 | 100 | # Check that the array is initially empty 101 | self.assertEqual(len(mc.features_array), 0) 102 | self.assertEqual(len(mc.time_array), 0) 103 | 104 | # Adding one point 105 | mc.append(t1, x1) 106 | self.assertEqual(len(mc.features_array), 1) 107 | self.assertEqual(len(mc.time_array), 1) 108 | 109 | # Asserting that the append comes in the correct order 110 | mc.append(t2, x2) 111 | expected_time_array = np.array([t1, t2]).reshape((2, 1)) 112 | self.assertTrue(np.linalg.norm(mc.time_array - expected_time_array) < self.TOL) 113 | 114 | # Checking so that the update works 115 | mc.append(t3, x3) 116 | t = 3 117 | mc.update_parameters(time=t) 118 | 119 | expected_weight = 1.5 + 1 / np.sqrt(2) 120 | expected_center = np.array([0.5 + 4 / np.sqrt(2), 1 + 5 / np.sqrt(2)]).reshape((1, 2)) 121 | expected_center = expected_center / expected_weight 122 | 123 | self.assertTrue(np.abs(mc.weight - expected_weight) < self.TOL) 124 | self.assertTrue(np.linalg.norm(mc.center - expected_center) < self.TOL) 125 | 126 | def test_update_parameters_same(self): 127 | """ 128 | This test nis designed to test that adding update_parameters works the same when "time" is given and not. 129 | """ 130 | 131 | lambd = 0.5 132 | time = 1 133 | 134 | x1 = np.array([-4.1, -3.9]).reshape((1, 2)) 135 | x2 = np.array([-4.2, -4.1]).reshape((1, 2)) 136 | 137 | c1 = micro_cluster.MicroCluster(1.0, lambd) 138 | c1.append(1.0, x1) 139 | c1.append(1.0, x2) 140 | c1.update_parameters(time=time) 141 | 142 | c2 = micro_cluster.MicroCluster(1.0, lambd) 143 | c2.append(1.0, x1) 144 | c2.append(1.0, x2) 145 | 146 | radius, weight, cf1 = c2.calculate_radius(time) 147 | c2.update_parameters(cf1_score=cf1, weight=weight) 148 | 149 | self.assertTrue(np.abs(c1.weight - c2.weight) < self.TOL) 150 | self.assertTrue(np.linalg.norm(c1.center - c2.center) < self.TOL) 151 | 152 | 153 | if __name__ == "__main__": 154 | unittest.main() 155 | -------------------------------------------------------------------------------- /tests/test_preprocessing.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | 5 | from denstream import preprocessing 6 | 7 | 8 | class TestPreprocessing(unittest.TestCase): 9 | def setUp(self): 10 | self.TOL = 1e-6 11 | 12 | def test_rolling_stats(self): 13 | """ 14 | This test is designed to check that the rolling statistics (i.e. mean and variance) works as expected. 15 | """ 16 | 17 | x1 = np.array([100, 1]).reshape((1, 2)) 18 | x2 = np.array([200, 2]).reshape((1, 2)) 19 | 20 | rs = preprocessing.RollingStats((1, 2)) 21 | rs.update_statistics(x1) 22 | rs.update_statistics(x2) 23 | 24 | expected_mean = np.array([150, 1.5]).reshape((1, 2)) 25 | expected_variance = np.array([2.5e3, 2.5e-1]).reshape((1, 2)) 26 | 27 | self.assertTrue(np.linalg.norm(rs.mean - expected_mean) < self.TOL) 28 | self.assertTrue(np.linalg.norm(rs.variance - expected_variance) < self.TOL) 29 | 30 | 31 | if __name__ == "__main__": 32 | unittest.main() 33 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | 5 | from denstream import utils 6 | 7 | 8 | class TestUtils(unittest.TestCase): 9 | def setUp(self): 10 | self.TOL = 1e-8 11 | self.SEED = 42 12 | 13 | def test_fading_function(self): 14 | """ 15 | This test is designed to check that the fading function works as expected, 16 | both for scalars and vectors (numpy arrays). 17 | """ 18 | 19 | # Testing fading function for scalar values 20 | lambd = 1.0 21 | time = 2 22 | assert np.abs(utils.fading_function(lambd, time) - 0.25) < self.TOL 23 | 24 | # Testing fading function for numpy arrays 25 | lambd_array = np.repeat(lambd, 4).reshape((4, 1)) 26 | time_array = np.array([1, 2, 3, 4]).reshape((4, 1)) 27 | 28 | expected_array = np.array([0.5, 0.25, 0.125, 0.0625]).reshape((4, 1)) 29 | actual_array = utils.fading_function(lambd_array, time_array) 30 | 31 | self.assertTrue(np.linalg.norm(actual_array - expected_array) < self.TOL) 32 | 33 | def test_cf1_calculations(self): 34 | """ 35 | This test checks that the calculation of the CF1-score is the same for the numpy and the numba version. 36 | """ 37 | 38 | np.random.seed(self.SEED) 39 | x = np.random.uniform(0, 1, size=(100, 2)) 40 | fading_array = np.random.uniform(0, 1, size=(100, 1)) 41 | 42 | np_cf1 = utils.numpy_cf1(x, fading_array) 43 | numba_cf1 = utils.numba_cf1(x, fading_array) 44 | 45 | self.assertTrue(np.linalg.norm(np_cf1 - numba_cf1) < self.TOL) 46 | 47 | def test_cf2_calculations(self): 48 | """ 49 | This test checks that the calculation of the CF2-score is the same for the numpy and the numba version. 50 | """ 51 | 52 | np.random.seed(self.SEED) 53 | x = np.random.uniform(0, 1, size=(100, 2)) 54 | fading_array = np.random.uniform(0, 1, size=(100, 1)) 55 | 56 | np_cf2 = utils.numpy_cf2(x, fading_array) 57 | numba_cf2 = utils.numba_cf2(x, fading_array) 58 | 59 | self.assertTrue(np.linalg.norm(np_cf2 - numba_cf2) < self.TOL) 60 | 61 | 62 | if __name__ == "__main__": 63 | unittest.main() 64 | --------------------------------------------------------------------------------