├── requirements.txt ├── .gitignore ├── diameter_clustering ├── approx │ ├── __init__.py │ ├── hnsw.py │ └── leader.py ├── __init__.py ├── timer.py ├── dist_matrix.py ├── mixins.py ├── leader.py ├── qt.py └── diameter.py ├── tests ├── test_dist_matrix.py ├── approx │ └── test_approx_leader.py ├── test_qt.py ├── test_greedy.py └── test_leader.py ├── setup.py ├── LICENSE ├── README.md └── .pylintrc /requirements.txt: -------------------------------------------------------------------------------- 1 | hnswlib 2 | numpy 3 | numpy_groupies 4 | scikit_learn 5 | scipy 6 | tqdm 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .vscode 3 | .pytest_cache 4 | .DS_Store 5 | *.egg-info 6 | dist/ 7 | build/ -------------------------------------------------------------------------------- /diameter_clustering/approx/__init__.py: -------------------------------------------------------------------------------- 1 | from .hnsw import HNSWIndex 2 | from .leader import ApproxLeaderClustering -------------------------------------------------------------------------------- /diameter_clustering/__init__.py: -------------------------------------------------------------------------------- 1 | from .diameter import MaxDiameterClustering 2 | from .leader import LeaderClustering 3 | from .qt import QTClustering 4 | -------------------------------------------------------------------------------- /tests/test_dist_matrix.py: -------------------------------------------------------------------------------- 1 | """Tests for distance matrix computation.""" 2 | 3 | import numpy as np 4 | import scipy 5 | from sklearn.datasets import make_blobs 6 | 7 | from diameter_clustering.dist_matrix import compute_dist_matrix, compute_sparse_dist_matrix 8 | 9 | 10 | X, y = make_blobs(n_samples=100, n_features=50, random_state=42) 11 | 12 | 13 | def test_dist_matrix(): 14 | 15 | dist_matrix = compute_dist_matrix(X) 16 | assert np.all(np.isfinite(dist_matrix)) 17 | 18 | dist_matrix = compute_dist_matrix(X, metric='inner_product') 19 | assert np.all(np.isfinite(dist_matrix)) 20 | 21 | dist_matrix = compute_dist_matrix(X, fill_diagonal=True) 22 | assert np.all(np.diagonal(dist_matrix) == np.inf) 23 | 24 | dist_matrix = compute_dist_matrix(X[0]) 25 | 26 | 27 | def test_sparse_dist_matrix(): 28 | 29 | dist_matrix = compute_sparse_dist_matrix(X, metric='cosine', max_distance=0.5) 30 | assert isinstance(dist_matrix, scipy.sparse.csr_matrix) 31 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r", encoding="utf-8") as readme_file: 4 | long_description = readme_file.read() 5 | 6 | setuptools.setup( 7 | name="diameter-clustering", 8 | version="0.1.0", 9 | author="Anton Klenitskiy", 10 | author_email="ant-klen@yandex.ru", 11 | description="Clustering with maximum distance between points inside clusters", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/antklen/diameter-clustering", 15 | packages=['diameter_clustering', 'diameter_clustering.approx'], 16 | classifiers=[ 17 | "Programming Language :: Python :: 3", 18 | "License :: OSI Approved :: MIT License", 19 | "Operating System :: OS Independent", 20 | "Development Status :: 3 - Alpha", 21 | ], 22 | install_requires=[ 23 | 'hnswlib', 24 | 'numpy', 25 | 'numpy_groupies', 26 | 'scikit_learn', 27 | 'scipy', 28 | 'tqdm' 29 | ], 30 | ) 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Anton Klenitskiy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /tests/approx/test_approx_leader.py: -------------------------------------------------------------------------------- 1 | """Tests for ApproxLeaderClustering.""" 2 | 3 | import numpy as np 4 | from scipy.spatial.distance import pdist 5 | from sklearn.datasets import make_blobs 6 | 7 | from diameter_clustering.approx import ApproxLeaderClustering, HNSWIndex 8 | from diameter_clustering.dist_matrix import compute_dist_matrix, compute_sparse_dist_matrix 9 | 10 | 11 | MAX_RADIUS = 0.25 12 | MAX_RADIUS_EUCLIDEAN = 30 13 | 14 | X, y = make_blobs(n_samples=100, n_features=50, centers=3, 15 | cluster_std=3, random_state=42) 16 | 17 | 18 | def compute_max_dist(X, labels, metric='cosine'): 19 | """Compute maximum distance between points inside clusters.""" 20 | 21 | max_dist = [] 22 | 23 | for cluster in np.unique(labels): 24 | x_cluster = X[labels == cluster] 25 | dist = pdist(x_cluster, metric=metric) 26 | if len(dist) == 0: 27 | max_dist.append(0) 28 | else: 29 | max_dist.append(dist.max()) 30 | 31 | return np.max(max_dist) 32 | 33 | 34 | def test_approx_leader(): 35 | 36 | hnsw_index = HNSWIndex(max_elements=len(X), space='l2', dim=50, 37 | ef=100, ef_construction=200, M=16) 38 | model = ApproxLeaderClustering(hnsw_index, max_radius=MAX_RADIUS_EUCLIDEAN) 39 | labels = model.fit_predict(X) 40 | assert compute_max_dist(X, labels, metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2 41 | 42 | hnsw_index = HNSWIndex(max_elements=len(X), space='cosine', dim=50, 43 | ef=100, ef_construction=200, M=16) 44 | model = ApproxLeaderClustering(hnsw_index, max_radius=MAX_RADIUS) 45 | labels = model.fit_predict(X) 46 | assert len(labels) == len(X) 47 | 48 | 49 | def test_deterministic(): 50 | 51 | hnsw_index1 = HNSWIndex(max_elements=len(X), space='cosine', dim=50, 52 | ef=100, ef_construction=200, M=16) 53 | model1 = ApproxLeaderClustering(hnsw_index1, max_radius=0.2, deterministic=True) 54 | labels1 = model1.fit_predict(X) 55 | 56 | hnsw_index2 = HNSWIndex(max_elements=len(X), space='cosine', dim=50, 57 | ef=100, ef_construction=200, M=16) 58 | model2 = ApproxLeaderClustering(hnsw_index2, max_radius=0.2, deterministic=True) 59 | labels2 = model2.fit_predict(X) 60 | assert np.array_equal(labels1, labels2) 61 | -------------------------------------------------------------------------------- /diameter_clustering/timer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Timer which saves history of runs. 3 | """ 4 | 5 | import logging 6 | import time 7 | from contextlib import contextmanager 8 | 9 | 10 | @contextmanager 11 | def timer(name: str, disable: bool = False): 12 | """Simple timer as context manager.""" 13 | 14 | start = time.time() 15 | yield 16 | if not disable: 17 | logging.info(f'[{name}] done in {(time.time() - start)*1000:.1f} ms') 18 | 19 | 20 | class TimerWithHistory: 21 | """Timer as context mamager which saves history. 22 | 23 | This timer should be initialized and then used as context manager. 24 | After each run it appends execution time to list with history. 25 | Different runs could have different names and history is saved as dict 26 | with separate key for each name. 27 | 28 | Args: 29 | default_name (str): Default name for given run. 30 | disable (bool): If True then disable timer. 31 | 32 | Example: 33 | timer = TimerWithHistory() 34 | with timer(): 35 | time.sleep(1) 36 | with timer(name='first'): 37 | time.sleep(2) 38 | # get history 39 | hist1, hist2 = timer.history['default'], timer.history['first'] 40 | """ 41 | 42 | def __init__(self, default_name: str = 'default', disable: bool = False): 43 | 44 | self._start = None 45 | self.history = {} 46 | self.name = default_name 47 | self.default_name = default_name 48 | self.disable = disable 49 | 50 | def start(self): 51 | """Start timer.""" 52 | 53 | if self._start is not None: 54 | raise RuntimeError('Timer already started...') 55 | self._start = time.perf_counter() 56 | 57 | def stop(self): 58 | """Stop timer and save result to history.""" 59 | 60 | if self._start is None: 61 | raise RuntimeError('Timer not yet started...') 62 | elapsed = time.perf_counter() - self._start 63 | if self.history.get(self.name): 64 | self.history[self.name].append(elapsed) 65 | else: 66 | self.history[self.name] = [elapsed] 67 | self._start = None 68 | 69 | def __enter__(self): 70 | if not self.disable: 71 | self.start() 72 | return self 73 | 74 | def __exit__(self, *args): 75 | if not self.disable: 76 | self.stop() 77 | 78 | def __call__(self, name=None): 79 | if not self.disable: 80 | self.name = name or self.default_name 81 | return self 82 | -------------------------------------------------------------------------------- /diameter_clustering/dist_matrix.py: -------------------------------------------------------------------------------- 1 | """ 2 | Computation of distance matrix. 3 | """ 4 | 5 | import numpy as np 6 | from scipy.sparse import csr_matrix 7 | from scipy.spatial.distance import pdist, squareform 8 | from sklearn.neighbors import RadiusNeighborsTransformer 9 | 10 | 11 | def compute_dist_matrix(X: np.ndarray, metric: str = 'inner_product', 12 | fill_diagonal: bool = False) -> np.ndarray: 13 | """ 14 | Compute distance matrix between points and optionally fill diagonal elements 15 | with np.inf (may be convenient in some situation). 16 | 17 | Args: 18 | X (np.ndarray): 2D array with data points. 19 | metric (str): Distance metric. Possible options are 'inner_product' or one of metrics 20 | available in scipy.spatial.distance.pdist. If 'inner_product' then use np.inner 21 | which is much faster than pdist. 'inner_product' could be used instead of cosine 22 | distance for normalized vectors. 23 | fill_diagonal (bool): If True then fill diagonal with np.inf. 24 | 25 | Returns: 26 | Array with shape (len(X), len(X)). 27 | """ 28 | 29 | if X.ndim == 1: 30 | X = X[None, :] # for correct work of distance computation 31 | 32 | if metric == 'inner_product': 33 | dist_matrix = 1 - np.inner(X, X) 34 | else: 35 | dist_matrix = pdist(X, metric=metric) 36 | # squareform converts emmpty dist_matrix array([]) to array([[0.]]) 37 | # this behavior could break the code later 38 | dist_matrix = squareform(dist_matrix) if len(dist_matrix) > 0 else np.empty((0, 0)) 39 | 40 | if fill_diagonal: 41 | np.fill_diagonal(dist_matrix, np.inf) 42 | 43 | return dist_matrix 44 | 45 | 46 | def compute_sparse_dist_matrix(X: np.ndarray, metric: str = 'cosine', 47 | max_distance: float = 0.2) -> csr_matrix: 48 | """ 49 | Compute distance matrix in sparse csr format using sklearn RadiusNeighborsTransformer. 50 | Zero elements of matrix are elements for which distance is greater than max_distance. 51 | 52 | Args: 53 | X (np.ndarray): 2D array with data points. 54 | metric (str): Distance metric 55 | (possible options in sklearn.neighbors.VALID_METRICS['brute']). 56 | max_distance (float): Maximum distance threshold. 57 | 58 | Returns: 59 | scipy.sparse.csr_matrix with shape (len(X), len(X)). 60 | """ 61 | 62 | transformer = RadiusNeighborsTransformer(mode='distance', algorithm='brute', 63 | metric=metric, radius=max_distance) 64 | 65 | return transformer.fit_transform(X) 66 | -------------------------------------------------------------------------------- /diameter_clustering/approx/hnsw.py: -------------------------------------------------------------------------------- 1 | """ 2 | Wrapper for approximate nearest neighbors search using hnswlib library. 3 | """ 4 | 5 | from typing import Optional 6 | 7 | import hnswlib 8 | import numpy as np 9 | 10 | 11 | class HNSWIndex: 12 | """ 13 | Approximate nearest neighbors search using hnswlib library. 14 | 15 | Args: 16 | max_elements: Maximum number of elements that can be stored in index (hnswlib parameter). 17 | path: Path to previously saved index. If not None, load it. If None, initialize empty index. 18 | space: Distance metric (hnswlib parameter). Possible values: 19 | 'l2', 'ip' (inner product), 'cosine. 20 | dim: Dimensionality of vectors in index (hnswlib parameter). 21 | ef: hnswlib parameter, defines query time accuracy/speed trade-off. 22 | ef_construction: hnswlib parameter, defines construction time/accuracy trade-off. 23 | M: hnswlib parameter, defines maximum number of outgoing connections in the graph. 24 | 25 | Attributes: 26 | index: Instance of hnswlib.Index. 27 | """ 28 | 29 | def __init__(self, max_elements: int, path: Optional[str] = None, 30 | space: str = 'ip', dim: int = 512, ef: int = 100, 31 | ef_construction: int = 250, M: int = 16): 32 | 33 | self.index = hnswlib.Index(space=space, dim=dim) 34 | 35 | if path is not None: 36 | self.index.load_index(path, max_elements=max_elements) 37 | else: 38 | self.index.init_index(max_elements=max_elements, ef_construction=ef_construction, M=M) 39 | 40 | self.index.set_ef(ef) 41 | 42 | def add_item(self, vector: np.ndarray, label: Optional[int] = None): 43 | """Add one element to index. 44 | 45 | Args: 46 | vector: Numpy array with vector for one element. 47 | label: Optional integer label for this element. 48 | """ 49 | 50 | self.index.add_items(vector, ids=label) 51 | 52 | def add_items(self, vectors: np.ndarray, labels: Optional[int] = None): 53 | """Add batch of elements to index. 54 | 55 | Args: 56 | vectors: Numpy array with vectors for given elements. 57 | label: Optional integer labels for this elements. 58 | """ 59 | 60 | self.index.add_items(vectors, ids=labels) 61 | 62 | def find_nearest_point(self, vector: np.ndarray): 63 | """Find nearest point from index for given vector. 64 | 65 | Args: 66 | vector: Numpy array. 67 | 68 | Returns: 69 | Label of nearest point and distance to it. 70 | """ 71 | 72 | labels, distances = self.index.knn_query(vector, k=1) 73 | return labels[0, 0], distances[0, 0] 74 | 75 | def find_nearest_point_batch(self, vectors: np.ndarray): 76 | """Find nearest point from index for batch of vectors. 77 | 78 | Args: 79 | vectors: Numpy array. 80 | 81 | Returns: 82 | Labels of nearest points and corresponding distances to it. 83 | """ 84 | 85 | labels, distances = self.index.knn_query(vectors, k=1) 86 | return labels[:, 0], distances[:, 0] 87 | 88 | def save(self, path: str): 89 | """Save index to disk. 90 | 91 | Args: 92 | path: Save index to this path. 93 | """ 94 | 95 | self.index.save_index(path) 96 | -------------------------------------------------------------------------------- /diameter_clustering/mixins.py: -------------------------------------------------------------------------------- 1 | """ 2 | Mixins for clustering algorithms. 3 | """ 4 | 5 | import logging 6 | from typing import Union 7 | 8 | import numpy as np 9 | from scipy.sparse import csr_matrix 10 | 11 | from .dist_matrix import compute_dist_matrix, compute_sparse_dist_matrix 12 | from .timer import timer 13 | 14 | 15 | class FitPredictMixin: 16 | """Mixin with fit_predict method.""" 17 | 18 | def fit_predict(self, X: Union[np.ndarray, csr_matrix]) -> np.ndarray: 19 | """Fit clustering from features or distance matrix and return cluster labels. 20 | 21 | Args: 22 | X (np.ndarray or scipy.sparse.csr_matrix): Array with features 23 | or precomputed distance matrix, which could be in sparse matrix format. 24 | 25 | Returns: 26 | Array with cluster labels. 27 | """ 28 | 29 | self.fit(X) 30 | 31 | return self.labels_ 32 | 33 | 34 | class DistanceMatrixMixin: 35 | """Mixin with methods for working with distance matrix.""" 36 | 37 | def _prepare_distance_matrix(self, X: Union[np.ndarray, csr_matrix]): 38 | """Prepare distance matrix. 39 | 40 | If self.precomputed_dist is True then do nothing, only check for correctness of X. 41 | Otherwise compute distance matrix regarding X as array of features. If self.sparse_dist 42 | is True then compute matrix in sparse format.""" 43 | 44 | if not self.precomputed_dist: 45 | if self.sparse_dist: 46 | if self.verbose: 47 | logging.info('computing distance matrix in sparse format...') 48 | with timer('compute_sparse_dist_matrix', disable=not self.verbose): 49 | return compute_sparse_dist_matrix(X, metric=self.metric, 50 | max_distance=self.max_distance) 51 | else: 52 | if self.verbose: 53 | logging.info('computing distance matrix in dense format...') 54 | with timer('compute_dist_matrix', disable=not self.verbose): 55 | return compute_dist_matrix(X, metric=self.metric) 56 | 57 | if X.shape[0] != X.shape[1]: 58 | raise ValueError(f'Distance matrix should be square. Got matrix of shape {X.shape}.') 59 | 60 | if self.sparse_dist: 61 | if not isinstance(X, csr_matrix): 62 | raise TypeError('Sparse distance matrix should be in ' 63 | 'scipy.sparse.csr_matrix format.') 64 | elif not isinstance(X, np.ndarray): 65 | raise TypeError('Dense distance matrix should be ' 66 | 'an instance of np.ndarray.') 67 | 68 | return X 69 | 70 | def _slice_distance_matrix(self, dist_matrix: Union[np.ndarray, csr_matrix], 71 | idx: int, indexes: np.ndarray): 72 | """Get one row of distance matrix. 73 | Get distance between given point and several other points. 74 | 75 | Args: 76 | dist (np.ndarray or scipy.sparse.csr_matrix): Distance matrix. 77 | idx (int): Index of given point. 78 | indexes (np.ndarray): Indexes of other points. 79 | """ 80 | 81 | if isinstance(dist_matrix, csr_matrix): 82 | current_dist = dist_matrix[idx, indexes].toarray()[0, :] 83 | current_dist[current_dist == 0] = np.inf 84 | else: 85 | current_dist = dist_matrix[idx, indexes] 86 | 87 | return current_dist 88 | -------------------------------------------------------------------------------- /tests/test_qt.py: -------------------------------------------------------------------------------- 1 | """Tests for QTClustering.""" 2 | 3 | import numpy as np 4 | from scipy.spatial.distance import pdist 5 | from sklearn.datasets import make_blobs 6 | 7 | from diameter_clustering import QTClustering 8 | from diameter_clustering.dist_matrix import compute_dist_matrix, compute_sparse_dist_matrix 9 | 10 | 11 | MAX_RADIUS = 0.25 12 | MAX_RADIUS_EUCLIDEAN = 30 13 | 14 | X, y = make_blobs(n_samples=100, n_features=50, centers=3, 15 | cluster_std=3, random_state=42) 16 | 17 | 18 | def compute_max_dist(X, labels, metric='cosine'): 19 | """Compute maximum distance between points inside clusters.""" 20 | 21 | max_dist = [] 22 | 23 | for cluster in np.unique(labels): 24 | x_cluster = X[labels == cluster] 25 | dist = pdist(x_cluster, metric=metric) 26 | if len(dist) == 0: 27 | max_dist.append(0) 28 | else: 29 | max_dist.append(dist.max()) 30 | 31 | return np.max(max_dist) 32 | 33 | 34 | def test_qt(): 35 | 36 | model = QTClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean', 37 | min_cluster_size=1, sparse_dist=False) 38 | labels = model.fit_predict(X) 39 | assert compute_max_dist(X, labels, metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2 40 | 41 | model = QTClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean', 42 | min_cluster_size=3, sparse_dist=False) 43 | labels = model.fit_predict(X) 44 | assert compute_max_dist(X[labels != -1], labels[labels != -1], 45 | metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2 46 | 47 | model = QTClustering(max_radius=MAX_RADIUS, metric='cosine', sparse_dist=False) 48 | labels = model.fit_predict(X) 49 | assert len(labels) == len(X) 50 | 51 | 52 | def test_inner_product(): 53 | 54 | x_normalized = X/(np.linalg.norm(X, axis=-1, keepdims=True) + 1e-16) 55 | model = QTClustering(max_radius=MAX_RADIUS, metric='inner_product', sparse_dist=False) 56 | labels = model.fit_predict(x_normalized) 57 | assert len(labels) == len(X) 58 | 59 | model2 = QTClustering(max_radius=MAX_RADIUS, metric='cosine', sparse_dist=False) 60 | labels2 = model2.fit_predict(X) 61 | assert np.array_equal(labels, labels2) 62 | 63 | 64 | def test_precomputed(): 65 | 66 | model = QTClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean', 67 | min_cluster_size=1, precomputed_dist=True, sparse_dist=False) 68 | dist_matrix = compute_dist_matrix(X, metric='euclidean') 69 | labels = model.fit_predict(dist_matrix) 70 | assert compute_max_dist(X, labels, metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2 71 | 72 | 73 | def test_sparse(): 74 | 75 | model = QTClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean', 76 | min_cluster_size=1, sparse_dist=True) 77 | labels = model.fit_predict(X) 78 | assert compute_max_dist(X, labels, metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2 79 | 80 | model = QTClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean', 81 | min_cluster_size=1, sparse_dist=True, precomputed_dist=True) 82 | dist_matrix = compute_sparse_dist_matrix(X, metric='euclidean', 83 | max_distance=MAX_RADIUS_EUCLIDEAN) 84 | labels = model.fit_predict(dist_matrix) 85 | assert compute_max_dist(X, labels, metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2 86 | 87 | 88 | def test_sparse_dense_equivalence(): 89 | 90 | model1 = QTClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean', 91 | min_cluster_size=1, sparse_dist=False) 92 | labels1 = model1.fit_predict(X) 93 | 94 | model2 = QTClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean', 95 | min_cluster_size=1, sparse_dist=True) 96 | labels2 = model2.fit_predict(X) 97 | 98 | assert np.array_equal(labels1, labels2) 99 | -------------------------------------------------------------------------------- /tests/test_greedy.py: -------------------------------------------------------------------------------- 1 | """Tests for MaxDiameterClustering.""" 2 | 3 | import numpy as np 4 | from scipy.spatial.distance import pdist 5 | from sklearn.datasets import make_blobs 6 | 7 | from diameter_clustering import MaxDiameterClustering 8 | from diameter_clustering.dist_matrix import compute_dist_matrix, compute_sparse_dist_matrix 9 | 10 | 11 | MAX_DISTANCE = 0.5 12 | 13 | X, y = make_blobs(n_samples=100, n_features=50, centers=3, 14 | cluster_std=5, random_state=42) 15 | 16 | 17 | def compute_max_dist(X, labels, metric='cosine'): 18 | """Compute maximum distance between points inside clusters.""" 19 | 20 | max_dist = [] 21 | 22 | for cluster in np.unique(labels): 23 | x_cluster = X[labels == cluster] 24 | dist = pdist(x_cluster, metric=metric) 25 | if len(dist) == 0: 26 | max_dist.append(0) 27 | else: 28 | max_dist.append(dist.max()) 29 | 30 | return np.max(max_dist) 31 | 32 | 33 | def test_max_diameter(): 34 | 35 | model = MaxDiameterClustering(max_distance=MAX_DISTANCE, criterion='distance', 36 | metric='cosine', sparse_dist=False, use_timer=True) 37 | labels = model.fit_predict(X) 38 | assert compute_max_dist(X, labels) < MAX_DISTANCE 39 | 40 | model = MaxDiameterClustering(max_distance=MAX_DISTANCE, criterion='size', 41 | metric='cosine', sparse_dist=False, use_timer=True) 42 | labels = model.fit_predict(X) 43 | assert compute_max_dist(X, labels) < MAX_DISTANCE 44 | 45 | def test_inner_product(): 46 | 47 | x_normalized = X/(np.linalg.norm(X, axis=-1, keepdims=True) + 1e-16) 48 | model = MaxDiameterClustering(max_distance=MAX_DISTANCE, metric='inner_product', 49 | sparse_dist=False, deterministic=True) 50 | labels = model.fit_predict(x_normalized) 51 | assert compute_max_dist(x_normalized, labels) < MAX_DISTANCE 52 | 53 | model2 = MaxDiameterClustering(max_distance=MAX_DISTANCE, metric='cosine', 54 | sparse_dist=False, deterministic=True) 55 | labels2 = model2.fit_predict(X) 56 | assert np.array_equal(labels, labels2) 57 | 58 | 59 | def test_precomputed(): 60 | 61 | model = MaxDiameterClustering(max_distance=MAX_DISTANCE, precomputed_dist=True, 62 | sparse_dist=False) 63 | dist_matrix = compute_dist_matrix(X, metric='cosine') 64 | labels = model.fit_predict(dist_matrix) 65 | assert compute_max_dist(X, labels) < MAX_DISTANCE 66 | 67 | 68 | def test_sparse(): 69 | 70 | model = MaxDiameterClustering(max_distance=MAX_DISTANCE, metric='cosine', 71 | sparse_dist=True) 72 | labels = model.fit_predict(X) 73 | assert compute_max_dist(X, labels) < MAX_DISTANCE 74 | 75 | model = MaxDiameterClustering(max_distance=MAX_DISTANCE, 76 | sparse_dist=True, precomputed_dist=True) 77 | dist_matrix = compute_sparse_dist_matrix(X, max_distance=MAX_DISTANCE) 78 | labels = model.fit_predict(dist_matrix) 79 | assert compute_max_dist(X, labels) < MAX_DISTANCE 80 | 81 | 82 | def test_deterministic(): 83 | 84 | model = MaxDiameterClustering(max_distance=MAX_DISTANCE, metric='cosine', 85 | deterministic=True) 86 | labels1 = model.fit_predict(X) 87 | labels2 = model.fit_predict(X) 88 | assert np.array_equal(labels1, labels2) 89 | 90 | 91 | def test_sparse_dense_equivalence(): 92 | 93 | model1 = MaxDiameterClustering(max_distance=MAX_DISTANCE, metric='cosine', 94 | sparse_dist=False, deterministic=True) 95 | labels1 = model1.fit_predict(X) 96 | 97 | model2 = MaxDiameterClustering(max_distance=MAX_DISTANCE, metric='cosine', 98 | sparse_dist=True, deterministic=True) 99 | labels2 = model2.fit_predict(X) 100 | 101 | assert np.array_equal(labels1, labels2) 102 | -------------------------------------------------------------------------------- /tests/test_leader.py: -------------------------------------------------------------------------------- 1 | """Tests for LeaderClustering.""" 2 | 3 | import numpy as np 4 | from scipy.spatial.distance import pdist 5 | from sklearn.datasets import make_blobs 6 | 7 | from diameter_clustering import LeaderClustering 8 | from diameter_clustering.dist_matrix import compute_dist_matrix, compute_sparse_dist_matrix 9 | 10 | 11 | MAX_RADIUS = 0.25 12 | MAX_RADIUS_EUCLIDEAN = 30 13 | 14 | X, y = make_blobs(n_samples=100, n_features=50, centers=3, 15 | cluster_std=3, random_state=42) 16 | 17 | 18 | def compute_max_dist(X, labels, metric='cosine'): 19 | """Compute maximum distance between points inside clusters.""" 20 | 21 | max_dist = [] 22 | 23 | for cluster in np.unique(labels): 24 | x_cluster = X[labels == cluster] 25 | dist = pdist(x_cluster, metric=metric) 26 | if len(dist) == 0: 27 | max_dist.append(0) 28 | else: 29 | max_dist.append(dist.max()) 30 | 31 | return np.max(max_dist) 32 | 33 | 34 | def test_leader(): 35 | 36 | model = LeaderClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean', 37 | sparse_dist=False) 38 | labels = model.fit_predict(X) 39 | assert compute_max_dist(X, labels, metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2 40 | 41 | model = LeaderClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean', 42 | change_leaders=True, sparse_dist=False) 43 | labels = model.fit_predict(X) 44 | assert compute_max_dist(X, labels, metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2 45 | 46 | model = LeaderClustering(max_radius=MAX_RADIUS, metric='cosine', sparse_dist=False) 47 | labels = model.fit_predict(X) 48 | assert len(labels) == len(X) 49 | 50 | 51 | def test_inner_product(): 52 | 53 | x_normalized = X/(np.linalg.norm(X, axis=-1, keepdims=True) + 1e-16) 54 | model = LeaderClustering(max_radius=MAX_RADIUS, metric='inner_product', 55 | sparse_dist=False, deterministic=True) 56 | labels = model.fit_predict(x_normalized) 57 | assert len(labels) == len(X) 58 | 59 | model2 = LeaderClustering(max_radius=MAX_RADIUS, metric='cosine', 60 | sparse_dist=False, deterministic=True) 61 | labels2 = model2.fit_predict(X) 62 | assert np.array_equal(labels, labels2) 63 | 64 | def test_precomputed(): 65 | 66 | model = LeaderClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean', 67 | precomputed_dist=True, sparse_dist=False) 68 | dist_matrix = compute_dist_matrix(X, metric='euclidean') 69 | labels = model.fit_predict(dist_matrix) 70 | assert compute_max_dist(X, labels, metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2 71 | 72 | 73 | def test_sparse(): 74 | 75 | model = LeaderClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean', 76 | sparse_dist=True) 77 | labels = model.fit_predict(X) 78 | assert compute_max_dist(X, labels, metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2 79 | 80 | model = LeaderClustering(max_radius=MAX_RADIUS_EUCLIDEAN, 81 | sparse_dist=True, precomputed_dist=True) 82 | dist_matrix = compute_sparse_dist_matrix(X, metric='euclidean', 83 | max_distance=MAX_RADIUS_EUCLIDEAN) 84 | labels = model.fit_predict(dist_matrix) 85 | assert compute_max_dist(X, labels, metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2 86 | 87 | 88 | def test_deterministic(): 89 | 90 | model = LeaderClustering(max_radius=MAX_RADIUS, metric='cosine', 91 | deterministic=True) 92 | labels1 = model.fit_predict(X) 93 | labels2 = model.fit_predict(X) 94 | assert np.array_equal(labels1, labels2) 95 | 96 | 97 | def test_sparse_dense_equivalence(): 98 | 99 | model1 = LeaderClustering(max_radius=MAX_RADIUS, metric='cosine', 100 | sparse_dist=False, deterministic=True) 101 | labels1 = model1.fit_predict(X) 102 | 103 | model2 = LeaderClustering(max_radius=MAX_RADIUS, metric='cosine', 104 | sparse_dist=True, deterministic=True) 105 | labels2 = model2.fit_predict(X) 106 | 107 | assert np.array_equal(labels1, labels2) 108 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Clustering with maximum diameter 2 | 3 | Collection of clustering algorithms with maximum distance between points inside clusters. 4 | 5 | When we have interpretable metric like cosine distance it could be nice to have clusters with maximum distance between points. Then we can find good threshold for maximum distance and be confident that points inside clusters are really similar. Also we dont' need to specify number of clusters with such approach. 6 | 7 | Unfortunately most of popular clustering algorithms don't have such behavior. 8 | 9 | Possible applications: 10 | - Embeddings of text data with cosine distance. 11 | - Geo data with haversine distance. 12 | 13 | ## Algorithms 14 | 15 | ### MaxDiameterClustering 16 | 17 | A simple greedy algorithm, in which we add points one by one. If there is a cluster with all points close enough to new points, then we add new point to this cluster. If there is no such cluster, this point starts new cluster. 18 | 19 | ### Quality Threshold Clustering 20 | 21 | [Explanation](https://sites.google.com/site/dataclusteringalgorithms/quality-threshold-clustering-algorithm-1). 22 | 23 | Inspired by this [repository](https://github.com/melvrl13/python-quality-threshold). 24 | ### Leader Clustering 25 | 26 | [Explanation on stackoverflow](https://stackoverflow.com/questions/36928654/leader-clustering-algorithm-explanation) 27 | 28 | [R package](https://cran.r-project.org/web/packages/leaderCluster/index.html) 29 | 30 | ### Approximate Leader Clustering 31 | 32 | Use approximate nearest neighbors search (currently hnswlib) to speed up Leader Clustering. 33 | 34 | 35 | ## Installation 36 | 37 | Install from PyPI 38 | ```sh 39 | pip install diameter-clustering 40 | ``` 41 | 42 | Install from source 43 | ```sh 44 | pip install git+git://github.com/antklen/diameter-clustering.git 45 | # or 46 | git clone git@github.com:antklen/diameter-clustering.git 47 | cd diameter-clustering 48 | pip install . 49 | ``` 50 | 51 | ## Usage 52 | 53 | ### MaxDiameterClustering 54 | 55 | Basic usage of MaxDiameterClustering: 56 | ```python 57 | from sklearn.datasets import make_blobs 58 | from diameter_clustering import MaxDiameterClustering 59 | 60 | X, y = make_blobs(n_samples=100, n_features=50) 61 | 62 | model = MaxDiameterClustering(max_distance=0.3, metric='cosine') 63 | labels = model.fit_predict(X) 64 | ``` 65 | 66 | Instead of using feature matrix `X` we can pass precomputed distance matrix: 67 | ```python 68 | from diameter_clustering.dist_matrix import compute_sparse_dist_matrix 69 | 70 | dist_matrix = compute_sparse_dist_matrix(X, metric='cosine') 71 | 72 | model = MaxDiameterClustering(max_distance=0.3, precomputed_dist=True) 73 | labels = model.fit_predict(dist_matrix) 74 | ``` 75 | 76 | By default computation of distance matrix in sparse format is used (`sparse_dist=True`), because calculation of distance matrix between all points in dense format is expensive. But when dataset is not so big (roughly less than 20k-30k points) `sparse_dist=False` mode can be used. It could be faster for small datasets or useful when you already have precomputed distance matrix in dense format. 77 | ```python 78 | model = MaxDiameterClustering(max_distance=0.3, metric='cosine', sparse_dist=False) 79 | labels = model.fit_predict(X) 80 | 81 | 82 | from diameter_clustering.dist_matrix import compute_dist_matrix 83 | 84 | dist_matrix = compute_dist_matrix(X, max_distance=0.3, metric='cosine') 85 | 86 | model = MaxDiameterClustering(max_distance=0.3, sparse_dist=False, precomputed_dist=True) 87 | labels = model.fit_predict(dist_matrix) 88 | ``` 89 | 90 | When we want to compute cosine distance in dense format and our vectors are normalized, it is better to use 91 | `inner_product` as metric because it is much faster: 92 | ```python 93 | X_normalized = X/(np.linalg.norm(X, axis=-1, keepdims=True) + 1e-16) 94 | 95 | model = MaxDiameterClustering(max_distance=0.3, metric='inner_product', sparse_dist=False) 96 | labels = model.fit_predict(X_normalized) 97 | ``` 98 | 99 | With `deterministic=True` we can get reproducible results: 100 | ```python 101 | model = MaxDiameterClustering(max_distance=0.3, metric='cosine', deterministic=True) 102 | labels = model.fit_predict(X) 103 | ``` 104 | 105 | ### Quality Threshold Clustering 106 | 107 | ```python 108 | from diameter_clustering import QTClustering 109 | 110 | model = QTClustering(max_radius=0.15, metric='cosine', min_cluster_size=5) 111 | labels = model.fit_predict(X) 112 | ``` 113 | 114 | `precomputed_dist`, `sparse_dist`, and `inner_product` 115 | can be used as in MaxDiameterClustering. This algorithm is deterministic by design. 116 | 117 | ### Leader Clustering 118 | 119 | ```python 120 | from diameter_clustering import LeaderClustering 121 | 122 | model = LeaderClustering(max_radius=0.15, metric='cosine') 123 | labels = model.fit_predict(X) 124 | ``` 125 | 126 | `precomputed_dist`, `sparse_dist`, `deterministic` and `inner_product` 127 | can be used as in MaxDiameterClustering. 128 | 129 | ### Approximate Leader Clustering 130 | 131 | ```python 132 | from diameter_clustering.approx import HNSWIndex 133 | from diameter_clustering.approx import ApproxLeaderClustering 134 | 135 | # fit model 136 | hnsw_index = HNSWIndex(max_elements=len(X), space='cosine', dim=50, 137 | ef=100, ef_construction=200, M=16) 138 | model = ApproxLeaderClustering(hnsw_index, max_radius=0.15, deterministic=True) 139 | labels = model.fit_predict(X) 140 | 141 | # save index for later usage 142 | hnsw_index.save('hnsw_index.bin') 143 | 144 | # predict clusters for new data later 145 | hnsw_index = HNSWIndex(max_elements=len(X_new), path='hnsw_index.bin', 146 | space='cosine', dim=50, ef=100) 147 | model = ApproxLeaderClustering(hnsw_index, max_radius=0.15, deterministic=True) 148 | new_labels = model.predict(X_new) 149 | ``` 150 | -------------------------------------------------------------------------------- /diameter_clustering/leader.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of Leader clustering. 3 | """ 4 | 5 | from typing import Union 6 | 7 | import numpy as np 8 | from scipy.sparse import csr_matrix 9 | from tqdm import tqdm 10 | 11 | from .mixins import FitPredictMixin, DistanceMatrixMixin 12 | 13 | 14 | class LeaderClustering(FitPredictMixin, DistanceMatrixMixin): 15 | """Leader clustering algorithm. 16 | 17 | Args: 18 | max_radius (float): Maximum radius of cluster 19 | (maximum distance between leader and all other points in cluster). 20 | change_leaders (bool): if True then change cluster leader if there is a point with smaller 21 | average distance to all points in cluster. 22 | metric (str): Distance metric. 23 | For sparse_dist=True possible options are in sklearn.neighbors.VALID_METRICS['brute']. 24 | For sparse_dist=False possible options are 'inner_product' or one of metrics 25 | available in scipy.spatial.distance.pdist. If 'inner_product' then use np.inner 26 | which is much faster than pdist. 'inner_product' could be used instead 27 | of cosine distance for normalized vectors. 28 | precomputed_dist (bool): If True, then input should be precomputed distance matrix, 29 | if False then input is array with features. 30 | sparse_dist (bool): If True, then use distance matrix in sparse format (zero elements 31 | are elements for which distance between points is greater than max_distance). 32 | If False, then consider distance matrix as ordinary numpy array. 33 | deterministic (bool): If True then take points one by one to get deterministic behavior. 34 | If False then select points at random, so results would be different for each run. 35 | verbose (bool): If True then output progress info, otherwise be silent. 36 | 37 | Attributes: 38 | labels_ (np.ndarray): Array with cluster labels after fitting model. 39 | n_clusters_ (int): Number of clusters after fitting model. 40 | centers_ (np.ndarray): Array with indexes of cluster centers. 41 | leaders_ (np.ndarray): Array with 1 for cluster centers and with 0 for all other points. 42 | """ 43 | 44 | def __init__(self, max_radius: float = 0.1, change_leaders: bool = False, 45 | metric: str = 'cosine', precomputed_dist: bool = False, 46 | sparse_dist: bool = True, deterministic: bool = False, 47 | verbose: bool = True): 48 | 49 | self.max_radius = max_radius 50 | self.change_leaders = change_leaders 51 | self.metric = metric 52 | self.precomputed_dist = precomputed_dist 53 | self.sparse_dist = sparse_dist 54 | self.deterministic = deterministic 55 | self.verbose = verbose 56 | 57 | self.max_distance = max_radius # is needed for computation of sparse distance matrix 58 | self.labels_ = None 59 | self.n_clusters_ = None 60 | self.centers_ = None 61 | self.leaders_ = None 62 | 63 | def fit(self, X: Union[np.ndarray, csr_matrix]): 64 | """Fit clustering from features or distance matrix. 65 | 66 | Args: 67 | X (np.ndarray or scipy.sparse.csr_matrix): Array with features or 68 | precomputed distance matrix, could be in sparse matrix format. 69 | """ 70 | 71 | dist_matrix = self._prepare_distance_matrix(X) 72 | 73 | # create arrays for labels, leaders and centers 74 | labels = np.empty(dist_matrix.shape[0]) 75 | labels.fill(np.nan) 76 | leaders = np.zeros(dist_matrix.shape[0]) 77 | centers = [] 78 | 79 | # choose first point and assign label to it 80 | idx = 0 if self.deterministic else np.random.choice(range(len(labels))) 81 | labels[idx] = 0 82 | next_cluster = 1 83 | leaders[idx] = 1 84 | centers.append(idx) 85 | 86 | for _ in tqdm(range(len(labels)-1), desc='LeaderClustering fit', disable=not self.verbose): 87 | 88 | # choose next point 89 | indexes = np.where(np.isnan(labels))[0] 90 | idx = indexes[0] if self.deterministic else np.random.choice(indexes) 91 | # find indices of current leaders 92 | current_leaders_idx = np.where(leaders == 1)[0] 93 | current_leaders_labels = labels[current_leaders_idx] 94 | 95 | # find distances to current leaders 96 | leaders_dist = self._slice_distance_matrix(dist_matrix, idx, current_leaders_idx) 97 | 98 | if np.min(leaders_dist) <= self.max_radius: 99 | # assign cluster with nearest leader as label 100 | labels[idx] = current_leaders_labels[leaders_dist.argmin()] 101 | 102 | # change leader in cluster if there is better candidate for it 103 | if self.change_leaders: 104 | cluster_idx = np.where(labels == labels[idx])[0] 105 | dist_inside = dist_matrix[cluster_idx][:, cluster_idx].mean(axis=1) 106 | min_idx = cluster_idx[dist_inside.argmin()] 107 | nearest_leader_idx = current_leaders_idx[leaders_dist.argmin()] 108 | if min_idx != nearest_leader_idx: 109 | leaders[nearest_leader_idx] = 0 110 | leaders[min_idx] = 1 111 | 112 | else: 113 | # assign new cluster label 114 | labels[idx] = next_cluster 115 | leaders[idx] = 1 116 | centers.append(idx) 117 | next_cluster += 1 118 | 119 | self.labels_ = labels.astype(int) 120 | self.n_clusters_ = int(labels.max() + 1) 121 | self.centers_ = np.array(centers) 122 | self.leaders_ = leaders.astype(int) 123 | -------------------------------------------------------------------------------- /diameter_clustering/qt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implementation of Quality threshold clustering. 3 | """ 4 | 5 | from typing import Union 6 | 7 | import numpy as np 8 | from scipy.sparse import csr_matrix, lil_matrix 9 | from tqdm import tqdm 10 | 11 | from .mixins import FitPredictMixin, DistanceMatrixMixin 12 | 13 | 14 | class QTClustering(FitPredictMixin, DistanceMatrixMixin): 15 | """Quality threshold clustering. 16 | 17 | Args: 18 | max_radius (float): Maximum radius of cluster 19 | (maximum distance between center of cluster and all other points). 20 | min_cluster_size (int): Minimum size of clusters, stop iterations at this cluster size. 21 | metric (str): Distance metric. 22 | For sparse_dist=True possible options are in sklearn.neighbors.VALID_METRICS['brute']. 23 | For sparse_dist=False possible options are 'inner_product' or one of metrics 24 | available in scipy.spatial.distance.pdist. If 'inner_product' then use np.inner 25 | which is much faster than pdist. 'inner_product' could be used instead 26 | of cosine distance for normalized vectors. 27 | precomputed_dist (bool): If True, then input should be precomputed distance matrix, 28 | if False then input is array with features. 29 | sparse_dist (bool): If True, then use distance matrix in sparse format (zero elements 30 | are elements for which distance between points is greater than max_distance). 31 | If False, then consider distance matrix as ordinary numpy array. 32 | verbose (bool): If True then output progress info, otherwise be silent. 33 | 34 | Attributes: 35 | labels_ (np.ndarray): Array with cluster labels after fitting model. 36 | n_clusters_ (int): Number of clusters after fitting model. 37 | centers_ (np.ndarray): Array with indexes of cluster centers. 38 | """ 39 | 40 | def __init__(self, max_radius: float = 0.1, min_cluster_size: int = 2, 41 | metric: str = 'cosine', precomputed_dist: bool = False, 42 | sparse_dist: bool = True, verbose: bool = True): 43 | 44 | self.max_radius = max_radius 45 | self.min_cluster_size = min_cluster_size 46 | self.metric = metric 47 | self.precomputed_dist = precomputed_dist 48 | self.sparse_dist = sparse_dist 49 | self.verbose = verbose 50 | 51 | self.max_distance = max_radius # is needed for computation of sparse distance matrix 52 | self.labels_ = None 53 | self.n_clusters_ = None 54 | self.centers_ = None 55 | 56 | def fit(self, X: Union[np.ndarray, csr_matrix]): 57 | """Fit clustering from features or distance matrix. 58 | 59 | Args: 60 | X (np.ndarray or scipy.sparse.csr_matrix): Array with features or 61 | precomputed distance matrix, could be in sparse matrix format. 62 | """ 63 | 64 | dist_matrix = self._prepare_distance_matrix(X) 65 | 66 | if self.sparse_dist: 67 | dist_mask = lil_matrix(dist_matrix) 68 | dist_mask[dist_mask > 0] = 1 69 | dist_mask.setdiag(1) 70 | labels, centers = self.fit_sparse(dist_mask) 71 | else: 72 | dist_mask = dist_matrix < self.max_radius 73 | np.fill_diagonal(dist_mask, True) 74 | labels, centers = self.fit_dense(dist_mask) 75 | 76 | self.labels_ = labels.astype(int) 77 | self.n_clusters_ = int(labels.max() + 1) 78 | self.centers_ = np.array(centers) 79 | 80 | def fit_dense(self, dist_mask: np.ndarray): 81 | """Fit clustering from distance matrix mask when it is dense matrix.""" 82 | 83 | labels = np.empty(dist_mask.shape[0]) 84 | labels.fill(np.nan) 85 | centers = [] 86 | cluster_number = 0 87 | total_count = 0 88 | 89 | with tqdm(total=dist_mask.shape[0], disable=not self.verbose) as pbar: 90 | while dist_mask.any(): 91 | 92 | # find size of candidate clusters for each point 93 | candidate_size = dist_mask.sum(axis=1) 94 | 95 | if np.max(candidate_size) < self.min_cluster_size: 96 | labels[np.where(np.isnan(labels))] = -1 97 | break 98 | 99 | # pick the biggest possible cluster from candidates 100 | center_idx = np.argmax(candidate_size) 101 | cluster_points_idx = np.where(dist_mask[center_idx])[0] 102 | # assign labels 103 | labels[cluster_points_idx] = cluster_number 104 | centers.append(center_idx) 105 | # remove labeled data from further calculations 106 | dist_mask[cluster_points_idx, :] = False 107 | dist_mask[:, cluster_points_idx] = False 108 | 109 | # finalize iteration 110 | cluster_number += 1 111 | size = np.max(candidate_size) 112 | total_count += size 113 | 114 | pbar.update(size) 115 | pbar.set_description( 116 | f"QTClustering fit. Current cluster size {size}, total count {total_count}") 117 | 118 | return labels, centers 119 | 120 | def fit_sparse(self, dist_mask: csr_matrix): 121 | """Fit clustering from distance matrix mask when it is sparse matrix.""" 122 | 123 | labels = np.empty(dist_mask.shape[0]) 124 | labels.fill(np.nan) 125 | centers = [] 126 | cluster_number = 0 127 | total_count = 0 128 | 129 | with tqdm(total=dist_mask.shape[0], disable=not self.verbose) as pbar: 130 | while dist_mask.sum() > 0: 131 | 132 | # find size of candidate clusters for each point 133 | candidate_size = dist_mask.sum(axis=1) 134 | 135 | if np.max(candidate_size) < self.min_cluster_size: 136 | labels[np.where(np.isnan(labels))] = -1 137 | break 138 | 139 | # pick the biggest possible cluster from candidates 140 | center_idx = np.argmax(candidate_size) 141 | cluster_points_idx = dist_mask[center_idx].nonzero()[1] 142 | # assign labels 143 | labels[cluster_points_idx] = cluster_number 144 | centers.append(center_idx) 145 | 146 | # remove labeled data from further calculations 147 | dist_mask[cluster_points_idx, :] = 0 148 | dist_mask[:, cluster_points_idx] = 0 149 | 150 | # finalize iteration 151 | cluster_number += 1 152 | size = np.max(candidate_size) 153 | total_count += size 154 | 155 | pbar.update(size) 156 | pbar.set_description( 157 | f"QTClustering fit. Current cluster size {size}, total count {total_count}") 158 | 159 | return labels, centers 160 | -------------------------------------------------------------------------------- /diameter_clustering/approx/leader.py: -------------------------------------------------------------------------------- 1 | """ 2 | Version of Leader clustering using approximate nearest neighbors search. 3 | """ 4 | 5 | import numpy as np 6 | from tqdm import tqdm 7 | 8 | from .hnsw import HNSWIndex 9 | from ..timer import timer 10 | 11 | 12 | class ApproxLeaderClustering: 13 | """Leader clustering algorithm with approximate nearest neighbors search. 14 | 15 | Approximate nearest neighbors index is used to store leaders of clusters 16 | and to find nearest leader for new points. 17 | 18 | Args: 19 | ann_index: instance of HNSWIndex. 20 | max_radius: Maximum radius of each cluster 21 | (maximum distance between the leader and all other points in cluster). 22 | deterministic: If True then take points one by one to get deterministic behavior. 23 | If False then select points at random, so results would be different for each run. 24 | verbose: If True then output progress info, otherwise be silent. 25 | 26 | Attributes: 27 | labels_ (np.ndarray): Array with cluster labels after fitting model. 28 | n_clusters_ (int): Number of clusters after fitting model. 29 | centers_ (np.ndarray): Array with indexes of cluster centers. 30 | leaders_ (np.ndarray): Array with 1 for cluster centers and with 0 for all other points. 31 | 32 | Examples: 33 | import numpy as np 34 | from diameter_clustering.approx import HNSWIndex 35 | from diameter_clustering.approx import ApproxLeaderClustering 36 | 37 | # fit model 38 | data = np.random.rand(1000, 50) 39 | hnsw_index = HNSWIndex(max_elements=len(data), space='cosine', dim=50, 40 | ef=100, ef_construction=200, M=16) 41 | model = ApproxLeaderClustering(hnsw_index, max_radius=0.2, deterministic=True) 42 | labels = model.fit_predict(data) 43 | 44 | # save index for later usage 45 | hnsw_index.save('hnsw_index.bin') 46 | 47 | # predict clusters for new data later 48 | new_data = np.random.rand(100, 50) 49 | hnsw_index = HNSWIndex(max_elements=len(new_data), path='hnsw_index.bin', 50 | space='cosine', dim=50, ef=100) 51 | model = ApproxLeaderClustering(hnsw_index, max_radius=0.2, deterministic=True) 52 | new_labels = model.predict(new_data) 53 | """ 54 | 55 | def __init__(self, ann_index: HNSWIndex, max_radius: float = 0.1, 56 | deterministic: bool = True, verbose: bool = True): 57 | 58 | self.ann_index = ann_index 59 | self.max_radius = max_radius 60 | self.deterministic = deterministic 61 | self.verbose = verbose 62 | 63 | self.labels_ = None 64 | self.n_clusters_ = None 65 | self.centers_ = None 66 | self.leaders_ = None 67 | 68 | def fit(self, X: np.ndarray): 69 | """Fit clustering. 70 | 71 | Args: 72 | X: Array with features. 73 | """ 74 | 75 | # create arrays for labels and leaders 76 | labels = np.empty(len(X)) 77 | labels.fill(np.nan) 78 | leaders = np.zeros(len(X)) 79 | centers = [] 80 | 81 | # handle case when empty input data is passed 82 | if len(labels) == 0: 83 | self.labels_ = labels 84 | self.leaders_ = leaders 85 | self.n_clusters_ = 0 86 | return 87 | 88 | # choose first point and assign label to it 89 | idx = 0 if self.deterministic else np.random.choice(range(len(labels))) 90 | labels[idx] = 0 91 | next_cluster = 1 92 | leaders[idx] = 1 93 | centers.append(idx) 94 | self.ann_index.add_item(X[idx]) 95 | 96 | for _ in tqdm(range(len(labels)-1), desc='ApproxLeaderClustering fit', 97 | disable=not self.verbose): 98 | 99 | # choose next point 100 | indexes = np.where(np.isnan(labels))[0] 101 | idx = indexes[0] if self.deterministic else np.random.choice(indexes) 102 | 103 | # find nearest leader 104 | nearest_leader_idx, nearest_leader_dist = self.ann_index.find_nearest_point(X[idx]) 105 | 106 | if nearest_leader_dist <= self.max_radius: 107 | # assign cluster with nearest leader as label 108 | labels[idx] = nearest_leader_idx 109 | else: 110 | # assign new cluster label 111 | labels[idx] = next_cluster 112 | leaders[idx] = 1 113 | centers.append(idx) 114 | next_cluster += 1 115 | self.ann_index.add_item(X[idx]) 116 | 117 | self.labels_ = labels.astype(int) 118 | self.n_clusters_ = int(labels.max() + 1) 119 | self.centers_ = np.array(centers) 120 | self.leaders_ = leaders.astype(int) 121 | 122 | def fit_predict(self, X: np.ndarray) -> np.ndarray: 123 | """Fit clustering and return cluster labels. 124 | 125 | Args: 126 | X: Array with features. 127 | 128 | Returns: 129 | Numpy array with labels for data points in X. 130 | """ 131 | 132 | self.fit(X) 133 | 134 | return self.labels_ 135 | 136 | def predict(self, X: np.ndarray) -> np.ndarray: 137 | """Assigning new points to existent clusters without making new clusters. 138 | 139 | Returning -1 for points which can't be assigned to any cluster. 140 | Finding nearest leaders for points one by one. 141 | 142 | Args: 143 | X: Array with features for new points. 144 | 145 | Returns: 146 | Numpy array with labels for data points in X. 147 | """ 148 | 149 | # create array for new labels 150 | labels = np.empty(len(X)) 151 | 152 | for idx in tqdm(range(len(X)), desc='ApproxLeaderClustering assign points to clusters', 153 | disable=not self.verbose): 154 | 155 | # find nearest leader 156 | nearest_leader_idx, nearest_leader_dist = self.ann_index.find_nearest_point(X[idx]) 157 | 158 | if nearest_leader_dist <= self.max_radius: 159 | # assign cluster with nearest leader as label 160 | labels[idx] = nearest_leader_idx 161 | else: 162 | # assign -1 for point which is not close enough to any leader 163 | labels[idx] = -1 164 | 165 | return labels 166 | 167 | 168 | def predict_batch(self, X: np.ndarray) -> np.ndarray: 169 | """Assigning new points to existent clusters without making new clusters. 170 | 171 | Returning -1 for points which can't be assigned to any cluster. 172 | Finding nearest leaders for all points at once. 173 | 174 | Args: 175 | X: Array with features for new points. 176 | 177 | Returns: 178 | Numpy array with labels for data points in X. 179 | """ 180 | 181 | # create array for new labels 182 | labels = np.empty(len(X)) 183 | 184 | # find nearest leaders 185 | with timer('find_nearest_point_batch', disable=not self.verbose): 186 | nearest_leaders_idx, nearest_leaders_dist = \ 187 | self.ann_index.find_nearest_point_batch(X) 188 | 189 | for idx in range(len(X)): 190 | 191 | if nearest_leaders_dist[idx] <= self.max_radius: 192 | # assign cluster with nearest leader as label 193 | labels[idx] = nearest_leaders_idx[idx] 194 | else: 195 | # assign -1 for point which is not close enough to any leader 196 | labels[idx] = -1 197 | 198 | return labels 199 | -------------------------------------------------------------------------------- /diameter_clustering/diameter.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple greedy algorithm for clustering with maximum distance between points inside clusters. 3 | """ 4 | 5 | from typing import Union 6 | 7 | import numpy as np 8 | import numpy_groupies as npg 9 | from scipy.sparse import csr_matrix 10 | from tqdm import tqdm 11 | 12 | from .mixins import FitPredictMixin, DistanceMatrixMixin 13 | from .timer import TimerWithHistory 14 | 15 | 16 | class MaxDiameterClustering(FitPredictMixin, DistanceMatrixMixin): 17 | """Clustering with maximum diameter (maximum distance between points) inside clusters. 18 | 19 | Args: 20 | max_distance (float): Maximum distance between points in clusters. 21 | criterion (str): Criterion for choosing cluster from several candidates. 22 | If 'distance' then choose cluster with minimum average distance to given point. 23 | If 'size' then choose cluster with maximum current size. 24 | metric (str): Distance metric. 25 | For sparse_dist=True possible options are in sklearn.neighbors.VALID_METRICS['brute']. 26 | For sparse_dist=False possible options are 'inner_product' or one of metrics 27 | available in scipy.spatial.distance.pdist. If 'inner_product' then use np.inner 28 | which is much faster than pdist. 'inner_product' could be used instead 29 | of cosine distance for normalized vectors. 30 | precomputed_dist (bool): If True, then input should be precomputed distance matrix, 31 | if False then input is array with features. 32 | sparse_dist (bool): If True, then use distance matrix in sparse format (zero elements 33 | are elements for which distance between points is greater than max_distance). 34 | If False, then distance matrix is ordinary numpy array. 35 | deterministic (bool): If True then take points one by one to get determenistic behavior. 36 | If False then select points at random, so results would be different for each run. 37 | use_timer (bool): If True then use TimerWithHistory in fit method, which can be accessed 38 | via self.timer. Can be useful for debugging. 39 | verbose (bool): If True then output progress info, otherwise be silent. 40 | 41 | Attributes: 42 | labels_ (np.array): Array with cluster labels after fitting model. 43 | n_clusters_ (int): Number of clusters after fitting model. 44 | timer: Timer with history of execution time (access history via self.timer.history). 45 | """ 46 | 47 | def __init__(self, max_distance: float = 0.2, criterion: str = 'distance', 48 | metric: str = 'cosine', precomputed_dist: bool = False, 49 | sparse_dist: bool = True, deterministic: bool = False, 50 | use_timer: bool = False, verbose: bool = True): 51 | 52 | if criterion not in ['size', 'distance']: 53 | raise ValueError('Wrong criterion value, should be "size" or "distance".') 54 | 55 | self.max_distance = max_distance 56 | self.criterion = criterion 57 | self.metric = metric 58 | self.precomputed_dist = precomputed_dist 59 | self.sparse_dist = sparse_dist 60 | self.deterministic = deterministic 61 | self.use_timer = use_timer 62 | self.verbose = verbose 63 | 64 | self.labels_ = None 65 | self.n_clusters_ = None 66 | self.timer = None 67 | 68 | def fit(self, X: Union[np.ndarray, csr_matrix]): 69 | """Fit clustering from features or distance matrix. 70 | 71 | Args: 72 | X (np.ndarray or scipy.sparse.csr_matrix): Array with features or 73 | precomputed distance matrix, could be in sparse format. 74 | """ 75 | 76 | dist_matrix = self._prepare_distance_matrix(X) 77 | 78 | # create array for labels 79 | labels = np.empty(dist_matrix.shape[0]) 80 | labels.fill(np.nan) 81 | 82 | # handle case when empty input data is passed 83 | if len(labels) == 0: 84 | self.labels_ = labels 85 | self.n_clusters_ = 0 86 | return 87 | 88 | # choose first point and assign label to it 89 | idx = 0 if self.deterministic else np.random.choice(range(len(labels))) 90 | labels[idx] = 0 91 | next_cluster = 1 92 | 93 | self.timer = TimerWithHistory(disable=not self.use_timer) 94 | 95 | for _ in tqdm(range(len(labels)-1), desc='MaxDiameterClustering fit', 96 | disable=not self.verbose): 97 | 98 | # choose next point 99 | with self.timer(name='choose_next_point'): 100 | indexes = np.where(np.isnan(labels))[0] 101 | idx = indexes[0] if self.deterministic else np.random.choice(indexes) 102 | # find indices of already labeled points 103 | with self.timer(name='find_labeled_points'): 104 | current_cluster_idx = np.where(~np.isnan(labels))[0] 105 | current_cluster_labels = labels[current_cluster_idx].astype(int) 106 | # find distances to already labeled points 107 | with self.timer(name='get_distances'): 108 | current_dist = self._slice_distance_matrix(dist_matrix, idx, current_cluster_idx) 109 | 110 | # find max distance to each existent cluster 111 | with self.timer(name='max_distance_to_clusters'): 112 | cluster_dist_max = npg.aggregate(current_cluster_labels, current_dist, 113 | func='max', fill_value=np.inf) 114 | 115 | if np.min(cluster_dist_max) <= self.max_distance: 116 | # find existent clusters with max dist < threshold 117 | with self.timer(name='candidate_clusters'): 118 | candidate_clusters = np.where(cluster_dist_max <= self.max_distance)[0] 119 | # directly get label if there is only one such cluster 120 | if len(candidate_clusters) == 1: 121 | labels[idx] = candidate_clusters[0] 122 | continue 123 | # otherwise we need to choose between candidate clusters 124 | candidate_clusters_idx = np.isin(current_cluster_labels, candidate_clusters) 125 | candidate_clusters_labels = current_cluster_labels[candidate_clusters_idx] 126 | 127 | if self.criterion == 'distance': 128 | candidate_clusters_dist = current_dist[candidate_clusters_idx] 129 | labels[idx] = self._best_candidate_distance(candidate_clusters_labels, 130 | candidate_clusters_dist) 131 | elif self.criterion == 'size': 132 | labels[idx] = self._best_candidate_size(candidate_clusters_labels) 133 | else: 134 | # assign new cluster label 135 | with self.timer(name='assign_new_label'): 136 | labels[idx] = next_cluster 137 | next_cluster += 1 138 | 139 | self.labels_ = labels.astype(int) 140 | self.n_clusters_ = labels.max() + 1 141 | 142 | def _best_candidate_distance(self, candidate_clusters_labels: np.ndarray, 143 | candidate_clusters_dist: np.ndarray) -> int: 144 | """Find best candidate cluster based on average distance to clusters.""" 145 | 146 | # find average distance to clusters 147 | with self.timer(name='average_distance_to_clusters'): 148 | cluster_dist_mean = npg.aggregate(candidate_clusters_labels, 149 | candidate_clusters_dist, 150 | func='mean', fill_value=np.inf) 151 | 152 | # assign cluster with min average distance as label 153 | with self.timer(name='distance_argmin'): 154 | label = cluster_dist_mean.argmin() 155 | 156 | return label 157 | 158 | def _best_candidate_size(self, candidate_clusters_labels: np.ndarray) -> int: 159 | """Find best candidate cluster based on size of clusters.""" 160 | 161 | # find size of clusters 162 | with self.timer(name='average_size_of_clusters'): 163 | cluster_size = npg.aggregate(candidate_clusters_labels, 164 | candidate_clusters_labels, 165 | func='count') 166 | # assign cluster with max size as label 167 | with self.timer(name='size_argmax'): 168 | label = cluster_size.argmax() 169 | 170 | return label 171 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # A comma-separated list of package or module names from where C extensions may 4 | # be loaded. Extensions are loading into the active Python interpreter and may 5 | # run arbitrary code. 6 | extension-pkg-whitelist= 7 | 8 | # Specify a score threshold to be exceeded before program exits with error. 9 | fail-under=10 10 | 11 | # Add files or directories to the blacklist. They should be base names, not 12 | # paths. 13 | ignore=CVS 14 | 15 | # Add files or directories matching the regex patterns to the blacklist. The 16 | # regex matches against base names, not paths. 17 | ignore-patterns= 18 | 19 | # Python code to execute, usually for sys.path manipulation such as 20 | # pygtk.require(). 21 | #init-hook= 22 | 23 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the 24 | # number of processors available to use. 25 | jobs=1 26 | 27 | # Control the amount of potential inferred values when inferring a single 28 | # object. This can help the performance when dealing with large functions or 29 | # complex, nested conditions. 30 | limit-inference-results=100 31 | 32 | # List of plugins (as comma separated values of python module names) to load, 33 | # usually to register additional checkers. 34 | load-plugins= 35 | 36 | # Pickle collected data for later comparisons. 37 | persistent=yes 38 | 39 | # When enabled, pylint would attempt to guess common misconfiguration and emit 40 | # user-friendly hints instead of false-positive error messages. 41 | suggestion-mode=yes 42 | 43 | # Allow loading of arbitrary C extensions. Extensions are imported into the 44 | # active Python interpreter and may run arbitrary code. 45 | unsafe-load-any-extension=no 46 | 47 | 48 | [MESSAGES CONTROL] 49 | 50 | # Only show warnings with the listed confidence levels. Leave empty to show 51 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. 52 | confidence= 53 | 54 | # Disable the message, report, category or checker with the given id(s). You 55 | # can either give multiple identifiers separated by comma (,) or put this 56 | # option multiple times (only on the command line, not in the configuration 57 | # file where it should appear only once). You can also use "--disable=all" to 58 | # disable everything first and then reenable specific checks. For example, if 59 | # you want to run only the similarities checker, you can use "--disable=all 60 | # --enable=similarities". If you want to run only the classes checker, but have 61 | # no Warning level messages displayed, use "--disable=all --enable=classes 62 | # --disable=W". 63 | disable=print-statement, 64 | parameter-unpacking, 65 | unpacking-in-except, 66 | old-raise-syntax, 67 | backtick, 68 | long-suffix, 69 | old-ne-operator, 70 | old-octal-literal, 71 | import-star-module-level, 72 | non-ascii-bytes-literal, 73 | raw-checker-failed, 74 | bad-inline-option, 75 | locally-disabled, 76 | file-ignored, 77 | suppressed-message, 78 | useless-suppression, 79 | deprecated-pragma, 80 | use-symbolic-message-instead, 81 | apply-builtin, 82 | basestring-builtin, 83 | buffer-builtin, 84 | cmp-builtin, 85 | coerce-builtin, 86 | execfile-builtin, 87 | file-builtin, 88 | long-builtin, 89 | raw_input-builtin, 90 | reduce-builtin, 91 | standarderror-builtin, 92 | unicode-builtin, 93 | xrange-builtin, 94 | coerce-method, 95 | delslice-method, 96 | getslice-method, 97 | setslice-method, 98 | no-absolute-import, 99 | old-division, 100 | dict-iter-method, 101 | dict-view-method, 102 | next-method-called, 103 | metaclass-assignment, 104 | indexing-exception, 105 | raising-string, 106 | reload-builtin, 107 | oct-method, 108 | hex-method, 109 | nonzero-method, 110 | cmp-method, 111 | input-builtin, 112 | round-builtin, 113 | intern-builtin, 114 | unichr-builtin, 115 | map-builtin-not-iterating, 116 | zip-builtin-not-iterating, 117 | range-builtin-not-iterating, 118 | filter-builtin-not-iterating, 119 | using-cmp-argument, 120 | eq-without-hash, 121 | div-method, 122 | idiv-method, 123 | rdiv-method, 124 | exception-message-attribute, 125 | invalid-str-codec, 126 | sys-max-int, 127 | bad-python3-import, 128 | deprecated-string-function, 129 | deprecated-str-translate-call, 130 | deprecated-itertools-function, 131 | deprecated-types-field, 132 | next-method-defined, 133 | dict-items-not-iterating, 134 | dict-keys-not-iterating, 135 | dict-values-not-iterating, 136 | deprecated-operator-function, 137 | deprecated-urllib-function, 138 | xreadlines-attribute, 139 | deprecated-sys-function, 140 | exception-escape, 141 | comprehension-escape, 142 | too-many-instance-attributes, # start of user defined messages 143 | too-many-arguments, 144 | redefined-outer-name 145 | 146 | 147 | # Enable the message, report, category or checker with the given id(s). You can 148 | # either give multiple identifier separated by comma (,) or put this option 149 | # multiple time (only on the command line, not in the configuration file where 150 | # it should appear only once). See also the "--disable" option for examples. 151 | enable=c-extension-no-member 152 | 153 | 154 | [REPORTS] 155 | 156 | # Python expression which should return a score less than or equal to 10. You 157 | # have access to the variables 'error', 'warning', 'refactor', and 'convention' 158 | # which contain the number of messages in each category, as well as 'statement' 159 | # which is the total number of statements analyzed. This score is used by the 160 | # global evaluation report (RP0004). 161 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 162 | 163 | # Template used to display messages. This is a python new-style format string 164 | # used to format the message information. See doc for all details. 165 | #msg-template= 166 | 167 | # Set the output format. Available formats are text, parseable, colorized, json 168 | # and msvs (visual studio). You can also give a reporter class, e.g. 169 | # mypackage.mymodule.MyReporterClass. 170 | output-format=text 171 | 172 | # Tells whether to display a full report or only the messages. 173 | reports=no 174 | 175 | # Activate the evaluation score. 176 | score=yes 177 | 178 | 179 | [REFACTORING] 180 | 181 | # Maximum number of nested blocks for function / method body 182 | max-nested-blocks=5 183 | 184 | # Complete name of functions that never returns. When checking for 185 | # inconsistent-return-statements if a never returning function is called then 186 | # it will be considered as an explicit return statement and no message will be 187 | # printed. 188 | never-returning-functions=sys.exit 189 | 190 | 191 | [LOGGING] 192 | 193 | # The type of string formatting that logging methods do. `old` means using % 194 | # formatting, `new` is for `{}` formatting. 195 | logging-format-style=old 196 | 197 | # Logging modules to check that the string format arguments are in logging 198 | # function parameter format. 199 | logging-modules=logging 200 | 201 | 202 | [SPELLING] 203 | 204 | # Limits count of emitted suggestions for spelling mistakes. 205 | max-spelling-suggestions=4 206 | 207 | # Spelling dictionary name. Available dictionaries: none. To make it work, 208 | # install the python-enchant package. 209 | spelling-dict= 210 | 211 | # List of comma separated words that should not be checked. 212 | spelling-ignore-words= 213 | 214 | # A path to a file that contains the private dictionary; one word per line. 215 | spelling-private-dict-file= 216 | 217 | # Tells whether to store unknown words to the private dictionary (see the 218 | # --spelling-private-dict-file option) instead of raising a message. 219 | spelling-store-unknown-words=no 220 | 221 | 222 | [MISCELLANEOUS] 223 | 224 | # List of note tags to take in consideration, separated by a comma. 225 | notes=FIXME, 226 | XXX, 227 | TODO 228 | 229 | # Regular expression of note tags to take in consideration. 230 | #notes-rgx= 231 | 232 | 233 | [TYPECHECK] 234 | 235 | # List of decorators that produce context managers, such as 236 | # contextlib.contextmanager. Add to this list to register other decorators that 237 | # produce valid context managers. 238 | contextmanager-decorators=contextlib.contextmanager 239 | 240 | # List of members which are set dynamically and missed by pylint inference 241 | # system, and so shouldn't trigger E1101 when accessed. Python regular 242 | # expressions are accepted. 243 | generated-members= 244 | 245 | # Tells whether missing members accessed in mixin class should be ignored. A 246 | # mixin class is detected if its name ends with "mixin" (case insensitive). 247 | ignore-mixin-members=yes 248 | 249 | # Tells whether to warn about missing members when the owner of the attribute 250 | # is inferred to be None. 251 | ignore-none=yes 252 | 253 | # This flag controls whether pylint should warn about no-member and similar 254 | # checks whenever an opaque object is returned when inferring. The inference 255 | # can return multiple potential results while evaluating a Python object, but 256 | # some branches might not be evaluated, which results in partial inference. In 257 | # that case, it might be useful to still emit no-member and other checks for 258 | # the rest of the inferred objects. 259 | ignore-on-opaque-inference=yes 260 | 261 | # List of class names for which member attributes should not be checked (useful 262 | # for classes with dynamically set attributes). This supports the use of 263 | # qualified names. 264 | ignored-classes=optparse.Values,thread._local,_thread._local 265 | 266 | # List of module names for which member attributes should not be checked 267 | # (useful for modules/projects where namespaces are manipulated during runtime 268 | # and thus existing member attributes cannot be deduced by static analysis). It 269 | # supports qualified module names, as well as Unix pattern matching. 270 | ignored-modules= 271 | 272 | # Show a hint with possible names when a member name was not found. The aspect 273 | # of finding the hint is based on edit distance. 274 | missing-member-hint=yes 275 | 276 | # The minimum edit distance a name should have in order to be considered a 277 | # similar match for a missing member name. 278 | missing-member-hint-distance=1 279 | 280 | # The total number of similar names that should be taken in consideration when 281 | # showing a hint for a missing member. 282 | missing-member-max-choices=1 283 | 284 | # List of decorators that change the signature of a decorated function. 285 | signature-mutators= 286 | 287 | 288 | [VARIABLES] 289 | 290 | # List of additional names supposed to be defined in builtins. Remember that 291 | # you should avoid defining new builtins when possible. 292 | additional-builtins= 293 | 294 | # Tells whether unused global variables should be treated as a violation. 295 | allow-global-unused-variables=yes 296 | 297 | # List of strings which can identify a callback function by name. A callback 298 | # name must start or end with one of those strings. 299 | callbacks=cb_, 300 | _cb 301 | 302 | # A regular expression matching the name of dummy variables (i.e. expected to 303 | # not be used). 304 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ 305 | 306 | # Argument names that match this expression will be ignored. Default to name 307 | # with leading underscore. 308 | ignored-argument-names=_.*|^ignored_|^unused_ 309 | 310 | # Tells whether we should check for unused import in __init__ files. 311 | init-import=no 312 | 313 | # List of qualified module names which can have objects that can redefine 314 | # builtins. 315 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io 316 | 317 | 318 | [FORMAT] 319 | 320 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 321 | expected-line-ending-format= 322 | 323 | # Regexp for a line that is allowed to be longer than the limit. 324 | ignore-long-lines=^\s*(# )??$ 325 | 326 | # Number of spaces of indent required inside a hanging or continued line. 327 | indent-after-paren=4 328 | 329 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 330 | # tab). 331 | indent-string=' ' 332 | 333 | # Maximum number of characters on a single line. 334 | max-line-length=100 335 | 336 | # Maximum number of lines in a module. 337 | max-module-lines=1000 338 | 339 | # List of optional constructs for which whitespace checking is disabled. `dict- 340 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 341 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 342 | # `empty-line` allows space-only lines. 343 | no-space-check=trailing-comma, 344 | dict-separator 345 | 346 | # Allow the body of a class to be on the same line as the declaration if body 347 | # contains single statement. 348 | single-line-class-stmt=no 349 | 350 | # Allow the body of an if to be on the same line as the test if there is no 351 | # else. 352 | single-line-if-stmt=no 353 | 354 | 355 | [SIMILARITIES] 356 | 357 | # Ignore comments when computing similarities. 358 | ignore-comments=yes 359 | 360 | # Ignore docstrings when computing similarities. 361 | ignore-docstrings=yes 362 | 363 | # Ignore imports when computing similarities. 364 | ignore-imports=no 365 | 366 | # Minimum lines number of a similarity. 367 | min-similarity-lines=4 368 | 369 | 370 | [BASIC] 371 | 372 | # Naming style matching correct argument names. 373 | argument-naming-style=snake_case 374 | 375 | # Regular expression matching correct argument names. Overrides argument- 376 | # naming-style. 377 | #argument-rgx= 378 | 379 | # Naming style matching correct attribute names. 380 | attr-naming-style=snake_case 381 | 382 | # Regular expression matching correct attribute names. Overrides attr-naming- 383 | # style. 384 | #attr-rgx= 385 | 386 | # Bad variable names which should always be refused, separated by a comma. 387 | bad-names=foo, 388 | bar, 389 | baz, 390 | toto, 391 | tutu, 392 | tata 393 | 394 | # Bad variable names regexes, separated by a comma. If names match any regex, 395 | # they will always be refused 396 | bad-names-rgxs= 397 | 398 | # Naming style matching correct class attribute names. 399 | class-attribute-naming-style=any 400 | 401 | # Regular expression matching correct class attribute names. Overrides class- 402 | # attribute-naming-style. 403 | #class-attribute-rgx= 404 | 405 | # Naming style matching correct class names. 406 | class-naming-style=PascalCase 407 | 408 | # Regular expression matching correct class names. Overrides class-naming- 409 | # style. 410 | #class-rgx= 411 | 412 | # Naming style matching correct constant names. 413 | const-naming-style=UPPER_CASE 414 | 415 | # Regular expression matching correct constant names. Overrides const-naming- 416 | # style. 417 | #const-rgx= 418 | 419 | # Minimum line length for functions/classes that require docstrings, shorter 420 | # ones are exempt. 421 | docstring-min-length=-1 422 | 423 | # Naming style matching correct function names. 424 | function-naming-style=snake_case 425 | 426 | # Regular expression matching correct function names. Overrides function- 427 | # naming-style. 428 | #function-rgx= 429 | 430 | # Good variable names which should always be accepted, separated by a comma. 431 | good-names=i, 432 | j, 433 | k, 434 | ex, 435 | Run, 436 | _, 437 | df, 438 | X, 439 | y, 440 | 441 | # Good variable names regexes, separated by a comma. If names match any regex, 442 | # they will always be accepted 443 | good-names-rgxs= 444 | 445 | # Include a hint for the correct naming format with invalid-name. 446 | include-naming-hint=no 447 | 448 | # Naming style matching correct inline iteration names. 449 | inlinevar-naming-style=any 450 | 451 | # Regular expression matching correct inline iteration names. Overrides 452 | # inlinevar-naming-style. 453 | #inlinevar-rgx= 454 | 455 | # Naming style matching correct method names. 456 | method-naming-style=snake_case 457 | 458 | # Regular expression matching correct method names. Overrides method-naming- 459 | # style. 460 | #method-rgx= 461 | 462 | # Naming style matching correct module names. 463 | module-naming-style=snake_case 464 | 465 | # Regular expression matching correct module names. Overrides module-naming- 466 | # style. 467 | #module-rgx= 468 | 469 | # Colon-delimited sets of names that determine each other's naming style when 470 | # the name regexes allow several styles. 471 | name-group= 472 | 473 | # Regular expression which should only match function or class names that do 474 | # not require a docstring. 475 | no-docstring-rgx=^_ 476 | 477 | # List of decorators that produce properties, such as abc.abstractproperty. Add 478 | # to this list to register other decorators that produce valid properties. 479 | # These decorators are taken in consideration only for invalid-name. 480 | property-classes=abc.abstractproperty 481 | 482 | # Naming style matching correct variable names. 483 | variable-naming-style=snake_case 484 | 485 | # Regular expression matching correct variable names. Overrides variable- 486 | # naming-style. 487 | #variable-rgx= 488 | 489 | 490 | [STRING] 491 | 492 | # This flag controls whether inconsistent-quotes generates a warning when the 493 | # character used as a quote delimiter is used inconsistently within a module. 494 | check-quote-consistency=no 495 | 496 | # This flag controls whether the implicit-str-concat should generate a warning 497 | # on implicit string concatenation in sequences defined over several lines. 498 | check-str-concat-over-line-jumps=no 499 | 500 | 501 | [IMPORTS] 502 | 503 | # List of modules that can be imported at any level, not just the top level 504 | # one. 505 | allow-any-import-level= 506 | 507 | # Allow wildcard imports from modules that define __all__. 508 | allow-wildcard-with-all=no 509 | 510 | # Analyse import fallback blocks. This can be used to support both Python 2 and 511 | # 3 compatible code, which means that the block might have code that exists 512 | # only in one or another interpreter, leading to false positives when analysed. 513 | analyse-fallback-blocks=no 514 | 515 | # Deprecated modules which should not be used, separated by a comma. 516 | deprecated-modules=optparse,tkinter.tix 517 | 518 | # Create a graph of external dependencies in the given file (report RP0402 must 519 | # not be disabled). 520 | ext-import-graph= 521 | 522 | # Create a graph of every (i.e. internal and external) dependencies in the 523 | # given file (report RP0402 must not be disabled). 524 | import-graph= 525 | 526 | # Create a graph of internal dependencies in the given file (report RP0402 must 527 | # not be disabled). 528 | int-import-graph= 529 | 530 | # Force import order to recognize a module as part of the standard 531 | # compatibility libraries. 532 | known-standard-library= 533 | 534 | # Force import order to recognize a module as part of a third party library. 535 | known-third-party=enchant 536 | 537 | # Couples of modules and preferred modules, separated by a comma. 538 | preferred-modules= 539 | 540 | 541 | [CLASSES] 542 | 543 | # List of method names used to declare (i.e. assign) instance attributes. 544 | defining-attr-methods=__init__, 545 | __new__, 546 | setUp, 547 | __post_init__ 548 | 549 | # List of member names, which should be excluded from the protected access 550 | # warning. 551 | exclude-protected=_asdict, 552 | _fields, 553 | _replace, 554 | _source, 555 | _make 556 | 557 | # List of valid names for the first argument in a class method. 558 | valid-classmethod-first-arg=cls 559 | 560 | # List of valid names for the first argument in a metaclass class method. 561 | valid-metaclass-classmethod-first-arg=cls 562 | 563 | 564 | [DESIGN] 565 | 566 | # Maximum number of arguments for function / method. 567 | max-args=5 568 | 569 | # Maximum number of attributes for a class (see R0902). 570 | max-attributes=7 571 | 572 | # Maximum number of boolean expressions in an if statement (see R0916). 573 | max-bool-expr=5 574 | 575 | # Maximum number of branch for function / method body. 576 | max-branches=12 577 | 578 | # Maximum number of locals for function / method body. 579 | max-locals=15 580 | 581 | # Maximum number of parents for a class (see R0901). 582 | max-parents=7 583 | 584 | # Maximum number of public methods for a class (see R0904). 585 | max-public-methods=20 586 | 587 | # Maximum number of return / yield for function / method body. 588 | max-returns=6 589 | 590 | # Maximum number of statements in function / method body. 591 | max-statements=50 592 | 593 | # Minimum number of public methods for a class (see R0903). 594 | min-public-methods=2 595 | 596 | 597 | [EXCEPTIONS] 598 | 599 | # Exceptions that will emit a warning when being caught. Defaults to 600 | # "BaseException, Exception". 601 | overgeneral-exceptions=BaseException, 602 | Exception 603 | --------------------------------------------------------------------------------