├── requirements.txt
├── .gitignore
├── diameter_clustering
    ├── approx
    │   ├── __init__.py
    │   ├── hnsw.py
    │   └── leader.py
    ├── __init__.py
    ├── timer.py
    ├── dist_matrix.py
    ├── mixins.py
    ├── leader.py
    ├── qt.py
    └── diameter.py
├── tests
    ├── test_dist_matrix.py
    ├── approx
    │   └── test_approx_leader.py
    ├── test_qt.py
    ├── test_greedy.py
    └── test_leader.py
├── setup.py
├── LICENSE
├── README.md
└── .pylintrc


/requirements.txt:
--------------------------------------------------------------------------------
1 | hnswlib
2 | numpy
3 | numpy_groupies
4 | scikit_learn
5 | scipy
6 | tqdm
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .vscode
3 | .pytest_cache
4 | .DS_Store
5 | *.egg-info
6 | dist/
7 | build/


--------------------------------------------------------------------------------
/diameter_clustering/approx/__init__.py:
--------------------------------------------------------------------------------
1 | from .hnsw import HNSWIndex
2 | from .leader import ApproxLeaderClustering


--------------------------------------------------------------------------------
/diameter_clustering/__init__.py:
--------------------------------------------------------------------------------
1 | from .diameter import MaxDiameterClustering
2 | from .leader import LeaderClustering
3 | from .qt import QTClustering
4 | 


--------------------------------------------------------------------------------
/tests/test_dist_matrix.py:
--------------------------------------------------------------------------------
 1 | """Tests for distance matrix computation."""
 2 | 
 3 | import numpy as np
 4 | import scipy
 5 | from sklearn.datasets import make_blobs
 6 | 
 7 | from diameter_clustering.dist_matrix import compute_dist_matrix, compute_sparse_dist_matrix
 8 | 
 9 | 
10 | X, y = make_blobs(n_samples=100, n_features=50, random_state=42)
11 | 
12 | 
13 | def test_dist_matrix():
14 | 
15 |     dist_matrix = compute_dist_matrix(X)
16 |     assert np.all(np.isfinite(dist_matrix))
17 | 
18 |     dist_matrix = compute_dist_matrix(X, metric='inner_product')
19 |     assert np.all(np.isfinite(dist_matrix))
20 | 
21 |     dist_matrix = compute_dist_matrix(X, fill_diagonal=True)
22 |     assert np.all(np.diagonal(dist_matrix) == np.inf)
23 | 
24 |     dist_matrix = compute_dist_matrix(X[0])
25 | 
26 | 
27 | def test_sparse_dist_matrix():
28 | 
29 |     dist_matrix = compute_sparse_dist_matrix(X, metric='cosine', max_distance=0.5)
30 |     assert isinstance(dist_matrix, scipy.sparse.csr_matrix)
31 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r", encoding="utf-8") as readme_file:
 4 |     long_description = readme_file.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="diameter-clustering",
 8 |     version="0.1.0",
 9 |     author="Anton Klenitskiy",
10 |     author_email="ant-klen@yandex.ru",
11 |     description="Clustering with maximum distance between points inside clusters",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/antklen/diameter-clustering",
15 |     packages=['diameter_clustering', 'diameter_clustering.approx'],
16 |     classifiers=[
17 |         "Programming Language :: Python :: 3",
18 |         "License :: OSI Approved :: MIT License",
19 |         "Operating System :: OS Independent",
20 |         "Development Status :: 3 - Alpha",
21 |     ],
22 |     install_requires=[
23 |         'hnswlib',
24 |         'numpy',
25 |         'numpy_groupies',
26 |         'scikit_learn',
27 |         'scipy',
28 |         'tqdm'
29 |     ],
30 | )
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Anton Klenitskiy
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/tests/approx/test_approx_leader.py:
--------------------------------------------------------------------------------
 1 | """Tests for ApproxLeaderClustering."""
 2 | 
 3 | import numpy as np
 4 | from scipy.spatial.distance import pdist
 5 | from sklearn.datasets import make_blobs
 6 | 
 7 | from diameter_clustering.approx import ApproxLeaderClustering, HNSWIndex
 8 | from diameter_clustering.dist_matrix import compute_dist_matrix, compute_sparse_dist_matrix
 9 | 
10 | 
11 | MAX_RADIUS = 0.25
12 | MAX_RADIUS_EUCLIDEAN = 30
13 | 
14 | X, y = make_blobs(n_samples=100, n_features=50, centers=3,
15 |                   cluster_std=3, random_state=42)
16 | 
17 | 
18 | def compute_max_dist(X, labels, metric='cosine'):
19 |     """Compute maximum distance between points inside clusters."""
20 | 
21 |     max_dist = []
22 | 
23 |     for cluster in np.unique(labels):
24 |         x_cluster = X[labels == cluster]
25 |         dist = pdist(x_cluster, metric=metric)
26 |         if len(dist) == 0:
27 |             max_dist.append(0)
28 |         else:
29 |             max_dist.append(dist.max())
30 | 
31 |     return np.max(max_dist)
32 | 
33 | 
34 | def test_approx_leader():
35 | 
36 |     hnsw_index = HNSWIndex(max_elements=len(X), space='l2', dim=50,
37 |                            ef=100, ef_construction=200, M=16)
38 |     model = ApproxLeaderClustering(hnsw_index, max_radius=MAX_RADIUS_EUCLIDEAN)
39 |     labels = model.fit_predict(X)
40 |     assert compute_max_dist(X, labels, metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2
41 | 
42 |     hnsw_index = HNSWIndex(max_elements=len(X), space='cosine', dim=50,
43 |                            ef=100, ef_construction=200, M=16)
44 |     model = ApproxLeaderClustering(hnsw_index, max_radius=MAX_RADIUS)
45 |     labels = model.fit_predict(X)
46 |     assert len(labels) == len(X)
47 | 
48 | 
49 | def test_deterministic():
50 | 
51 |     hnsw_index1 = HNSWIndex(max_elements=len(X), space='cosine', dim=50,
52 |                         ef=100, ef_construction=200, M=16)
53 |     model1 = ApproxLeaderClustering(hnsw_index1, max_radius=0.2, deterministic=True)
54 |     labels1 = model1.fit_predict(X)
55 | 
56 |     hnsw_index2 = HNSWIndex(max_elements=len(X), space='cosine', dim=50,
57 |                             ef=100, ef_construction=200, M=16)
58 |     model2 = ApproxLeaderClustering(hnsw_index2, max_radius=0.2, deterministic=True)
59 |     labels2 = model2.fit_predict(X)
60 |     assert np.array_equal(labels1, labels2)
61 | 


--------------------------------------------------------------------------------
/diameter_clustering/timer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Timer which saves history of runs.
 3 | """
 4 | 
 5 | import logging
 6 | import time
 7 | from contextlib import contextmanager
 8 | 
 9 | 
10 | @contextmanager
11 | def timer(name: str, disable: bool = False):
12 |     """Simple timer as context manager."""
13 | 
14 |     start = time.time()
15 |     yield
16 |     if not disable:
17 |         logging.info(f'[{name}] done in {(time.time() - start)*1000:.1f} ms')
18 | 
19 | 
20 | class TimerWithHistory:
21 |     """Timer as context mamager which saves history.
22 | 
23 |     This timer should be initialized and then used as context manager.
24 |     After each run it appends execution time to list with history.
25 |     Different runs could have different names and history is saved as dict
26 |     with separate key for each name.
27 | 
28 |     Args:
29 |         default_name (str): Default name for given run.
30 |         disable (bool): If True then disable timer.
31 | 
32 |     Example:
33 |         timer = TimerWithHistory()
34 |         with timer():
35 |             time.sleep(1)
36 |         with timer(name='first'):
37 |             time.sleep(2)
38 |         # get history
39 |         hist1, hist2 = timer.history['default'], timer.history['first']
40 |     """
41 | 
42 |     def __init__(self, default_name: str = 'default', disable: bool = False):
43 | 
44 |         self._start = None
45 |         self.history = {}
46 |         self.name = default_name
47 |         self.default_name = default_name
48 |         self.disable = disable
49 | 
50 |     def start(self):
51 |         """Start timer."""
52 | 
53 |         if self._start is not None:
54 |             raise RuntimeError('Timer already started...')
55 |         self._start = time.perf_counter()
56 | 
57 |     def stop(self):
58 |         """Stop timer and save result to history."""
59 | 
60 |         if self._start is None:
61 |             raise RuntimeError('Timer not yet started...')
62 |         elapsed = time.perf_counter() - self._start
63 |         if self.history.get(self.name):
64 |             self.history[self.name].append(elapsed)
65 |         else:
66 |             self.history[self.name] = [elapsed]
67 |         self._start = None
68 | 
69 |     def __enter__(self):
70 |         if not self.disable:
71 |             self.start()
72 |         return self
73 | 
74 |     def __exit__(self, *args):
75 |         if not self.disable:
76 |             self.stop()
77 | 
78 |     def __call__(self, name=None):
79 |         if not self.disable:
80 |             self.name = name or self.default_name
81 |         return self
82 | 


--------------------------------------------------------------------------------
/diameter_clustering/dist_matrix.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Computation of distance matrix.
 3 | """
 4 | 
 5 | import numpy as np
 6 | from scipy.sparse import csr_matrix
 7 | from scipy.spatial.distance import pdist, squareform
 8 | from sklearn.neighbors import RadiusNeighborsTransformer
 9 | 
10 | 
11 | def compute_dist_matrix(X: np.ndarray, metric: str = 'inner_product',
12 |                         fill_diagonal: bool = False) -> np.ndarray:
13 |     """
14 |     Compute distance matrix between points and optionally fill diagonal elements
15 |     with np.inf (may be convenient in some situation).
16 | 
17 |     Args:
18 |         X (np.ndarray): 2D array with data points.
19 |         metric (str): Distance metric. Possible options are 'inner_product' or one of metrics
20 |             available in scipy.spatial.distance.pdist. If 'inner_product' then use np.inner
21 |             which is much faster than pdist. 'inner_product' could be used instead of cosine
22 |             distance for normalized vectors.
23 |         fill_diagonal (bool): If True then fill diagonal with np.inf.
24 | 
25 |     Returns:
26 |         Array with shape (len(X), len(X)).
27 |     """
28 | 
29 |     if X.ndim == 1:
30 |         X = X[None, :]  # for correct work of distance computation
31 | 
32 |     if metric == 'inner_product':
33 |         dist_matrix = 1 - np.inner(X, X)
34 |     else:
35 |         dist_matrix = pdist(X, metric=metric)
36 |         # squareform converts emmpty dist_matrix array([]) to array([[0.]])
37 |         # this behavior could break the code later
38 |         dist_matrix = squareform(dist_matrix) if len(dist_matrix) > 0 else np.empty((0, 0))
39 | 
40 |     if fill_diagonal:
41 |         np.fill_diagonal(dist_matrix, np.inf)
42 | 
43 |     return dist_matrix
44 | 
45 | 
46 | def compute_sparse_dist_matrix(X: np.ndarray, metric: str = 'cosine',
47 |                                max_distance: float = 0.2) -> csr_matrix:
48 |     """
49 |     Compute distance matrix in sparse csr format using sklearn RadiusNeighborsTransformer.
50 |     Zero elements of matrix are elements for which distance is greater than max_distance.
51 | 
52 |     Args:
53 |         X (np.ndarray): 2D array with data points.
54 |         metric (str): Distance metric
55 |             (possible options in sklearn.neighbors.VALID_METRICS['brute']).
56 |         max_distance (float): Maximum distance threshold.
57 | 
58 |     Returns:
59 |         scipy.sparse.csr_matrix with shape (len(X), len(X)).
60 |     """
61 | 
62 |     transformer = RadiusNeighborsTransformer(mode='distance', algorithm='brute',
63 |                                              metric=metric, radius=max_distance)
64 | 
65 |     return transformer.fit_transform(X)
66 | 


--------------------------------------------------------------------------------
/diameter_clustering/approx/hnsw.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Wrapper for approximate nearest neighbors search using hnswlib library.
 3 | """
 4 | 
 5 | from typing import Optional
 6 | 
 7 | import hnswlib
 8 | import numpy as np
 9 | 
10 | 
11 | class HNSWIndex:
12 |     """
13 |     Approximate nearest neighbors search using hnswlib library.
14 | 
15 |     Args:
16 |         max_elements: Maximum number of elements that can be stored in index (hnswlib parameter).
17 |         path: Path to previously saved index. If not None, load it. If None, initialize empty index.
18 |         space: Distance metric (hnswlib parameter). Possible values:
19 |             'l2', 'ip' (inner product), 'cosine.
20 |         dim: Dimensionality of vectors in index (hnswlib parameter).
21 |         ef: hnswlib parameter, defines query time accuracy/speed trade-off.
22 |         ef_construction: hnswlib parameter, defines construction time/accuracy trade-off.
23 |         M: hnswlib parameter, defines maximum number of outgoing connections in the graph.
24 | 
25 |     Attributes:
26 |         index: Instance of hnswlib.Index.
27 |     """
28 | 
29 |     def __init__(self, max_elements: int, path: Optional[str] = None,
30 |                  space: str = 'ip', dim: int = 512, ef: int = 100,
31 |                  ef_construction: int = 250, M: int = 16):
32 | 
33 |         self.index = hnswlib.Index(space=space, dim=dim)
34 | 
35 |         if path is not None:
36 |             self.index.load_index(path, max_elements=max_elements)
37 |         else:
38 |             self.index.init_index(max_elements=max_elements, ef_construction=ef_construction, M=M)
39 | 
40 |         self.index.set_ef(ef)
41 | 
42 |     def add_item(self, vector: np.ndarray, label: Optional[int] = None):
43 |         """Add one element to index.
44 | 
45 |         Args:
46 |             vector: Numpy array with vector for one element.
47 |             label: Optional integer label for this element.
48 |         """
49 | 
50 |         self.index.add_items(vector, ids=label)
51 | 
52 |     def add_items(self, vectors: np.ndarray, labels: Optional[int] = None):
53 |         """Add batch of elements to index.
54 | 
55 |         Args:
56 |             vectors: Numpy array with vectors for given elements.
57 |             label: Optional integer labels for this elements.
58 |         """
59 | 
60 |         self.index.add_items(vectors, ids=labels)
61 | 
62 |     def find_nearest_point(self, vector: np.ndarray):
63 |         """Find nearest point from index for given vector.
64 | 
65 |         Args:
66 |             vector: Numpy array.
67 | 
68 |         Returns:
69 |             Label of nearest point and distance to it.
70 |         """
71 | 
72 |         labels, distances = self.index.knn_query(vector, k=1)
73 |         return labels[0, 0], distances[0, 0]
74 | 
75 |     def find_nearest_point_batch(self, vectors: np.ndarray):
76 |         """Find nearest point from index for batch of vectors.
77 | 
78 |         Args:
79 |             vectors: Numpy array.
80 | 
81 |         Returns:
82 |             Labels of nearest points and corresponding distances to it.
83 |         """
84 | 
85 |         labels, distances = self.index.knn_query(vectors, k=1)
86 |         return labels[:, 0], distances[:, 0]
87 | 
88 |     def save(self, path: str):
89 |         """Save index to disk.
90 | 
91 |         Args:
92 |             path: Save index to this path.
93 |         """
94 | 
95 |         self.index.save_index(path)
96 | 


--------------------------------------------------------------------------------
/diameter_clustering/mixins.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Mixins for clustering algorithms.
 3 | """
 4 | 
 5 | import logging
 6 | from typing import Union
 7 | 
 8 | import numpy as np
 9 | from scipy.sparse import csr_matrix
10 | 
11 | from .dist_matrix import compute_dist_matrix, compute_sparse_dist_matrix
12 | from .timer import timer
13 | 
14 | 
15 | class FitPredictMixin:
16 |     """Mixin with fit_predict method."""
17 | 
18 |     def fit_predict(self, X: Union[np.ndarray, csr_matrix]) -> np.ndarray:
19 |         """Fit clustering from features or distance matrix and return cluster labels.
20 | 
21 |         Args:
22 |             X (np.ndarray or scipy.sparse.csr_matrix): Array with features
23 |                 or precomputed distance matrix, which could be in sparse matrix format.
24 | 
25 |         Returns:
26 |             Array with cluster labels.
27 |         """
28 | 
29 |         self.fit(X)
30 | 
31 |         return self.labels_
32 | 
33 | 
34 | class DistanceMatrixMixin:
35 |     """Mixin with methods for working with distance matrix."""
36 | 
37 |     def _prepare_distance_matrix(self, X: Union[np.ndarray, csr_matrix]):
38 |         """Prepare distance matrix.
39 | 
40 |         If self.precomputed_dist is True then do nothing, only check for correctness of X.
41 |         Otherwise compute distance matrix regarding X as array of features. If self.sparse_dist
42 |         is True then compute matrix in sparse format."""
43 | 
44 |         if not self.precomputed_dist:
45 |             if self.sparse_dist:
46 |                 if self.verbose:
47 |                     logging.info('computing distance matrix in sparse format...')
48 |                 with timer('compute_sparse_dist_matrix', disable=not self.verbose):
49 |                     return compute_sparse_dist_matrix(X, metric=self.metric,
50 |                                                       max_distance=self.max_distance)
51 |             else:
52 |                 if self.verbose:
53 |                     logging.info('computing distance matrix in dense format...')
54 |                 with timer('compute_dist_matrix', disable=not self.verbose):
55 |                     return compute_dist_matrix(X, metric=self.metric)
56 | 
57 |         if X.shape[0] != X.shape[1]:
58 |             raise ValueError(f'Distance matrix should be square. Got matrix of shape {X.shape}.')
59 | 
60 |         if self.sparse_dist:
61 |             if not isinstance(X, csr_matrix):
62 |                 raise TypeError('Sparse distance matrix should be in '
63 |                                 'scipy.sparse.csr_matrix format.')
64 |         elif not isinstance(X, np.ndarray):
65 |             raise TypeError('Dense distance matrix should be '
66 |                             'an instance of np.ndarray.')
67 | 
68 |         return X
69 | 
70 |     def _slice_distance_matrix(self, dist_matrix: Union[np.ndarray, csr_matrix],
71 |                                idx: int, indexes: np.ndarray):
72 |         """Get one row of distance matrix.
73 |         Get distance between given point and several other points.
74 | 
75 |         Args:
76 |             dist (np.ndarray or scipy.sparse.csr_matrix): Distance matrix.
77 |             idx (int): Index of given point.
78 |             indexes (np.ndarray): Indexes of other points.
79 |         """
80 | 
81 |         if isinstance(dist_matrix, csr_matrix):
82 |             current_dist = dist_matrix[idx, indexes].toarray()[0, :]
83 |             current_dist[current_dist == 0] = np.inf
84 |         else:
85 |             current_dist = dist_matrix[idx, indexes]
86 | 
87 |         return current_dist
88 | 


--------------------------------------------------------------------------------
/tests/test_qt.py:
--------------------------------------------------------------------------------
 1 | """Tests for QTClustering."""
 2 | 
 3 | import numpy as np
 4 | from scipy.spatial.distance import pdist
 5 | from sklearn.datasets import make_blobs
 6 | 
 7 | from diameter_clustering import QTClustering
 8 | from diameter_clustering.dist_matrix import compute_dist_matrix, compute_sparse_dist_matrix
 9 | 
10 | 
11 | MAX_RADIUS = 0.25
12 | MAX_RADIUS_EUCLIDEAN = 30
13 | 
14 | X, y = make_blobs(n_samples=100, n_features=50, centers=3,
15 |                   cluster_std=3, random_state=42)
16 | 
17 | 
18 | def compute_max_dist(X, labels, metric='cosine'):
19 |     """Compute maximum distance between points inside clusters."""
20 | 
21 |     max_dist = []
22 | 
23 |     for cluster in np.unique(labels):
24 |         x_cluster = X[labels == cluster]
25 |         dist = pdist(x_cluster, metric=metric)
26 |         if len(dist) == 0:
27 |             max_dist.append(0)
28 |         else:
29 |             max_dist.append(dist.max())
30 | 
31 |     return np.max(max_dist)
32 | 
33 | 
34 | def test_qt():
35 | 
36 |     model = QTClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean',
37 |                          min_cluster_size=1, sparse_dist=False)
38 |     labels = model.fit_predict(X)
39 |     assert compute_max_dist(X, labels, metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2
40 | 
41 |     model = QTClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean',
42 |                          min_cluster_size=3, sparse_dist=False)
43 |     labels = model.fit_predict(X)
44 |     assert compute_max_dist(X[labels != -1], labels[labels != -1],
45 |                             metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2
46 | 
47 |     model = QTClustering(max_radius=MAX_RADIUS, metric='cosine', sparse_dist=False)
48 |     labels = model.fit_predict(X)
49 |     assert len(labels) == len(X)
50 | 
51 | 
52 | def test_inner_product():
53 | 
54 |     x_normalized = X/(np.linalg.norm(X, axis=-1, keepdims=True) + 1e-16)
55 |     model = QTClustering(max_radius=MAX_RADIUS, metric='inner_product', sparse_dist=False)
56 |     labels = model.fit_predict(x_normalized)
57 |     assert len(labels) == len(X)
58 | 
59 |     model2 = QTClustering(max_radius=MAX_RADIUS, metric='cosine', sparse_dist=False)
60 |     labels2 = model2.fit_predict(X)
61 |     assert np.array_equal(labels, labels2)
62 | 
63 | 
64 | def test_precomputed():
65 | 
66 |     model = QTClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean',
67 |                          min_cluster_size=1, precomputed_dist=True, sparse_dist=False)
68 |     dist_matrix = compute_dist_matrix(X, metric='euclidean')
69 |     labels = model.fit_predict(dist_matrix)
70 |     assert compute_max_dist(X, labels, metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2
71 | 
72 | 
73 | def test_sparse():
74 | 
75 |     model = QTClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean',
76 |                          min_cluster_size=1, sparse_dist=True)
77 |     labels = model.fit_predict(X)
78 |     assert compute_max_dist(X, labels, metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2
79 | 
80 |     model = QTClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean',
81 |                          min_cluster_size=1, sparse_dist=True, precomputed_dist=True)
82 |     dist_matrix = compute_sparse_dist_matrix(X, metric='euclidean',
83 |                                              max_distance=MAX_RADIUS_EUCLIDEAN)
84 |     labels = model.fit_predict(dist_matrix)
85 |     assert compute_max_dist(X, labels, metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2
86 | 
87 | 
88 | def test_sparse_dense_equivalence():
89 | 
90 |     model1 = QTClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean',
91 |                           min_cluster_size=1, sparse_dist=False)
92 |     labels1 = model1.fit_predict(X)
93 | 
94 |     model2 = QTClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean',
95 |                           min_cluster_size=1, sparse_dist=True)
96 |     labels2 = model2.fit_predict(X)
97 | 
98 |     assert np.array_equal(labels1, labels2)
99 | 


--------------------------------------------------------------------------------
/tests/test_greedy.py:
--------------------------------------------------------------------------------
  1 | """Tests for MaxDiameterClustering."""
  2 | 
  3 | import numpy as np
  4 | from scipy.spatial.distance import pdist
  5 | from sklearn.datasets import make_blobs
  6 | 
  7 | from diameter_clustering import MaxDiameterClustering
  8 | from diameter_clustering.dist_matrix import compute_dist_matrix, compute_sparse_dist_matrix
  9 | 
 10 | 
 11 | MAX_DISTANCE = 0.5
 12 | 
 13 | X, y = make_blobs(n_samples=100, n_features=50, centers=3,
 14 |                   cluster_std=5, random_state=42)
 15 | 
 16 | 
 17 | def compute_max_dist(X, labels, metric='cosine'):
 18 |     """Compute maximum distance between points inside clusters."""
 19 | 
 20 |     max_dist = []
 21 | 
 22 |     for cluster in np.unique(labels):
 23 |         x_cluster = X[labels == cluster]
 24 |         dist = pdist(x_cluster, metric=metric)
 25 |         if len(dist) == 0:
 26 |             max_dist.append(0)
 27 |         else:
 28 |             max_dist.append(dist.max())
 29 | 
 30 |     return np.max(max_dist)
 31 | 
 32 | 
 33 | def test_max_diameter():
 34 | 
 35 |     model = MaxDiameterClustering(max_distance=MAX_DISTANCE, criterion='distance',
 36 |                                   metric='cosine', sparse_dist=False, use_timer=True)
 37 |     labels = model.fit_predict(X)
 38 |     assert compute_max_dist(X, labels) < MAX_DISTANCE
 39 | 
 40 |     model = MaxDiameterClustering(max_distance=MAX_DISTANCE, criterion='size',
 41 |                                   metric='cosine', sparse_dist=False, use_timer=True)
 42 |     labels = model.fit_predict(X)
 43 |     assert compute_max_dist(X, labels) < MAX_DISTANCE
 44 | 
 45 | def test_inner_product():
 46 | 
 47 |     x_normalized = X/(np.linalg.norm(X, axis=-1, keepdims=True) + 1e-16)
 48 |     model = MaxDiameterClustering(max_distance=MAX_DISTANCE, metric='inner_product',
 49 |                                   sparse_dist=False, deterministic=True)
 50 |     labels = model.fit_predict(x_normalized)
 51 |     assert compute_max_dist(x_normalized, labels) < MAX_DISTANCE
 52 | 
 53 |     model2 = MaxDiameterClustering(max_distance=MAX_DISTANCE, metric='cosine',
 54 |                                    sparse_dist=False, deterministic=True)
 55 |     labels2 = model2.fit_predict(X)
 56 |     assert np.array_equal(labels, labels2)
 57 | 
 58 | 
 59 | def test_precomputed():
 60 | 
 61 |     model = MaxDiameterClustering(max_distance=MAX_DISTANCE, precomputed_dist=True,
 62 |                                   sparse_dist=False)
 63 |     dist_matrix = compute_dist_matrix(X, metric='cosine')
 64 |     labels = model.fit_predict(dist_matrix)
 65 |     assert compute_max_dist(X, labels) < MAX_DISTANCE
 66 | 
 67 | 
 68 | def test_sparse():
 69 | 
 70 |     model = MaxDiameterClustering(max_distance=MAX_DISTANCE, metric='cosine',
 71 |                                   sparse_dist=True)
 72 |     labels = model.fit_predict(X)
 73 |     assert compute_max_dist(X, labels) < MAX_DISTANCE
 74 | 
 75 |     model = MaxDiameterClustering(max_distance=MAX_DISTANCE,
 76 |                                   sparse_dist=True, precomputed_dist=True)
 77 |     dist_matrix = compute_sparse_dist_matrix(X, max_distance=MAX_DISTANCE)
 78 |     labels = model.fit_predict(dist_matrix)
 79 |     assert compute_max_dist(X, labels) < MAX_DISTANCE
 80 | 
 81 | 
 82 | def test_deterministic():
 83 | 
 84 |     model = MaxDiameterClustering(max_distance=MAX_DISTANCE, metric='cosine',
 85 |                                   deterministic=True)
 86 |     labels1 = model.fit_predict(X)
 87 |     labels2 = model.fit_predict(X)
 88 |     assert np.array_equal(labels1, labels2)
 89 | 
 90 | 
 91 | def test_sparse_dense_equivalence():
 92 | 
 93 |     model1 = MaxDiameterClustering(max_distance=MAX_DISTANCE, metric='cosine',
 94 |                                    sparse_dist=False, deterministic=True)
 95 |     labels1 = model1.fit_predict(X)
 96 | 
 97 |     model2 = MaxDiameterClustering(max_distance=MAX_DISTANCE, metric='cosine',
 98 |                                    sparse_dist=True, deterministic=True)
 99 |     labels2 = model2.fit_predict(X)
100 | 
101 |     assert np.array_equal(labels1, labels2)
102 | 


--------------------------------------------------------------------------------
/tests/test_leader.py:
--------------------------------------------------------------------------------
  1 | """Tests for LeaderClustering."""
  2 | 
  3 | import numpy as np
  4 | from scipy.spatial.distance import pdist
  5 | from sklearn.datasets import make_blobs
  6 | 
  7 | from diameter_clustering import LeaderClustering
  8 | from diameter_clustering.dist_matrix import compute_dist_matrix, compute_sparse_dist_matrix
  9 | 
 10 | 
 11 | MAX_RADIUS = 0.25
 12 | MAX_RADIUS_EUCLIDEAN = 30
 13 | 
 14 | X, y = make_blobs(n_samples=100, n_features=50, centers=3,
 15 |                   cluster_std=3, random_state=42)
 16 | 
 17 | 
 18 | def compute_max_dist(X, labels, metric='cosine'):
 19 |     """Compute maximum distance between points inside clusters."""
 20 | 
 21 |     max_dist = []
 22 | 
 23 |     for cluster in np.unique(labels):
 24 |         x_cluster = X[labels == cluster]
 25 |         dist = pdist(x_cluster, metric=metric)
 26 |         if len(dist) == 0:
 27 |             max_dist.append(0)
 28 |         else:
 29 |             max_dist.append(dist.max())
 30 | 
 31 |     return np.max(max_dist)
 32 | 
 33 | 
 34 | def test_leader():
 35 | 
 36 |     model = LeaderClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean',
 37 |                              sparse_dist=False)
 38 |     labels = model.fit_predict(X)
 39 |     assert compute_max_dist(X, labels, metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2
 40 | 
 41 |     model = LeaderClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean',
 42 |                              change_leaders=True, sparse_dist=False)
 43 |     labels = model.fit_predict(X)
 44 |     assert compute_max_dist(X, labels, metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2
 45 | 
 46 |     model = LeaderClustering(max_radius=MAX_RADIUS, metric='cosine', sparse_dist=False)
 47 |     labels = model.fit_predict(X)
 48 |     assert len(labels) == len(X)
 49 | 
 50 | 
 51 | def test_inner_product():
 52 | 
 53 |     x_normalized = X/(np.linalg.norm(X, axis=-1, keepdims=True) + 1e-16)
 54 |     model = LeaderClustering(max_radius=MAX_RADIUS, metric='inner_product',
 55 |                              sparse_dist=False, deterministic=True)
 56 |     labels = model.fit_predict(x_normalized)
 57 |     assert len(labels) == len(X)
 58 | 
 59 |     model2 = LeaderClustering(max_radius=MAX_RADIUS, metric='cosine',
 60 |                               sparse_dist=False, deterministic=True)
 61 |     labels2 = model2.fit_predict(X)
 62 |     assert np.array_equal(labels, labels2)
 63 | 
 64 | def test_precomputed():
 65 | 
 66 |     model = LeaderClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean',
 67 |                              precomputed_dist=True, sparse_dist=False)
 68 |     dist_matrix = compute_dist_matrix(X, metric='euclidean')
 69 |     labels = model.fit_predict(dist_matrix)
 70 |     assert compute_max_dist(X, labels, metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2
 71 | 
 72 | 
 73 | def test_sparse():
 74 | 
 75 |     model = LeaderClustering(max_radius=MAX_RADIUS_EUCLIDEAN, metric='euclidean',
 76 |                              sparse_dist=True)
 77 |     labels = model.fit_predict(X)
 78 |     assert compute_max_dist(X, labels, metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2
 79 | 
 80 |     model = LeaderClustering(max_radius=MAX_RADIUS_EUCLIDEAN,
 81 |                              sparse_dist=True, precomputed_dist=True)
 82 |     dist_matrix = compute_sparse_dist_matrix(X, metric='euclidean',
 83 |                                              max_distance=MAX_RADIUS_EUCLIDEAN)
 84 |     labels = model.fit_predict(dist_matrix)
 85 |     assert compute_max_dist(X, labels, metric='euclidean') < MAX_RADIUS_EUCLIDEAN * 2
 86 | 
 87 | 
 88 | def test_deterministic():
 89 | 
 90 |     model = LeaderClustering(max_radius=MAX_RADIUS, metric='cosine',
 91 |                              deterministic=True)
 92 |     labels1 = model.fit_predict(X)
 93 |     labels2 = model.fit_predict(X)
 94 |     assert np.array_equal(labels1, labels2)
 95 | 
 96 | 
 97 | def test_sparse_dense_equivalence():
 98 | 
 99 |     model1 = LeaderClustering(max_radius=MAX_RADIUS, metric='cosine',
100 |                               sparse_dist=False, deterministic=True)
101 |     labels1 = model1.fit_predict(X)
102 | 
103 |     model2 = LeaderClustering(max_radius=MAX_RADIUS, metric='cosine',
104 |                               sparse_dist=True, deterministic=True)
105 |     labels2 = model2.fit_predict(X)
106 | 
107 |     assert np.array_equal(labels1, labels2)
108 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Clustering with maximum diameter
  2 | 
  3 | Collection of clustering algorithms with maximum distance between points inside clusters.
  4 | 
  5 | When we have interpretable metric like cosine distance it could be nice to have clusters with maximum distance between points. Then we can find good threshold for maximum distance and be confident that points inside clusters are really similar. Also we dont' need to specify number of clusters with such approach.
  6 | 
  7 | Unfortunately most of popular clustering algorithms don't have such behavior.
  8 | 
  9 | Possible applications:
 10 | - Embeddings of text data with cosine distance.
 11 | - Geo data with haversine distance.
 12 | 
 13 | ## Algorithms
 14 | 
 15 | ### MaxDiameterClustering
 16 | 
 17 | A simple greedy algorithm, in which we add points one by one. If there is a cluster with all points close enough to new points, then we add new point to this cluster. If there is no such cluster, this point starts new cluster.
 18 | 
 19 | ### Quality Threshold Clustering
 20 | 
 21 | [Explanation](https://sites.google.com/site/dataclusteringalgorithms/quality-threshold-clustering-algorithm-1).
 22 | 
 23 | Inspired by this [repository](https://github.com/melvrl13/python-quality-threshold).
 24 | ### Leader Clustering
 25 | 
 26 | [Explanation on stackoverflow](https://stackoverflow.com/questions/36928654/leader-clustering-algorithm-explanation)
 27 | 
 28 | [R package](https://cran.r-project.org/web/packages/leaderCluster/index.html)
 29 | 
 30 | ### Approximate Leader Clustering
 31 | 
 32 | Use approximate nearest neighbors search (currently hnswlib) to speed up Leader Clustering.
 33 | 
 34 | 
 35 | ## Installation
 36 | 
 37 | Install from PyPI
 38 | ```sh
 39 | pip install diameter-clustering
 40 | ```
 41 | 
 42 | Install from source
 43 | ```sh
 44 | pip install git+git://github.com/antklen/diameter-clustering.git
 45 | # or
 46 | git clone git@github.com:antklen/diameter-clustering.git
 47 | cd diameter-clustering
 48 | pip install .
 49 | ```
 50 | 
 51 | ## Usage
 52 | 
 53 | ### MaxDiameterClustering
 54 | 
 55 | Basic usage of MaxDiameterClustering:
 56 | ```python
 57 | from sklearn.datasets import make_blobs
 58 | from diameter_clustering import MaxDiameterClustering
 59 | 
 60 | X, y = make_blobs(n_samples=100, n_features=50)
 61 | 
 62 | model = MaxDiameterClustering(max_distance=0.3, metric='cosine')
 63 | labels = model.fit_predict(X)
 64 | ```
 65 | 
 66 | Instead of using feature matrix `X` we can pass precomputed distance matrix:
 67 | ```python
 68 | from diameter_clustering.dist_matrix import compute_sparse_dist_matrix
 69 | 
 70 | dist_matrix = compute_sparse_dist_matrix(X, metric='cosine')
 71 | 
 72 | model = MaxDiameterClustering(max_distance=0.3, precomputed_dist=True)
 73 | labels = model.fit_predict(dist_matrix)
 74 | ```
 75 | 
 76 | By default computation of distance matrix in sparse format is used (`sparse_dist=True`), because calculation of distance matrix between all points in dense format is expensive. But when dataset is not so big (roughly less than 20k-30k points) `sparse_dist=False` mode can be used. It could be faster for small datasets or useful when you already have precomputed distance matrix in dense format.
 77 | ```python
 78 | model = MaxDiameterClustering(max_distance=0.3, metric='cosine', sparse_dist=False)
 79 | labels = model.fit_predict(X)
 80 | 
 81 | 
 82 | from diameter_clustering.dist_matrix import compute_dist_matrix
 83 | 
 84 | dist_matrix = compute_dist_matrix(X, max_distance=0.3, metric='cosine')
 85 | 
 86 | model = MaxDiameterClustering(max_distance=0.3, sparse_dist=False, precomputed_dist=True)
 87 | labels = model.fit_predict(dist_matrix)
 88 | ```
 89 | 
 90 | When we want to compute cosine distance in dense format and our vectors are normalized, it is better to use
 91 | `inner_product` as metric because it is much faster:
 92 | ```python
 93 | X_normalized = X/(np.linalg.norm(X, axis=-1, keepdims=True) + 1e-16)
 94 | 
 95 | model = MaxDiameterClustering(max_distance=0.3, metric='inner_product', sparse_dist=False)
 96 | labels = model.fit_predict(X_normalized)
 97 | ```
 98 | 
 99 | With `deterministic=True` we can get reproducible results:
100 | ```python
101 | model = MaxDiameterClustering(max_distance=0.3, metric='cosine', deterministic=True)
102 | labels = model.fit_predict(X)
103 | ```
104 | 
105 | ### Quality Threshold Clustering
106 | 
107 | ```python
108 | from diameter_clustering import QTClustering
109 | 
110 | model = QTClustering(max_radius=0.15, metric='cosine', min_cluster_size=5)
111 | labels = model.fit_predict(X)
112 | ```
113 | 
114 | `precomputed_dist`, `sparse_dist`, and `inner_product`
115 | can be used as in MaxDiameterClustering. This algorithm is deterministic by design.
116 | 
117 | ### Leader Clustering
118 | 
119 | ```python
120 | from diameter_clustering import LeaderClustering
121 | 
122 | model = LeaderClustering(max_radius=0.15, metric='cosine')
123 | labels = model.fit_predict(X)
124 | ```
125 | 
126 | `precomputed_dist`, `sparse_dist`, `deterministic` and `inner_product`
127 | can be used as in MaxDiameterClustering.
128 | 
129 | ### Approximate Leader Clustering
130 | 
131 | ```python
132 | from diameter_clustering.approx import HNSWIndex
133 | from diameter_clustering.approx import ApproxLeaderClustering
134 | 
135 | # fit model
136 | hnsw_index = HNSWIndex(max_elements=len(X), space='cosine', dim=50,
137 |                        ef=100, ef_construction=200, M=16)
138 | model = ApproxLeaderClustering(hnsw_index, max_radius=0.15, deterministic=True)
139 | labels = model.fit_predict(X)
140 | 
141 | # save index for later usage
142 | hnsw_index.save('hnsw_index.bin')
143 | 
144 | # predict clusters for new data later
145 | hnsw_index = HNSWIndex(max_elements=len(X_new), path='hnsw_index.bin',
146 |                         space='cosine', dim=50, ef=100)
147 | model = ApproxLeaderClustering(hnsw_index, max_radius=0.15, deterministic=True)
148 | new_labels = model.predict(X_new)
149 | ```
150 | 


--------------------------------------------------------------------------------
/diameter_clustering/leader.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implementation of Leader clustering.
  3 | """
  4 | 
  5 | from typing import Union
  6 | 
  7 | import numpy as np
  8 | from scipy.sparse import csr_matrix
  9 | from tqdm import tqdm
 10 | 
 11 | from .mixins import FitPredictMixin, DistanceMatrixMixin
 12 | 
 13 | 
 14 | class LeaderClustering(FitPredictMixin, DistanceMatrixMixin):
 15 |     """Leader clustering algorithm.
 16 | 
 17 |     Args:
 18 |         max_radius (float): Maximum radius of cluster
 19 |             (maximum distance between leader and all other points in cluster).
 20 |         change_leaders (bool): if True then change cluster leader if there is a point with smaller
 21 |             average distance to all points in cluster.
 22 |         metric (str): Distance metric.
 23 |             For sparse_dist=True possible options are in sklearn.neighbors.VALID_METRICS['brute'].
 24 |             For sparse_dist=False possible options are 'inner_product' or one of metrics
 25 |             available in scipy.spatial.distance.pdist. If 'inner_product' then use np.inner
 26 |             which is much faster than pdist. 'inner_product' could be used instead
 27 |             of cosine distance for normalized vectors.
 28 |         precomputed_dist (bool): If True, then input should be precomputed distance matrix,
 29 |             if False then input is array with features.
 30 |         sparse_dist (bool): If True, then use distance matrix in sparse format (zero elements
 31 |             are elements for which distance between points is greater than max_distance).
 32 |             If False, then consider distance matrix as ordinary numpy array.
 33 |         deterministic (bool): If True then take points one by one to get deterministic behavior.
 34 |             If False then select points at random, so results would be different for each run.
 35 |         verbose (bool): If True then output progress info, otherwise be silent.
 36 | 
 37 |     Attributes:
 38 |         labels_ (np.ndarray): Array with cluster labels after fitting model.
 39 |         n_clusters_ (int): Number of clusters after fitting model.
 40 |         centers_ (np.ndarray): Array with indexes of cluster centers.
 41 |         leaders_ (np.ndarray): Array with 1 for cluster centers and with 0 for all other points.
 42 |     """
 43 | 
 44 |     def __init__(self, max_radius: float = 0.1, change_leaders: bool = False,
 45 |                  metric: str = 'cosine', precomputed_dist: bool = False,
 46 |                  sparse_dist: bool = True, deterministic: bool = False,
 47 |                  verbose: bool = True):
 48 | 
 49 |         self.max_radius = max_radius
 50 |         self.change_leaders = change_leaders
 51 |         self.metric = metric
 52 |         self.precomputed_dist = precomputed_dist
 53 |         self.sparse_dist = sparse_dist
 54 |         self.deterministic = deterministic
 55 |         self.verbose = verbose
 56 | 
 57 |         self.max_distance = max_radius  # is needed for computation of sparse distance matrix
 58 |         self.labels_ = None
 59 |         self.n_clusters_ = None
 60 |         self.centers_ = None
 61 |         self.leaders_ = None
 62 | 
 63 |     def fit(self, X: Union[np.ndarray, csr_matrix]):
 64 |         """Fit clustering from features or distance matrix.
 65 | 
 66 |         Args:
 67 |             X (np.ndarray or scipy.sparse.csr_matrix): Array with features or
 68 |                 precomputed distance matrix, could be in sparse matrix format.
 69 |         """
 70 | 
 71 |         dist_matrix = self._prepare_distance_matrix(X)
 72 | 
 73 |         # create arrays for labels, leaders and centers
 74 |         labels = np.empty(dist_matrix.shape[0])
 75 |         labels.fill(np.nan)
 76 |         leaders = np.zeros(dist_matrix.shape[0])
 77 |         centers = []
 78 | 
 79 |         # choose first point and assign label to it
 80 |         idx = 0 if self.deterministic else np.random.choice(range(len(labels)))
 81 |         labels[idx] = 0
 82 |         next_cluster = 1
 83 |         leaders[idx] = 1
 84 |         centers.append(idx)
 85 | 
 86 |         for _ in tqdm(range(len(labels)-1), desc='LeaderClustering fit', disable=not self.verbose):
 87 | 
 88 |             # choose next point
 89 |             indexes = np.where(np.isnan(labels))[0]
 90 |             idx = indexes[0] if self.deterministic else np.random.choice(indexes)
 91 |             # find indices of current leaders
 92 |             current_leaders_idx = np.where(leaders == 1)[0]
 93 |             current_leaders_labels = labels[current_leaders_idx]
 94 | 
 95 |             # find distances to current leaders
 96 |             leaders_dist = self._slice_distance_matrix(dist_matrix, idx, current_leaders_idx)
 97 | 
 98 |             if np.min(leaders_dist) <= self.max_radius:
 99 |                 # assign cluster with nearest leader as label
100 |                 labels[idx] = current_leaders_labels[leaders_dist.argmin()]
101 | 
102 |                 # change leader in cluster if there is better candidate for it
103 |                 if self.change_leaders:
104 |                     cluster_idx = np.where(labels == labels[idx])[0]
105 |                     dist_inside = dist_matrix[cluster_idx][:, cluster_idx].mean(axis=1)
106 |                     min_idx = cluster_idx[dist_inside.argmin()]
107 |                     nearest_leader_idx = current_leaders_idx[leaders_dist.argmin()]
108 |                     if min_idx != nearest_leader_idx:
109 |                         leaders[nearest_leader_idx] = 0
110 |                         leaders[min_idx] = 1
111 | 
112 |             else:
113 |                 # assign new cluster label
114 |                 labels[idx] = next_cluster
115 |                 leaders[idx] = 1
116 |                 centers.append(idx)
117 |                 next_cluster += 1
118 | 
119 |         self.labels_ = labels.astype(int)
120 |         self.n_clusters_ = int(labels.max() + 1)
121 |         self.centers_ = np.array(centers)
122 |         self.leaders_ = leaders.astype(int)
123 | 


--------------------------------------------------------------------------------
/diameter_clustering/qt.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Implementation of Quality threshold clustering.
  3 | """
  4 | 
  5 | from typing import Union
  6 | 
  7 | import numpy as np
  8 | from scipy.sparse import csr_matrix, lil_matrix
  9 | from tqdm import tqdm
 10 | 
 11 | from .mixins import FitPredictMixin, DistanceMatrixMixin
 12 | 
 13 | 
 14 | class QTClustering(FitPredictMixin, DistanceMatrixMixin):
 15 |     """Quality threshold clustering.
 16 | 
 17 |     Args:
 18 |         max_radius (float): Maximum radius of cluster
 19 |             (maximum distance between center of cluster and all other points).
 20 |         min_cluster_size (int): Minimum size of clusters, stop iterations at this cluster size.
 21 |         metric (str): Distance metric.
 22 |             For sparse_dist=True possible options are in sklearn.neighbors.VALID_METRICS['brute'].
 23 |             For sparse_dist=False possible options are 'inner_product' or one of metrics
 24 |             available in scipy.spatial.distance.pdist. If 'inner_product' then use np.inner
 25 |             which is much faster than pdist. 'inner_product' could be used instead
 26 |             of cosine distance for normalized vectors.
 27 |         precomputed_dist (bool): If True, then input should be precomputed distance matrix,
 28 |             if False then input is array with features.
 29 |         sparse_dist (bool): If True, then use distance matrix in sparse format (zero elements
 30 |             are elements for which distance between points is greater than max_distance).
 31 |             If False, then consider distance matrix as ordinary numpy array.
 32 |         verbose (bool): If True then output progress info, otherwise be silent.
 33 | 
 34 |     Attributes:
 35 |         labels_ (np.ndarray): Array with cluster labels after fitting model.
 36 |         n_clusters_ (int): Number of clusters after fitting model.
 37 |         centers_ (np.ndarray): Array with indexes of cluster centers.
 38 |     """
 39 | 
 40 |     def __init__(self, max_radius: float = 0.1, min_cluster_size: int = 2,
 41 |                  metric: str = 'cosine', precomputed_dist: bool = False,
 42 |                  sparse_dist: bool = True, verbose: bool = True):
 43 | 
 44 |         self.max_radius = max_radius
 45 |         self.min_cluster_size = min_cluster_size
 46 |         self.metric = metric
 47 |         self.precomputed_dist = precomputed_dist
 48 |         self.sparse_dist = sparse_dist
 49 |         self.verbose = verbose
 50 | 
 51 |         self.max_distance = max_radius  # is needed for computation of sparse distance matrix
 52 |         self.labels_ = None
 53 |         self.n_clusters_ = None
 54 |         self.centers_ = None
 55 | 
 56 |     def fit(self, X: Union[np.ndarray, csr_matrix]):
 57 |         """Fit clustering from features or distance matrix.
 58 | 
 59 |         Args:
 60 |             X (np.ndarray or scipy.sparse.csr_matrix): Array with features or
 61 |                 precomputed distance matrix, could be in sparse matrix format.
 62 |         """
 63 | 
 64 |         dist_matrix = self._prepare_distance_matrix(X)
 65 | 
 66 |         if self.sparse_dist:
 67 |             dist_mask = lil_matrix(dist_matrix)
 68 |             dist_mask[dist_mask > 0] = 1
 69 |             dist_mask.setdiag(1)
 70 |             labels, centers = self.fit_sparse(dist_mask)
 71 |         else:
 72 |             dist_mask = dist_matrix < self.max_radius
 73 |             np.fill_diagonal(dist_mask, True)
 74 |             labels, centers = self.fit_dense(dist_mask)
 75 | 
 76 |         self.labels_ = labels.astype(int)
 77 |         self.n_clusters_ = int(labels.max() + 1)
 78 |         self.centers_ = np.array(centers)
 79 | 
 80 |     def fit_dense(self, dist_mask: np.ndarray):
 81 |         """Fit clustering from distance matrix mask when it is dense matrix."""
 82 | 
 83 |         labels = np.empty(dist_mask.shape[0])
 84 |         labels.fill(np.nan)
 85 |         centers = []
 86 |         cluster_number = 0
 87 |         total_count = 0
 88 | 
 89 |         with tqdm(total=dist_mask.shape[0], disable=not self.verbose) as pbar:
 90 |             while dist_mask.any():
 91 | 
 92 |                 # find size of candidate clusters for each point
 93 |                 candidate_size = dist_mask.sum(axis=1)
 94 | 
 95 |                 if np.max(candidate_size) < self.min_cluster_size:
 96 |                     labels[np.where(np.isnan(labels))] = -1
 97 |                     break
 98 | 
 99 |                 # pick the biggest possible cluster from candidates
100 |                 center_idx = np.argmax(candidate_size)
101 |                 cluster_points_idx = np.where(dist_mask[center_idx])[0]
102 |                 # assign labels
103 |                 labels[cluster_points_idx] = cluster_number
104 |                 centers.append(center_idx)
105 |                 # remove labeled data from further calculations
106 |                 dist_mask[cluster_points_idx, :] = False
107 |                 dist_mask[:, cluster_points_idx] = False
108 | 
109 |                 # finalize iteration
110 |                 cluster_number += 1
111 |                 size = np.max(candidate_size)
112 |                 total_count += size
113 | 
114 |                 pbar.update(size)
115 |                 pbar.set_description(
116 |                     f"QTClustering fit. Current cluster size {size}, total count {total_count}")
117 | 
118 |         return labels, centers
119 | 
120 |     def fit_sparse(self, dist_mask: csr_matrix):
121 |         """Fit clustering from distance matrix mask when it is sparse matrix."""
122 | 
123 |         labels = np.empty(dist_mask.shape[0])
124 |         labels.fill(np.nan)
125 |         centers = []
126 |         cluster_number = 0
127 |         total_count = 0
128 | 
129 |         with tqdm(total=dist_mask.shape[0], disable=not self.verbose) as pbar:
130 |             while dist_mask.sum() > 0:
131 | 
132 |                 # find size of candidate clusters for each point
133 |                 candidate_size = dist_mask.sum(axis=1)
134 | 
135 |                 if np.max(candidate_size) < self.min_cluster_size:
136 |                     labels[np.where(np.isnan(labels))] = -1
137 |                     break
138 | 
139 |                 # pick the biggest possible cluster from candidates
140 |                 center_idx = np.argmax(candidate_size)
141 |                 cluster_points_idx = dist_mask[center_idx].nonzero()[1]
142 |                 # assign labels
143 |                 labels[cluster_points_idx] = cluster_number
144 |                 centers.append(center_idx)
145 | 
146 |                 # remove labeled data from further calculations
147 |                 dist_mask[cluster_points_idx, :] = 0
148 |                 dist_mask[:, cluster_points_idx] = 0
149 | 
150 |                 # finalize iteration
151 |                 cluster_number += 1
152 |                 size = np.max(candidate_size)
153 |                 total_count += size
154 | 
155 |                 pbar.update(size)
156 |                 pbar.set_description(
157 |                     f"QTClustering fit. Current cluster size {size}, total count {total_count}")
158 | 
159 |         return labels, centers
160 | 


--------------------------------------------------------------------------------
/diameter_clustering/approx/leader.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Version of Leader clustering using approximate nearest neighbors search.
  3 | """
  4 | 
  5 | import numpy as np
  6 | from tqdm import tqdm
  7 | 
  8 | from .hnsw import HNSWIndex
  9 | from ..timer import timer
 10 | 
 11 | 
 12 | class ApproxLeaderClustering:
 13 |     """Leader clustering algorithm with approximate nearest neighbors search.
 14 | 
 15 |     Approximate nearest neighbors index is used to store leaders of clusters
 16 |     and to find nearest leader for new points.
 17 | 
 18 |     Args:
 19 |         ann_index: instance of HNSWIndex.
 20 |         max_radius: Maximum radius of each cluster
 21 |             (maximum distance between the leader and all other points in cluster).
 22 |         deterministic: If True then take points one by one to get deterministic behavior.
 23 |             If False then select points at random, so results would be different for each run.
 24 |         verbose: If True then output progress info, otherwise be silent.
 25 | 
 26 |     Attributes:
 27 |         labels_ (np.ndarray): Array with cluster labels after fitting model.
 28 |         n_clusters_ (int): Number of clusters after fitting model.
 29 |         centers_ (np.ndarray): Array with indexes of cluster centers.
 30 |         leaders_ (np.ndarray): Array with 1 for cluster centers and with 0 for all other points.
 31 | 
 32 |     Examples:
 33 |         import numpy as np
 34 |         from diameter_clustering.approx import HNSWIndex
 35 |         from diameter_clustering.approx import ApproxLeaderClustering
 36 | 
 37 |         # fit model
 38 |         data = np.random.rand(1000, 50)
 39 |         hnsw_index = HNSWIndex(max_elements=len(data), space='cosine', dim=50,
 40 |                                ef=100, ef_construction=200, M=16)
 41 |         model = ApproxLeaderClustering(hnsw_index, max_radius=0.2, deterministic=True)
 42 |         labels = model.fit_predict(data)
 43 | 
 44 |         # save index for later usage
 45 |         hnsw_index.save('hnsw_index.bin')
 46 | 
 47 |         # predict clusters for new data later
 48 |         new_data = np.random.rand(100, 50)
 49 |         hnsw_index = HNSWIndex(max_elements=len(new_data), path='hnsw_index.bin',
 50 |                                space='cosine', dim=50, ef=100)
 51 |         model = ApproxLeaderClustering(hnsw_index, max_radius=0.2, deterministic=True)
 52 |         new_labels = model.predict(new_data)
 53 |     """
 54 | 
 55 |     def __init__(self, ann_index: HNSWIndex, max_radius: float = 0.1,
 56 |                  deterministic: bool = True, verbose: bool = True):
 57 | 
 58 |         self.ann_index = ann_index
 59 |         self.max_radius = max_radius
 60 |         self.deterministic = deterministic
 61 |         self.verbose = verbose
 62 | 
 63 |         self.labels_ = None
 64 |         self.n_clusters_ = None
 65 |         self.centers_ = None
 66 |         self.leaders_ = None
 67 | 
 68 |     def fit(self, X: np.ndarray):
 69 |         """Fit clustering.
 70 | 
 71 |         Args:
 72 |             X: Array with features.
 73 |         """
 74 | 
 75 |         # create arrays for labels and leaders
 76 |         labels = np.empty(len(X))
 77 |         labels.fill(np.nan)
 78 |         leaders = np.zeros(len(X))
 79 |         centers = []
 80 | 
 81 |         # handle case when empty input data is passed
 82 |         if len(labels) == 0:
 83 |             self.labels_ = labels
 84 |             self.leaders_ = leaders
 85 |             self.n_clusters_ = 0
 86 |             return
 87 | 
 88 |         # choose first point and assign label to it
 89 |         idx = 0 if self.deterministic else np.random.choice(range(len(labels)))
 90 |         labels[idx] = 0
 91 |         next_cluster = 1
 92 |         leaders[idx] = 1
 93 |         centers.append(idx)
 94 |         self.ann_index.add_item(X[idx])
 95 | 
 96 |         for _ in tqdm(range(len(labels)-1), desc='ApproxLeaderClustering fit',
 97 |                       disable=not self.verbose):
 98 | 
 99 |             # choose next point
100 |             indexes = np.where(np.isnan(labels))[0]
101 |             idx = indexes[0] if self.deterministic else np.random.choice(indexes)
102 | 
103 |             # find nearest leader
104 |             nearest_leader_idx, nearest_leader_dist = self.ann_index.find_nearest_point(X[idx])
105 | 
106 |             if nearest_leader_dist <= self.max_radius:
107 |                 # assign cluster with nearest leader as label
108 |                 labels[idx] = nearest_leader_idx
109 |             else:
110 |                 # assign new cluster label
111 |                 labels[idx] = next_cluster
112 |                 leaders[idx] = 1
113 |                 centers.append(idx)
114 |                 next_cluster += 1
115 |                 self.ann_index.add_item(X[idx])
116 | 
117 |         self.labels_ = labels.astype(int)
118 |         self.n_clusters_ = int(labels.max() + 1)
119 |         self.centers_ = np.array(centers)
120 |         self.leaders_ = leaders.astype(int)
121 | 
122 |     def fit_predict(self, X: np.ndarray) -> np.ndarray:
123 |         """Fit clustering and return cluster labels.
124 | 
125 |         Args:
126 |             X: Array with features.
127 | 
128 |         Returns:
129 |             Numpy array with labels for data points in X.
130 |         """
131 | 
132 |         self.fit(X)
133 | 
134 |         return self.labels_
135 | 
136 |     def predict(self, X: np.ndarray) -> np.ndarray:
137 |         """Assigning new points to existent clusters without making new clusters.
138 | 
139 |         Returning -1 for points which can't be assigned to any cluster.
140 |         Finding nearest leaders for points one by one.
141 | 
142 |         Args:
143 |             X: Array with features for new points.
144 | 
145 |         Returns:
146 |             Numpy array with labels for data points in X.
147 |         """
148 | 
149 |         # create array for new labels
150 |         labels = np.empty(len(X))
151 | 
152 |         for idx in tqdm(range(len(X)), desc='ApproxLeaderClustering assign points to clusters',
153 |                         disable=not self.verbose):
154 | 
155 |             # find nearest leader
156 |             nearest_leader_idx, nearest_leader_dist = self.ann_index.find_nearest_point(X[idx])
157 | 
158 |             if nearest_leader_dist <= self.max_radius:
159 |                 # assign cluster with nearest leader as label
160 |                 labels[idx] = nearest_leader_idx
161 |             else:
162 |                 # assign -1 for point which is not close enough to any leader
163 |                 labels[idx] = -1
164 | 
165 |         return labels
166 | 
167 | 
168 |     def predict_batch(self, X: np.ndarray) -> np.ndarray:
169 |         """Assigning new points to existent clusters without making new clusters.
170 | 
171 |         Returning -1 for points which can't be assigned to any cluster.
172 |         Finding nearest leaders for all points at once.
173 | 
174 |         Args:
175 |             X: Array with features for new points.
176 | 
177 |         Returns:
178 |             Numpy array with labels for data points in X.
179 |         """
180 | 
181 |         # create array for new labels
182 |         labels = np.empty(len(X))
183 | 
184 |         # find nearest leaders
185 |         with timer('find_nearest_point_batch', disable=not self.verbose):
186 |             nearest_leaders_idx, nearest_leaders_dist = \
187 |                 self.ann_index.find_nearest_point_batch(X)
188 | 
189 |         for idx in range(len(X)):
190 | 
191 |             if nearest_leaders_dist[idx] <= self.max_radius:
192 |                 # assign cluster with nearest leader as label
193 |                 labels[idx] = nearest_leaders_idx[idx]
194 |             else:
195 |                 # assign -1 for point which is not close enough to any leader
196 |                 labels[idx] = -1
197 | 
198 |         return labels
199 | 


--------------------------------------------------------------------------------
/diameter_clustering/diameter.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Simple greedy algorithm for clustering with maximum distance between points inside clusters.
  3 | """
  4 | 
  5 | from typing import Union
  6 | 
  7 | import numpy as np
  8 | import numpy_groupies as npg
  9 | from scipy.sparse import csr_matrix
 10 | from tqdm import tqdm
 11 | 
 12 | from .mixins import FitPredictMixin, DistanceMatrixMixin
 13 | from .timer import TimerWithHistory
 14 | 
 15 | 
 16 | class MaxDiameterClustering(FitPredictMixin, DistanceMatrixMixin):
 17 |     """Clustering with maximum diameter (maximum distance between points) inside clusters.
 18 | 
 19 |     Args:
 20 |         max_distance (float): Maximum distance between points in clusters.
 21 |         criterion (str): Criterion for choosing cluster from several candidates.
 22 |             If 'distance' then choose cluster with minimum average distance to given point.
 23 |             If 'size' then choose cluster with maximum current size.
 24 |         metric (str): Distance metric.
 25 |             For sparse_dist=True possible options are in sklearn.neighbors.VALID_METRICS['brute'].
 26 |             For sparse_dist=False possible options are 'inner_product' or one of metrics
 27 |             available in scipy.spatial.distance.pdist. If 'inner_product' then use np.inner
 28 |             which is much faster than pdist. 'inner_product' could be used instead
 29 |             of cosine distance for normalized vectors.
 30 |         precomputed_dist (bool): If True, then input should be precomputed distance matrix,
 31 |             if False then input is array with features.
 32 |         sparse_dist (bool): If True, then use distance matrix in sparse format (zero elements
 33 |             are elements for which distance between points is greater than max_distance).
 34 |             If False, then distance matrix is ordinary numpy array.
 35 |         deterministic (bool): If True then take points one by one to get determenistic behavior.
 36 |             If False then select points at random, so results would be different for each run.
 37 |         use_timer (bool): If True then use TimerWithHistory in fit method, which can be accessed
 38 |             via self.timer. Can be useful for debugging.
 39 |         verbose (bool): If True then output progress info, otherwise be silent.
 40 | 
 41 |     Attributes:
 42 |         labels_ (np.array): Array with cluster labels after fitting model.
 43 |         n_clusters_ (int): Number of clusters after fitting model.
 44 |         timer: Timer with history of execution time (access history via self.timer.history).
 45 |     """
 46 | 
 47 |     def __init__(self, max_distance: float = 0.2, criterion: str = 'distance',
 48 |                  metric: str = 'cosine', precomputed_dist: bool = False,
 49 |                  sparse_dist: bool = True, deterministic: bool = False,
 50 |                  use_timer: bool = False, verbose: bool = True):
 51 | 
 52 |         if criterion not in ['size', 'distance']:
 53 |             raise ValueError('Wrong criterion value, should be "size" or "distance".')
 54 | 
 55 |         self.max_distance = max_distance
 56 |         self.criterion = criterion
 57 |         self.metric = metric
 58 |         self.precomputed_dist = precomputed_dist
 59 |         self.sparse_dist = sparse_dist
 60 |         self.deterministic = deterministic
 61 |         self.use_timer = use_timer
 62 |         self.verbose = verbose
 63 | 
 64 |         self.labels_ = None
 65 |         self.n_clusters_ = None
 66 |         self.timer = None
 67 | 
 68 |     def fit(self, X: Union[np.ndarray, csr_matrix]):
 69 |         """Fit clustering from features or distance matrix.
 70 | 
 71 |         Args:
 72 |             X (np.ndarray or scipy.sparse.csr_matrix): Array with features or
 73 |                 precomputed distance matrix, could be in sparse format.
 74 |         """
 75 | 
 76 |         dist_matrix = self._prepare_distance_matrix(X)
 77 | 
 78 |         # create array for labels
 79 |         labels = np.empty(dist_matrix.shape[0])
 80 |         labels.fill(np.nan)
 81 | 
 82 |         # handle case when empty input data is passed
 83 |         if len(labels) == 0:
 84 |             self.labels_ = labels
 85 |             self.n_clusters_ = 0
 86 |             return
 87 | 
 88 |         # choose first point and assign label to it
 89 |         idx = 0 if self.deterministic else np.random.choice(range(len(labels)))
 90 |         labels[idx] = 0
 91 |         next_cluster = 1
 92 | 
 93 |         self.timer = TimerWithHistory(disable=not self.use_timer)
 94 | 
 95 |         for _ in tqdm(range(len(labels)-1), desc='MaxDiameterClustering fit',
 96 |                       disable=not self.verbose):
 97 | 
 98 |             # choose next point
 99 |             with self.timer(name='choose_next_point'):
100 |                 indexes = np.where(np.isnan(labels))[0]
101 |                 idx = indexes[0] if self.deterministic else np.random.choice(indexes)
102 |             # find indices of already labeled points
103 |             with self.timer(name='find_labeled_points'):
104 |                 current_cluster_idx = np.where(~np.isnan(labels))[0]
105 |                 current_cluster_labels = labels[current_cluster_idx].astype(int)
106 |             # find distances to already labeled points
107 |             with self.timer(name='get_distances'):
108 |                 current_dist = self._slice_distance_matrix(dist_matrix, idx, current_cluster_idx)
109 | 
110 |             # find max distance to each existent cluster
111 |             with self.timer(name='max_distance_to_clusters'):
112 |                 cluster_dist_max = npg.aggregate(current_cluster_labels, current_dist,
113 |                                                  func='max', fill_value=np.inf)
114 | 
115 |             if np.min(cluster_dist_max) <= self.max_distance:
116 |                 # find existent clusters with max dist < threshold
117 |                 with self.timer(name='candidate_clusters'):
118 |                     candidate_clusters = np.where(cluster_dist_max <= self.max_distance)[0]
119 |                     # directly get label if there is only one such cluster
120 |                     if len(candidate_clusters) == 1:
121 |                         labels[idx] = candidate_clusters[0]
122 |                         continue
123 |                     # otherwise we need to choose between candidate clusters
124 |                     candidate_clusters_idx = np.isin(current_cluster_labels, candidate_clusters)
125 |                     candidate_clusters_labels = current_cluster_labels[candidate_clusters_idx]
126 | 
127 |                 if self.criterion == 'distance':
128 |                     candidate_clusters_dist = current_dist[candidate_clusters_idx]
129 |                     labels[idx] = self._best_candidate_distance(candidate_clusters_labels,
130 |                                                                 candidate_clusters_dist)
131 |                 elif self.criterion == 'size':
132 |                     labels[idx] = self._best_candidate_size(candidate_clusters_labels)
133 |             else:
134 |                 # assign new cluster label
135 |                 with self.timer(name='assign_new_label'):
136 |                     labels[idx] = next_cluster
137 |                     next_cluster += 1
138 | 
139 |         self.labels_ = labels.astype(int)
140 |         self.n_clusters_ = labels.max() + 1
141 | 
142 |     def _best_candidate_distance(self, candidate_clusters_labels: np.ndarray,
143 |                                  candidate_clusters_dist: np.ndarray) -> int:
144 |         """Find best candidate cluster based on average distance to clusters."""
145 | 
146 |         # find average distance to clusters
147 |         with self.timer(name='average_distance_to_clusters'):
148 |             cluster_dist_mean = npg.aggregate(candidate_clusters_labels,
149 |                                               candidate_clusters_dist,
150 |                                               func='mean', fill_value=np.inf)
151 | 
152 |         # assign cluster with min average distance as label
153 |         with self.timer(name='distance_argmin'):
154 |             label = cluster_dist_mean.argmin()
155 | 
156 |         return label
157 | 
158 |     def _best_candidate_size(self, candidate_clusters_labels: np.ndarray) -> int:
159 |         """Find best candidate cluster based on size of clusters."""
160 | 
161 |         # find size of clusters
162 |         with self.timer(name='average_size_of_clusters'):
163 |             cluster_size = npg.aggregate(candidate_clusters_labels,
164 |                                          candidate_clusters_labels,
165 |                                          func='count')
166 |         # assign cluster with max size as label
167 |         with self.timer(name='size_argmax'):
168 |             label = cluster_size.argmax()
169 | 
170 |         return label
171 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
  1 | [MASTER]
  2 | 
  3 | # A comma-separated list of package or module names from where C extensions may
  4 | # be loaded. Extensions are loading into the active Python interpreter and may
  5 | # run arbitrary code.
  6 | extension-pkg-whitelist=
  7 | 
  8 | # Specify a score threshold to be exceeded before program exits with error.
  9 | fail-under=10
 10 | 
 11 | # Add files or directories to the blacklist. They should be base names, not
 12 | # paths.
 13 | ignore=CVS
 14 | 
 15 | # Add files or directories matching the regex patterns to the blacklist. The
 16 | # regex matches against base names, not paths.
 17 | ignore-patterns=
 18 | 
 19 | # Python code to execute, usually for sys.path manipulation such as
 20 | # pygtk.require().
 21 | #init-hook=
 22 | 
 23 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
 24 | # number of processors available to use.
 25 | jobs=1
 26 | 
 27 | # Control the amount of potential inferred values when inferring a single
 28 | # object. This can help the performance when dealing with large functions or
 29 | # complex, nested conditions.
 30 | limit-inference-results=100
 31 | 
 32 | # List of plugins (as comma separated values of python module names) to load,
 33 | # usually to register additional checkers.
 34 | load-plugins=
 35 | 
 36 | # Pickle collected data for later comparisons.
 37 | persistent=yes
 38 | 
 39 | # When enabled, pylint would attempt to guess common misconfiguration and emit
 40 | # user-friendly hints instead of false-positive error messages.
 41 | suggestion-mode=yes
 42 | 
 43 | # Allow loading of arbitrary C extensions. Extensions are imported into the
 44 | # active Python interpreter and may run arbitrary code.
 45 | unsafe-load-any-extension=no
 46 | 
 47 | 
 48 | [MESSAGES CONTROL]
 49 | 
 50 | # Only show warnings with the listed confidence levels. Leave empty to show
 51 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
 52 | confidence=
 53 | 
 54 | # Disable the message, report, category or checker with the given id(s). You
 55 | # can either give multiple identifiers separated by comma (,) or put this
 56 | # option multiple times (only on the command line, not in the configuration
 57 | # file where it should appear only once). You can also use "--disable=all" to
 58 | # disable everything first and then reenable specific checks. For example, if
 59 | # you want to run only the similarities checker, you can use "--disable=all
 60 | # --enable=similarities". If you want to run only the classes checker, but have
 61 | # no Warning level messages displayed, use "--disable=all --enable=classes
 62 | # --disable=W".
 63 | disable=print-statement,
 64 |         parameter-unpacking,
 65 |         unpacking-in-except,
 66 |         old-raise-syntax,
 67 |         backtick,
 68 |         long-suffix,
 69 |         old-ne-operator,
 70 |         old-octal-literal,
 71 |         import-star-module-level,
 72 |         non-ascii-bytes-literal,
 73 |         raw-checker-failed,
 74 |         bad-inline-option,
 75 |         locally-disabled,
 76 |         file-ignored,
 77 |         suppressed-message,
 78 |         useless-suppression,
 79 |         deprecated-pragma,
 80 |         use-symbolic-message-instead,
 81 |         apply-builtin,
 82 |         basestring-builtin,
 83 |         buffer-builtin,
 84 |         cmp-builtin,
 85 |         coerce-builtin,
 86 |         execfile-builtin,
 87 |         file-builtin,
 88 |         long-builtin,
 89 |         raw_input-builtin,
 90 |         reduce-builtin,
 91 |         standarderror-builtin,
 92 |         unicode-builtin,
 93 |         xrange-builtin,
 94 |         coerce-method,
 95 |         delslice-method,
 96 |         getslice-method,
 97 |         setslice-method,
 98 |         no-absolute-import,
 99 |         old-division,
100 |         dict-iter-method,
101 |         dict-view-method,
102 |         next-method-called,
103 |         metaclass-assignment,
104 |         indexing-exception,
105 |         raising-string,
106 |         reload-builtin,
107 |         oct-method,
108 |         hex-method,
109 |         nonzero-method,
110 |         cmp-method,
111 |         input-builtin,
112 |         round-builtin,
113 |         intern-builtin,
114 |         unichr-builtin,
115 |         map-builtin-not-iterating,
116 |         zip-builtin-not-iterating,
117 |         range-builtin-not-iterating,
118 |         filter-builtin-not-iterating,
119 |         using-cmp-argument,
120 |         eq-without-hash,
121 |         div-method,
122 |         idiv-method,
123 |         rdiv-method,
124 |         exception-message-attribute,
125 |         invalid-str-codec,
126 |         sys-max-int,
127 |         bad-python3-import,
128 |         deprecated-string-function,
129 |         deprecated-str-translate-call,
130 |         deprecated-itertools-function,
131 |         deprecated-types-field,
132 |         next-method-defined,
133 |         dict-items-not-iterating,
134 |         dict-keys-not-iterating,
135 |         dict-values-not-iterating,
136 |         deprecated-operator-function,
137 |         deprecated-urllib-function,
138 |         xreadlines-attribute,
139 |         deprecated-sys-function,
140 |         exception-escape,
141 |         comprehension-escape,
142 |         too-many-instance-attributes,  # start of user defined messages
143 |         too-many-arguments,
144 |         redefined-outer-name
145 | 
146 | 
147 | # Enable the message, report, category or checker with the given id(s). You can
148 | # either give multiple identifier separated by comma (,) or put this option
149 | # multiple time (only on the command line, not in the configuration file where
150 | # it should appear only once). See also the "--disable" option for examples.
151 | enable=c-extension-no-member
152 | 
153 | 
154 | [REPORTS]
155 | 
156 | # Python expression which should return a score less than or equal to 10. You
157 | # have access to the variables 'error', 'warning', 'refactor', and 'convention'
158 | # which contain the number of messages in each category, as well as 'statement'
159 | # which is the total number of statements analyzed. This score is used by the
160 | # global evaluation report (RP0004).
161 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
162 | 
163 | # Template used to display messages. This is a python new-style format string
164 | # used to format the message information. See doc for all details.
165 | #msg-template=
166 | 
167 | # Set the output format. Available formats are text, parseable, colorized, json
168 | # and msvs (visual studio). You can also give a reporter class, e.g.
169 | # mypackage.mymodule.MyReporterClass.
170 | output-format=text
171 | 
172 | # Tells whether to display a full report or only the messages.
173 | reports=no
174 | 
175 | # Activate the evaluation score.
176 | score=yes
177 | 
178 | 
179 | [REFACTORING]
180 | 
181 | # Maximum number of nested blocks for function / method body
182 | max-nested-blocks=5
183 | 
184 | # Complete name of functions that never returns. When checking for
185 | # inconsistent-return-statements if a never returning function is called then
186 | # it will be considered as an explicit return statement and no message will be
187 | # printed.
188 | never-returning-functions=sys.exit
189 | 
190 | 
191 | [LOGGING]
192 | 
193 | # The type of string formatting that logging methods do. `old` means using %
194 | # formatting, `new` is for `{}` formatting.
195 | logging-format-style=old
196 | 
197 | # Logging modules to check that the string format arguments are in logging
198 | # function parameter format.
199 | logging-modules=logging
200 | 
201 | 
202 | [SPELLING]
203 | 
204 | # Limits count of emitted suggestions for spelling mistakes.
205 | max-spelling-suggestions=4
206 | 
207 | # Spelling dictionary name. Available dictionaries: none. To make it work,
208 | # install the python-enchant package.
209 | spelling-dict=
210 | 
211 | # List of comma separated words that should not be checked.
212 | spelling-ignore-words=
213 | 
214 | # A path to a file that contains the private dictionary; one word per line.
215 | spelling-private-dict-file=
216 | 
217 | # Tells whether to store unknown words to the private dictionary (see the
218 | # --spelling-private-dict-file option) instead of raising a message.
219 | spelling-store-unknown-words=no
220 | 
221 | 
222 | [MISCELLANEOUS]
223 | 
224 | # List of note tags to take in consideration, separated by a comma.
225 | notes=FIXME,
226 |       XXX,
227 |       TODO
228 | 
229 | # Regular expression of note tags to take in consideration.
230 | #notes-rgx=
231 | 
232 | 
233 | [TYPECHECK]
234 | 
235 | # List of decorators that produce context managers, such as
236 | # contextlib.contextmanager. Add to this list to register other decorators that
237 | # produce valid context managers.
238 | contextmanager-decorators=contextlib.contextmanager
239 | 
240 | # List of members which are set dynamically and missed by pylint inference
241 | # system, and so shouldn't trigger E1101 when accessed. Python regular
242 | # expressions are accepted.
243 | generated-members=
244 | 
245 | # Tells whether missing members accessed in mixin class should be ignored. A
246 | # mixin class is detected if its name ends with "mixin" (case insensitive).
247 | ignore-mixin-members=yes
248 | 
249 | # Tells whether to warn about missing members when the owner of the attribute
250 | # is inferred to be None.
251 | ignore-none=yes
252 | 
253 | # This flag controls whether pylint should warn about no-member and similar
254 | # checks whenever an opaque object is returned when inferring. The inference
255 | # can return multiple potential results while evaluating a Python object, but
256 | # some branches might not be evaluated, which results in partial inference. In
257 | # that case, it might be useful to still emit no-member and other checks for
258 | # the rest of the inferred objects.
259 | ignore-on-opaque-inference=yes
260 | 
261 | # List of class names for which member attributes should not be checked (useful
262 | # for classes with dynamically set attributes). This supports the use of
263 | # qualified names.
264 | ignored-classes=optparse.Values,thread._local,_thread._local
265 | 
266 | # List of module names for which member attributes should not be checked
267 | # (useful for modules/projects where namespaces are manipulated during runtime
268 | # and thus existing member attributes cannot be deduced by static analysis). It
269 | # supports qualified module names, as well as Unix pattern matching.
270 | ignored-modules=
271 | 
272 | # Show a hint with possible names when a member name was not found. The aspect
273 | # of finding the hint is based on edit distance.
274 | missing-member-hint=yes
275 | 
276 | # The minimum edit distance a name should have in order to be considered a
277 | # similar match for a missing member name.
278 | missing-member-hint-distance=1
279 | 
280 | # The total number of similar names that should be taken in consideration when
281 | # showing a hint for a missing member.
282 | missing-member-max-choices=1
283 | 
284 | # List of decorators that change the signature of a decorated function.
285 | signature-mutators=
286 | 
287 | 
288 | [VARIABLES]
289 | 
290 | # List of additional names supposed to be defined in builtins. Remember that
291 | # you should avoid defining new builtins when possible.
292 | additional-builtins=
293 | 
294 | # Tells whether unused global variables should be treated as a violation.
295 | allow-global-unused-variables=yes
296 | 
297 | # List of strings which can identify a callback function by name. A callback
298 | # name must start or end with one of those strings.
299 | callbacks=cb_,
300 |           _cb
301 | 
302 | # A regular expression matching the name of dummy variables (i.e. expected to
303 | # not be used).
304 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
305 | 
306 | # Argument names that match this expression will be ignored. Default to name
307 | # with leading underscore.
308 | ignored-argument-names=_.*|^ignored_|^unused_
309 | 
310 | # Tells whether we should check for unused import in __init__ files.
311 | init-import=no
312 | 
313 | # List of qualified module names which can have objects that can redefine
314 | # builtins.
315 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
316 | 
317 | 
318 | [FORMAT]
319 | 
320 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
321 | expected-line-ending-format=
322 | 
323 | # Regexp for a line that is allowed to be longer than the limit.
324 | ignore-long-lines=^\s*(# )?<?https?://\S+>?$
325 | 
326 | # Number of spaces of indent required inside a hanging or continued line.
327 | indent-after-paren=4
328 | 
329 | # String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
330 | # tab).
331 | indent-string='    '
332 | 
333 | # Maximum number of characters on a single line.
334 | max-line-length=100
335 | 
336 | # Maximum number of lines in a module.
337 | max-module-lines=1000
338 | 
339 | # List of optional constructs for which whitespace checking is disabled. `dict-
340 | # separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
341 | # `trailing-comma` allows a space between comma and closing bracket: (a, ).
342 | # `empty-line` allows space-only lines.
343 | no-space-check=trailing-comma,
344 |                dict-separator
345 | 
346 | # Allow the body of a class to be on the same line as the declaration if body
347 | # contains single statement.
348 | single-line-class-stmt=no
349 | 
350 | # Allow the body of an if to be on the same line as the test if there is no
351 | # else.
352 | single-line-if-stmt=no
353 | 
354 | 
355 | [SIMILARITIES]
356 | 
357 | # Ignore comments when computing similarities.
358 | ignore-comments=yes
359 | 
360 | # Ignore docstrings when computing similarities.
361 | ignore-docstrings=yes
362 | 
363 | # Ignore imports when computing similarities.
364 | ignore-imports=no
365 | 
366 | # Minimum lines number of a similarity.
367 | min-similarity-lines=4
368 | 
369 | 
370 | [BASIC]
371 | 
372 | # Naming style matching correct argument names.
373 | argument-naming-style=snake_case
374 | 
375 | # Regular expression matching correct argument names. Overrides argument-
376 | # naming-style.
377 | #argument-rgx=
378 | 
379 | # Naming style matching correct attribute names.
380 | attr-naming-style=snake_case
381 | 
382 | # Regular expression matching correct attribute names. Overrides attr-naming-
383 | # style.
384 | #attr-rgx=
385 | 
386 | # Bad variable names which should always be refused, separated by a comma.
387 | bad-names=foo,
388 |           bar,
389 |           baz,
390 |           toto,
391 |           tutu,
392 |           tata
393 | 
394 | # Bad variable names regexes, separated by a comma. If names match any regex,
395 | # they will always be refused
396 | bad-names-rgxs=
397 | 
398 | # Naming style matching correct class attribute names.
399 | class-attribute-naming-style=any
400 | 
401 | # Regular expression matching correct class attribute names. Overrides class-
402 | # attribute-naming-style.
403 | #class-attribute-rgx=
404 | 
405 | # Naming style matching correct class names.
406 | class-naming-style=PascalCase
407 | 
408 | # Regular expression matching correct class names. Overrides class-naming-
409 | # style.
410 | #class-rgx=
411 | 
412 | # Naming style matching correct constant names.
413 | const-naming-style=UPPER_CASE
414 | 
415 | # Regular expression matching correct constant names. Overrides const-naming-
416 | # style.
417 | #const-rgx=
418 | 
419 | # Minimum line length for functions/classes that require docstrings, shorter
420 | # ones are exempt.
421 | docstring-min-length=-1
422 | 
423 | # Naming style matching correct function names.
424 | function-naming-style=snake_case
425 | 
426 | # Regular expression matching correct function names. Overrides function-
427 | # naming-style.
428 | #function-rgx=
429 | 
430 | # Good variable names which should always be accepted, separated by a comma.
431 | good-names=i,
432 |            j,
433 |            k,
434 |            ex,
435 |            Run,
436 |            _,
437 |            df,
438 |            X,
439 |            y,
440 | 
441 | # Good variable names regexes, separated by a comma. If names match any regex,
442 | # they will always be accepted
443 | good-names-rgxs=
444 | 
445 | # Include a hint for the correct naming format with invalid-name.
446 | include-naming-hint=no
447 | 
448 | # Naming style matching correct inline iteration names.
449 | inlinevar-naming-style=any
450 | 
451 | # Regular expression matching correct inline iteration names. Overrides
452 | # inlinevar-naming-style.
453 | #inlinevar-rgx=
454 | 
455 | # Naming style matching correct method names.
456 | method-naming-style=snake_case
457 | 
458 | # Regular expression matching correct method names. Overrides method-naming-
459 | # style.
460 | #method-rgx=
461 | 
462 | # Naming style matching correct module names.
463 | module-naming-style=snake_case
464 | 
465 | # Regular expression matching correct module names. Overrides module-naming-
466 | # style.
467 | #module-rgx=
468 | 
469 | # Colon-delimited sets of names that determine each other's naming style when
470 | # the name regexes allow several styles.
471 | name-group=
472 | 
473 | # Regular expression which should only match function or class names that do
474 | # not require a docstring.
475 | no-docstring-rgx=^_
476 | 
477 | # List of decorators that produce properties, such as abc.abstractproperty. Add
478 | # to this list to register other decorators that produce valid properties.
479 | # These decorators are taken in consideration only for invalid-name.
480 | property-classes=abc.abstractproperty
481 | 
482 | # Naming style matching correct variable names.
483 | variable-naming-style=snake_case
484 | 
485 | # Regular expression matching correct variable names. Overrides variable-
486 | # naming-style.
487 | #variable-rgx=
488 | 
489 | 
490 | [STRING]
491 | 
492 | # This flag controls whether inconsistent-quotes generates a warning when the
493 | # character used as a quote delimiter is used inconsistently within a module.
494 | check-quote-consistency=no
495 | 
496 | # This flag controls whether the implicit-str-concat should generate a warning
497 | # on implicit string concatenation in sequences defined over several lines.
498 | check-str-concat-over-line-jumps=no
499 | 
500 | 
501 | [IMPORTS]
502 | 
503 | # List of modules that can be imported at any level, not just the top level
504 | # one.
505 | allow-any-import-level=
506 | 
507 | # Allow wildcard imports from modules that define __all__.
508 | allow-wildcard-with-all=no
509 | 
510 | # Analyse import fallback blocks. This can be used to support both Python 2 and
511 | # 3 compatible code, which means that the block might have code that exists
512 | # only in one or another interpreter, leading to false positives when analysed.
513 | analyse-fallback-blocks=no
514 | 
515 | # Deprecated modules which should not be used, separated by a comma.
516 | deprecated-modules=optparse,tkinter.tix
517 | 
518 | # Create a graph of external dependencies in the given file (report RP0402 must
519 | # not be disabled).
520 | ext-import-graph=
521 | 
522 | # Create a graph of every (i.e. internal and external) dependencies in the
523 | # given file (report RP0402 must not be disabled).
524 | import-graph=
525 | 
526 | # Create a graph of internal dependencies in the given file (report RP0402 must
527 | # not be disabled).
528 | int-import-graph=
529 | 
530 | # Force import order to recognize a module as part of the standard
531 | # compatibility libraries.
532 | known-standard-library=
533 | 
534 | # Force import order to recognize a module as part of a third party library.
535 | known-third-party=enchant
536 | 
537 | # Couples of modules and preferred modules, separated by a comma.
538 | preferred-modules=
539 | 
540 | 
541 | [CLASSES]
542 | 
543 | # List of method names used to declare (i.e. assign) instance attributes.
544 | defining-attr-methods=__init__,
545 |                       __new__,
546 |                       setUp,
547 |                       __post_init__
548 | 
549 | # List of member names, which should be excluded from the protected access
550 | # warning.
551 | exclude-protected=_asdict,
552 |                   _fields,
553 |                   _replace,
554 |                   _source,
555 |                   _make
556 | 
557 | # List of valid names for the first argument in a class method.
558 | valid-classmethod-first-arg=cls
559 | 
560 | # List of valid names for the first argument in a metaclass class method.
561 | valid-metaclass-classmethod-first-arg=cls
562 | 
563 | 
564 | [DESIGN]
565 | 
566 | # Maximum number of arguments for function / method.
567 | max-args=5
568 | 
569 | # Maximum number of attributes for a class (see R0902).
570 | max-attributes=7
571 | 
572 | # Maximum number of boolean expressions in an if statement (see R0916).
573 | max-bool-expr=5
574 | 
575 | # Maximum number of branch for function / method body.
576 | max-branches=12
577 | 
578 | # Maximum number of locals for function / method body.
579 | max-locals=15
580 | 
581 | # Maximum number of parents for a class (see R0901).
582 | max-parents=7
583 | 
584 | # Maximum number of public methods for a class (see R0904).
585 | max-public-methods=20
586 | 
587 | # Maximum number of return / yield for function / method body.
588 | max-returns=6
589 | 
590 | # Maximum number of statements in function / method body.
591 | max-statements=50
592 | 
593 | # Minimum number of public methods for a class (see R0903).
594 | min-public-methods=2
595 | 
596 | 
597 | [EXCEPTIONS]
598 | 
599 | # Exceptions that will emit a warning when being caught. Defaults to
600 | # "BaseException, Exception".
601 | overgeneral-exceptions=BaseException,
602 |                        Exception
603 | 


--------------------------------------------------------------------------------