├── size_constrained_clustering ├── sklearn_import │ ├── cluster │ │ ├── __init__.py │ │ ├── _k_means.pyx │ │ └── k_means_.py │ ├── metrics │ │ ├── __init__.py │ │ ├── pairwise_fast.pyx │ │ └── pairwise.py │ ├── externals │ │ └── __init__.py │ ├── preprocessing │ │ ├── __init__.py │ │ └── data.py │ ├── utils │ │ ├── fixes.py │ │ ├── __init__.py │ │ ├── sparsefuncs.py │ │ ├── extmath.py │ │ ├── sparsefuncs_fast.pyx │ │ └── validation.py │ ├── fixes.py │ ├── __init__.py │ ├── exceptions.py │ ├── funcsigs.py │ └── base.py ├── k_means_constrained │ ├── __init__.py │ ├── mincostflow_vectorized.py │ ├── mincostflow_vectorized_.pyx │ └── k_means_constrained_.py ├── __init__.py ├── fcm.py ├── minmax.py ├── shrinkage.py ├── base.py ├── da.py └── equal.py ├── pic ├── da.png ├── fcm.png ├── equal.png ├── minmax.png ├── shrinkage.png └── equal_heuristics.png ├── __init__.py ├── requirements.txt ├── .travis.yml ├── tests ├── test_pypi.py ├── memory_test.py ├── memory_monitor.py ├── test_da.py ├── test_equal.py ├── test_fcm.py └── test_minmax.py ├── LICENSE ├── examples └── examples.py ├── setup.py └── README.md /size_constrained_clustering/sklearn_import/cluster/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /size_constrained_clustering/sklearn_import/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /size_constrained_clustering/sklearn_import/externals/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /size_constrained_clustering/sklearn_import/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pic/da.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jingw2/size_constrained_clustering/HEAD/pic/da.png -------------------------------------------------------------------------------- /pic/fcm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jingw2/size_constrained_clustering/HEAD/pic/fcm.png -------------------------------------------------------------------------------- /pic/equal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jingw2/size_constrained_clustering/HEAD/pic/equal.png -------------------------------------------------------------------------------- /pic/minmax.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jingw2/size_constrained_clustering/HEAD/pic/minmax.png -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .size_constrained_clustering import base, da, equal, fcm, minmax, shrinkage 3 | -------------------------------------------------------------------------------- /pic/shrinkage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jingw2/size_constrained_clustering/HEAD/pic/shrinkage.png -------------------------------------------------------------------------------- /pic/equal_heuristics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jingw2/size_constrained_clustering/HEAD/pic/equal_heuristics.png -------------------------------------------------------------------------------- /size_constrained_clustering/k_means_constrained/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | __all__ = ['KMeansConstrained'] 3 | 4 | from .k_means_constrained_ import KMeansConstrained 5 | 6 | -------------------------------------------------------------------------------- /size_constrained_clustering/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | path = os.path.dirname(os.path.abspath(__file__)) 4 | import sys 5 | sys.path.append(path) 6 | import base, da, equal, fcm, minmax, shrinkage 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | joblib==0.15.1 2 | psutil>=5.6.6 3 | numpy>=1.16.5 4 | scipy==1.6.0 5 | ortools>=6.7 6 | six==1.12.0 7 | matplotlib==3.1.0 8 | seaborn==0.10.1 9 | Cython==0.29.20 10 | scikit_learn==0.24.1 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - '3.7' 4 | before_install: 5 | - pip install -U pytest pytest-cov codecov 6 | install: 7 | - python setup.py build_ext --inplace 8 | - pip install -r requirements.txt 9 | script: 10 | - pytest --cov=./tests 11 | after_sucess: 12 | codecov 13 | -------------------------------------------------------------------------------- /size_constrained_clustering/sklearn_import/utils/fixes.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn_import.fixes import _parse_version 3 | 4 | np_version = _parse_version(np.__version__) 5 | 6 | 7 | def sparse_min_max(X, axis): 8 | return (X.min(axis=axis).toarray().ravel(), 9 | X.max(axis=axis).toarray().ravel()) 10 | -------------------------------------------------------------------------------- /size_constrained_clustering/sklearn_import/fixes.py: -------------------------------------------------------------------------------- 1 | def _parse_version(version_string): 2 | version = [] 3 | for x in version_string.split('.'): 4 | try: 5 | version.append(int(x)) 6 | except ValueError: 7 | # x may be of the form dev-1ea1592 8 | version.append(x) 9 | return tuple(version) -------------------------------------------------------------------------------- /tests/test_pypi.py: -------------------------------------------------------------------------------- 1 | 2 | from size_constrained_clustering import fcm, equal, minmax, shrinkage 3 | import numpy as np 4 | n_samples = 2000 5 | n_clusters = 3 6 | X = np.random.rand(n_samples, 2) 7 | # 使用minmax flow方式求解 8 | model = equal.SameSizeKMeansMinCostFlow(n_clusters) 9 | # 使用heuristics方法求解 10 | model.fit(X) 11 | centers = model.cluster_centers_ 12 | labels = model.labels_ 13 | -------------------------------------------------------------------------------- /tests/memory_test.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import collections 4 | import os 5 | import sys 6 | 7 | path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 8 | sys.path.append(path) 9 | from size_constrained_clustering import equal, da 10 | 11 | n_samples = 10000 12 | n_clusters = 4 13 | X = np.random.rand(n_samples, 2) 14 | distribution = [0.25] * n_clusters 15 | model = da.DeterministicAnnealing(n_clusters, distribution) 16 | model.fit(X) 17 | -------------------------------------------------------------------------------- /size_constrained_clustering/sklearn_import/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def get_config(): 5 | """Retrieve current values for configuration set by :func:`set_config` 6 | 7 | Returns 8 | ------- 9 | config : dict 10 | Keys are parameter names that can be passed to :func:`set_config`. 11 | """ 12 | return {'assume_finite': _ASSUME_FINITE} 13 | 14 | 15 | __version__ = '0.19.2' 16 | _ASSUME_FINITE = bool(os.environ.get('SKLEARN_ASSUME_FINITE', False)) -------------------------------------------------------------------------------- /size_constrained_clustering/k_means_constrained/mincostflow_vectorized.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | from ortools.graph.pywrapgraph import SimpleMinCostFlow 5 | 6 | 7 | # Cython paths must be fully qualified 8 | from k_means_constrained.mincostflow_vectorized_ import \ 9 | SimpleMinCostFlow_AddArcWithCapacityAndUnitCostVectorized, \ 10 | SimpleMinCostFlow_SetNodeSupplyVectorized, \ 11 | SimpleMinCostFlow_FlowVectorized 12 | 13 | 14 | class SimpleMinCostFlowVectorized(SimpleMinCostFlow): 15 | 16 | def AddArcWithCapacityAndUnitCostVectorized(self, tail, head, capacity, unit_cost): 17 | return SimpleMinCostFlow_AddArcWithCapacityAndUnitCostVectorized(self, tail, head, capacity, unit_cost) 18 | 19 | def SetNodeSupplyVectorized(self, node, supply): 20 | return SimpleMinCostFlow_SetNodeSupplyVectorized(self, node, supply) 21 | 22 | def FlowVectorized(self, arc): 23 | return SimpleMinCostFlow_FlowVectorized(self, arc) 24 | 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 The Python Packaging Authority 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /tests/memory_monitor.py: -------------------------------------------------------------------------------- 1 | #!usr/bin/python 3.7 2 | #-*-coding:utf-8-*- 3 | 4 | import subprocess 5 | import psutil 6 | import matplotlib.pyplot as plt 7 | import time 8 | 9 | cmd = "python memory_test.py" 10 | process = subprocess.Popen(cmd.split(" ")) 11 | 12 | pid = process.pid 13 | print("process id: ", pid) 14 | 15 | def get_memory_list(): 16 | process = psutil.Process(pid) 17 | memory_list = [] 18 | while process_running(process): 19 | try: 20 | memo = process.memory_info().rss / 1024 / 1024 #MB 21 | except: 22 | break 23 | memory_list.append(memo) 24 | time.sleep(1) 25 | return memory_list 26 | 27 | def process_running(process): 28 | try: 29 | memo = process.memory_info().rss / 1024 / 1024 30 | return True 31 | except: 32 | return False 33 | 34 | def plot(): 35 | start = time.time() 36 | memory_list = get_memory_list() 37 | end = time.time() 38 | print("Time spent to run {}s".format(round(end-start, 2))) 39 | plt.plot([x for x in range(len(memory_list))], memory_list) 40 | plt.xlabel("record point") 41 | plt.ylabel("memory (MB)") 42 | plt.show() 43 | 44 | if __name__ == "__main__": 45 | plot() 46 | -------------------------------------------------------------------------------- /tests/test_da.py: -------------------------------------------------------------------------------- 1 | #!usr/bin/python 3.7 2 | #-*-coding:utf-8-*- 3 | 4 | import pytest 5 | import sys 6 | import os 7 | path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 8 | sys.path.append(path) 9 | from size_constrained_clustering import da 10 | 11 | class TestDA: 12 | 13 | def test_input(self): 14 | with pytest.raises(AssertionError): 15 | da.DeterministicAnnealing(n_clusters=2, distribution=[0.25, 0.3]) 16 | with pytest.raises(AssertionError): 17 | da.DeterministicAnnealing(n_clusters=1, distribution=[0.25, 0.3]) 18 | with pytest.raises(AssertionError): 19 | da.DeterministicAnnealing(n_clusters=2, distribution=[0.25, 0.75], T=0.1) 20 | 21 | def test_output(self): 22 | import collections 23 | import random 24 | import numpy as np 25 | n_samples = 1000 26 | random_state = 42 27 | random.seed(random_state) 28 | np.random.seed(random_state) 29 | X = np.random.rand(n_samples, 2) 30 | n_clusters = 4 31 | distribution = [0.25] * n_clusters 32 | 33 | model = da.DeterministicAnnealing(n_clusters, distribution) 34 | model.fit(X) 35 | 36 | labels = model.labels_ 37 | label_counter = collections.Counter(labels) 38 | label_dist = list(label_counter.values()) 39 | label_dist = [d / np.sum(label_dist) for d in label_dist] 40 | 41 | assert np.sum(np.array(label_dist) - np.array(distribution)) <= 1e-6 42 | -------------------------------------------------------------------------------- /size_constrained_clustering/sklearn_import/exceptions.py: -------------------------------------------------------------------------------- 1 | class DataConversionWarning(UserWarning): 2 | """Warning used to notify implicit data conversions happening in the code. 3 | 4 | This warning occurs when some input data needs to be converted or 5 | interpreted in a way that may not match the user's expectations. 6 | 7 | For example, this warning may occur when the user 8 | - passes an integer array to a function which expects float input and 9 | will convert the input 10 | - requests a non-copying operation, but a copy is required to meet the 11 | implementation's data-type expectations; 12 | - passes an input whose shape can be interpreted ambiguously. 13 | 14 | .. versionchanged:: 0.18 15 | Moved from sklearn.utils.validation. 16 | """ 17 | 18 | 19 | class NotFittedError(ValueError, AttributeError): 20 | """Exception class to raise if estimator is used before fitting. 21 | 22 | This class inherits from both ValueError and AttributeError to help with 23 | exception handling and backward compatibility. 24 | 25 | Examples 26 | -------- 27 | >>> from sklearn.svm import LinearSVC 28 | >>> from sklearn.exceptions import NotFittedError 29 | >>> try: 30 | ... LinearSVC().predict([[1, 2], [2, 3], [3, 4]]) 31 | ... except NotFittedError as e: 32 | ... print(repr(e)) 33 | ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS 34 | NotFittedError('This LinearSVC instance is not fitted yet'...) 35 | 36 | .. versionchanged:: 0.18 37 | Moved from sklearn.utils.validation. 38 | """ -------------------------------------------------------------------------------- /size_constrained_clustering/sklearn_import/metrics/pairwise_fast.pyx: -------------------------------------------------------------------------------- 1 | #cython: boundscheck=False 2 | #cython: cdivision=True 3 | #cython: wraparound=False 4 | 5 | # Author: Andreas Mueller 6 | # Lars Buitinck 7 | # 8 | # License: BSD 3 clause 9 | 10 | from libc.string cimport memset 11 | import numpy as np 12 | cimport numpy as np 13 | 14 | ctypedef float [:, :] float_array_2d_t 15 | ctypedef double [:, :] double_array_2d_t 16 | 17 | cdef fused floating1d: 18 | float[::1] 19 | double[::1] 20 | 21 | cdef fused floating_array_2d_t: 22 | float_array_2d_t 23 | double_array_2d_t 24 | 25 | 26 | np.import_array() 27 | 28 | 29 | def _sparse_manhattan(floating1d X_data, int[:] X_indices, int[:] X_indptr, 30 | floating1d Y_data, int[:] Y_indices, int[:] Y_indptr, 31 | np.npy_intp n_features, double[:, ::1] D): 32 | """Pairwise L1 distances for CSR matrices. 33 | 34 | Usage: 35 | 36 | >>> D = np.zeros(X.shape[0], Y.shape[0]) 37 | >>> sparse_manhattan(X.data, X.indices, X.indptr, 38 | ... Y.data, Y.indices, Y.indptr, 39 | ... X.shape[1], D) 40 | """ 41 | cdef double[::1] row = np.empty(n_features) 42 | cdef np.npy_intp ix, iy, j 43 | 44 | with nogil: 45 | for ix in range(D.shape[0]): 46 | for iy in range(D.shape[1]): 47 | # Simple strategy: densify current row of X, then subtract the 48 | # corresponding row of Y. 49 | memset(&row[0], 0, n_features * sizeof(double)) 50 | for j in range(X_indptr[ix], X_indptr[ix + 1]): 51 | row[X_indices[j]] = X_data[j] 52 | for j in range(Y_indptr[iy], Y_indptr[iy + 1]): 53 | row[Y_indices[j]] -= Y_data[j] 54 | 55 | with gil: 56 | D[ix, iy] = row[0].abs().sum() -------------------------------------------------------------------------------- /tests/test_equal.py: -------------------------------------------------------------------------------- 1 | #!usr/bin/python 3.6 2 | #-*-coding:utf-8-*- 3 | 4 | import pytest 5 | import sys 6 | import os 7 | path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 8 | sys.path.append(path) 9 | from size_constrained_clustering import equal 10 | 11 | class TestEqual: 12 | 13 | def test_input(self): 14 | with pytest.raises(AssertionError): 15 | equal.SameSizeKMeansHeuristics(n_clusters=-1) 16 | with pytest.raises(AssertionError): 17 | equal.SameSizeKMeansMinCostFlow(n_clusters=-1) 18 | with pytest.raises(AssertionError): 19 | equal.SameSizeKMeansHeuristics(n_clusters=0) 20 | with pytest.raises(AssertionError): 21 | equal.SameSizeKMeansMinCostFlow(n_clusters=0) 22 | with pytest.raises(AssertionError): 23 | equal.SameSizeKMeansHeuristics(n_clusters=1, max_iters=1.2) 24 | with pytest.raises(AssertionError): 25 | equal.SameSizeKMeansMinCostFlow(n_clusters=1, max_iters=1.2) 26 | with pytest.raises(Exception): 27 | equal.SameSizeKMeansHeuristics(n_clusters=1, distance_func="a") 28 | with pytest.raises(Exception): 29 | equal.SameSizeKMeansMinCostFlow(n_clusters=1, distance_func="a") 30 | 31 | def test_output(self): 32 | import numpy as np 33 | import collections 34 | n_samples = 2000 35 | n_clusters = 4 36 | X = np.random.rand(n_samples, 2) 37 | model = equal.SameSizeKMeansHeuristics(n_clusters) 38 | model.fit(X) 39 | labels = model.labels_ 40 | label_counts = collections.Counter(labels) 41 | assert_cluster_equal(label_counts) 42 | 43 | model = equal.SameSizeKMeansMinCostFlow(n_clusters) 44 | model.fit(X) 45 | labels = model.labels_ 46 | label_counts = collections.Counter(labels) 47 | assert_cluster_equal(label_counts) 48 | 49 | def assert_cluster_equal(label_counts): 50 | size = label_counts[0] 51 | for i in range(1, len(label_counts)): 52 | assert label_counts[i] == size 53 | 54 | -------------------------------------------------------------------------------- /tests/test_fcm.py: -------------------------------------------------------------------------------- 1 | #!usr/bin/python 3.6 2 | #-*-coding:utf-8-*- 3 | 4 | import pytest 5 | import sys 6 | import os 7 | path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 8 | sys.path.append(path) 9 | from size_constrained_clustering import fcm 10 | 11 | class TestFCM: 12 | 13 | def test_input(self): 14 | with pytest.raises(AssertionError): 15 | fcm.FCM(n_clusters=-1) 16 | with pytest.raises(AssertionError): 17 | fcm.FCM(n_clusters=0) 18 | with pytest.raises(AssertionError): 19 | fcm.FCM(n_clusters=3, m=1) 20 | with pytest.raises(AssertionError): 21 | fcm.FCM(n_clusters=3, max_iters=1.0) 22 | with pytest.raises(AssertionError): 23 | fcm.FCM(n_clusters=3, epsilon=-1) 24 | with pytest.raises(Exception): 25 | fcm.FCM(n_clusters=3, distance_func="a") 26 | 27 | def test_output(self): 28 | from sklearn.datasets import make_blobs 29 | import numpy as np 30 | import collections 31 | n_samples = 5000 32 | n_bins = 4 # use 3 bins for calibration_curve as we have 3 clusters here 33 | centers = [(-5, -5), (0, 0), (5, 5), (7, 10)] 34 | 35 | X, _ = make_blobs(n_samples=n_samples, n_features=2, cluster_std=1.0, 36 | centers=centers, shuffle=False, random_state=42) 37 | 38 | model = fcm.FCM(n_bins) 39 | model.fit(X) 40 | fcm_centers = model.cluster_centers_ 41 | fcm_labels = model.labels_ 42 | 43 | target_centers = np.array([[-0.020799, -0.03094044], 44 | [-4.99797698, -4.96240717], 45 | [7.01237337, 10.03848252], 46 | [4.97931177, 4.94258691]]) 47 | # within tolerance 48 | fcm_centers = np.round(fcm_centers, 3) 49 | target_centers = np.round(target_centers, 3) 50 | label_counts = dict(collections.Counter(fcm_labels)) 51 | assert label_counts == {2: 1252, 0: 1250, 1: 1249, 3: 1249} 52 | assert np.array_equal(fcm_centers, target_centers) 53 | 54 | if __name__ == "__main__": 55 | pass 56 | -------------------------------------------------------------------------------- /size_constrained_clustering/k_means_constrained/mincostflow_vectorized_.pyx: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | cimport numpy as np 4 | cimport cython 5 | 6 | from ortools.graph._pywrapgraph import \ 7 | SimpleMinCostFlow_AddArcWithCapacityAndUnitCost,\ 8 | SimpleMinCostFlow_SetNodeSupply,\ 9 | SimpleMinCostFlow_Flow 10 | 11 | DTYPE = np.int32 12 | ctypedef np.int32_t DTYPE_t 13 | 14 | 15 | @cython.boundscheck(False) 16 | @cython.wraparound(False) 17 | def SimpleMinCostFlow_AddArcWithCapacityAndUnitCostVectorized( 18 | self, 19 | np.ndarray[DTYPE_t, ndim=1] tail, 20 | np.ndarray[DTYPE_t, ndim=1] head, 21 | np.ndarray[DTYPE_t, ndim=1] capacity, 22 | np.ndarray[DTYPE_t, ndim=1] unit_cost): 23 | 24 | cdef int len = tail.shape[0] 25 | 26 | assert tail.dtype == DTYPE 27 | assert head.dtype == DTYPE 28 | assert capacity.dtype == DTYPE 29 | assert unit_cost.dtype == DTYPE 30 | assert head.shape[0] == len 31 | assert capacity.shape[0] == len 32 | assert unit_cost.shape[0] == len 33 | 34 | for i in range(len): 35 | SimpleMinCostFlow_AddArcWithCapacityAndUnitCost(self, tail[i], head[i], capacity[i], unit_cost[i]) 36 | 37 | 38 | @cython.boundscheck(False) 39 | @cython.wraparound(False) 40 | def SimpleMinCostFlow_SetNodeSupplyVectorized(self, 41 | np.ndarray[DTYPE_t, ndim=1] node, 42 | np.ndarray[DTYPE_t, ndim=1] supply): 43 | cdef int len = node.shape[0] 44 | 45 | assert node.dtype == DTYPE 46 | assert supply.dtype == DTYPE 47 | assert supply.shape[0] == len 48 | 49 | for i in range(len): 50 | SimpleMinCostFlow_SetNodeSupply(self, node[i], supply[i]) 51 | 52 | 53 | @cython.boundscheck(False) 54 | @cython.wraparound(False) 55 | def SimpleMinCostFlow_FlowVectorized(self, 56 | np.ndarray[DTYPE_t, ndim=1] arc): 57 | 58 | cdef int len = arc.shape[0] 59 | 60 | assert arc.dtype == DTYPE 61 | 62 | cdef np.ndarray flow = np.zeros(len, dtype=DTYPE) 63 | 64 | for i in range(len): 65 | flow[i] = SimpleMinCostFlow_Flow(self, arc[i]) 66 | 67 | return flow 68 | 69 | -------------------------------------------------------------------------------- /size_constrained_clustering/sklearn_import/utils/__init__.py: -------------------------------------------------------------------------------- 1 | def gen_batches(n, batch_size): 2 | """Generator to create slices containing batch_size elements, from 0 to n. 3 | 4 | The last slice may contain less than batch_size elements, when batch_size 5 | does not divide n. 6 | 7 | Examples 8 | -------- 9 | >>> from sklearn.utils import gen_batches 10 | >>> list(gen_batches(7, 3)) 11 | [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)] 12 | >>> list(gen_batches(6, 3)) 13 | [slice(0, 3, None), slice(3, 6, None)] 14 | >>> list(gen_batches(2, 3)) 15 | [slice(0, 2, None)] 16 | """ 17 | start = 0 18 | for _ in range(int(n // batch_size)): 19 | end = start + batch_size 20 | yield slice(start, end) 21 | start = end 22 | if start < n: 23 | yield slice(start, n) 24 | 25 | 26 | def gen_even_slices(n, n_packs, n_samples=None): 27 | """Generator to create n_packs slices going up to n. 28 | 29 | Pass n_samples when the slices are to be used for sparse matrix indexing; 30 | slicing off-the-end raises an exception, while it works for NumPy arrays. 31 | 32 | Examples 33 | -------- 34 | >>> from sklearn.utils import gen_even_slices 35 | >>> list(gen_even_slices(10, 1)) 36 | [slice(0, 10, None)] 37 | >>> list(gen_even_slices(10, 10)) #doctest: +ELLIPSIS 38 | [slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)] 39 | >>> list(gen_even_slices(10, 5)) #doctest: +ELLIPSIS 40 | [slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)] 41 | >>> list(gen_even_slices(10, 3)) 42 | [slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)] 43 | """ 44 | start = 0 45 | if n_packs < 1: 46 | raise ValueError("gen_even_slices got n_packs=%s, must be >=1" 47 | % n_packs) 48 | for pack_num in range(n_packs): 49 | this_n = n // n_packs 50 | if pack_num < n % n_packs: 51 | this_n += 1 52 | if this_n > 0: 53 | end = start + this_n 54 | if n_samples is not None: 55 | end = min(n_samples, end) 56 | yield slice(start, end, None) 57 | start = end -------------------------------------------------------------------------------- /tests/test_minmax.py: -------------------------------------------------------------------------------- 1 | #!usr/bin/python 3.6 2 | #-*-coding:utf-8-*- 3 | 4 | import pytest 5 | import sys 6 | import os 7 | path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 8 | sys.path.append(path) 9 | from size_constrained_clustering import minmax 10 | 11 | class TestMinMax: 12 | 13 | def test_input(self): 14 | with pytest.raises(AssertionError): 15 | minmax.MinMaxKMeansMinCostFlow(n_clusters=-1, size_min=1, size_max=2) 16 | with pytest.raises(AssertionError): 17 | minmax.MinMaxKMeansMinCostFlow(n_clusters=0, size_min=1, size_max=2) 18 | with pytest.raises(AssertionError): 19 | minmax.MinMaxKMeansMinCostFlow(n_clusters=1, max_iters=1.2, size_min=1, size_max=2) 20 | with pytest.raises(Exception): 21 | minmax.MinMaxKMeansMinCostFlow(n_clusters=1, distance_func="a", size_min=1, size_max=2) 22 | with pytest.raises(AssertionError): 23 | minmax.MinMaxKMeansMinCostFlow(n_clusters=1, size_min=None, size_max=2) 24 | with pytest.raises(AssertionError): 25 | minmax.MinMaxKMeansMinCostFlow(n_clusters=1, size_min=-1, size_max=2) 26 | with pytest.raises(AssertionError): 27 | minmax.MinMaxKMeansMinCostFlow(n_clusters=1, size_min=20, size_max=10) 28 | with pytest.raises(AssertionError): 29 | model = minmax.MinMaxKMeansMinCostFlow(n_clusters=1, size_min=10, size_max=20) 30 | import numpy as np 31 | X = np.random.random((1000, 2)) 32 | model.fit(X) 33 | 34 | def test_output(self): 35 | from sklearn.datasets import make_blobs 36 | import collections 37 | 38 | n_samples = 2000 39 | n_clusters = 4 # use 3 bins for calibration_curve as we have 3 clusters here 40 | centers = [(-5, -5), (0, 0), (5, 5), (7, 10)] 41 | 42 | X, _ = make_blobs(n_samples=n_samples, n_features=2, cluster_std=1.0, 43 | centers=centers, shuffle=False, random_state=42) 44 | 45 | minsize = 200 46 | maxsize = 800 47 | model = minmax.MinMaxKMeansMinCostFlow(n_clusters, size_min=minsize, 48 | size_max=maxsize) 49 | model.fit(X) 50 | 51 | label_counter = collections.Counter(model.labels_) 52 | 53 | for label, count in label_counter.items(): 54 | assert count >= minsize and count <= maxsize 55 | -------------------------------------------------------------------------------- /size_constrained_clustering/sklearn_import/utils/sparsefuncs.py: -------------------------------------------------------------------------------- 1 | from scipy import sparse as sp 2 | from sklearn_import.utils.fixes import sparse_min_max 3 | 4 | from .sparsefuncs_fast import ( 5 | csr_mean_variance_axis0 as _csr_mean_var_axis0, 6 | csc_mean_variance_axis0 as _csc_mean_var_axis0) 7 | 8 | 9 | def mean_variance_axis(X, axis): 10 | """Compute mean and variance along an axix on a CSR or CSC matrix 11 | 12 | Parameters 13 | ---------- 14 | X : CSR or CSC sparse matrix, shape (n_samples, n_features) 15 | Input data. 16 | 17 | axis : int (either 0 or 1) 18 | Axis along which the axis should be computed. 19 | 20 | Returns 21 | ------- 22 | 23 | means : float array with shape (n_features,) 24 | Feature-wise means 25 | 26 | variances : float array with shape (n_features,) 27 | Feature-wise variances 28 | 29 | """ 30 | _raise_error_wrong_axis(axis) 31 | 32 | if isinstance(X, sp.csr_matrix): 33 | if axis == 0: 34 | return _csr_mean_var_axis0(X) 35 | else: 36 | return _csc_mean_var_axis0(X.T) 37 | elif isinstance(X, sp.csc_matrix): 38 | if axis == 0: 39 | return _csc_mean_var_axis0(X) 40 | else: 41 | return _csr_mean_var_axis0(X.T) 42 | else: 43 | _raise_typeerror(X) 44 | 45 | 46 | def min_max_axis(X, axis): 47 | """Compute minimum and maximum along an axis on a CSR or CSC matrix 48 | 49 | Parameters 50 | ---------- 51 | X : CSR or CSC sparse matrix, shape (n_samples, n_features) 52 | Input data. 53 | 54 | axis : int (either 0 or 1) 55 | Axis along which the axis should be computed. 56 | 57 | Returns 58 | ------- 59 | 60 | mins : float array with shape (n_features,) 61 | Feature-wise minima 62 | 63 | maxs : float array with shape (n_features,) 64 | Feature-wise maxima 65 | """ 66 | if isinstance(X, sp.csr_matrix) or isinstance(X, sp.csc_matrix): 67 | return sparse_min_max(X, axis=axis) 68 | else: 69 | _raise_typeerror(X) 70 | 71 | 72 | def _raise_typeerror(X): 73 | """Raises a TypeError if X is not a CSR or CSC matrix""" 74 | input_type = X.format if sp.issparse(X) else type(X) 75 | err = "Expected a CSR or CSC sparse matrix, got %s." % input_type 76 | raise TypeError(err) 77 | 78 | 79 | def _raise_error_wrong_axis(axis): 80 | if axis not in (0, 1): 81 | raise ValueError( 82 | "Unknown axis value: %d. Use 0 for rows, or 1 for columns" % axis) 83 | -------------------------------------------------------------------------------- /examples/examples.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import sys 4 | path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 5 | sys.path.append(path) 6 | from src import fcm, equal, da, minmax, shrinkage 7 | 8 | from sklearn.datasets import make_blobs 9 | from matplotlib import pyplot as plt 10 | import numpy as np 11 | from seaborn import scatterplot as scatter 12 | from sklearn.metrics.pairwise import haversine_distances 13 | import collections 14 | 15 | def fcm_example(): 16 | n_samples = 2000 17 | n_clusters = 4 18 | centers = [(-5, -5), (0, 0), (5, 5), (7, 10)] 19 | 20 | X, _ = make_blobs(n_samples=n_samples, n_features=2, cluster_std=1.0, 21 | centers=centers, shuffle=False, random_state=42) 22 | 23 | model = fcm.FCM(n_clusters) 24 | model.fit(X) 25 | centers = model.cluster_centers_ 26 | labels = model.labels_ 27 | 28 | plot(centers, labels, X) 29 | 30 | def equal_example(): 31 | n_samples = 2000 32 | n_clusters = 3 33 | X = np.random.rand(n_samples, 2) 34 | # model = equal.SameSizeKMeansMinCostFlow(n_clusters) 35 | model = equal.SameSizeKMeansHeuristics(n_clusters) 36 | model.fit(X) 37 | 38 | centers = model.cluster_centers_ 39 | labels = model.labels_ 40 | 41 | print("Cluster size count: ", collections.Counter(labels)) 42 | plot(centers, labels, X) 43 | 44 | def minmax_example(): 45 | n_samples = 2000 46 | n_clusters = 3 47 | X = np.random.rand(n_samples, 2) 48 | model = minmax.MinMaxKMeansMinCostFlow(n_clusters, size_min=400, size_max=800) 49 | model.fit(X) 50 | 51 | centers = model.cluster_centers_ 52 | labels = model.labels_ 53 | 54 | print("Cluster size count: ", collections.Counter(labels)) 55 | plot(centers, labels, X) 56 | 57 | def da_example(): 58 | n_samples = 2000 59 | n_clusters = 3 60 | X = np.random.rand(n_samples, 2) 61 | model = da.DeterministicAnnealing(n_clusters, distribution=[0.1, 0.6, 0.3]) 62 | model.fit(X) 63 | 64 | centers = model.cluster_centers_ 65 | labels = model.labels_ 66 | 67 | cluster_size = list(collections.Counter(labels).values()) 68 | print("Cluster size: ", cluster_size) 69 | print("Cluster size count: ", [c / n_samples for c in cluster_size]) 70 | plot(centers, labels, X) 71 | 72 | def shrinkage_example(): 73 | n_samples = 1000 74 | n_clusters = 4 75 | centers = [(-5, -5), (0, 0), (5, 5), (7, 10)] 76 | 77 | X, _ = make_blobs(n_samples=n_samples, n_features=2, cluster_std=1.0, 78 | centers=centers, shuffle=False, random_state=42) 79 | 80 | model = shrinkage.Shrinkage(n_clusters, size_min=100) 81 | model.fit(X) 82 | centers = model.cluster_centers_ 83 | labels = model.labels_ 84 | 85 | plot(centers, labels, X) 86 | 87 | def plot(centers, labels, X): 88 | f, axes = plt.subplots(1, 2, figsize=(11, 5)) 89 | scatter(X[:, 0], X[:, 1], ax=axes[0]) 90 | scatter(X[:, 0], X[:, 1], ax=axes[1], hue=labels) 91 | scatter(centers[:, 0], centers[:, 1], ax=axes[1], marker="s", s=200) 92 | plt.show() 93 | 94 | if __name__ == "__main__": 95 | shrinkage_example() 96 | -------------------------------------------------------------------------------- /size_constrained_clustering/fcm.py: -------------------------------------------------------------------------------- 1 | #!usr/bin/python 3.7 2 | #-*-coding:utf-8-*- 3 | 4 | ''' 5 | @file: fcm.py, fuzzy c-means algorithm 6 | @Author: Jing Wang (jingw2@foxmail.com) 7 | @Date: 06/06/2020 8 | @paper: Clustering with Size Constraints 9 | @github reference: https://github.com/omadson/fuzzy-c-means/blob/master/fcmeans/fcm.py 10 | ''' 11 | 12 | from scipy.spatial.distance import cdist 13 | import numpy as np 14 | from scipy.linalg import norm 15 | import sys 16 | import os 17 | path = os.path.dirname(os.path.abspath(__file__)) 18 | sys.path.append(path) 19 | import base 20 | 21 | class FCM(base.Base): 22 | 23 | def __init__(self, n_clusters, \ 24 | max_iters=1000, m=2, 25 | epsilon=1e-5, 26 | random_state=42, 27 | distance_func=cdist): 28 | ''' 29 | Args: 30 | n_clusters (int): number of clusters 31 | max_iters (int): maximum iterations 32 | m (float): membership order, in general it is 2 33 | epsilon (float): 1e-5 34 | random_state (int): random seed 35 | distance_func (callable function/None), default is Euclidean distance 36 | ''' 37 | super(FCM, self).__init__(n_clusters, max_iters, distance_func) 38 | assert m > 1 39 | assert epsilon > 0 40 | self.m = m 41 | self.epsilon = epsilon 42 | self.random_state = random_state 43 | self.u, self.cluster_centers_ = None, None 44 | 45 | def fit(self, X): 46 | ''' 47 | Args: 48 | X (array like): shape is (n_samples, n_dimensions) 49 | ''' 50 | np.random.seed(self.random_state) 51 | n_samples, n_dimensions = X.shape 52 | 53 | # initialize mu 54 | self.u = np.random.random(size=(n_samples, self.n_clusters)) 55 | self.u /= np.sum(self.u, axis=1).reshape((-1, 1)) 56 | 57 | # initialize centers 58 | itr = 0 59 | while True: 60 | last_u = self.u.copy() 61 | # update centers 62 | self.cluster_centers_ = self.update_centers(X) 63 | # update membership 64 | self.u = self.update_membership(X) 65 | if norm(self.u - last_u) < self.epsilon or itr >= self.max_iters: 66 | break 67 | itr += 1 68 | 69 | self.labels_ = np.argmax(self.u, axis=1) 70 | 71 | def update_centers(self, X): 72 | ''' 73 | Update centers based new u 74 | ''' 75 | um = np.power(self.u, self.m) # (n_samples, n_clusters) 76 | centers = (X.T.dot(um)).T / np.sum(um, axis=0).reshape((-1, 1)) 77 | return centers 78 | 79 | def update_membership(self, X): 80 | power = 2. / (self.m - 1) 81 | n_samples, n_dimensions = X.shape 82 | dist = self.distance_func(X, self.cluster_centers_) 83 | dist = np.power(dist, power) 84 | u = dist * np.sum(1. / dist, axis=1).reshape((-1, 1)) 85 | u = 1. / u 86 | # normalize 87 | u /= np.sum(u, axis=1).reshape((-1, 1)) 88 | return u 89 | 90 | def predict(self, X): 91 | u = self.update_membership(X) 92 | labels = np.argmax(u, axis=1) 93 | return labels 94 | -------------------------------------------------------------------------------- /size_constrained_clustering/minmax.py: -------------------------------------------------------------------------------- 1 | #!usr/bin/python 3.7 2 | #-*-coding:utf-8-*- 3 | 4 | ''' 5 | @file: same_size_kmeans.py, equal size clustering with heuristics 6 | @Author: Jing Wang (jingw2@foxmail.com) 7 | @Date: 06/18/2020 8 | @paper: 9 | @github reference: https://github.com/joshlk/k-means-constrained 10 | @Web: 11 | ''' 12 | 13 | import os 14 | import sys 15 | path = os.path.dirname(os.path.abspath(__file__)) 16 | sys.path.append(path) 17 | import base 18 | from k_means_constrained import KMeansConstrained 19 | import numpy as np 20 | import matplotlib.pyplot as plt 21 | from seaborn import scatterplot as scatter 22 | import collections 23 | from sklearn.metrics.pairwise import haversine_distances 24 | from sklearn.datasets import make_blobs 25 | from scipy.spatial.distance import cdist 26 | 27 | class MinMaxKMeansMinCostFlow(base.Base): 28 | 29 | def __init__(self, n_clusters, size_min=None, size_max=None, 30 | max_iters=1000, distance_func=cdist, random_state=42): 31 | ''' 32 | Args: 33 | n_clusters (int): number of clusters 34 | max_iters (int): maximum iterations 35 | distance_func (object): callable function with input (X, centers) / None, by default is l2-distance 36 | random_state (int): random state to initiate, by default it is 42 37 | ''' 38 | super(MinMaxKMeansMinCostFlow, self).__init__(n_clusters, max_iters, distance_func) 39 | self.clf = None 40 | self.size_min = size_min 41 | self.size_max = size_max 42 | assert size_min is not None and size_max is not None 43 | assert size_min >= 0 and size_max >= 0 44 | assert size_min <= size_max 45 | 46 | def fit(self, X): 47 | n_samples, n_features = X.shape 48 | assert self.size_max * self.n_clusters >= n_samples 49 | 50 | clf = KMeansConstrained(self.n_clusters, size_min=self.size_min, 51 | size_max=self.size_max, distance_func=self.distance_func) 52 | 53 | clf.fit(X) 54 | 55 | self.clf = clf 56 | self.cluster_centers_ = self.clf.cluster_centers_ 57 | self.labels_ = self.clf.labels_ 58 | 59 | def predict(self, X): 60 | return self.clf.predict(X) 61 | 62 | if __name__ == "__main__": 63 | n_samples = 2000 64 | n_clusters = 4 # use 3 bins for calibration_curve as we have 3 clusters here 65 | centers = [(-5, -5), (0, 0), (5, 5), (7, 10)] 66 | 67 | X, _ = make_blobs(n_samples=n_samples, n_features=2, cluster_std=1.0, 68 | centers=centers, shuffle=False, random_state=42) 69 | minsize = n_samples // n_clusters 70 | maxsize = n_samples // n_clusters 71 | minmax = MinMaxKMeansMinCostFlow(n_clusters, size_min=minsize, 72 | size_max=maxsize, distance_func=cdist) 73 | minmax.fit(X) 74 | 75 | fcm_centers = minmax.cluster_centers_ 76 | fcm_labels = minmax.labels_ 77 | 78 | print(collections.Counter(fcm_labels)) 79 | 80 | f, axes = plt.subplots(1, 2, figsize=(11, 5)) 81 | scatter(X[:, 0], X[:, 1], ax=axes[0]) 82 | scatter(X[:, 0], X[:, 1], ax=axes[1], hue=fcm_labels) 83 | scatter(fcm_centers[:, 0], fcm_centers[:, 1], ax=axes[1], marker="s",s=200) 84 | plt.show() 85 | -------------------------------------------------------------------------------- /size_constrained_clustering/shrinkage.py: -------------------------------------------------------------------------------- 1 | #!usr/bin/python 3.6 2 | #-*-coding:utf-8-*- 3 | 4 | ''' 5 | @file: shrinkage.py, shrinkage clustering 6 | @Author: Jing Wang (jingw2@foxmail.com) 7 | @Date: 06/24/2020 8 | @Paper reference: Shrinkage Clustering: A fast and \ 9 | size-constrained clustering algorithm for biomedical applications 10 | ''' 11 | 12 | import os 13 | import sys 14 | path = os.path.dirname(os.path.abspath(__file__)) 15 | sys.path.append(path) 16 | import base 17 | from scipy.spatial.distance import cdist 18 | import numpy as np 19 | import random 20 | 21 | class Shrinkage(base.Base): 22 | 23 | def __init__(self, n_clusters, size_min=1, max_iters=1000, \ 24 | distance_func=cdist, random_state=42): 25 | ''' 26 | Args: 27 | n_clusters (int): number of clusters 28 | max_iters (int): maximum iterations 29 | distance_func (object): callable function with input (X, centers) / None, by default is l2-distance 30 | random_state (int): random state to initiate, by default it is 42 31 | ''' 32 | super(Shrinkage, self).__init__(n_clusters, max_iters, distance_func) 33 | np.random.seed(random_state) 34 | random.seed(random_state) 35 | self.size_min = size_min 36 | assert isinstance(size_min, int) 37 | assert size_min >= 1 38 | 39 | def fit(self, X): 40 | 41 | n_samples, n_features = X.shape 42 | 43 | assert self.size_min <= n_samples // self.n_clusters 44 | # calculate similarity matrix, larger similarity means more resemblance 45 | S = self.distance_func(X, X) 46 | S /= np.max(S) 47 | S = 1 - S 48 | # initialize 49 | A, S_tilde = self._init(S) 50 | iters = 0 51 | while True: 52 | # remove empty clusters 53 | cluster_size = np.sum(A, axis=0) 54 | keep_cluster = np.where(cluster_size >= self.size_min)[0] 55 | A = A[:, keep_cluster] 56 | 57 | # permute cluster membership 58 | M = S_tilde @ A 59 | v = np.min(M - np.sum(M * A, axis=1).reshape((-1, 1)), axis=1) 60 | X_bar = np.argmin(v) 61 | C_prime = np.argmin(M[X_bar]) 62 | 63 | K = A.shape[1] 64 | A[X_bar] = np.zeros(K) 65 | A[X_bar, C_prime] = 1 66 | 67 | if abs(np.sum(v)) < 1e-5 or iters >= self.max_iters: 68 | break 69 | 70 | iters += 1 71 | 72 | self.labels_ = np.argmax(A, axis=1) 73 | self.cluster_centers_ = self.update_centers(X, A) 74 | 75 | 76 | def _init(self, S): 77 | ''' 78 | Initialize A and S_tilde 79 | ''' 80 | n_samples, _ = S.shape 81 | A = np.zeros((n_samples, self.n_clusters)) 82 | A[range(n_samples), [random.choice(range(self.n_clusters)) for _ in range(n_samples)]] = 1 83 | S_tilde = 1 - 2 * S 84 | return A, S_tilde 85 | 86 | def update_centers(self, X, labels): 87 | ''' 88 | Update centers 89 | Args: 90 | X (array like): (n_samples, n_features) 91 | labels (array like): (n_samples, n_clusters), one-hot array 92 | 93 | Return: 94 | centers (array like): (n_clusters, n_features) 95 | ''' 96 | centers = (X.T.dot(labels)).T / np.sum(labels, axis=0).reshape((-1, 1)) 97 | return centers 98 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, Extension, dist 2 | 3 | try: 4 | from setuptools import setup 5 | except: 6 | from distutils.core import setup 7 | 8 | import os 9 | this_directory = os.path.abspath(os.path.dirname(__file__)) 10 | with open(os.path.join(this_directory, 'README.md'), encoding='utf-8') as f: 11 | long_description = f.read() 12 | 13 | dist.Distribution().fetch_build_eggs(["cython>=0.27", "numpy>=1.13"]) 14 | 15 | 16 | try: 17 | from numpy import get_include 18 | except: 19 | def get_include(): 20 | # Defer import to later 21 | from numpy import get_include 22 | return get_include() 23 | 24 | try: 25 | from Cython.Build import cythonize 26 | except ImportError: 27 | print("! Could not import Cython !") 28 | cythonize = None 29 | 30 | 31 | # https://cython.readthedocs.io/en/latest/src/userguide/source_files_and_compilation.html#distributing-cython-modules 32 | def no_cythonize(extensions, **_ignore): 33 | for extension in extensions: 34 | sources = [] 35 | for sfile in extension.sources: 36 | path, ext = os.path.splitext(sfile) 37 | if ext in (".pyx", ".py"): 38 | if extension.language == "c++": 39 | ext = ".cpp" 40 | else: 41 | ext = ".c" 42 | sfile = path + ext 43 | sources.append(sfile) 44 | extension.sources[:] = sources 45 | return extensions 46 | 47 | path = os.path.dirname(os.path.abspath(__file__)) 48 | extensions = [ 49 | Extension("size_constrained_clustering.k_means_constrained.mincostflow_vectorized_", [os.path.join(path, "size_constrained_clustering/k_means_constrained/mincostflow_vectorized_.pyx")], 50 | include_dirs=[get_include()]), 51 | Extension("size_constrained_clustering.sklearn_import.cluster._k_means", [os.path.join(path, "size_constrained_clustering/sklearn_import/cluster/_k_means.pyx")], 52 | include_dirs=[get_include()]), 53 | Extension("size_constrained_clustering.sklearn_import.metrics.pairwise_fast", [os.path.join(path, "size_constrained_clustering/sklearn_import/metrics/pairwise_fast.pyx")], 54 | include_dirs=[get_include()]), 55 | Extension("size_constrained_clustering.sklearn_import.utils.sparsefuncs_fast", [os.path.join(path, "size_constrained_clustering/sklearn_import/utils/sparsefuncs_fast.pyx")], 56 | include_dirs=[get_include()]), 57 | ] 58 | 59 | CYTHONIZE = bool(int(os.getenv("CYTHONIZE", 1))) and cythonize is not None 60 | 61 | if CYTHONIZE: 62 | compiler_directives = {"language_level": 3, "embedsignature": True} 63 | extensions = cythonize(extensions, compiler_directives=compiler_directives) 64 | else: 65 | extensions = no_cythonize(extensions) 66 | 67 | with open(os.path.join(path, "requirements.txt")) as fp: 68 | install_requires = fp.read().strip().split("\n") 69 | 70 | VERSION = "0.1.1" 71 | LICENSE = 'MIT' 72 | setup( 73 | ext_modules=extensions, 74 | version=VERSION, 75 | setup_requires=["cython", "numpy"], 76 | install_requires=install_requires, 77 | name='size_constrained_clustering', 78 | description='Size Constrained Clustering solver', 79 | long_description=long_description, 80 | long_description_content_type='text/markdown', 81 | url='https://github.com/jingw2/size_constrained_clustering', 82 | author='Jing Wang', 83 | author_email='jingw2@foxmail.com', 84 | license=LICENSE, 85 | packages=find_packages(), 86 | python_requires='>=3.6') 87 | -------------------------------------------------------------------------------- /size_constrained_clustering/sklearn_import/preprocessing/data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy import sparse 3 | 4 | from sklearn_import.utils.sparsefuncs_fast import inplace_csr_row_normalize_l1, inplace_csr_row_normalize_l2 5 | 6 | from sklearn_import.utils.sparsefuncs import min_max_axis 7 | 8 | from sklearn_import.utils.extmath import row_norms 9 | from sklearn_import.utils.validation import check_array, FLOAT_DTYPES 10 | 11 | 12 | def normalize(X, norm='l2', axis=1, copy=True, return_norm=False): 13 | """Scale input vectors individually to unit norm (vector length). 14 | 15 | Read more in the :ref:`User Guide `. 16 | 17 | Parameters 18 | ---------- 19 | X : {array-like, sparse matrix}, shape [n_samples, n_features] 20 | The data to normalize, element by element. 21 | scipy.sparse matrices should be in CSR format to avoid an 22 | un-necessary copy. 23 | 24 | norm : 'l1', 'l2', or 'max', optional ('l2' by default) 25 | The norm to use to normalize each non zero sample (or each non-zero 26 | feature if axis is 0). 27 | 28 | axis : 0 or 1, optional (1 by default) 29 | axis used to normalize the data along. If 1, independently normalize 30 | each sample, otherwise (if 0) normalize each feature. 31 | 32 | copy : boolean, optional, default True 33 | set to False to perform inplace row normalization and avoid a 34 | copy (if the input is already a numpy array or a scipy.sparse 35 | CSR matrix and if axis is 1). 36 | 37 | return_norm : boolean, default False 38 | whether to return the computed norms 39 | 40 | Returns 41 | ------- 42 | X : {array-like, sparse matrix}, shape [n_samples, n_features] 43 | Normalized input X. 44 | 45 | norms : array, shape [n_samples] if axis=1 else [n_features] 46 | An array of norms along given axis for X. 47 | When X is sparse, a NotImplementedError will be raised 48 | for norm 'l1' or 'l2'. 49 | 50 | See also 51 | -------- 52 | Normalizer: Performs normalization using the ``Transformer`` API 53 | (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`). 54 | 55 | Notes 56 | ----- 57 | For a comparison of the different scalers, transformers, and normalizers, 58 | see :ref:`examples/preprocessing/plot_all_scaling.py 59 | `. 60 | 61 | """ 62 | if norm not in ('l1', 'l2', 'max'): 63 | raise ValueError("'%s' is not a supported norm" % norm) 64 | 65 | if axis == 0: 66 | sparse_format = 'csc' 67 | elif axis == 1: 68 | sparse_format = 'csr' 69 | else: 70 | raise ValueError("'%d' is not a supported axis" % axis) 71 | 72 | X = check_array(X, sparse_format, copy=copy, 73 | estimator='the normalize function', dtype=FLOAT_DTYPES) 74 | if axis == 0: 75 | X = X.T 76 | 77 | if sparse.issparse(X): 78 | if return_norm and norm in ('l1', 'l2'): 79 | raise NotImplementedError("return_norm=True is not implemented " 80 | "for sparse matrices with norm 'l1' " 81 | "or norm 'l2'") 82 | if norm == 'l1': 83 | inplace_csr_row_normalize_l1(X) 84 | elif norm == 'l2': 85 | inplace_csr_row_normalize_l2(X) 86 | elif norm == 'max': 87 | _, norms = min_max_axis(X, 1) 88 | norms_elementwise = norms.repeat(np.diff(X.indptr)) 89 | mask = norms_elementwise != 0 90 | X.data[mask] /= norms_elementwise[mask] 91 | else: 92 | if norm == 'l1': 93 | norms = np.abs(X).sum(axis=1) 94 | elif norm == 'l2': 95 | norms = row_norms(X) 96 | elif norm == 'max': 97 | norms = np.max(X, axis=1) 98 | norms = _handle_zeros_in_scale(norms, copy=False) 99 | X /= norms[:, np.newaxis] 100 | 101 | if axis == 0: 102 | X = X.T 103 | 104 | if return_norm: 105 | return X, norms 106 | else: 107 | return X 108 | 109 | 110 | def _handle_zeros_in_scale(scale, copy=True): 111 | ''' Makes sure that whenever scale is zero, we handle it correctly. 112 | 113 | This happens in most scalers when we have constant features.''' 114 | 115 | # if we are fitting on 1D arrays, scale might be a scalar 116 | if np.isscalar(scale): 117 | if scale == .0: 118 | scale = 1. 119 | return scale 120 | elif isinstance(scale, np.ndarray): 121 | if copy: 122 | # New array to avoid side-effects 123 | scale = scale.copy() 124 | scale[scale == 0.0] = 1.0 125 | return scale 126 | -------------------------------------------------------------------------------- /size_constrained_clustering/sklearn_import/utils/extmath.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import numpy as np 4 | from scipy.sparse import issparse, csr_matrix 5 | from sklearn_import.utils.sparsefuncs_fast import csr_row_norms 6 | 7 | from sklearn_import.utils.fixes import np_version 8 | 9 | 10 | def row_norms(X, squared=False): 11 | """Row-wise (squared) Euclidean norm of X. 12 | 13 | Equivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse 14 | matrices and does not create an X.shape-sized temporary. 15 | 16 | Performs no input validation. 17 | """ 18 | if issparse(X): 19 | if not isinstance(X, csr_matrix): 20 | X = csr_matrix(X) 21 | norms = csr_row_norms(X) 22 | else: 23 | norms = np.einsum('ij,ij->i', X, X) 24 | 25 | if not squared: 26 | np.sqrt(norms, norms) 27 | return norms 28 | 29 | 30 | def squared_norm(x): 31 | """Squared Euclidean or Frobenius norm of x. 32 | 33 | Returns the Euclidean norm when x is a vector, the Frobenius norm when x 34 | is a matrix (2-d array). Faster than norm(x) ** 2. 35 | """ 36 | x = np.ravel(x, order='K') 37 | if np.issubdtype(x.dtype, np.integer): 38 | warnings.warn('Array type is integer, np.dot may overflow. ' 39 | 'Data should be float type to avoid this issue', 40 | UserWarning) 41 | return np.dot(x, x) 42 | 43 | 44 | def cartesian(arrays, out=None): 45 | """Generate a cartesian product of input arrays. 46 | 47 | Parameters 48 | ---------- 49 | arrays : list of array-like 50 | 1-D arrays to form the cartesian product of. 51 | out : ndarray 52 | Array to place the cartesian product in. 53 | 54 | Returns 55 | ------- 56 | out : ndarray 57 | 2-D array of shape (M, len(arrays)) containing cartesian products 58 | formed of input arrays. 59 | 60 | Examples 61 | -------- 62 | >>> cartesian(([1, 2, 3], [4, 5], [6, 7])) 63 | array([[1, 4, 6], 64 | [1, 4, 7], 65 | [1, 5, 6], 66 | [1, 5, 7], 67 | [2, 4, 6], 68 | [2, 4, 7], 69 | [2, 5, 6], 70 | [2, 5, 7], 71 | [3, 4, 6], 72 | [3, 4, 7], 73 | [3, 5, 6], 74 | [3, 5, 7]]) 75 | 76 | """ 77 | arrays = [np.asarray(x) for x in arrays] 78 | shape = (len(x) for x in arrays) 79 | dtype = arrays[0].dtype 80 | 81 | ix = np.indices(shape) 82 | ix = ix.reshape(len(arrays), -1).T 83 | 84 | if out is None: 85 | out = np.empty_like(ix, dtype=dtype) 86 | 87 | for n, arr in enumerate(arrays): 88 | out[:, n] = arrays[n][ix[:, n]] 89 | 90 | return out 91 | 92 | 93 | def stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08): 94 | """Use high precision for cumsum and check that final value matches sum 95 | 96 | Parameters 97 | ---------- 98 | arr : array-like 99 | To be cumulatively summed as flat 100 | axis : int, optional 101 | Axis along which the cumulative sum is computed. 102 | The default (None) is to compute the cumsum over the flattened array. 103 | rtol : float 104 | Relative tolerance, see ``np.allclose`` 105 | atol : float 106 | Absolute tolerance, see ``np.allclose`` 107 | """ 108 | # sum is as unstable as cumsum for numpy < 1.9 109 | if np_version < (1, 9): 110 | return np.cumsum(arr, axis=axis, dtype=np.float64) 111 | 112 | out = np.cumsum(arr, axis=axis, dtype=np.float64) 113 | expected = np.sum(arr, axis=axis, dtype=np.float64) 114 | if not np.all(np.isclose(out.take(-1, axis=axis), expected, rtol=rtol, 115 | atol=atol, equal_nan=True)): 116 | warnings.warn('cumsum was found to be unstable: ' 117 | 'its last element does not correspond to sum', 118 | RuntimeWarning) 119 | return out 120 | 121 | 122 | def safe_sparse_dot(a, b, dense_output=False): 123 | """Dot product that handle the sparse matrix case correctly 124 | 125 | Uses BLAS GEMM as replacement for numpy.dot where possible 126 | to avoid unnecessary copies. 127 | 128 | Parameters 129 | ---------- 130 | a : array or sparse matrix 131 | b : array or sparse matrix 132 | dense_output : boolean, default False 133 | When False, either ``a`` or ``b`` being sparse will yield sparse 134 | output. When True, output will always be an array. 135 | 136 | Returns 137 | ------- 138 | dot_product : array or sparse matrix 139 | sparse if ``a`` or ``b`` is sparse and ``dense_output=False``. 140 | """ 141 | if issparse(a) or issparse(b): 142 | ret = a * b 143 | if dense_output and hasattr(ret, "toarray"): 144 | ret = ret.toarray() 145 | return ret 146 | else: 147 | return np.dot(a, b) 148 | -------------------------------------------------------------------------------- /size_constrained_clustering/sklearn_import/cluster/_k_means.pyx: -------------------------------------------------------------------------------- 1 | # cython: profile=True 2 | # Profiling is enabled by default as the overhead does not seem to be measurable 3 | # on this specific use case. 4 | 5 | # Author: Peter Prettenhofer 6 | # Olivier Grisel 7 | # Lars Buitinck 8 | # 9 | # License: BSD 3 clause 10 | 11 | import numpy as np 12 | cimport numpy as np 13 | cimport cython 14 | from cython cimport floating 15 | 16 | from sklearn_import.utils.sparsefuncs_fast import assign_rows_csr 17 | 18 | ctypedef np.float64_t DOUBLE 19 | ctypedef np.int32_t INT 20 | 21 | ctypedef floating (*DOT)(int N, floating *X, int incX, floating *Y, 22 | int incY) 23 | 24 | 25 | np.import_array() 26 | 27 | @cython.boundscheck(False) 28 | @cython.wraparound(False) 29 | @cython.cdivision(True) 30 | def _centers_dense(np.ndarray[floating, ndim=2] X, 31 | np.ndarray[INT, ndim=1] labels, int n_clusters, 32 | np.ndarray[floating, ndim=1] distances): 33 | """M step of the K-means EM algorithm 34 | 35 | Computation of cluster centers / means. 36 | 37 | Parameters 38 | ---------- 39 | X : array-like, shape (n_samples, n_features) 40 | 41 | labels : array of integers, shape (n_samples) 42 | Current label assignment 43 | 44 | n_clusters : int 45 | Number of desired clusters 46 | 47 | distances : array-like, shape (n_samples) 48 | Distance to closest cluster for each sample. 49 | 50 | Returns 51 | ------- 52 | centers : array, shape (n_clusters, n_features) 53 | The resulting centers 54 | """ 55 | ## TODO: add support for CSR input 56 | cdef int n_samples, n_features 57 | n_samples = X.shape[0] 58 | n_features = X.shape[1] 59 | cdef int i, j, c 60 | cdef np.ndarray[floating, ndim=2] centers 61 | if floating is float: 62 | centers = np.zeros((n_clusters, n_features), dtype=np.float32) 63 | else: 64 | centers = np.zeros((n_clusters, n_features), dtype=np.float64) 65 | 66 | n_samples_in_cluster = np.bincount(labels, minlength=n_clusters) 67 | empty_clusters = np.where(n_samples_in_cluster == 0)[0] 68 | # maybe also relocate small clusters? 69 | 70 | if len(empty_clusters): 71 | # find points to reassign empty clusters to 72 | far_from_centers = distances.argsort()[::-1] 73 | 74 | for i, cluster_id in enumerate(empty_clusters): 75 | # XXX two relocated clusters could be close to each other 76 | new_center = X[far_from_centers[i]] 77 | centers[cluster_id] = new_center 78 | n_samples_in_cluster[cluster_id] = 1 79 | 80 | for i in range(n_samples): 81 | for j in range(n_features): 82 | centers[labels[i], j] += X[i, j] 83 | 84 | centers /= n_samples_in_cluster[:, np.newaxis] 85 | 86 | return centers 87 | 88 | 89 | @cython.boundscheck(False) 90 | @cython.wraparound(False) 91 | @cython.cdivision(True) 92 | def _centers_sparse(X, np.ndarray[INT, ndim=1] labels, n_clusters, 93 | np.ndarray[floating, ndim=1] distances): 94 | """M step of the K-means EM algorithm 95 | 96 | Computation of cluster centers / means. 97 | 98 | Parameters 99 | ---------- 100 | X : scipy.sparse.csr_matrix, shape (n_samples, n_features) 101 | 102 | labels : array of integers, shape (n_samples) 103 | Current label assignment 104 | 105 | n_clusters : int 106 | Number of desired clusters 107 | 108 | distances : array-like, shape (n_samples) 109 | Distance to closest cluster for each sample. 110 | 111 | Returns 112 | ------- 113 | centers : array, shape (n_clusters, n_features) 114 | The resulting centers 115 | """ 116 | cdef int n_features = X.shape[1] 117 | cdef int curr_label 118 | 119 | cdef np.ndarray[floating, ndim=1] data = X.data 120 | cdef np.ndarray[int, ndim=1] indices = X.indices 121 | cdef np.ndarray[int, ndim=1] indptr = X.indptr 122 | 123 | cdef np.ndarray[floating, ndim=2, mode="c"] centers 124 | cdef np.ndarray[np.npy_intp, ndim=1] far_from_centers 125 | cdef np.ndarray[np.npy_intp, ndim=1, mode="c"] n_samples_in_cluster = \ 126 | np.bincount(labels, minlength=n_clusters) 127 | cdef np.ndarray[np.npy_intp, ndim=1, mode="c"] empty_clusters = \ 128 | np.where(n_samples_in_cluster == 0)[0] 129 | cdef int n_empty_clusters = empty_clusters.shape[0] 130 | 131 | if floating is float: 132 | centers = np.zeros((n_clusters, n_features), dtype=np.float32) 133 | else: 134 | centers = np.zeros((n_clusters, n_features), dtype=np.float64) 135 | 136 | # maybe also relocate small clusters? 137 | 138 | if n_empty_clusters > 0: 139 | # find points to reassign empty clusters to 140 | far_from_centers = distances.argsort()[::-1][:n_empty_clusters] 141 | 142 | # XXX two relocated clusters could be close to each other 143 | assign_rows_csr(X, far_from_centers, empty_clusters, centers) 144 | 145 | for i in range(n_empty_clusters): 146 | n_samples_in_cluster[empty_clusters[i]] = 1 147 | 148 | for i in range(labels.shape[0]): 149 | curr_label = labels[i] 150 | for ind in range(indptr[i], indptr[i + 1]): 151 | j = indices[ind] 152 | centers[curr_label, j] += data[ind] 153 | 154 | centers /= n_samples_in_cluster[:, np.newaxis] 155 | 156 | return centers 157 | -------------------------------------------------------------------------------- /size_constrained_clustering/sklearn_import/funcsigs.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import types 3 | from collections import OrderedDict 4 | 5 | from sklearn_import.externals.funcsigs import _NonUserDefinedCallables, _get_user_defined_method, \ 6 | _POSITIONAL_ONLY, _VAR_POSITIONAL, _VAR_KEYWORD, Signature 7 | 8 | 9 | def signature(obj): 10 | '''Get a signature object for the passed callable.''' 11 | 12 | if not callable(obj): 13 | raise TypeError('{0!r} is not a callable object'.format(obj)) 14 | 15 | if isinstance(obj, types.MethodType): 16 | sig = signature(obj.__func__) 17 | if obj.__self__ is None: 18 | # Unbound method: the first parameter becomes positional-only 19 | if sig.parameters: 20 | first = sig.parameters.values()[0].replace( 21 | kind=_POSITIONAL_ONLY) 22 | return sig.replace( 23 | parameters=(first,) + tuple(sig.parameters.values())[1:]) 24 | else: 25 | return sig 26 | else: 27 | # In this case we skip the first parameter of the underlying 28 | # function (usually `self` or `cls`). 29 | return sig.replace(parameters=tuple(sig.parameters.values())[1:]) 30 | 31 | try: 32 | sig = obj.__signature__ 33 | except AttributeError: 34 | pass 35 | else: 36 | if sig is not None: 37 | return sig 38 | 39 | try: 40 | # Was this function wrapped by a decorator? 41 | wrapped = obj.__wrapped__ 42 | except AttributeError: 43 | pass 44 | else: 45 | return signature(wrapped) 46 | 47 | if isinstance(obj, types.FunctionType): 48 | return Signature.from_function(obj) 49 | 50 | if isinstance(obj, functools.partial): 51 | sig = signature(obj.func) 52 | 53 | new_params = OrderedDict(sig.parameters.items()) 54 | 55 | partial_args = obj.args or () 56 | partial_keywords = obj.keywords or {} 57 | try: 58 | ba = sig.bind_partial(*partial_args, **partial_keywords) 59 | except TypeError as ex: 60 | msg = 'partial object {0!r} has incorrect arguments'.format(obj) 61 | raise ValueError(msg) 62 | 63 | for arg_name, arg_value in ba.arguments.items(): 64 | param = new_params[arg_name] 65 | if arg_name in partial_keywords: 66 | # We set a new default value, because the following code 67 | # is correct: 68 | # 69 | # >>> def foo(a): print(a) 70 | # >>> print(partial(partial(foo, a=10), a=20)()) 71 | # 20 72 | # >>> print(partial(partial(foo, a=10), a=20)(a=30)) 73 | # 30 74 | # 75 | # So, with 'partial' objects, passing a keyword argument is 76 | # like setting a new default value for the corresponding 77 | # parameter 78 | # 79 | # We also mark this parameter with '_partial_kwarg' 80 | # flag. Later, in '_bind', the 'default' value of this 81 | # parameter will be added to 'kwargs', to simulate 82 | # the 'functools.partial' real call. 83 | new_params[arg_name] = param.replace(default=arg_value, 84 | _partial_kwarg=True) 85 | 86 | elif (param.kind not in (_VAR_KEYWORD, _VAR_POSITIONAL) and 87 | not param._partial_kwarg): 88 | new_params.pop(arg_name) 89 | 90 | return sig.replace(parameters=new_params.values()) 91 | 92 | sig = None 93 | if isinstance(obj, type): 94 | # obj is a class or a metaclass 95 | 96 | # First, let's see if it has an overloaded __call__ defined 97 | # in its metaclass 98 | call = _get_user_defined_method(type(obj), '__call__') 99 | if call is not None: 100 | sig = signature(call) 101 | else: 102 | # Now we check if the 'obj' class has a '__new__' method 103 | new = _get_user_defined_method(obj, '__new__') 104 | if new is not None: 105 | sig = signature(new) 106 | else: 107 | # Finally, we should have at least __init__ implemented 108 | init = _get_user_defined_method(obj, '__init__') 109 | if init is not None: 110 | sig = signature(init) 111 | elif not isinstance(obj, _NonUserDefinedCallables): 112 | # An object with __call__ 113 | # We also check that the 'obj' is not an instance of 114 | # _WrapperDescriptor or _MethodWrapper to avoid 115 | # infinite recursion (and even potential segfault) 116 | call = _get_user_defined_method(type(obj), '__call__', 'im_func') 117 | if call is not None: 118 | sig = signature(call) 119 | 120 | if sig is not None: 121 | # For classes and objects we skip the first parameter of their 122 | # __call__, __new__, or __init__ methods 123 | return sig.replace(parameters=tuple(sig.parameters.values())[1:]) 124 | 125 | if isinstance(obj, types.BuiltinFunctionType): 126 | # Raise a nicer error message for builtins 127 | msg = 'no signature found for builtin function {0!r}'.format(obj) 128 | raise ValueError(msg) 129 | 130 | raise ValueError('callable {0!r} is not supported by signature'.format(obj)) 131 | -------------------------------------------------------------------------------- /size_constrained_clustering/base.py: -------------------------------------------------------------------------------- 1 | #!usr/bin/python 3.7 2 | #-*-coding:utf-8-*- 3 | 4 | ''' 5 | @file: base.py, base for clustering algorithm 6 | @Author: Jing Wang (jingw2@foxmail.com) 7 | @Date: 06/07/2020 8 | ''' 9 | from scipy.spatial.distance import cdist 10 | import numpy as np 11 | import warnings 12 | import scipy.sparse as sp 13 | 14 | import os 15 | import sys 16 | path = os.path.dirname(os.path.abspath(__file__)) 17 | sys.path.append(path) 18 | from sklearn_import.utils.extmath import stable_cumsum 19 | 20 | class Base(object): 21 | 22 | def __init__(self, n_clusters, max_iters, distance_func=cdist): 23 | ''' 24 | Base Cluster object 25 | 26 | Args: 27 | n_clusters (int): number of clusters 28 | max_iters (int): maximum iterations 29 | distance_func (callable function): distance function callback 30 | ''' 31 | assert isinstance(n_clusters, int) 32 | assert n_clusters >= 1 33 | assert isinstance(max_iters, int) 34 | assert max_iters >= 1 35 | self.n_clusters = n_clusters 36 | self.max_iters = max_iters 37 | if distance_func is not None and not callable(distance_func): 38 | raise Exception("Distance function is not callable") 39 | self.distance_func = distance_func 40 | 41 | def fit(self, X): 42 | pass 43 | 44 | def predict(self, X): 45 | pass 46 | 47 | def k_init(X, n_clusters, x_squared_norms, random_state=42, distance_func=cdist, n_local_trials=None): 48 | """Init n_clusters seeds according to k-means++ 49 | 50 | Parameters 51 | ---------- 52 | X : array or sparse matrix, shape (n_samples, n_features) 53 | The data to pick seeds for. To avoid memory copy, the input data 54 | should be double precision (dtype=np.float64). 55 | 56 | n_clusters : integer 57 | The number of seeds to choose 58 | 59 | x_squared_norms : array, shape (n_samples,) 60 | Squared Euclidean norm of each data point. 61 | 62 | random_state : int, RandomState instance 63 | The generator used to initialize the centers. Use an int to make the 64 | randomness deterministic. 65 | See :term:`Glossary `. 66 | 67 | n_local_trials : integer, optional 68 | The number of seeding trials for each center (except the first), 69 | of which the one reducing inertia the most is greedily chosen. 70 | Set to None to make the number of trials depend logarithmically 71 | on the number of seeds (2+log(k)); this is the default. 72 | 73 | Notes 74 | ----- 75 | Selects initial cluster centers for k-mean clustering in a smart way 76 | to speed up convergence. see: Arthur, D. and Vassilvitskii, S. 77 | "k-means++: the advantages of careful seeding". ACM-SIAM symposium 78 | on Discrete algorithms. 2007 79 | 80 | Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip, 81 | which is the implementation used in the aforementioned paper. 82 | """ 83 | n_samples, n_features = X.shape 84 | 85 | centers = np.empty((n_clusters, n_features), dtype=X.dtype) 86 | 87 | assert x_squared_norms is not None, 'x_squared_norms None in _k_init' 88 | 89 | # Set the number of local seeding trials if none is given 90 | if n_local_trials is None: 91 | # This is what Arthur/Vassilvitskii tried, but did not report 92 | # specific results for other than mentioning in the conclusion 93 | # that it helped. 94 | n_local_trials = 2 + int(np.log(n_clusters)) 95 | 96 | # Pick first center randomly 97 | center_id = random_state.randint(n_samples) 98 | if sp.issparse(X): 99 | centers[0] = X[center_id].toarray() 100 | else: 101 | centers[0] = X[center_id] 102 | 103 | # Initialize list of closest distances and calculate current potential 104 | closest_dist_sq = distance_func( 105 | centers[0, np.newaxis], X) 106 | current_pot = closest_dist_sq.sum() 107 | 108 | # Pick the remaining n_clusters-1 points 109 | for c in range(1, n_clusters): 110 | # Choose center candidates by sampling with probability proportional 111 | # to the squared distance to the closest existing center 112 | rand_vals = random_state.random_sample(n_local_trials) * current_pot 113 | candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq), 114 | rand_vals) 115 | # XXX: numerical imprecision can result in a candidate_id out of range 116 | np.clip(candidate_ids, None, closest_dist_sq.size - 1, 117 | out=candidate_ids) 118 | 119 | # Compute distances to center candidates 120 | # distance_to_candidates = euclidean_distances( 121 | # X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True) 122 | distance_to_candidates = distance_func(X[candidate_ids], X) 123 | 124 | # update closest distances squared and potential for each candidate 125 | np.minimum(closest_dist_sq, distance_to_candidates, 126 | out=distance_to_candidates) 127 | candidates_pot = distance_to_candidates.sum(axis=1) 128 | 129 | # Decide which candidate is the best 130 | best_candidate = np.argmin(candidates_pot) 131 | current_pot = candidates_pot[best_candidate] 132 | closest_dist_sq = distance_to_candidates[best_candidate] 133 | best_candidate = candidate_ids[best_candidate] 134 | 135 | # Permanently add best center candidate found in local tries 136 | if sp.issparse(X): 137 | centers[c] = X[best_candidate].toarray() 138 | else: 139 | centers[c] = X[best_candidate] 140 | 141 | return centers 142 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Size Constrained Clustering Solver 2 | [![Build Status](https://travis-ci.org/jingw2/size_constrained_clustering.svg?branch=master)](https://travis-ci.org/jingw2/size_constrained_clustering) 3 | [![PyPI version](https://badge.fury.io/py/size-constrained-clustering.svg)](https://badge.fury.io/py/size-constrained-clustering) 4 | ![GitHub](https://img.shields.io/github/license/jingw2/size_constrained_clustering) 5 | [![codecov](https://codecov.io/gh/jingw2/size_constrained_clustering/branch/master/graph/badge.svg)](https://codecov.io/gh/jingw2/size_constrained_clustering) 6 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/size-constrained-clustering) 7 | ![Codecov](https://img.shields.io/codecov/c/github/jingw2/size_constrained_clustering) 8 | 9 | 10 | Implementation of Size Constrained Clustering. 11 | Size constrained clustering can be treated as an optimization problem. Details could be found in a set of reference paper. 12 | 13 | ### Installation 14 | Requirement Python >= 3.6, Numpy >= 1.13, Cython >= 0.29 15 | * install from PyPI 16 | ```shell 17 | pip install size-constrained-clustering 18 | ``` 19 | 20 | ### Methods 21 | * Fuzzy C-means Algorithm: Similar to KMeans, but use membership probability, not 0 or 1 22 | * Same Size Contrained KMeans Heuristics: Use Heuristics methods to reach same size clustering 23 | * Same Size Contrained KMeans Inspired by Minimum Cost Flow Problem 24 | * Minimum and Maximum Size Constrained KMeans Inspired by Minimum Cost Flow Problem 25 | * Deterministic Annealling Algorithm: Input target cluster distribution, return correspondent clusters 26 | * Shrinkage Clustering: base algorithm and minimum size constraints 27 | 28 | ### Usage: 29 | ```python 30 | # setup 31 | from size_constrained_clustering import fcm, equal, minmax, shrinkage 32 | # by default it is euclidean distance, but can select others 33 | from sklearn.metrics.pairwise import haversine_distances 34 | import numpy as np 35 | ``` 36 | 37 | Fuzzy C-means 38 | ```python 39 | n_samples = 2000 40 | n_clusters = 4 41 | centers = [(-5, -5), (0, 0), (5, 5), (7, 10)] 42 | X, _ = make_blobs(n_samples=n_samples, n_features=2, cluster_std=1.0, 43 | centers=centers, shuffle=False, random_state=42) 44 | model = fcm.FCM(n_clusters) 45 | # use other distance function: e.g. haversine distance 46 | # model = fcm.FCM(n_clusters, distance_func=haversine_distances) 47 | model.fit(X) 48 | centers = model.cluster_centers_ 49 | labels = model.labels_ 50 | ``` 51 | ![alt text](https://github.com/jingw2/size_constrained_clustering/blob/master/pic/fcm.png) 52 | 53 | 54 | Equal Size Constraint 55 | ```python 56 | n_samples = 2000 57 | n_clusters = 3 58 | X = np.random.rand(n_samples, 2) 59 | # use minimum cost flow framework to solve 60 | model = equal.SameSizeKMeansMinCostFlow(n_clusters) 61 | # use heuristics method to solve 62 | model = equal.SameSizeKMeansHeuristics(n_clusters) 63 | model.fit(X) 64 | centers = model.cluster_centers_ 65 | labels = model.labels_ 66 | ``` 67 | ![alt text](https://github.com/jingw2/size_constrained_clustering/blob/master/pic/equal.png) 68 | 69 | Cluster size: 667, 667 and 666 in the figure above. 70 | 71 | Minimum and Maximum Size Constraint 72 | ```python 73 | n_samples = 2000 74 | n_clusters = 3 75 | X = np.random.rand(n_samples, 2) 76 | model = minmax.MinMaxKMeansMinCostFlow(n_clusters, size_min=400, size_max=800) 77 | model.fit(X) 78 | centers = model.cluster_centers_ 79 | labels = model.labels_ 80 | ``` 81 | ![alt text](https://github.com/jingw2/size_constrained_clustering/blob/master/pic/minmax.png) 82 | 83 | Cluster size: 753, 645 and 602 in the figure above. 84 | 85 | Deterministic Annealing 86 | ```python 87 | n_samples = 2000 88 | n_clusters = 3 89 | X = np.random.rand(n_samples, 2) 90 | # distribution is the distribution of cluster sizes 91 | model = da.DeterministicAnnealing(n_clusters, distribution=[0.1, 0.6, 0.3]) 92 | model.fit(X) 93 | centers = model.cluster_centers_ 94 | labels = model.labels_ 95 | ``` 96 | ![alt text](https://github.com/jingw2/size_constrained_clustering/blob/master/pic/da.png) 97 | 98 | Cluster size: 1200, 600 and 200 in the figure above, corresponding to distribution [0.6, 0.3, 0.1] 99 | 100 | Shrinkage Clustering 101 | 102 | The result might be not available. 103 | ```python 104 | n_samples = 1000 105 | n_clusters = 4 106 | centers = [(-5, -5), (0, 0), (5, 5), (7, 10)] 107 | X, _ = make_blobs(n_samples=n_samples, n_features=2, cluster_std=1.0, centers=centers, shuffle=False, random_state=42) 108 | 109 | model = shrinkage.Shrinkage(n_clusters, size_min=100) 110 | model.fit(X) 111 | centers = model.cluster_centers_ 112 | labels = model.labels_ 113 | ``` 114 | ![alt text](https://github.com/jingw2/size_constrained_clustering/blob/master/pic/shrinkage.png) 115 | 116 | 117 | ## Copyright 118 | Copyright (c) 2020 Jing Wang. Released under the MIT License. 119 | 120 | Third-party copyright in this distribution is noted where applicable. 121 | 122 | ### Reference 123 | * [Clustering with Capacity and Size Constraints: A Deterministic 124 | Approach](http://web.eecs.umich.edu/~mayankb/docs/ClusterCap.pdf) 125 | * [Deterministic Annealing, Clustering and Optimization](https://thesis.library.caltech.edu/2858/1/Rose_k_1991.pdf) 126 | * [Deterministic Annealing, Constrained Clustering, and Opthiieation](https://authors.library.caltech.edu/78353/1/00170767.pdf) 127 | * [Shrinkage Clustering](https://www.researchgate.net/publication/322668506_Shrinkage_Clustering_A_fast_and_size-constrained_clustering_algorithm_for_biomedical_applications) 128 | * [Clustering with size constraints](https://www.researchgate.net/publication/268292668_Clustering_with_Size_Constraints) 129 | * [Data Clustering with Cluster Size Constraints Using a Modified k-means Algorithm](https://core.ac.uk/download/pdf/61217069.pdf) 130 | * [KMeans Constrained Clustering Inspired by Minimum Cost Flow Problem](https://github.com/joshlk/k-means-constrained) 131 | * [Same Size Kmeans Heuristics Methods](https://elki-project.github.io/tutorial/same-size_k_means) 132 | * [Google's Operations Research tools's 133 | `SimpleMinCostFlow`](https://developers.google.com/optimization/flow/mincostflow) 134 | * [Cluster KMeans Constrained](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-2000-65.pdf) 135 | -------------------------------------------------------------------------------- /size_constrained_clustering/da.py: -------------------------------------------------------------------------------- 1 | #!usr/bin/python 3.6 2 | #-*-coding:utf-8-*- 3 | 4 | ''' 5 | @file: da.py, deterministic annealing algorithm 6 | @Author: Jing Wang (jingw2@foxmail.com) 7 | @Date: 11/28/2019 8 | @Paper reference: Clustering with Capacity and Size Constraints: A Deterministic Approach 9 | ''' 10 | 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | from copy import deepcopy 14 | import collections 15 | import random 16 | from scipy.spatial.distance import cdist 17 | 18 | import os 19 | import sys 20 | path = os.path.dirname(os.path.abspath(__file__)) 21 | sys.path.append(path) 22 | import base 23 | 24 | class DeterministicAnnealing(base.Base): 25 | 26 | def __init__(self, n_clusters, distribution, max_iters=1000, 27 | distance_func=cdist, random_state=42, T=None): 28 | ''' 29 | Args: 30 | n_clusters (int): number of clusters 31 | distribution (list): a list of ratio distribution for each cluster 32 | T (list): inverse choice of beta coefficients 33 | ''' 34 | super(DeterministicAnnealing, self).__init__(n_clusters, max_iters, distance_func) 35 | self.lamb = distribution 36 | assert np.sum(distribution) == 1 37 | assert len(distribution) == n_clusters 38 | assert isinstance(T, list) or T is None 39 | 40 | self.beta = None 41 | self.T = T 42 | self.cluster_centers_ = None 43 | self.labels_ = None 44 | self._eta = None 45 | self._demands_prob = None 46 | random.seed(random_state) 47 | np.random.seed(random_state) 48 | 49 | def fit(self, X, demands_prob=None): 50 | # setting T, loop 51 | T = [1, 0.1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8] 52 | solutions = [] 53 | diff_list = [] 54 | is_early_terminated = False 55 | 56 | n_samples, n_features = X.shape 57 | self.capacity = [n_samples * d for d in self.lamb] 58 | if demands_prob is None: 59 | demands_prob = np.ones((n_samples, 1)) 60 | else: 61 | demands_prob = np.asarray(demands_prob).reshape((-1, 1)) 62 | assert demands_prob.shape[0] == X.shape[0] 63 | demands_prob = demands_prob / sum(demands_prob) 64 | for t in T: 65 | self.T = t 66 | centers = self.initial_centers(X) 67 | 68 | eta = self.lamb 69 | labels = None 70 | for _ in range(self.max_iters): 71 | self.beta = 1. / self.T 72 | distance_matrix = self.distance_func(X, centers) 73 | eta = self.update_eta(eta, demands_prob, distance_matrix) 74 | gibbs = self.update_gibbs(eta, distance_matrix) 75 | centers = self.update_centers(demands_prob, gibbs, X) 76 | self.T *= 0.999 77 | 78 | labels = np.argmax(gibbs, axis=1) 79 | 80 | if self._is_satisfied(labels): break 81 | 82 | solutions.append([labels, centers]) 83 | resultant_clusters = len(collections.Counter(labels)) 84 | 85 | diff_list.append(abs(resultant_clusters - self.n_clusters)) 86 | if resultant_clusters == self.n_clusters: 87 | is_early_terminated = True 88 | break 89 | 90 | # modification for non-strictly satisfaction, only works for one demand per location 91 | # labels = self.modify(labels, centers, distance_matrix) 92 | if not is_early_terminated: 93 | best_index = np.argmin(diff_list) 94 | labels, centers = solutions[best_index] 95 | 96 | self.cluster_centers_ = centers 97 | self.labels_ = labels 98 | self._eta = eta 99 | self._demands_prob = demands_prob 100 | 101 | def predict(self, X): 102 | distance_matrix = self.distance_func(X, self.cluster_centers_) 103 | eta = self.update_eta(self._eta, self._demands_prob, distance_matrix) 104 | gibbs = self.update_gibbs(eta, distance_matrix) 105 | labels = np.argmax(gibbs, axis=1) 106 | return labels 107 | 108 | def modify(self, labels, centers, distance_matrix): 109 | centers_distance = self.distance_func(centers, centers) 110 | adjacent_centers = {i: np.argsort(centers_distance, axis=1)[i, 1:3].tolist() for i in range(self.n_clusters)} 111 | while not self._is_satisfied(labels): 112 | count = collections.Counter(labels) 113 | cluster_id_list = list(count.keys()) 114 | random.shuffle(cluster_id_list) 115 | for cluster_id in cluster_id_list: 116 | num_points = count[cluster_id] 117 | diff = num_points - self.capacity[cluster_id] 118 | if diff <= 0: 119 | continue 120 | adjacent_cluster = None 121 | adjacent_cluster = random.choice(adjacent_centers[cluster_id]) 122 | if adjacent_cluster is None: 123 | continue 124 | cluster_point_id = np.where(labels==cluster_id)[0].tolist() 125 | diff_distance = distance_matrix[cluster_point_id, adjacent_cluster] \ 126 | - distance_matrix[cluster_point_id, cluster_id] 127 | remove_point_id = np.asarray(cluster_point_id)[np.argsort(diff_distance)[:diff]] 128 | labels[remove_point_id] = adjacent_cluster 129 | 130 | return labels 131 | 132 | def initial_centers(self, X): 133 | selective_centers = random.sample(range(X.shape[0]), self.n_clusters) 134 | centers = X[selective_centers] 135 | return centers 136 | 137 | def _is_satisfied(self, labels): 138 | count = collections.Counter(labels) 139 | for cluster_id in range(len(self.capacity)): 140 | if cluster_id not in count: 141 | return False 142 | num_points = count[cluster_id] 143 | if num_points > self.capacity[cluster_id]: 144 | return False 145 | return True 146 | 147 | def update_eta(self, eta, demands_prob, distance_matrix): 148 | n_points, n_centers = distance_matrix.shape 149 | eta_repmat = np.tile(np.asarray(eta).reshape(1, -1), (n_points, 1)) 150 | exp_term = np.exp(- self.beta * distance_matrix) 151 | divider = exp_term / np.sum(np.multiply(exp_term, 152 | eta_repmat), axis=1).reshape((-1, 1)) 153 | eta = np.divide(np.asarray(self.lamb), 154 | np.sum(divider * demands_prob, axis=0)) 155 | 156 | return eta 157 | 158 | def update_gibbs(self, eta, distance_matrix): 159 | n_points, n_centers = distance_matrix.shape 160 | eta_repmat = np.tile(np.asarray(eta).reshape(1, -1), (n_points, 1)) 161 | exp_term = np.exp(- self.beta * distance_matrix) 162 | factor = np.multiply(exp_term, eta_repmat) 163 | gibbs = factor / np.sum(factor, axis=1).reshape((-1, 1)) 164 | return gibbs 165 | 166 | def update_centers(self, demands_prob, gibbs, X): 167 | n_points, n_features = X.shape 168 | divide_up = gibbs.T.dot(X * demands_prob)# n_cluster, n_features 169 | p_y = np.sum(gibbs * demands_prob, axis=0) # n_cluster, 170 | p_y_repmat = np.tile(p_y.reshape(-1, 1), (1, n_features)) 171 | centers = np.divide(divide_up, p_y_repmat) 172 | return centers 173 | 174 | if __name__ == "__main__": 175 | X = [] 176 | n_points = 1000 177 | random_state = 42 178 | random.seed(random_state) 179 | np.random.seed(random_state) 180 | # demands = np.random.randint(1, 24, (n_points, 1)) 181 | X = np.random.rand(n_points, 2) 182 | demands = np.ones((n_points, 1)) 183 | n_clusters = 4 184 | n_iters = 100 185 | max_size = [n_points / n_clusters] * n_clusters 186 | max_size = [0.25, 0.5, 0.1, 0.15] 187 | 188 | da = DeterministicAnnealing(n_clusters, max_size, n_iters) 189 | da.fit(X, demands) 190 | labels = da.labels_ 191 | centers = da.cluster_centers_ 192 | print(centers) 193 | labels_demand_cnt = {} 194 | for i, label in enumerate(labels): 195 | labels_demand_cnt[label] = labels_demand_cnt.get(label, 0) + demands[i][0] 196 | 197 | sorted_labels = sorted(labels_demand_cnt.items()) 198 | x = list(range(n_clusters)) 199 | y = [j for i, j in sorted_labels] 200 | plt.scatter(X[:, 0], X[:, 1], c=labels) 201 | print(collections.Counter(labels_demand_cnt)) 202 | # plt.show() 203 | plt.xlabel("X") 204 | plt.ylabel("Y") 205 | # plt.bar(x, y) 206 | plt.show() 207 | -------------------------------------------------------------------------------- /size_constrained_clustering/sklearn_import/base.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from collections import defaultdict 3 | 4 | import numpy as np 5 | import six 6 | 7 | from sklearn_import import __version__ 8 | from sklearn_import.funcsigs import signature 9 | 10 | 11 | class BaseEstimator(object): 12 | """Base class for all estimators in scikit-learn 13 | 14 | Notes 15 | ----- 16 | All estimators should specify all the parameters that can be set 17 | at the class level in their ``__init__`` as explicit keyword 18 | arguments (no ``*args`` or ``**kwargs``). 19 | """ 20 | 21 | @classmethod 22 | def _get_param_names(cls): 23 | """Get parameter names for the estimator""" 24 | # fetch the constructor or the original constructor before 25 | # deprecation wrapping if any 26 | init = getattr(cls.__init__, 'deprecated_original', cls.__init__) 27 | if init is object.__init__: 28 | # No explicit constructor to introspect 29 | return [] 30 | 31 | # introspect the constructor arguments to find the model parameters 32 | # to represent 33 | init_signature = signature(init) 34 | # Consider the constructor parameters excluding 'self' 35 | parameters = [p for p in init_signature.parameters.values() 36 | if p.name != 'self' and p.kind != p.VAR_KEYWORD] 37 | for p in parameters: 38 | if p.kind == p.VAR_POSITIONAL: 39 | raise RuntimeError("scikit-learn estimators should always " 40 | "specify their parameters in the signature" 41 | " of their __init__ (no varargs)." 42 | " %s with constructor %s doesn't " 43 | " follow this convention." 44 | % (cls, init_signature)) 45 | # Extract and sort argument names excluding 'self' 46 | return sorted([p.name for p in parameters]) 47 | 48 | def get_params(self, deep=True): 49 | """Get parameters for this estimator. 50 | 51 | Parameters 52 | ---------- 53 | deep : boolean, optional 54 | If True, will return the parameters for this estimator and 55 | contained subobjects that are estimators. 56 | 57 | Returns 58 | ------- 59 | params : mapping of string to any 60 | Parameter names mapped to their values. 61 | """ 62 | out = dict() 63 | for key in self._get_param_names(): 64 | # We need deprecation warnings to always be on in order to 65 | # catch deprecated param values. 66 | # This is set in utils/__init__.py but it gets overwritten 67 | # when running under python3 somehow. 68 | warnings.simplefilter("always", DeprecationWarning) 69 | try: 70 | with warnings.catch_warnings(record=True) as w: 71 | value = getattr(self, key, None) 72 | if len(w) and w[0].category == DeprecationWarning: 73 | # if the parameter is deprecated, don't show it 74 | continue 75 | finally: 76 | warnings.filters.pop(0) 77 | 78 | # XXX: should we rather test if instance of estimator? 79 | if deep and hasattr(value, 'get_params'): 80 | deep_items = value.get_params().items() 81 | out.update((key + '__' + k, val) for k, val in deep_items) 82 | out[key] = value 83 | return out 84 | 85 | def set_params(self, **params): 86 | """Set the parameters of this estimator. 87 | 88 | The method works on simple estimators as well as on nested objects 89 | (such as pipelines). The latter have parameters of the form 90 | ``__`` so that it's possible to update each 91 | component of a nested object. 92 | 93 | Returns 94 | ------- 95 | self 96 | """ 97 | if not params: 98 | # Simple optimization to gain speed (inspect is slow) 99 | return self 100 | valid_params = self.get_params(deep=True) 101 | 102 | nested_params = defaultdict(dict) # grouped by prefix 103 | for key, value in params.items(): 104 | key, delim, sub_key = key.partition('__') 105 | if key not in valid_params: 106 | raise ValueError('Invalid parameter %s for estimator %s. ' 107 | 'Check the list of available parameters ' 108 | 'with `estimator.get_params().keys()`.' % 109 | (key, self)) 110 | 111 | if delim: 112 | nested_params[key][sub_key] = value 113 | else: 114 | setattr(self, key, value) 115 | 116 | for key, sub_params in nested_params.items(): 117 | valid_params[key].set_params(**sub_params) 118 | 119 | return self 120 | 121 | def __repr__(self): 122 | class_name = self.__class__.__name__ 123 | return '%s(%s)' % (class_name, _pprint(self.get_params(deep=False), 124 | offset=len(class_name),),) 125 | 126 | def __getstate__(self): 127 | try: 128 | state = super(BaseEstimator, self).__getstate__() 129 | except AttributeError: 130 | state = self.__dict__.copy() 131 | 132 | if type(self).__module__.startswith('sklearn.'): 133 | return dict(state.items(), _sklearn_version=__version__) 134 | else: 135 | return state 136 | 137 | def __setstate__(self, state): 138 | if type(self).__module__.startswith('sklearn.'): 139 | pickle_version = state.pop("_sklearn_version", "pre-0.18") 140 | if pickle_version != __version__: 141 | warnings.warn( 142 | "Trying to unpickle estimator {0} from version {1} when " 143 | "using version {2}. This might lead to breaking code or " 144 | "invalid results. Use at your own risk.".format( 145 | self.__class__.__name__, pickle_version, __version__), 146 | UserWarning) 147 | try: 148 | super(BaseEstimator, self).__setstate__(state) 149 | except AttributeError: 150 | self.__dict__.update(state) 151 | 152 | 153 | class ClusterMixin(object): 154 | """Mixin class for all cluster estimators in scikit-learn.""" 155 | _estimator_type = "clusterer" 156 | 157 | def fit_predict(self, X, y=None): 158 | """Performs clustering on X and returns cluster labels. 159 | 160 | Parameters 161 | ---------- 162 | X : ndarray, shape (n_samples, n_features) 163 | Input data. 164 | 165 | Returns 166 | ------- 167 | y : ndarray, shape (n_samples,) 168 | cluster labels 169 | """ 170 | # non-optimized default implementation; override when a better 171 | # method is possible for a given clustering algorithm 172 | self.fit(X) 173 | return self.labels_ 174 | 175 | 176 | class TransformerMixin(object): 177 | """Mixin class for all transformers in scikit-learn.""" 178 | 179 | def fit_transform(self, X, y=None, **fit_params): 180 | """Fit to data, then transform it. 181 | 182 | Fits transformer to X and y with optional parameters fit_params 183 | and returns a transformed version of X. 184 | 185 | Parameters 186 | ---------- 187 | X : numpy array of shape [n_samples, n_features] 188 | Training set. 189 | 190 | y : numpy array of shape [n_samples] 191 | Target values. 192 | 193 | Returns 194 | ------- 195 | X_new : numpy array of shape [n_samples, n_features_new] 196 | Transformed array. 197 | 198 | """ 199 | # non-optimized default implementation; override when a better 200 | # method is possible for a given clustering algorithm 201 | if y is None: 202 | # fit method of arity 1 (unsupervised transformation) 203 | return self.fit(X, **fit_params).transform(X) 204 | else: 205 | # fit method of arity 2 (supervised transformation) 206 | return self.fit(X, y, **fit_params).transform(X) 207 | 208 | 209 | def _pprint(params, offset=0, printer=repr): 210 | """Pretty print the dictionary 'params' 211 | 212 | Parameters 213 | ---------- 214 | params : dict 215 | The dictionary to pretty print 216 | 217 | offset : int 218 | The offset in characters to add at the begin of each line. 219 | 220 | printer : callable 221 | The function to convert entries to strings, typically 222 | the builtin str or repr 223 | 224 | """ 225 | # Do a multi-line justified repr: 226 | options = np.get_printoptions() 227 | np.set_printoptions(precision=5, threshold=64, edgeitems=2) 228 | params_list = list() 229 | this_line_length = offset 230 | line_sep = ',\n' + (1 + offset // 2) * ' ' 231 | for i, (k, v) in enumerate(sorted(six.iteritems(params))): 232 | if type(v) is float: 233 | # use str for representing floating point numbers 234 | # this way we get consistent representation across 235 | # architectures and versions. 236 | this_repr = '%s=%s' % (k, str(v)) 237 | else: 238 | # use repr of the rest 239 | this_repr = '%s=%s' % (k, printer(v)) 240 | if len(this_repr) > 500: 241 | this_repr = this_repr[:300] + '...' + this_repr[-100:] 242 | if i > 0: 243 | if (this_line_length + len(this_repr) >= 75 or '\n' in this_repr): 244 | params_list.append(line_sep) 245 | this_line_length = len(line_sep) 246 | else: 247 | params_list.append(', ') 248 | this_line_length += 2 249 | params_list.append(this_repr) 250 | this_line_length += len(this_repr) 251 | 252 | np.set_printoptions(**options) 253 | lines = ''.join(params_list) 254 | # Strip trailing space to avoid nightmare in doctests 255 | lines = '\n'.join(l.rstrip(' ') for l in lines.split('\n')) 256 | return lines 257 | -------------------------------------------------------------------------------- /size_constrained_clustering/equal.py: -------------------------------------------------------------------------------- 1 | #!usr/bin/python 3.7 2 | #-*-coding:utf-8-*- 3 | 4 | ''' 5 | @file: same_size_kmeans.py, equal size clustering with heuristics 6 | @Author: Jing Wang (jingw2@foxmail.com) 7 | @Date: 06/16/2020 8 | @paper: 9 | @github reference: https://github.com/joshlk/k-means-constrained 10 | @Web: https://elki-project.github.io/tutorial/same-size_k_means 11 | ''' 12 | 13 | from scipy.spatial.distance import cdist 14 | import numpy as np 15 | # from sklearn.cluster._k_means import _k_init 16 | from sklearn.preprocessing import OneHotEncoder 17 | import collections 18 | import warnings 19 | 20 | import sys 21 | import os 22 | path = os.path.dirname(os.path.abspath(__file__)) 23 | sys.path.append(path) 24 | import base 25 | from k_means_constrained import KMeansConstrained 26 | 27 | class SameSizeKMeansHeuristics(base.Base): 28 | 29 | def __init__(self, n_clusters, max_iters=1000, distance_func=cdist, random_state=42): 30 | ''' 31 | Args: 32 | n_clusters (int): number of clusters 33 | max_iters (int): maximum iterations 34 | distance_func (object): callable function with input (X, centers) / None, by default is l2-distance 35 | random_state (int): random state to initiate, by default it is 42 36 | ''' 37 | super(SameSizeKMeansHeuristics, self).__init__(n_clusters, max_iters, distance_func) 38 | self.random_state = np.random.RandomState(random_state) 39 | 40 | def fit(self, X): 41 | ''' 42 | Args: 43 | X (array like): shape (n_samples, n_features) 44 | ''' 45 | n_samples, _ = X.shape 46 | minsize = n_samples // self.n_clusters 47 | maxsize = (n_samples + self.n_clusters - 1) // self.n_clusters 48 | if minsize != maxsize: 49 | warnings.warn("Cluster minimum and maximum size are {} and {}, respectively".format(minsize, maxsize)) 50 | 51 | # initiate 52 | labels = self._init(X) 53 | encoder = OneHotEncoder() 54 | labels_onehot = encoder.fit_transform(labels.reshape((-1, 1))).toarray() 55 | itr = 0 56 | clusters = collections.Counter(labels) 57 | while True: 58 | # update centers 59 | labels_onehot = encoder.fit_transform(labels.reshape((-1, 1))).toarray() 60 | centers = self.update_centers(X, labels_onehot) 61 | # compute distance to centers 62 | dist_mat = self.distance_func(X, centers) 63 | # calculate preference 64 | labels = labels.astype(int) 65 | preference = dist_mat[range(n_samples), labels] - np.min(dist_mat, axis=1) 66 | argsort = np.argsort(preference)[::-1] # descending order 67 | # transfer list 68 | transfer = {c: [] for c in range(self.n_clusters)} 69 | 70 | for sample_id in argsort: 71 | source = labels[sample_id] 72 | dest = np.argmin(dist_mat[sample_id]) 73 | 74 | # cannot transfer to same cluster 75 | if source == dest: 76 | continue 77 | 78 | sample_gain = dist_mat[sample_id][source] - dist_mat[sample_id][dest] 79 | 80 | # find if there is pair transfer 81 | dest_transfer = transfer[dest] 82 | gains = {} 83 | for other_id in dest_transfer: 84 | other_gain = dist_mat[other_id][dest] - dist_mat[other_id][source] 85 | gain = sample_gain + other_gain 86 | if gain > 0: 87 | gains[other_id] = gain 88 | if len(gains) > 0: 89 | other_id = sorted(gains.items(), key=lambda x: x[1], reverse=True)[0][0] 90 | labels[other_id], labels[sample_id] \ 91 | = labels[sample_id], labels[other_id] 92 | transfer[dest].remove(other_id) 93 | if sample_id in transfer[source]: 94 | transfer[source].remove(sample_id) 95 | continue 96 | 97 | # if cluster size allows, move a single object 98 | if (sample_gain > 0 and clusters[dest] < maxsize and clusters[source] > minsize): 99 | labels[sample_id] = dest 100 | clusters[dest] += 1 101 | clusters[source] -= 1 102 | if sample_id in transfer[source]: 103 | transfer[source].remove(sample_id) 104 | continue 105 | 106 | # if the object would prefer a different cluster, put in transfer list 107 | if (sample_gain > 0): 108 | transfer[source].append(sample_id) 109 | 110 | if len(transfer) <= 0: 111 | break 112 | 113 | itr += 1 114 | pending = sum([len(val) for key, val in transfer.items()]) 115 | if itr >= self.max_iters: 116 | print("Reach maximum iterations! Now pending transfer samples {}!".format(pending)) 117 | break 118 | 119 | self.cluster_centers_ = centers 120 | self.labels_ = labels 121 | 122 | def predict(self, X): 123 | ''' 124 | Predict labels based input X 125 | Args: 126 | X (array like): shape (n_samples, n_features) 127 | ''' 128 | dist_mat = self.distance_func(X, self.cluster_centers_) 129 | labels = np.argmin(dist_mat, axis=1) 130 | return labels 131 | 132 | def update_centers(self, X, labels): 133 | ''' 134 | Update centers 135 | Args: 136 | X (array like): (n_samples, n_features) 137 | labels (array like): (n_samples, n_clusters), one-hot array 138 | 139 | Return: 140 | centers (array like): (n_clusters, n_features) 141 | ''' 142 | centers = (X.T.dot(labels)).T / np.sum(labels, axis=0).reshape((-1, 1)) 143 | return centers 144 | 145 | def _init(self, X): 146 | ''' 147 | Initiate centroids based on X input with kmeans ++ 148 | 149 | Args: 150 | X (array like): shape is (n_samples, n_features) 151 | 152 | Returns: 153 | labels (array like): shape is (n_samples,) 154 | ''' 155 | n_samples, n_features = X.shape 156 | max_size = (n_samples + self.n_clusters - 1) // self.n_clusters 157 | # initiate centroids with kmeans++ 158 | X_squared_norm = np.sum(np.square(X), axis=1) 159 | centers = base.k_init(X, self.n_clusters, X_squared_norm, self.random_state) 160 | 161 | # calculate priority 162 | dist_mat = self.distance_func(X, centers) # (n_samples, n_clusters) 163 | priority = np.max(dist_mat, axis=1) - np.min(dist_mat, axis=1) 164 | argsort = np.argsort(priority)[::-1] # descending order 165 | clusters = {i: 0 for i in range(self.n_clusters)} 166 | 167 | # assign to clusters based on priority 168 | samples = list(range(n_samples)) 169 | visited = set() 170 | dist_mat_copy = dist_mat.copy() 171 | m = np.zeros_like(dist_mat_copy) 172 | labels = np.zeros(n_samples) 173 | while len(samples) > 0: 174 | for sample_id in argsort: 175 | if sample_id in visited: 176 | continue 177 | cluster_id = np.argmin(dist_mat_copy[sample_id]) 178 | if clusters[cluster_id] < max_size: 179 | labels[sample_id] = cluster_id 180 | clusters[cluster_id] += 1 181 | samples.remove(sample_id) 182 | visited.add(sample_id) 183 | else: 184 | break 185 | dist_mat_copy = dist_mat.copy() 186 | # mask full cluster column 187 | m[:, cluster_id] = 1 188 | dist_mat_copy = np.ma.masked_array(dist_mat_copy, m) 189 | priority = np.max(dist_mat_copy, axis=1) - np.min(dist_mat_copy, axis=1) 190 | argsort = np.argsort(priority)[::-1] # descending order 191 | 192 | return labels 193 | 194 | class SameSizeKMeansMinCostFlow(base.Base): 195 | 196 | def __init__(self, n_clusters, max_iters=1000, distance_func=cdist, random_state=42): 197 | ''' 198 | Args: 199 | n_clusters (int): number of clusters 200 | max_iters (int): maximum iterations 201 | distance_func (object): callable function with input (X, centers) / None, by default is l2-distance 202 | random_state (int): random state to initiate, by default it is 42 203 | ''' 204 | super(SameSizeKMeansMinCostFlow, self).__init__(n_clusters, max_iters, distance_func) 205 | self.clf = None 206 | 207 | def fit(self, X): 208 | n_samples, n_features = X.shape 209 | minsize = n_samples // self.n_clusters 210 | maxsize = (n_samples + self.n_clusters - 1) // self.n_clusters 211 | 212 | clf = KMeansConstrained(self.n_clusters, size_min=minsize, 213 | size_max=maxsize, distance_func=self.distance_func) 214 | 215 | if minsize != maxsize: 216 | warnings.warn("Cluster minimum and maximum size are {} and {}, respectively".format(minsize, maxsize)) 217 | 218 | clf.fit(X) 219 | 220 | self.clf = clf 221 | self.cluster_centers_ = self.clf.cluster_centers_ 222 | self.labels_ = self.clf.labels_ 223 | 224 | def predict(self, X): 225 | return self.clf.predict(X) 226 | 227 | if __name__ == "__main__": 228 | from sklearn.datasets import make_blobs 229 | from matplotlib import pyplot as plt 230 | from seaborn import scatterplot as scatter 231 | from sklearn.metrics.pairwise import haversine_distances 232 | n_samples = 2000 233 | n_clusters = 4 # use 3 bins for calibration_curve as we have 3 clusters here 234 | centers = [(-5, -5), (0, 0), (5, 5), (7, 10)] 235 | 236 | X, _ = make_blobs(n_samples=n_samples, n_features=2, cluster_std=1.0, 237 | centers=centers, shuffle=False, random_state=42) 238 | 239 | # X = np.random.rand(n_samples, 2) 240 | equal = SameSizeKMeansMinCostFlow(n_clusters) 241 | equal.fit(X) 242 | 243 | fcm_centers = equal.cluster_centers_ 244 | fcm_labels = equal.labels_ 245 | 246 | f, axes = plt.subplots(1, 2, figsize=(11, 5)) 247 | scatter(X[:, 0], X[:, 1], ax=axes[0]) 248 | scatter(X[:, 0], X[:, 1], ax=axes[1], hue=fcm_labels) 249 | scatter(fcm_centers[:, 0], fcm_centers[:, 1], ax=axes[1], marker="s",s=200) 250 | plt.show() 251 | -------------------------------------------------------------------------------- /size_constrained_clustering/sklearn_import/utils/sparsefuncs_fast.pyx: -------------------------------------------------------------------------------- 1 | # Authors: Mathieu Blondel 2 | # Olivier Grisel 3 | # Peter Prettenhofer 4 | # Lars Buitinck 5 | # Giorgio Patrini 6 | # 7 | # License: BSD 3 clause 8 | 9 | #!python 10 | #cython: boundscheck=False, wraparound=False, cdivision=True 11 | 12 | from libc.math cimport fabs, sqrt, pow 13 | cimport numpy as np 14 | import numpy as np 15 | import scipy.sparse as sp 16 | cimport cython 17 | from cython cimport floating 18 | 19 | np.import_array() 20 | 21 | 22 | ctypedef np.float64_t DOUBLE 23 | 24 | def csr_row_norms(X): 25 | """L2 norm of each row in CSR matrix X.""" 26 | if X.dtype != np.float32: 27 | X = X.astype(np.float64) 28 | return _csr_row_norms(X.data, X.shape, X.indices, X.indptr) 29 | 30 | 31 | def _csr_row_norms(np.ndarray[floating, ndim=1, mode="c"] X_data, 32 | shape, 33 | np.ndarray[int, ndim=1, mode="c"] X_indices, 34 | np.ndarray[int, ndim=1, mode="c"] X_indptr): 35 | cdef: 36 | unsigned int n_samples = shape[0] 37 | unsigned int n_features = shape[1] 38 | np.ndarray[DOUBLE, ndim=1, mode="c"] norms 39 | 40 | np.npy_intp i, j 41 | double sum_ 42 | 43 | norms = np.zeros(n_samples, dtype=np.float64) 44 | 45 | for i in range(n_samples): 46 | sum_ = 0.0 47 | for j in range(X_indptr[i], X_indptr[i + 1]): 48 | sum_ += X_data[j] * X_data[j] 49 | norms[i] = sum_ 50 | 51 | return norms 52 | 53 | 54 | def csr_mean_variance_axis0(X): 55 | """Compute mean and variance along axis 0 on a CSR matrix 56 | 57 | Parameters 58 | ---------- 59 | X : CSR sparse matrix, shape (n_samples, n_features) 60 | Input data. 61 | 62 | Returns 63 | ------- 64 | 65 | means : float array with shape (n_features,) 66 | Feature-wise means 67 | 68 | variances : float array with shape (n_features,) 69 | Feature-wise variances 70 | 71 | """ 72 | if X.dtype != np.float32: 73 | X = X.astype(np.float64) 74 | return _csr_mean_variance_axis0(X.data, X.shape, X.indices) 75 | 76 | 77 | def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, 78 | shape, 79 | np.ndarray[int, ndim=1] X_indices): 80 | # Implement the function here since variables using fused types 81 | # cannot be declared directly and can only be passed as function arguments 82 | cdef unsigned int n_samples = shape[0] 83 | cdef unsigned int n_features = shape[1] 84 | 85 | cdef unsigned int i 86 | cdef unsigned int non_zero = X_indices.shape[0] 87 | cdef unsigned int col_ind 88 | cdef floating diff 89 | 90 | # means[j] contains the mean of feature j 91 | cdef np.ndarray[floating, ndim=1] means 92 | # variances[j] contains the variance of feature j 93 | cdef np.ndarray[floating, ndim=1] variances 94 | 95 | if floating is float: 96 | dtype = np.float32 97 | else: 98 | dtype = np.float64 99 | 100 | means = np.zeros(n_features, dtype=dtype) 101 | variances = np.zeros_like(means, dtype=dtype) 102 | 103 | # counts[j] contains the number of samples where feature j is non-zero 104 | cdef np.ndarray[int, ndim=1] counts = np.zeros(n_features, 105 | dtype=np.int32) 106 | 107 | for i in xrange(non_zero): 108 | col_ind = X_indices[i] 109 | means[col_ind] += X_data[i] 110 | 111 | means /= n_samples 112 | 113 | for i in xrange(non_zero): 114 | col_ind = X_indices[i] 115 | diff = X_data[i] - means[col_ind] 116 | variances[col_ind] += diff * diff 117 | counts[col_ind] += 1 118 | 119 | for i in xrange(n_features): 120 | variances[i] += (n_samples - counts[i]) * means[i] ** 2 121 | variances[i] /= n_samples 122 | 123 | return means, variances 124 | 125 | 126 | def csc_mean_variance_axis0(X): 127 | """Compute mean and variance along axis 0 on a CSC matrix 128 | 129 | Parameters 130 | ---------- 131 | X : CSC sparse matrix, shape (n_samples, n_features) 132 | Input data. 133 | 134 | Returns 135 | ------- 136 | 137 | means : float array with shape (n_features,) 138 | Feature-wise means 139 | 140 | variances : float array with shape (n_features,) 141 | Feature-wise variances 142 | 143 | """ 144 | if X.dtype != np.float32: 145 | X = X.astype(np.float64) 146 | return _csc_mean_variance_axis0(X.data, X.shape, X.indices, X.indptr) 147 | 148 | 149 | def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, 150 | shape, 151 | np.ndarray[int, ndim=1] X_indices, 152 | np.ndarray[int, ndim=1] X_indptr): 153 | # Implement the function here since variables using fused types 154 | # cannot be declared directly and can only be passed as function arguments 155 | cdef unsigned int n_samples = shape[0] 156 | cdef unsigned int n_features = shape[1] 157 | 158 | cdef unsigned int i 159 | cdef unsigned int j 160 | cdef unsigned int counts 161 | cdef unsigned int startptr 162 | cdef unsigned int endptr 163 | cdef floating diff 164 | 165 | # means[j] contains the mean of feature j 166 | cdef np.ndarray[floating, ndim=1] means 167 | # variances[j] contains the variance of feature j 168 | cdef np.ndarray[floating, ndim=1] variances 169 | if floating is float: 170 | dtype = np.float32 171 | else: 172 | dtype = np.float64 173 | 174 | means = np.zeros(n_features, dtype=dtype) 175 | variances = np.zeros_like(means, dtype=dtype) 176 | 177 | for i in xrange(n_features): 178 | 179 | startptr = X_indptr[i] 180 | endptr = X_indptr[i + 1] 181 | counts = endptr - startptr 182 | 183 | for j in xrange(startptr, endptr): 184 | means[i] += X_data[j] 185 | means[i] /= n_samples 186 | 187 | for j in xrange(startptr, endptr): 188 | diff = X_data[j] - means[i] 189 | variances[i] += diff * diff 190 | 191 | variances[i] += (n_samples - counts) * means[i] * means[i] 192 | variances[i] /= n_samples 193 | 194 | return means, variances 195 | 196 | 197 | def incr_mean_variance_axis0(X, last_mean, last_var, unsigned long last_n): 198 | """Compute mean and variance along axis 0 on a CSR or CSC matrix. 199 | 200 | last_mean, last_var are the statistics computed at the last step by this 201 | function. Both must be initilized to 0.0. last_n is the 202 | number of samples encountered until now and is initialized at 0. 203 | 204 | Parameters 205 | ---------- 206 | X : CSR or CSC sparse matrix, shape (n_samples, n_features) 207 | Input data. 208 | 209 | last_mean : float array with shape (n_features,) 210 | Array of feature-wise means to update with the new data X. 211 | 212 | last_var : float array with shape (n_features,) 213 | Array of feature-wise var to update with the new data X. 214 | 215 | last_n : int 216 | Number of samples seen so far, before X. 217 | 218 | Returns 219 | ------- 220 | 221 | updated_mean : float array with shape (n_features,) 222 | Feature-wise means 223 | 224 | updated_variance : float array with shape (n_features,) 225 | Feature-wise variances 226 | 227 | updated_n : int 228 | Updated number of samples seen 229 | 230 | References 231 | ---------- 232 | 233 | T. Chan, G. Golub, R. LeVeque. Algorithms for computing the sample 234 | variance: recommendations, The American Statistician, Vol. 37, No. 3, 235 | pp. 242-247 236 | 237 | Also, see the non-sparse implementation of this in 238 | `utils.extmath._batch_mean_variance_update`. 239 | 240 | """ 241 | if X.dtype != np.float32: 242 | X = X.astype(np.float64) 243 | return _incr_mean_variance_axis0(X.data, X.shape, X.indices, X.indptr, 244 | X.format, last_mean, last_var, last_n) 245 | 246 | 247 | def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, 248 | shape, 249 | np.ndarray[int, ndim=1] X_indices, 250 | np.ndarray[int, ndim=1] X_indptr, 251 | X_format, 252 | last_mean, 253 | last_var, 254 | unsigned long last_n): 255 | # Implement the function here since variables using fused types 256 | # cannot be declared directly and can only be passed as function arguments 257 | cdef unsigned long n_samples = shape[0] 258 | cdef unsigned int n_features = shape[1] 259 | cdef unsigned int i 260 | 261 | # last = stats until now 262 | # new = the current increment 263 | # updated = the aggregated stats 264 | # when arrays, they are indexed by i per-feature 265 | cdef np.ndarray[floating, ndim=1] new_mean 266 | cdef np.ndarray[floating, ndim=1] new_var 267 | cdef np.ndarray[floating, ndim=1] updated_mean 268 | cdef np.ndarray[floating, ndim=1] updated_var 269 | if floating is float: 270 | dtype = np.float32 271 | else: 272 | dtype = np.float64 273 | 274 | new_mean = np.zeros(n_features, dtype=dtype) 275 | new_var = np.zeros_like(new_mean, dtype=dtype) 276 | updated_mean = np.zeros_like(new_mean, dtype=dtype) 277 | updated_var = np.zeros_like(new_mean, dtype=dtype) 278 | 279 | cdef unsigned long new_n 280 | cdef unsigned long updated_n 281 | cdef floating last_over_new_n 282 | 283 | # Obtain new stats first 284 | new_n = n_samples 285 | 286 | if X_format == 'csr': 287 | # X is a CSR matrix 288 | new_mean, new_var = _csr_mean_variance_axis0(X_data, shape, X_indices) 289 | else: 290 | # X is a CSC matrix 291 | new_mean, new_var = _csc_mean_variance_axis0(X_data, shape, X_indices, 292 | X_indptr) 293 | 294 | # First pass 295 | if last_n == 0: 296 | return new_mean, new_var, new_n 297 | # Next passes 298 | else: 299 | updated_n = last_n + new_n 300 | last_over_new_n = last_n / new_n 301 | 302 | for i in xrange(n_features): 303 | # Unnormalized old stats 304 | last_mean[i] *= last_n 305 | last_var[i] *= last_n 306 | 307 | # Unnormalized new stats 308 | new_mean[i] *= new_n 309 | new_var[i] *= new_n 310 | 311 | # Update stats 312 | updated_var[i] = (last_var[i] + new_var[i] + 313 | last_over_new_n / updated_n * 314 | (last_mean[i] / last_over_new_n - new_mean[i]) ** 2) 315 | 316 | updated_mean[i] = (last_mean[i] + new_mean[i]) / updated_n 317 | updated_var[i] = updated_var[i] / updated_n 318 | 319 | return updated_mean, updated_var, updated_n 320 | 321 | 322 | def inplace_csr_row_normalize_l1(X): 323 | """Inplace row normalize using the l1 norm""" 324 | _inplace_csr_row_normalize_l1(X.data, X.shape, X.indices, X.indptr) 325 | 326 | 327 | def _inplace_csr_row_normalize_l1(np.ndarray[floating, ndim=1] X_data, 328 | shape, 329 | np.ndarray[int, ndim=1] X_indices, 330 | np.ndarray[int, ndim=1] X_indptr): 331 | cdef unsigned int n_samples = shape[0] 332 | cdef unsigned int n_features = shape[1] 333 | 334 | # the column indices for row i are stored in: 335 | # indices[indptr[i]:indices[i+1]] 336 | # and their corresponding values are stored in: 337 | # data[indptr[i]:indptr[i+1]] 338 | cdef unsigned int i 339 | cdef unsigned int j 340 | cdef double sum_ 341 | 342 | for i in xrange(n_samples): 343 | sum_ = 0.0 344 | 345 | for j in xrange(X_indptr[i], X_indptr[i + 1]): 346 | sum_ += fabs(X_data[j]) 347 | 348 | if sum_ == 0.0: 349 | # do not normalize empty rows (can happen if CSR is not pruned 350 | # correctly) 351 | continue 352 | 353 | for j in xrange(X_indptr[i], X_indptr[i + 1]): 354 | X_data[j] /= sum_ 355 | 356 | 357 | def inplace_csr_row_normalize_l2(X): 358 | """Inplace row normalize using the l2 norm""" 359 | _inplace_csr_row_normalize_l2(X.data, X.shape, X.indices, X.indptr) 360 | 361 | 362 | def _inplace_csr_row_normalize_l2(np.ndarray[floating, ndim=1] X_data, 363 | shape, 364 | np.ndarray[int, ndim=1] X_indices, 365 | np.ndarray[int, ndim=1] X_indptr): 366 | cdef unsigned int n_samples = shape[0] 367 | cdef unsigned int n_features = shape[1] 368 | 369 | cdef unsigned int i 370 | cdef unsigned int j 371 | cdef double sum_ 372 | 373 | for i in xrange(n_samples): 374 | sum_ = 0.0 375 | 376 | for j in xrange(X_indptr[i], X_indptr[i + 1]): 377 | sum_ += (X_data[j] * X_data[j]) 378 | 379 | if sum_ == 0.0: 380 | # do not normalize empty rows (can happen if CSR is not pruned 381 | # correctly) 382 | continue 383 | 384 | sum_ = sqrt(sum_) 385 | 386 | for j in xrange(X_indptr[i], X_indptr[i + 1]): 387 | X_data[j] /= sum_ 388 | 389 | 390 | def assign_rows_csr(X, 391 | np.ndarray[np.npy_intp, ndim=1] X_rows, 392 | np.ndarray[np.npy_intp, ndim=1] out_rows, 393 | np.ndarray[floating, ndim=2, mode="c"] out): 394 | """Densify selected rows of a CSR matrix into a preallocated array. 395 | 396 | Like out[out_rows] = X[X_rows].toarray() but without copying. 397 | No-copy supported for both dtype=np.float32 and dtype=np.float64. 398 | 399 | Parameters 400 | ---------- 401 | X : scipy.sparse.csr_matrix, shape=(n_samples, n_features) 402 | X_rows : array, dtype=np.intp, shape=n_rows 403 | out_rows : array, dtype=np.intp, shape=n_rows 404 | out : array, shape=(arbitrary, n_features) 405 | """ 406 | cdef: 407 | # npy_intp (np.intp in Python) is what np.where returns, 408 | # but int is what scipy.sparse uses. 409 | int i, ind, j 410 | np.npy_intp rX 411 | np.ndarray[floating, ndim=1] data = X.data 412 | np.ndarray[int, ndim=1] indices = X.indices, indptr = X.indptr 413 | 414 | if X_rows.shape[0] != out_rows.shape[0]: 415 | raise ValueError("cannot assign %d rows to %d" 416 | % (X_rows.shape[0], out_rows.shape[0])) 417 | 418 | out[out_rows] = 0. 419 | for i in range(X_rows.shape[0]): 420 | rX = X_rows[i] 421 | for ind in range(indptr[rX], indptr[rX + 1]): 422 | j = indices[ind] 423 | out[out_rows[i], j] = data[ind] -------------------------------------------------------------------------------- /size_constrained_clustering/sklearn_import/utils/validation.py: -------------------------------------------------------------------------------- 1 | import numbers 2 | import warnings 3 | 4 | import numpy as np 5 | from scipy import sparse as sp 6 | from sklearn_import.exceptions import NotFittedError 7 | 8 | from sklearn_import import get_config as _get_config 9 | 10 | from sklearn_import.exceptions import DataConversionWarning 11 | import six 12 | 13 | 14 | def check_array(array, accept_sparse=False, dtype="numeric", order=None, 15 | copy=False, force_all_finite=True, ensure_2d=True, 16 | allow_nd=False, ensure_min_samples=1, ensure_min_features=1, 17 | warn_on_dtype=False, estimator=None): 18 | """Input validation on an array, list, sparse matrix or similar. 19 | 20 | By default, the input is converted to an at least 2D numpy array. 21 | If the dtype of the array is object, attempt converting to float, 22 | raising on failure. 23 | 24 | Parameters 25 | ---------- 26 | array : object 27 | Input object to check / convert. 28 | 29 | accept_sparse : string, boolean or list/tuple of strings (default=False) 30 | String[s] representing allowed sparse matrix formats, such as 'csc', 31 | 'csr', etc. If the input is sparse but not in the allowed format, 32 | it will be converted to the first listed format. True allows the input 33 | to be any format. False means that a sparse matrix input will 34 | raise an error. 35 | 36 | .. deprecated:: 0.19 37 | Passing 'None' to parameter ``accept_sparse`` in methods is 38 | deprecated in version 0.19 "and will be removed in 0.21. Use 39 | ``accept_sparse=False`` instead. 40 | 41 | dtype : string, type, list of types or None (default="numeric") 42 | Data type of result. If None, the dtype of the input is preserved. 43 | If "numeric", dtype is preserved unless array.dtype is object. 44 | If dtype is a list of types, conversion on the first type is only 45 | performed if the dtype of the input is not in the list. 46 | 47 | order : 'F', 'C' or None (default=None) 48 | Whether an array will be forced to be fortran or c-style. 49 | When order is None (default), then if copy=False, nothing is ensured 50 | about the memory layout of the output array; otherwise (copy=True) 51 | the memory layout of the returned array is kept as close as possible 52 | to the original array. 53 | 54 | copy : boolean (default=False) 55 | Whether a forced copy will be triggered. If copy=False, a copy might 56 | be triggered by a conversion. 57 | 58 | force_all_finite : boolean (default=True) 59 | Whether to raise an error on np.inf and np.nan in X. 60 | 61 | ensure_2d : boolean (default=True) 62 | Whether to raise a value error if X is not 2d. 63 | 64 | allow_nd : boolean (default=False) 65 | Whether to allow X.ndim > 2. 66 | 67 | ensure_min_samples : int (default=1) 68 | Make sure that the array has a minimum number of samples in its first 69 | axis (rows for a 2D array). Setting to 0 disables this check. 70 | 71 | ensure_min_features : int (default=1) 72 | Make sure that the 2D array has some minimum number of features 73 | (columns). The default value of 1 rejects empty datasets. 74 | This check is only enforced when the input data has effectively 2 75 | dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0 76 | disables this check. 77 | 78 | warn_on_dtype : boolean (default=False) 79 | Raise DataConversionWarning if the dtype of the input data structure 80 | does not match the requested dtype, causing a memory copy. 81 | 82 | estimator : str or estimator instance (default=None) 83 | If passed, include the name of the estimator in warning messages. 84 | 85 | Returns 86 | ------- 87 | X_converted : object 88 | The converted and validated X. 89 | 90 | """ 91 | # accept_sparse 'None' deprecation check 92 | if accept_sparse is None: 93 | warnings.warn( 94 | "Passing 'None' to parameter 'accept_sparse' in methods " 95 | "check_array and check_X_y is deprecated in version 0.19 " 96 | "and will be removed in 0.21. Use 'accept_sparse=False' " 97 | " instead.", DeprecationWarning) 98 | accept_sparse = False 99 | 100 | # store whether originally we wanted numeric dtype 101 | dtype_numeric = isinstance(dtype, six.string_types) and dtype == "numeric" 102 | 103 | dtype_orig = getattr(array, "dtype", None) 104 | if not hasattr(dtype_orig, 'kind'): 105 | # not a data type (e.g. a column named dtype in a pandas DataFrame) 106 | dtype_orig = None 107 | 108 | if dtype_numeric: 109 | if dtype_orig is not None and dtype_orig.kind == "O": 110 | # if input is object, convert to float. 111 | dtype = np.float64 112 | else: 113 | dtype = None 114 | 115 | if isinstance(dtype, (list, tuple)): 116 | if dtype_orig is not None and dtype_orig in dtype: 117 | # no dtype conversion required 118 | dtype = None 119 | else: 120 | # dtype conversion required. Let's select the first element of the 121 | # list of accepted types. 122 | dtype = dtype[0] 123 | 124 | if estimator is not None: 125 | if isinstance(estimator, six.string_types): 126 | estimator_name = estimator 127 | else: 128 | estimator_name = estimator.__class__.__name__ 129 | else: 130 | estimator_name = "Estimator" 131 | context = " by %s" % estimator_name if estimator is not None else "" 132 | 133 | if sp.issparse(array): 134 | array = _ensure_sparse_format(array, accept_sparse, dtype, copy, 135 | force_all_finite) 136 | else: 137 | array = np.array(array, dtype=dtype, order=order, copy=copy) 138 | 139 | if ensure_2d: 140 | if array.ndim == 1: 141 | raise ValueError( 142 | "Expected 2D array, got 1D array instead:\narray={}.\n" 143 | "Reshape your data either using array.reshape(-1, 1) if " 144 | "your data has a single feature or array.reshape(1, -1) " 145 | "if it contains a single sample.".format(array)) 146 | array = np.atleast_2d(array) 147 | # To ensure that array flags are maintained 148 | array = np.array(array, dtype=dtype, order=order, copy=copy) 149 | 150 | # make sure we actually converted to numeric: 151 | if dtype_numeric and array.dtype.kind == "O": 152 | array = array.astype(np.float64) 153 | if not allow_nd and array.ndim >= 3: 154 | raise ValueError("Found array with dim %d. %s expected <= 2." 155 | % (array.ndim, estimator_name)) 156 | if force_all_finite: 157 | _assert_all_finite(array) 158 | 159 | shape_repr = _shape_repr(array.shape) 160 | if ensure_min_samples > 0: 161 | n_samples = _num_samples(array) 162 | if n_samples < ensure_min_samples: 163 | raise ValueError("Found array with %d sample(s) (shape=%s) while a" 164 | " minimum of %d is required%s." 165 | % (n_samples, shape_repr, ensure_min_samples, 166 | context)) 167 | 168 | if ensure_min_features > 0 and array.ndim == 2: 169 | n_features = array.shape[1] 170 | if n_features < ensure_min_features: 171 | raise ValueError("Found array with %d feature(s) (shape=%s) while" 172 | " a minimum of %d is required%s." 173 | % (n_features, shape_repr, ensure_min_features, 174 | context)) 175 | 176 | if warn_on_dtype and dtype_orig is not None and array.dtype != dtype_orig: 177 | msg = ("Data with input dtype %s was converted to %s%s." 178 | % (dtype_orig, array.dtype, context)) 179 | warnings.warn(msg, DataConversionWarning) 180 | return array 181 | 182 | 183 | def check_random_state(seed): 184 | """Turn seed into a np.random.RandomState instance 185 | 186 | Parameters 187 | ---------- 188 | seed : None | int | instance of RandomState 189 | If seed is None, return the RandomState singleton used by np.random. 190 | If seed is an int, return a new RandomState instance seeded with seed. 191 | If seed is already a RandomState instance, return it. 192 | Otherwise raise ValueError. 193 | """ 194 | if seed is None or seed is np.random: 195 | return np.random.mtrand._rand 196 | if isinstance(seed, (numbers.Integral, np.integer)): 197 | return np.random.RandomState(seed) 198 | if isinstance(seed, np.random.RandomState): 199 | return seed 200 | raise ValueError('%r cannot be used to seed a numpy.random.RandomState' 201 | ' instance' % seed) 202 | 203 | 204 | def as_float_array(X, copy=True, force_all_finite=True): 205 | """Converts an array-like to an array of floats. 206 | 207 | The new dtype will be np.float32 or np.float64, depending on the original 208 | type. The function can create a copy or modify the argument depending 209 | on the argument copy. 210 | 211 | Parameters 212 | ---------- 213 | X : {array-like, sparse matrix} 214 | 215 | copy : bool, optional 216 | If True, a copy of X will be created. If False, a copy may still be 217 | returned if X's dtype is not a floating point type. 218 | 219 | force_all_finite : boolean (default=True) 220 | Whether to raise an error on np.inf and np.nan in X. 221 | 222 | Returns 223 | ------- 224 | XT : {array, sparse matrix} 225 | An array of type np.float 226 | """ 227 | if isinstance(X, np.matrix) or (not isinstance(X, np.ndarray) 228 | and not sp.issparse(X)): 229 | return check_array(X, ['csr', 'csc', 'coo'], dtype=np.float64, 230 | copy=copy, force_all_finite=force_all_finite, 231 | ensure_2d=False) 232 | elif sp.issparse(X) and X.dtype in [np.float32, np.float64]: 233 | return X.copy() if copy else X 234 | elif X.dtype in [np.float32, np.float64]: # is numpy array 235 | return X.copy('F' if X.flags['F_CONTIGUOUS'] else 'C') if copy else X 236 | else: 237 | if X.dtype.kind in 'uib' and X.dtype.itemsize <= 4: 238 | return_dtype = np.float32 239 | else: 240 | return_dtype = np.float64 241 | return X.astype(return_dtype) 242 | 243 | 244 | def _assert_all_finite(X): 245 | """Like assert_all_finite, but only for ndarray.""" 246 | if _get_config()['assume_finite']: 247 | return 248 | X = np.asanyarray(X) 249 | # First try an O(n) time, O(1) space solution for the common case that 250 | # everything is finite; fall back to O(n) space np.isfinite to prevent 251 | # false positives from overflow in sum method. 252 | if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum()) 253 | and not np.isfinite(X).all()): 254 | raise ValueError("Input contains NaN, infinity" 255 | " or a value too large for %r." % X.dtype) 256 | 257 | 258 | def _num_samples(x): 259 | """Return number of samples in array-like x.""" 260 | if hasattr(x, 'fit') and callable(x.fit): 261 | # Don't get num_samples from an ensembles length! 262 | raise TypeError('Expected sequence or array-like, got ' 263 | 'estimator %s' % x) 264 | if not hasattr(x, '__len__') and not hasattr(x, 'shape'): 265 | if hasattr(x, '__array__'): 266 | x = np.asarray(x) 267 | else: 268 | raise TypeError("Expected sequence or array-like, got %s" % 269 | type(x)) 270 | if hasattr(x, 'shape'): 271 | if len(x.shape) == 0: 272 | raise TypeError("Singleton array %r cannot be considered" 273 | " a valid collection." % x) 274 | return x.shape[0] 275 | else: 276 | return len(x) 277 | 278 | 279 | def _shape_repr(shape): 280 | """Return a platform independent representation of an array shape 281 | 282 | Under Python 2, the `long` type introduces an 'L' suffix when using the 283 | default %r format for tuples of integers (typically used to store the shape 284 | of an array). 285 | 286 | Under Windows 64 bit (and Python 2), the `long` type is used by default 287 | in numpy shapes even when the integer dimensions are well below 32 bit. 288 | The platform specific type causes string messages or doctests to change 289 | from one platform to another which is not desirable. 290 | 291 | Under Python 3, there is no more `long` type so the `L` suffix is never 292 | introduced in string representation. 293 | 294 | >>> _shape_repr((1, 2)) 295 | '(1, 2)' 296 | >>> one = 2 ** 64 / 2 ** 64 # force an upcast to `long` under Python 2 297 | >>> _shape_repr((one, 2 * one)) 298 | '(1, 2)' 299 | >>> _shape_repr((1,)) 300 | '(1,)' 301 | >>> _shape_repr(()) 302 | '()' 303 | """ 304 | if len(shape) == 0: 305 | return "()" 306 | joined = ", ".join("%d" % e for e in shape) 307 | if len(shape) == 1: 308 | # special notation for singleton tuples 309 | joined += ',' 310 | return "(%s)" % joined 311 | 312 | 313 | def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy, 314 | force_all_finite): 315 | """Convert a sparse matrix to a given format. 316 | 317 | Checks the sparse format of spmatrix and converts if necessary. 318 | 319 | Parameters 320 | ---------- 321 | spmatrix : scipy sparse matrix 322 | Input to validate and convert. 323 | 324 | accept_sparse : string, boolean or list/tuple of strings 325 | String[s] representing allowed sparse matrix formats ('csc', 326 | 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but 327 | not in the allowed format, it will be converted to the first listed 328 | format. True allows the input to be any format. False means 329 | that a sparse matrix input will raise an error. 330 | 331 | dtype : string, type or None 332 | Data type of result. If None, the dtype of the input is preserved. 333 | 334 | copy : boolean 335 | Whether a forced copy will be triggered. If copy=False, a copy might 336 | be triggered by a conversion. 337 | 338 | force_all_finite : boolean 339 | Whether to raise an error on np.inf and np.nan in X. 340 | 341 | Returns 342 | ------- 343 | spmatrix_converted : scipy sparse matrix. 344 | Matrix that is ensured to have an allowed type. 345 | """ 346 | if dtype is None: 347 | dtype = spmatrix.dtype 348 | 349 | changed_format = False 350 | 351 | if isinstance(accept_sparse, six.string_types): 352 | accept_sparse = [accept_sparse] 353 | 354 | if accept_sparse is False: 355 | raise TypeError('A sparse matrix was passed, but dense ' 356 | 'data is required. Use X.toarray() to ' 357 | 'convert to a dense numpy array.') 358 | elif isinstance(accept_sparse, (list, tuple)): 359 | if len(accept_sparse) == 0: 360 | raise ValueError("When providing 'accept_sparse' " 361 | "as a tuple or list, it must contain at " 362 | "least one string value.") 363 | # ensure correct sparse format 364 | if spmatrix.format not in accept_sparse: 365 | # create new with correct sparse 366 | spmatrix = spmatrix.asformat(accept_sparse[0]) 367 | changed_format = True 368 | elif accept_sparse is not True: 369 | # any other type 370 | raise ValueError("Parameter 'accept_sparse' should be a string, " 371 | "boolean or list of strings. You provided " 372 | "'accept_sparse={}'.".format(accept_sparse)) 373 | 374 | if dtype != spmatrix.dtype: 375 | # convert dtype 376 | spmatrix = spmatrix.astype(dtype) 377 | elif copy and not changed_format: 378 | # force copy 379 | spmatrix = spmatrix.copy() 380 | 381 | if force_all_finite: 382 | if not hasattr(spmatrix, "data"): 383 | warnings.warn("Can't check %s sparse matrix for nan or inf." 384 | % spmatrix.format) 385 | else: 386 | _assert_all_finite(spmatrix.data) 387 | return spmatrix 388 | 389 | 390 | FLOAT_DTYPES = (np.float64, np.float32, np.float16) 391 | 392 | 393 | def check_is_fitted(estimator, attributes, msg=None, all_or_any=all): 394 | """Perform is_fitted validation for estimator. 395 | 396 | Checks if the estimator is fitted by verifying the presence of 397 | "all_or_any" of the passed attributes and raises a NotFittedError with the 398 | given message. 399 | 400 | Parameters 401 | ---------- 402 | estimator : estimator instance. 403 | estimator instance for which the check is performed. 404 | 405 | attributes : attribute name(s) given as string or a list/tuple of strings 406 | Eg.: 407 | ``["coef_", "estimator_", ...], "coef_"`` 408 | 409 | msg : string 410 | The default error message is, "This %(name)s instance is not fitted 411 | yet. Call 'fit' with appropriate arguments before using this method." 412 | 413 | For custom messages if "%(name)s" is present in the message string, 414 | it is substituted for the estimator name. 415 | 416 | Eg. : "Estimator, %(name)s, must be fitted before sparsifying". 417 | 418 | all_or_any : callable, {all, any}, default all 419 | Specify whether all or any of the given attributes must exist. 420 | 421 | Returns 422 | ------- 423 | None 424 | 425 | Raises 426 | ------ 427 | NotFittedError 428 | If the attributes are not found. 429 | """ 430 | if msg is None: 431 | msg = ("This %(name)s instance is not fitted yet. Call 'fit' with " 432 | "appropriate arguments before using this method.") 433 | 434 | if not hasattr(estimator, 'fit'): 435 | raise TypeError("%s is not an estimator instance." % (estimator)) 436 | 437 | if not isinstance(attributes, (list, tuple)): 438 | attributes = [attributes] 439 | 440 | if not all_or_any([hasattr(estimator, attr) for attr in attributes]): 441 | raise NotFittedError(msg % {'name': type(estimator).__name__}) 442 | -------------------------------------------------------------------------------- /size_constrained_clustering/sklearn_import/cluster/k_means_.py: -------------------------------------------------------------------------------- 1 | """K-means clustering""" 2 | 3 | # Authors: Gael Varoquaux 4 | # Thomas Rueckstiess 5 | # James Bergstra 6 | # Jan Schlueter 7 | # Nelle Varoquaux 8 | # Peter Prettenhofer 9 | # Olivier Grisel 10 | # Mathieu Blondel 11 | # Robert Layton 12 | # License: BSD 3 clause 13 | 14 | import warnings 15 | 16 | import numpy as np 17 | import scipy.sparse as sp 18 | from sklearn_import.base import BaseEstimator, ClusterMixin, TransformerMixin 19 | from six import string_types 20 | from sklearn_import.metrics.pairwise import euclidean_distances, pairwise_distances_argmin_min 21 | from sklearn_import.utils.validation import check_array, check_random_state, FLOAT_DTYPES, \ 22 | check_is_fitted 23 | from sklearn_import.utils.extmath import row_norms, stable_cumsum 24 | from sklearn_import.utils.sparsefuncs import mean_variance_axis 25 | 26 | from sklearn_import.cluster import _k_means 27 | 28 | 29 | ############################################################################### 30 | # Initialization heuristic 31 | 32 | 33 | def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None): 34 | """Init n_clusters seeds according to k-means++ 35 | 36 | Parameters 37 | ----------- 38 | X : array or sparse matrix, shape (n_samples, n_features) 39 | The data to pick seeds for. To avoid memory copy, the input data 40 | should be double precision (dtype=np.float64). 41 | 42 | n_clusters : integer 43 | The number of seeds to choose 44 | 45 | x_squared_norms : array, shape (n_samples,) 46 | Squared Euclidean norm of each data point. 47 | 48 | random_state : numpy.RandomState 49 | The generator used to initialize the centers. 50 | 51 | n_local_trials : integer, optional 52 | The number of seeding trials for each center (except the first), 53 | of which the one reducing inertia the most is greedily chosen. 54 | Set to None to make the number of trials depend logarithmically 55 | on the number of seeds (2+log(k)); this is the default. 56 | 57 | Notes 58 | ----- 59 | Selects initial cluster centers for k-mean clustering in a smart way 60 | to speed up convergence. see: Arthur, D. and Vassilvitskii, S. 61 | "k-means++: the advantages of careful seeding". ACM-SIAM symposium 62 | on Discrete algorithms. 2007 63 | 64 | Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip, 65 | which is the implementation used in the aforementioned paper. 66 | """ 67 | n_samples, n_features = X.shape 68 | 69 | centers = np.empty((n_clusters, n_features), dtype=X.dtype) 70 | 71 | assert x_squared_norms is not None, 'x_squared_norms None in _k_init' 72 | 73 | # Set the number of local seeding trials if none is given 74 | if n_local_trials is None: 75 | # This is what Arthur/Vassilvitskii tried, but did not report 76 | # specific results for other than mentioning in the conclusion 77 | # that it helped. 78 | n_local_trials = 2 + int(np.log(n_clusters)) 79 | 80 | # Pick first center randomly 81 | center_id = random_state.randint(n_samples) 82 | if sp.issparse(X): 83 | centers[0] = X[center_id].toarray() 84 | else: 85 | centers[0] = X[center_id] 86 | 87 | # Initialize list of closest distances and calculate current potential 88 | closest_dist_sq = euclidean_distances( 89 | centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms, 90 | squared=True) 91 | current_pot = closest_dist_sq.sum() 92 | 93 | # Pick the remaining n_clusters-1 points 94 | for c in range(1, n_clusters): 95 | # Choose center candidates by sampling with probability proportional 96 | # to the squared distance to the closest existing center 97 | rand_vals = random_state.random_sample(n_local_trials) * current_pot 98 | candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq), 99 | rand_vals) 100 | 101 | # Compute distances to center candidates 102 | distance_to_candidates = euclidean_distances( 103 | X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True) 104 | 105 | # Decide which candidate is the best 106 | best_candidate = None 107 | best_pot = None 108 | best_dist_sq = None 109 | for trial in range(n_local_trials): 110 | # Compute potential when including center candidate 111 | new_dist_sq = np.minimum(closest_dist_sq, 112 | distance_to_candidates[trial]) 113 | new_pot = new_dist_sq.sum() 114 | 115 | # Store result if it is the best local trial so far 116 | if (best_candidate is None) or (new_pot < best_pot): 117 | best_candidate = candidate_ids[trial] 118 | best_pot = new_pot 119 | best_dist_sq = new_dist_sq 120 | 121 | # Permanently add best center candidate found in local tries 122 | if sp.issparse(X): 123 | centers[c] = X[best_candidate].toarray() 124 | else: 125 | centers[c] = X[best_candidate] 126 | current_pot = best_pot 127 | closest_dist_sq = best_dist_sq 128 | 129 | return centers 130 | 131 | 132 | ############################################################################### 133 | # K-means batch estimation by EM (expectation maximization) 134 | 135 | def _validate_center_shape(X, n_centers, centers): 136 | """Check if centers is compatible with X and n_centers""" 137 | if len(centers) != n_centers: 138 | raise ValueError('The shape of the initial centers (%s) ' 139 | 'does not match the number of clusters %i' 140 | % (centers.shape, n_centers)) 141 | if centers.shape[1] != X.shape[1]: 142 | raise ValueError( 143 | "The number of features of the initial centers %s " 144 | "does not match the number of features of the data %s." 145 | % (centers.shape[1], X.shape[1])) 146 | 147 | 148 | def _tolerance(X, tol): 149 | """Return a tolerance which is independent of the dataset""" 150 | if sp.issparse(X): 151 | variances = mean_variance_axis(X, axis=0)[1] 152 | else: 153 | variances = np.var(X, axis=0) 154 | return np.mean(variances) * tol 155 | 156 | 157 | def _labels_inertia_precompute_dense(X, x_squared_norms, centers, distances): 158 | """Compute labels and inertia using a full distance matrix. 159 | 160 | This will overwrite the 'distances' array in-place. 161 | 162 | Parameters 163 | ---------- 164 | X : numpy array, shape (n_sample, n_features) 165 | Input data. 166 | 167 | x_squared_norms : numpy array, shape (n_samples,) 168 | Precomputed squared norms of X. 169 | 170 | centers : numpy array, shape (n_clusters, n_features) 171 | Cluster centers which data is assigned to. 172 | 173 | distances : numpy array, shape (n_samples,) 174 | Pre-allocated array in which distances are stored. 175 | 176 | Returns 177 | ------- 178 | labels : numpy array, dtype=np.int, shape (n_samples,) 179 | Indices of clusters that samples are assigned to. 180 | 181 | inertia : float 182 | Sum of distances of samples to their closest cluster center. 183 | 184 | """ 185 | n_samples = X.shape[0] 186 | 187 | # Breakup nearest neighbor distance computation into batches to prevent 188 | # memory blowup in the case of a large number of samples and clusters. 189 | # TODO: Once PR #7383 is merged use check_inputs=False in metric_kwargs. 190 | labels, mindist = pairwise_distances_argmin_min( 191 | X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True}) 192 | # cython k-means code assumes int32 inputs 193 | labels = labels.astype(np.int32) 194 | if n_samples == distances.shape[0]: 195 | # distances will be changed in-place 196 | distances[:] = mindist 197 | inertia = mindist.sum() 198 | return labels, inertia 199 | 200 | 201 | def _labels_inertia(X, x_squared_norms, centers, 202 | precompute_distances=True, distances=None): 203 | """E step of the K-means EM algorithm. 204 | 205 | Compute the labels and the inertia of the given samples and centers. 206 | This will compute the distances in-place. 207 | 208 | Parameters 209 | ---------- 210 | X : float64 array-like or CSR sparse matrix, shape (n_samples, n_features) 211 | The input samples to assign to the labels. 212 | 213 | x_squared_norms : array, shape (n_samples,) 214 | Precomputed squared euclidean norm of each data point, to speed up 215 | computations. 216 | 217 | centers : float array, shape (k, n_features) 218 | The cluster centers. 219 | 220 | precompute_distances : boolean, default: True 221 | Precompute distances (faster but takes more memory). 222 | 223 | distances : float array, shape (n_samples,) 224 | Pre-allocated array to be filled in with each sample's distance 225 | to the closest center. 226 | 227 | Returns 228 | ------- 229 | labels : int array of shape(n) 230 | The resulting assignment 231 | 232 | inertia : float 233 | Sum of distances of samples to their closest cluster center. 234 | """ 235 | n_samples = X.shape[0] 236 | # set the default value of centers to -1 to be able to detect any anomaly 237 | # easily 238 | labels = -np.ones(n_samples, np.int32) 239 | if distances is None: 240 | distances = np.zeros(shape=(0,), dtype=X.dtype) 241 | # distances will be changed in-place 242 | if sp.issparse(X): 243 | inertia = _k_means._assign_labels_csr( 244 | X, x_squared_norms, centers, labels, distances=distances) 245 | else: 246 | if precompute_distances: 247 | return _labels_inertia_precompute_dense(X, x_squared_norms, 248 | centers, distances) 249 | inertia = _k_means._assign_labels_array( 250 | X, x_squared_norms, centers, labels, distances=distances) 251 | return labels, inertia 252 | 253 | 254 | def _init_centroids(X, k, init, random_state=None, x_squared_norms=None, 255 | init_size=None): 256 | """Compute the initial centroids 257 | 258 | Parameters 259 | ---------- 260 | 261 | X : array, shape (n_samples, n_features) 262 | 263 | k : int 264 | number of centroids 265 | 266 | init : {'k-means++', 'random' or ndarray or callable} optional 267 | Method for initialization 268 | 269 | random_state : int, RandomState instance or None, optional, default: None 270 | If int, random_state is the seed used by the random number generator; 271 | If RandomState instance, random_state is the random number generator; 272 | If None, the random number generator is the RandomState instance used 273 | by `np.random`. 274 | 275 | x_squared_norms : array, shape (n_samples,), optional 276 | Squared euclidean norm of each data point. Pass it if you have it at 277 | hands already to avoid it being recomputed here. Default: None 278 | 279 | init_size : int, optional 280 | Number of samples to randomly sample for speeding up the 281 | initialization (sometimes at the expense of accuracy): the 282 | only algorithm is initialized by running a batch KMeans on a 283 | random subset of the data. This needs to be larger than k. 284 | 285 | Returns 286 | ------- 287 | centers : array, shape(k, n_features) 288 | """ 289 | random_state = check_random_state(random_state) 290 | n_samples = X.shape[0] 291 | 292 | if x_squared_norms is None: 293 | x_squared_norms = row_norms(X, squared=True) 294 | 295 | if init_size is not None and init_size < n_samples: 296 | if init_size < k: 297 | warnings.warn( 298 | "init_size=%d should be larger than k=%d. " 299 | "Setting it to 3*k" % (init_size, k), 300 | RuntimeWarning, stacklevel=2) 301 | init_size = 3 * k 302 | init_indices = random_state.randint(0, n_samples, init_size) 303 | X = X[init_indices] 304 | x_squared_norms = x_squared_norms[init_indices] 305 | n_samples = X.shape[0] 306 | elif n_samples < k: 307 | raise ValueError( 308 | "n_samples=%d should be larger than k=%d" % (n_samples, k)) 309 | 310 | if isinstance(init, string_types) and init == 'k-means++': 311 | centers = _k_init(X, k, random_state=random_state, 312 | x_squared_norms=x_squared_norms) 313 | elif isinstance(init, string_types) and init == 'random': 314 | seeds = random_state.permutation(n_samples)[:k] 315 | centers = X[seeds] 316 | elif hasattr(init, '__array__'): 317 | # ensure that the centers have the same dtype as X 318 | # this is a requirement of fused types of cython 319 | centers = np.array(init, dtype=X.dtype) 320 | elif callable(init): 321 | centers = init(X, k, random_state=random_state) 322 | centers = np.asarray(centers, dtype=X.dtype) 323 | else: 324 | raise ValueError("the init parameter for the k-means should " 325 | "be 'k-means++' or 'random' or an ndarray, " 326 | "'%s' (type '%s') was passed." % (init, type(init))) 327 | 328 | if sp.issparse(centers): 329 | centers = centers.toarray() 330 | 331 | _validate_center_shape(X, k, centers) 332 | return centers 333 | 334 | 335 | class KMeans(BaseEstimator, ClusterMixin, TransformerMixin): 336 | """K-Means clustering 337 | 338 | Read more in the :ref:`User Guide `. 339 | 340 | Parameters 341 | ---------- 342 | 343 | n_clusters : int, optional, default: 8 344 | The number of clusters to form as well as the number of 345 | centroids to generate. 346 | 347 | init : {'k-means++', 'random' or an ndarray} 348 | Method for initialization, defaults to 'k-means++': 349 | 350 | 'k-means++' : selects initial cluster centers for k-mean 351 | clustering in a smart way to speed up convergence. See section 352 | Notes in k_init for more details. 353 | 354 | 'random': choose k observations (rows) at random from data for 355 | the initial centroids. 356 | 357 | If an ndarray is passed, it should be of shape (n_clusters, n_features) 358 | and gives the initial centers. 359 | 360 | n_init : int, default: 10 361 | Number of time the k-means algorithm will be run with different 362 | centroid seeds. The final results will be the best output of 363 | n_init consecutive runs in terms of inertia. 364 | 365 | max_iter : int, default: 300 366 | Maximum number of iterations of the k-means algorithm for a 367 | single run. 368 | 369 | tol : float, default: 1e-4 370 | Relative tolerance with regards to inertia to declare convergence 371 | 372 | precompute_distances : {'auto', True, False} 373 | Precompute distances (faster but takes more memory). 374 | 375 | 'auto' : do not precompute distances if n_samples * n_clusters > 12 376 | million. This corresponds to about 100MB overhead per job using 377 | double precision. 378 | 379 | True : always precompute distances 380 | 381 | False : never precompute distances 382 | 383 | verbose : int, default 0 384 | Verbosity mode. 385 | 386 | random_state : int, RandomState instance or None, optional, default: None 387 | If int, random_state is the seed used by the random number generator; 388 | If RandomState instance, random_state is the random number generator; 389 | If None, the random number generator is the RandomState instance used 390 | by `np.random`. 391 | 392 | copy_x : boolean, default True 393 | When pre-computing distances it is more numerically accurate to center 394 | the data first. If copy_x is True, then the original data is not 395 | modified. If False, the original data is modified, and put back before 396 | the function returns, but small numerical differences may be introduced 397 | by subtracting and then adding the data mean. 398 | 399 | n_jobs : int 400 | The number of jobs to use for the computation. This works by computing 401 | each of the n_init runs in parallel. 402 | 403 | If -1 all CPUs are used. If 1 is given, no parallel computing code is 404 | used at all, which is useful for debugging. For n_jobs below -1, 405 | (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one 406 | are used. 407 | 408 | algorithm : "auto", "full" or "elkan", default="auto" 409 | K-means algorithm to use. The classical EM-style algorithm is "full". 410 | The "elkan" variation is more efficient by using the triangle 411 | inequality, but currently doesn't support sparse data. "auto" chooses 412 | "elkan" for dense data and "full" for sparse data. 413 | 414 | Attributes 415 | ---------- 416 | cluster_centers_ : array, [n_clusters, n_features] 417 | Coordinates of cluster centers 418 | 419 | labels_ : 420 | Labels of each point 421 | 422 | inertia_ : float 423 | Sum of distances of samples to their closest cluster center. 424 | 425 | Examples 426 | -------- 427 | 428 | >>> from sklearn.cluster import KMeans 429 | >>> import numpy as np 430 | >>> X = np.array([[1, 2], [1, 4], [1, 0], 431 | ... [4, 2], [4, 4], [4, 0]]) 432 | >>> kmeans = KMeans(n_clusters=2, random_state=0).fit(X) 433 | >>> kmeans.labels_ 434 | array([0, 0, 0, 1, 1, 1], dtype=int32) 435 | >>> kmeans.predict([[0, 0], [4, 4]]) 436 | array([0, 1], dtype=int32) 437 | >>> kmeans.cluster_centers_ 438 | array([[ 1., 2.], 439 | [ 4., 2.]]) 440 | 441 | See also 442 | -------- 443 | 444 | MiniBatchKMeans 445 | Alternative online implementation that does incremental updates 446 | of the centers positions using mini-batches. 447 | For large scale learning (say n_samples > 10k) MiniBatchKMeans is 448 | probably much faster than the default batch implementation. 449 | 450 | Notes 451 | ------ 452 | The k-means problem is solved using Lloyd's algorithm. 453 | 454 | The average complexity is given by O(k n T), were n is the number of 455 | samples and T is the number of iteration. 456 | 457 | The worst case complexity is given by O(n^(k+2/p)) with 458 | n = n_samples, p = n_features. (D. Arthur and S. Vassilvitskii, 459 | 'How slow is the k-means method?' SoCG2006) 460 | 461 | In practice, the k-means algorithm is very fast (one of the fastest 462 | clustering algorithms available), but it falls in local minima. That's why 463 | it can be useful to restart it several times. 464 | 465 | """ 466 | 467 | def __init__(self, n_clusters=8, init='k-means++', n_init=10, 468 | max_iter=300, tol=1e-4, precompute_distances='auto', 469 | verbose=0, random_state=None, copy_x=True, 470 | n_jobs=1, algorithm='auto'): 471 | 472 | self.n_clusters = n_clusters 473 | self.init = init 474 | self.max_iter = max_iter 475 | self.tol = tol 476 | self.precompute_distances = precompute_distances 477 | self.n_init = n_init 478 | self.verbose = verbose 479 | self.random_state = random_state 480 | self.copy_x = copy_x 481 | self.n_jobs = n_jobs 482 | self.algorithm = algorithm 483 | 484 | def _check_fit_data(self, X): 485 | """Verify that the number of samples given is larger than k""" 486 | X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32]) 487 | if X.shape[0] < self.n_clusters: 488 | raise ValueError("n_samples=%d should be >= n_clusters=%d" % ( 489 | X.shape[0], self.n_clusters)) 490 | return X 491 | 492 | def _check_test_data(self, X): 493 | X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES) 494 | n_samples, n_features = X.shape 495 | expected_n_features = self.cluster_centers_.shape[1] 496 | if not n_features == expected_n_features: 497 | raise ValueError("Incorrect number of features. " 498 | "Got %d features, expected %d" % ( 499 | n_features, expected_n_features)) 500 | 501 | return X 502 | 503 | def fit(self, X, y=None): 504 | """Compute k-means clustering. 505 | 506 | Parameters 507 | ---------- 508 | X : array-like or sparse matrix, shape=(n_samples, n_features) 509 | Training instances to cluster. 510 | """ 511 | # Added to remove scikit-learn internal dependenceies 512 | raise NotImplemented 513 | 514 | def fit_predict(self, X, y=None): 515 | """Compute cluster centers and predict cluster index for each sample. 516 | 517 | Convenience method; equivalent to calling fit(X) followed by 518 | predict(X). 519 | 520 | Parameters 521 | ---------- 522 | X : {array-like, sparse matrix}, shape = [n_samples, n_features] 523 | New data to transform. 524 | 525 | Returns 526 | ------- 527 | labels : array, shape [n_samples,] 528 | Index of the cluster each sample belongs to. 529 | """ 530 | return self.fit(X).labels_ 531 | 532 | def fit_transform(self, X, y=None): 533 | """Compute clustering and transform X to cluster-distance space. 534 | 535 | Equivalent to fit(X).transform(X), but more efficiently implemented. 536 | 537 | Parameters 538 | ---------- 539 | X : {array-like, sparse matrix}, shape = [n_samples, n_features] 540 | New data to transform. 541 | 542 | Returns 543 | ------- 544 | X_new : array, shape [n_samples, k] 545 | X transformed in the new space. 546 | """ 547 | # Currently, this just skips a copy of the data if it is not in 548 | # np.array or CSR format already. 549 | # XXX This skips _check_test_data, which may change the dtype; 550 | # we should refactor the input validation. 551 | X = self._check_fit_data(X) 552 | return self.fit(X)._transform(X) 553 | 554 | def transform(self, X): 555 | """Transform X to a cluster-distance space. 556 | 557 | In the new space, each dimension is the distance to the cluster 558 | centers. Note that even if X is sparse, the array returned by 559 | `transform` will typically be dense. 560 | 561 | Parameters 562 | ---------- 563 | X : {array-like, sparse matrix}, shape = [n_samples, n_features] 564 | New data to transform. 565 | 566 | Returns 567 | ------- 568 | X_new : array, shape [n_samples, k] 569 | X transformed in the new space. 570 | """ 571 | check_is_fitted(self, 'cluster_centers_') 572 | 573 | X = self._check_test_data(X) 574 | return self._transform(X) 575 | 576 | def _transform(self, X): 577 | """guts of transform method; no input validation""" 578 | return euclidean_distances(X, self.cluster_centers_) 579 | 580 | def predict(self, X): 581 | """Predict the closest cluster each sample in X belongs to. 582 | 583 | In the vector quantization literature, `cluster_centers_` is called 584 | the code book and each value returned by `predict` is the index of 585 | the closest code in the code book. 586 | 587 | Parameters 588 | ---------- 589 | X : {array-like, sparse matrix}, shape = [n_samples, n_features] 590 | New data to predict. 591 | 592 | Returns 593 | ------- 594 | labels : array, shape [n_samples,] 595 | Index of the cluster each sample belongs to. 596 | """ 597 | check_is_fitted(self, 'cluster_centers_') 598 | 599 | X = self._check_test_data(X) 600 | x_squared_norms = row_norms(X, squared=True) 601 | return _labels_inertia(X, x_squared_norms, self.cluster_centers_)[0] 602 | 603 | def score(self, X, y=None): 604 | """Opposite of the value of X on the K-means objective. 605 | 606 | Parameters 607 | ---------- 608 | X : {array-like, sparse matrix}, shape = [n_samples, n_features] 609 | New data. 610 | 611 | Returns 612 | ------- 613 | score : float 614 | Opposite of the value of X on the K-means objective. 615 | """ 616 | check_is_fitted(self, 'cluster_centers_') 617 | 618 | X = self._check_test_data(X) 619 | x_squared_norms = row_norms(X, squared=True) 620 | return -_labels_inertia(X, x_squared_norms, self.cluster_centers_)[1] 621 | 622 | 623 | -------------------------------------------------------------------------------- /size_constrained_clustering/k_means_constrained/k_means_constrained_.py: -------------------------------------------------------------------------------- 1 | """K-means clustering""" 2 | 3 | # Authors: Josh Levy-Kramer 4 | # Gael Varoquaux 5 | # Thomas Rueckstiess 6 | # James Bergstra 7 | # Jan Schlueter 8 | # Nelle Varoquaux 9 | # Peter Prettenhofer 10 | # Olivier Grisel 11 | # Mathieu Blondel 12 | # Robert Layton 13 | # License: BSD 3 clause 14 | 15 | import warnings 16 | import numpy as np 17 | import scipy.sparse as sp 18 | from scipy.spatial.distance import cdist 19 | 20 | import sys 21 | import os 22 | folderpath = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 23 | sys.path.append(folderpath) 24 | from sklearn_import.metrics.pairwise import euclidean_distances 25 | from sklearn_import.utils.extmath import row_norms, squared_norm, cartesian 26 | from sklearn_import.utils.validation import check_array, check_random_state, as_float_array 27 | 28 | from joblib import Parallel 29 | from joblib import delayed 30 | 31 | # Internal scikit learn methods imported into this project 32 | from sklearn_import.cluster._k_means import _centers_dense, _centers_sparse 33 | from sklearn_import.cluster.k_means_ import _validate_center_shape, _tolerance, KMeans, _init_centroids 34 | 35 | from k_means_constrained.mincostflow_vectorized import SimpleMinCostFlowVectorized 36 | 37 | 38 | def k_means_constrained(X, n_clusters, size_min=None, size_max=None, init='k-means++', 39 | distance_func=cdist, 40 | n_init=10, max_iter=300, verbose=False, 41 | tol=1e-4, random_state=None, copy_x=True, n_jobs=1, 42 | return_n_iter=False): 43 | """K-Means clustering with minimum and maximum cluster size constraints. 44 | 45 | Read more in the :ref:`User Guide `. 46 | 47 | Parameters 48 | ---------- 49 | X : array-like, shape (n_samples, n_features) 50 | The observations to cluster. 51 | 52 | size_min : int, optional, default: None 53 | Constrain the label assignment so that each cluster has a minimum 54 | size of size_min. If None, no constrains will be applied 55 | 56 | size_max : int, optional, default: None 57 | Constrain the label assignment so that each cluster has a maximum 58 | size of size_max. If None, no constrains will be applied 59 | 60 | n_clusters : int 61 | The number of clusters to form as well as the number of 62 | centroids to generate. 63 | 64 | init : {'k-means++', 'random', or ndarray, or a callable}, optional 65 | Method for initialization, default to 'k-means++': 66 | 67 | 'k-means++' : selects initial cluster centers for k-mean 68 | clustering in a smart way to speed up convergence. See section 69 | Notes in k_init for more details. 70 | 71 | 'random': generate k centroids from a Gaussian with mean and 72 | variance estimated from the data. 73 | 74 | If an ndarray is passed, it should be of shape (n_clusters, n_features) 75 | and gives the initial centers. 76 | 77 | If a callable is passed, it should take arguments X, k and 78 | and a random state and return an initialization. 79 | 80 | n_init : int, optional, default: 10 81 | Number of time the k-means algorithm will be run with different 82 | centroid seeds. The final results will be the best output of 83 | n_init consecutive runs in terms of inertia. 84 | 85 | max_iter : int, optional, default 300 86 | Maximum number of iterations of the k-means algorithm to run. 87 | 88 | verbose : boolean, optional 89 | Verbosity mode. 90 | 91 | tol : float, optional 92 | The relative increment in the results before declaring convergence. 93 | 94 | random_state : int, RandomState instance or None, optional, default: None 95 | If int, random_state is the seed used by the random number generator; 96 | If RandomState instance, random_state is the random number generator; 97 | If None, the random number generator is the RandomState instance used 98 | by `np.random`. 99 | 100 | copy_x : boolean, optional 101 | When pre-computing distances it is more numerically accurate to center 102 | the data first. If copy_x is True, then the original data is not 103 | modified. If False, the original data is modified, and put back before 104 | the function returns, but small numerical differences may be introduced 105 | by subtracting and then adding the data mean. 106 | 107 | n_jobs : int 108 | The number of jobs to use for the computation. This works by computing 109 | each of the n_init runs in parallel. 110 | 111 | If -1 all CPUs are used. If 1 is given, no parallel computing code is 112 | used at all, which is useful for debugging. For n_jobs below -1, 113 | (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one 114 | are used. 115 | 116 | return_n_iter : bool, optional 117 | Whether or not to return the number of iterations. 118 | 119 | Returns 120 | ------- 121 | centroid : float ndarray with shape (k, n_features) 122 | Centroids found at the last iteration of k-means. 123 | 124 | label : integer ndarray with shape (n_samples,) 125 | label[i] is the code or index of the centroid the 126 | i'th observation is closest to. 127 | 128 | inertia : float 129 | The final value of the inertia criterion (sum of squared distances to 130 | the closest centroid for all observations in the training set). 131 | 132 | best_n_iter : int 133 | Number of iterations corresponding to the best results. 134 | Returned only if `return_n_iter` is set to True. 135 | 136 | """ 137 | if sp.issparse(X): 138 | raise NotImplementedError("Not implemented for sparse X") 139 | 140 | if n_init <= 0: 141 | raise ValueError("Invalid number of initializations." 142 | " n_init=%d must be bigger than zero." % n_init) 143 | random_state = check_random_state(random_state) 144 | 145 | if max_iter <= 0: 146 | raise ValueError('Number of iterations should be a positive number,' 147 | ' got %d instead' % max_iter) 148 | 149 | X = as_float_array(X, copy=copy_x) 150 | tol = _tolerance(X, tol) 151 | 152 | # Validate init array 153 | if hasattr(init, '__array__'): 154 | init = check_array(init, dtype=X.dtype.type, copy=True) 155 | _validate_center_shape(X, n_clusters, init) 156 | 157 | if n_init != 1: 158 | warnings.warn( 159 | 'Explicit initial center position passed: ' 160 | 'performing only one init in k-means instead of n_init=%d' 161 | % n_init, RuntimeWarning, stacklevel=2) 162 | n_init = 1 163 | 164 | # subtract of mean of x for more accurate distance computations 165 | if not sp.issparse(X): 166 | X_mean = X.mean(axis=0) 167 | # The copy was already done above 168 | X -= X_mean 169 | 170 | if hasattr(init, '__array__'): 171 | init -= X_mean 172 | 173 | # precompute squared norms of data points 174 | x_squared_norms = row_norms(X, squared=True) 175 | 176 | best_labels, best_inertia, best_centers = None, None, None 177 | 178 | if n_jobs == 1: 179 | # For a single thread, less memory is needed if we just store one set 180 | # of the best results (as opposed to one set per run per thread). 181 | for it in range(n_init): 182 | # run a k-means once 183 | labels, inertia, centers, n_iter_ = kmeans_constrained_single( 184 | X, n_clusters, 185 | size_min=size_min, size_max=size_max, distance_func=distance_func, 186 | max_iter=max_iter, init=init, verbose=verbose, tol=tol, 187 | x_squared_norms=x_squared_norms, random_state=random_state) 188 | # determine if these results are the best so far 189 | if best_inertia is None or inertia < best_inertia: 190 | best_labels = labels.copy() 191 | best_centers = centers.copy() 192 | best_inertia = inertia 193 | best_n_iter = n_iter_ 194 | else: 195 | # parallelisation of k-means runs 196 | seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init) 197 | results = Parallel(n_jobs=n_jobs, verbose=0)( 198 | delayed(kmeans_constrained_single)(X, n_clusters, 199 | size_min=size_min, size_max=size_max, 200 | max_iter=max_iter, init=init, distance_func=distance_func, 201 | verbose=verbose, tol=tol, 202 | x_squared_norms=x_squared_norms, 203 | # Change seed to ensure variety 204 | random_state=seed) 205 | for seed in seeds) 206 | # Get results with the lowest inertia 207 | labels, inertia, centers, n_iters = zip(*results) 208 | best = np.argmin(inertia) 209 | best_labels = labels[best] 210 | best_inertia = inertia[best] 211 | best_centers = centers[best] 212 | best_n_iter = n_iters[best] 213 | 214 | if not sp.issparse(X): 215 | if not copy_x: 216 | X += X_mean 217 | best_centers += X_mean 218 | 219 | if return_n_iter: 220 | return best_centers, best_labels, best_inertia, best_n_iter 221 | else: 222 | return best_centers, best_labels, best_inertia 223 | 224 | 225 | def kmeans_constrained_single(X, n_clusters, size_min=None, size_max=None, 226 | max_iter=300, init='k-means++', distance_func=cdist, 227 | verbose=False, x_squared_norms=None, 228 | random_state=None, tol=1e-4): 229 | """A single run of k-means constrained, assumes preparation completed prior. 230 | 231 | Parameters 232 | ---------- 233 | X : array-like of floats, shape (n_samples, n_features) 234 | The observations to cluster. 235 | 236 | size_min : int, optional, default: None 237 | Constrain the label assignment so that each cluster has a minimum 238 | size of size_min. If None, no constrains will be applied 239 | 240 | size_max : int, optional, default: None 241 | Constrain the label assignment so that each cluster has a maximum 242 | size of size_max. If None, no constrains will be applied 243 | 244 | n_clusters : int 245 | The number of clusters to form as well as the number of 246 | centroids to generate. 247 | 248 | max_iter : int, optional, default 300 249 | Maximum number of iterations of the k-means algorithm to run. 250 | 251 | init : {'k-means++', 'random', or ndarray, or a callable}, optional 252 | Method for initialization, default to 'k-means++': 253 | 254 | 'k-means++' : selects initial cluster centers for k-mean 255 | clustering in a smart way to speed up convergence. See section 256 | Notes in k_init for more details. 257 | 258 | 'random': generate k centroids from a Gaussian with mean and 259 | variance estimated from the data. 260 | 261 | If an ndarray is passed, it should be of shape (k, p) and gives 262 | the initial centers. 263 | 264 | If a callable is passed, it should take arguments X, k and 265 | and a random state and return an initialization. 266 | 267 | tol : float, optional 268 | The relative increment in the results before declaring convergence. 269 | 270 | verbose : boolean, optional 271 | Verbosity mode 272 | 273 | x_squared_norms : array 274 | Precomputed x_squared_norms. 275 | 276 | random_state : int, RandomState instance or None, optional, default: None 277 | If int, random_state is the seed used by the random number generator; 278 | If RandomState instance, random_state is the random number generator; 279 | If None, the random number generator is the RandomState instance used 280 | by `np.random`. 281 | 282 | Returns 283 | ------- 284 | centroid : float ndarray with shape (k, n_features) 285 | Centroids found at the last iteration of k-means. 286 | 287 | label : integer ndarray with shape (n_samples,) 288 | label[i] is the code or index of the centroid the 289 | i'th observation is closest to. 290 | 291 | inertia : float 292 | The final value of the inertia criterion (sum of squared distances to 293 | the closest centroid for all observations in the training set). 294 | 295 | n_iter : int 296 | Number of iterations run. 297 | """ 298 | if sp.issparse(X): 299 | raise NotImplementedError("Not implemented for sparse X") 300 | 301 | random_state = check_random_state(random_state) 302 | n_samples = X.shape[0] 303 | 304 | best_labels, best_inertia, best_centers = None, None, None 305 | # init 306 | centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) 307 | if verbose: 308 | print("Initialization complete") 309 | 310 | # Allocate memory to store the distances for each sample to its 311 | # closer center for reallocation in case of ties 312 | distances = np.zeros(shape=(n_samples,), dtype=X.dtype) 313 | 314 | # Determine min and max sizes if non given 315 | if size_min is None: 316 | size_min = 0 317 | if size_max is None: 318 | size_max = n_samples # Number of data points 319 | 320 | # Check size min and max 321 | if not ((size_min >= 0) and (size_min <= n_samples) 322 | and (size_max >= 0) and (size_max <= n_samples)): 323 | raise ValueError("size_min and size_max must be a positive number smaller " 324 | "than the number of data points or `None`") 325 | if size_max < size_min: 326 | raise ValueError("size_max must be larger than size_min") 327 | if size_min*n_clusters > n_samples: 328 | raise ValueError("The product of size_min and n_clusters cannot exceed the number of samples (X)") 329 | 330 | # iterations 331 | for i in range(max_iter): 332 | centers_old = centers.copy() 333 | # labels assignment is also called the E-step of EM 334 | labels, inertia = \ 335 | _labels_constrained(X, centers, size_min, size_max, 336 | distances=distances, distance_func=distance_func) 337 | 338 | # computation of the means is also called the M-step of EM 339 | if sp.issparse(X): 340 | centers = _centers_sparse(X, labels, n_clusters, distances) 341 | else: 342 | centers = _centers_dense(X, labels, n_clusters, distances) 343 | 344 | if verbose: 345 | print("Iteration %2d, inertia %.3f" % (i, inertia)) 346 | 347 | if best_inertia is None or inertia < best_inertia: 348 | best_labels = labels.copy() 349 | best_centers = centers.copy() 350 | best_inertia = inertia 351 | 352 | center_shift_total = squared_norm(centers_old - centers) 353 | if center_shift_total <= tol: 354 | if verbose: 355 | print("Converged at iteration %d: " 356 | "center shift %e within tolerance %e" 357 | % (i, center_shift_total, tol)) 358 | break 359 | 360 | if center_shift_total > 0: 361 | # rerun E-step in case of non-convergence so that predicted labels 362 | # match cluster centers 363 | best_labels, best_inertia = \ 364 | _labels_constrained(X, centers, size_min, size_max, 365 | distances=distances, distance_func=distance_func) 366 | 367 | return best_labels, best_inertia, best_centers, i + 1 368 | 369 | 370 | def _labels_constrained(X, centers, size_min, size_max, distances, distance_func=cdist): 371 | """Compute labels using the min and max cluster size constraint 372 | 373 | This will overwrite the 'distances' array in-place. 374 | 375 | Parameters 376 | ---------- 377 | X : numpy array, shape (n_sample, n_features) 378 | Input data. 379 | 380 | size_min : int 381 | Minimum size for each cluster 382 | 383 | size_max : int 384 | Maximum size for each cluster 385 | 386 | centers : numpy array, shape (n_clusters, n_features) 387 | Cluster centers which data is assigned to. 388 | 389 | distances : numpy array, shape (n_samples,) 390 | Pre-allocated array in which distances are stored. 391 | 392 | Returns 393 | ------- 394 | labels : numpy array, dtype=np.int, shape (n_samples,) 395 | Indices of clusters that samples are assigned to. 396 | 397 | inertia : float 398 | Sum of squared distances of samples to their closest cluster center. 399 | 400 | """ 401 | C = centers 402 | 403 | # Distances to each centre C. (the `distances` parameter is the distance to the closest centre) 404 | # K-mean original uses squared distances but this equivalent for constrained k-means 405 | # D = euclidean_distances(X, C, squared=False) 406 | D = distance_func(X, C) 407 | 408 | edges, costs, capacities, supplies, n_C, n_X = minimum_cost_flow_problem_graph(X, C, D, size_min, size_max) 409 | labels = solve_min_cost_flow_graph(edges, costs, capacities, supplies, n_C, n_X) 410 | 411 | # cython k-means M step code assumes int32 inputs 412 | labels = labels.astype(np.int32) 413 | 414 | # Change distances in-place 415 | distances[:] = D[np.arange(D.shape[0]), labels]**2 # Square for M step of EM 416 | inertia = distances.sum() 417 | 418 | return labels, inertia 419 | 420 | 421 | def minimum_cost_flow_problem_graph(X, C, D, size_min, size_max): 422 | 423 | # Setup minimum cost flow formulation graph 424 | # Vertices indexes: 425 | # X-nodes: [0, n(x)-1], C-nodes: [n(X), n(X)+n(C)-1], C-dummy nodes:[n(X)+n(C), n(X)+2*n(C)-1], 426 | # Artificial node: [n(X)+2*n(C), n(X)+2*n(C)+1-1] 427 | 428 | # Create indices of nodes 429 | n_X = X.shape[0] 430 | n_C = C.shape[0] 431 | X_ix = np.arange(n_X) 432 | C_dummy_ix = np.arange(X_ix[-1] + 1, X_ix[-1] + 1 + n_C) 433 | C_ix = np.arange(C_dummy_ix[-1] + 1, C_dummy_ix[-1] + 1 + n_C) 434 | art_ix = C_ix[-1] + 1 435 | 436 | # Edges 437 | edges_X_C_dummy = cartesian([X_ix, C_dummy_ix]) # All X's connect to all C dummy nodes (C') 438 | edges_C_dummy_C = np.stack([C_dummy_ix, C_ix], axis=1) # Each C' connects to a corresponding C (centroid) 439 | edges_C_art = np.stack([C_ix, art_ix * np.ones(n_C)], axis=1) # All C connect to artificial node 440 | 441 | edges = np.concatenate([edges_X_C_dummy, edges_C_dummy_C, edges_C_art]) 442 | 443 | # Costs 444 | costs_X_C_dummy = D.reshape(D.size) 445 | costs = np.concatenate([costs_X_C_dummy, np.zeros(edges.shape[0] - len(costs_X_C_dummy))]) 446 | 447 | # Capacities - can set for max-k 448 | capacities_C_dummy_C = size_max * np.ones(n_C) 449 | cap_non = n_X # The total supply and therefore wont restrict flow 450 | capacities = np.concatenate([ 451 | np.ones(edges_X_C_dummy.shape[0]), 452 | capacities_C_dummy_C, 453 | cap_non * np.ones(n_C) 454 | ]) 455 | 456 | # Sources and sinks 457 | supplies_X = np.ones(n_X) 458 | supplies_C = -1 * size_min * np.ones(n_C) # Demand node 459 | supplies_art = -1 * (n_X - n_C*size_min) # Demand node 460 | supplies = np.concatenate([ 461 | supplies_X, 462 | np.zeros(n_C), # C_dummies 463 | supplies_C, 464 | [supplies_art] 465 | ]) 466 | 467 | # All arrays must be of int dtype for `SimpleMinCostFlow` 468 | edges = edges.astype('int32') 469 | costs = np.around(costs*1000, 0).astype('int32') # Times by 1000 to give extra precision 470 | capacities = capacities.astype('int32') 471 | supplies = supplies.astype('int32') 472 | 473 | return edges, costs, capacities, supplies, n_C, n_X 474 | 475 | 476 | def solve_min_cost_flow_graph(edges, costs, capacities, supplies, n_C, n_X): 477 | 478 | # Instantiate a SimpleMinCostFlow solver. 479 | min_cost_flow = SimpleMinCostFlowVectorized() 480 | 481 | if (edges.dtype != 'int32') or (costs.dtype != 'int32') \ 482 | or (capacities.dtype != 'int32') or (supplies.dtype != 'int32'): 483 | raise ValueError("`edges`, `costs`, `capacities`, `supplies` must all be int dtype") 484 | 485 | N_edges = edges.shape[0] 486 | N_nodes = len(supplies) 487 | 488 | # Add each edge with associated capacities and cost 489 | min_cost_flow.AddArcWithCapacityAndUnitCostVectorized(edges[:,0], edges[:,1], capacities, costs) 490 | 491 | # Add node supplies 492 | min_cost_flow.SetNodeSupplyVectorized(np.arange(N_nodes, dtype='int32'), supplies) 493 | 494 | # Find the minimum cost flow between node 0 and node 4. 495 | if min_cost_flow.Solve() != min_cost_flow.OPTIMAL: 496 | raise Exception('There was an issue with the min cost flow input.') 497 | 498 | # Assignment 499 | labels_M = min_cost_flow.FlowVectorized(np.arange(n_X * n_C, dtype='int32')).reshape(n_X, n_C) 500 | 501 | labels = labels_M.argmax(axis=1) 502 | return labels 503 | 504 | 505 | class KMeansConstrained(KMeans): 506 | """K-Means clustering with minimum and maximum cluster size constraints 507 | 508 | Parameters 509 | ---------- 510 | 511 | n_clusters : int, optional, default: 8 512 | The number of clusters to form as well as the number of 513 | centroids to generate. 514 | 515 | size_min : int, optional, default: None 516 | Constrain the label assignment so that each cluster has a minimum 517 | size of size_min. If None, no constrains will be applied 518 | 519 | size_max : int, optional, default: None 520 | Constrain the label assignment so that each cluster has a maximum 521 | size of size_max. If None, no constrains will be applied 522 | 523 | init : {'k-means++', 'random' or an ndarray} 524 | Method for initialization, defaults to 'k-means++': 525 | 526 | 'k-means++' : selects initial cluster centers for k-mean 527 | clustering in a smart way to speed up convergence. See section 528 | Notes in k_init for more details. 529 | 530 | 'random': choose k observations (rows) at random from data for 531 | the initial centroids. 532 | 533 | If an ndarray is passed, it should be of shape (n_clusters, n_features) 534 | and gives the initial centers. 535 | 536 | n_init : int, default: 10 537 | Number of times the k-means algorithm will be run with different 538 | centroid seeds. The final results will be the best output of 539 | n_init consecutive runs in terms of inertia. 540 | 541 | max_iter : int, default: 300 542 | Maximum number of iterations of the k-means algorithm for a 543 | single run. 544 | 545 | tol : float, default: 1e-4 546 | Relative tolerance with regards to inertia to declare convergence 547 | 548 | verbose : int, default 0 549 | Verbosity mode. 550 | 551 | random_state : int, RandomState instance or None, optional, default: None 552 | If int, random_state is the seed used by the random number generator; 553 | If RandomState instance, random_state is the random number generator; 554 | If None, the random number generator is the RandomState instance used 555 | by `np.random`. 556 | 557 | copy_x : boolean, default True 558 | When pre-computing distances it is more numerically accurate to center 559 | the data first. If copy_x is True, then the original data is not 560 | modified. If False, the original data is modified, and put back before 561 | the function returns, but small numerical differences may be introduced 562 | by subtracting and then adding the data mean. 563 | 564 | n_jobs : int 565 | The number of jobs to use for the computation. This works by computing 566 | each of the n_init runs in parallel. 567 | 568 | If -1 all CPUs are used. If 1 is given, no parallel computing code is 569 | used at all, which is useful for debugging. For n_jobs below -1, 570 | (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one 571 | are used. 572 | 573 | Attributes 574 | ---------- 575 | cluster_centers_ : array, [n_clusters, n_features] 576 | Coordinates of cluster centers 577 | 578 | labels_ : 579 | Labels of each point 580 | 581 | inertia_ : float 582 | Sum of squared distances of samples to their closest cluster center. 583 | 584 | Examples 585 | -------- 586 | 587 | >>> from k_means_constrained import KMeansConstrained 588 | >>> import numpy as np 589 | >>> X = np.array([[1, 2], [1, 4], [1, 0], 590 | ... [4, 2], [4, 4], [4, 0]]) 591 | >>> clf = KMeansConstrained(n_clusters=2, size_min=2, size_max=5, random_state=0).fit(X) 592 | >>> clf.labels_ 593 | array([0, 0, 0, 1, 1, 1], dtype=int32) 594 | >>> clf.predict([[0, 0], [4, 4]]) 595 | array([0, 1], dtype=int32) 596 | >>> clf.cluster_centers_ 597 | array([[ 1., 2.], 598 | [ 4., 2.]]) 599 | 600 | Notes 601 | ------ 602 | K-means problem constrained with a minimum and/or maximum size for each cluster. 603 | 604 | The constrained assignment is formulated as a Minimum Cost Flow (MCF) linear network optimisation 605 | problem. This is then solved using a cost-scaling push-relabel algorithm. The implementation used is 606 | Google's Operations Research tools's `SimpleMinCostFlow`. 607 | 608 | Ref: 609 | 1. Bradley, P. S., K. P. Bennett, and Ayhan Demiriz. "Constrained k-means clustering." 610 | Microsoft Research, Redmond (2000): 1-8. 611 | 2. Google's SimpleMinCostFlow implementation: 612 | https://github.com/google/or-tools/blob/master/ortools/graph/min_cost_flow.h 613 | """ 614 | 615 | def __init__(self, n_clusters=8, size_min=None, size_max=None, distance_func=cdist, 616 | init='k-means++', n_init=10, max_iter=300, tol=1e-4, 617 | verbose=False, random_state=None, copy_x=True, n_jobs=1): 618 | 619 | self.size_min = size_min 620 | self.size_max = size_max 621 | 622 | super().__init__(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, tol=tol, 623 | verbose=verbose, random_state=random_state, copy_x=copy_x, n_jobs=n_jobs) 624 | self.distance_func = distance_func 625 | 626 | def fit(self, X, y=None): 627 | """Compute k-means clustering. 628 | 629 | Parameters 630 | ---------- 631 | X : array-like, shape=(n_samples, n_features) 632 | Training instances to cluster. 633 | 634 | y : Ignored 635 | 636 | """ 637 | if sp.issparse(X): 638 | raise NotImplementedError("Not implemented for sparse X") 639 | 640 | random_state = check_random_state(self.random_state) 641 | X = self._check_fit_data(X) 642 | 643 | self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \ 644 | k_means_constrained( 645 | X, n_clusters=self.n_clusters, 646 | size_min=self.size_min, size_max=self.size_max, 647 | init=self.init, 648 | distance_func=self.distance_func, 649 | n_init=self.n_init, max_iter=self.max_iter, verbose=self.verbose, 650 | tol=self.tol, random_state=random_state, copy_x=self.copy_x, 651 | n_jobs=self.n_jobs, 652 | return_n_iter=True) 653 | return self 654 | -------------------------------------------------------------------------------- /size_constrained_clustering/sklearn_import/metrics/pairwise.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import warnings 3 | from functools import partial 4 | 5 | import numpy as np 6 | from scipy.sparse import issparse, csr_matrix 7 | from scipy.spatial import distance 8 | from joblib import cpu_count, delayed, Parallel 9 | 10 | from sklearn_import.metrics.pairwise_fast import _sparse_manhattan 11 | 12 | from sklearn_import.preprocessing.data import normalize 13 | 14 | from sklearn_import.utils import gen_batches, gen_even_slices 15 | 16 | from sklearn_import.utils.validation import check_array 17 | from sklearn_import.utils.extmath import row_norms, safe_sparse_dot 18 | 19 | 20 | def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, 21 | X_norm_squared=None): 22 | """ 23 | Considering the rows of X (and Y=X) as vectors, compute the 24 | distance matrix between each pair of vectors. 25 | 26 | For efficiency reasons, the euclidean distance between a pair of row 27 | vector x and y is computed as:: 28 | 29 | dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y)) 30 | 31 | This formulation has two advantages over other ways of computing distances. 32 | First, it is computationally efficient when dealing with sparse data. 33 | Second, if one argument varies but the other remains unchanged, then 34 | `dot(x, x)` and/or `dot(y, y)` can be pre-computed. 35 | 36 | However, this is not the most precise way of doing this computation, and 37 | the distance matrix returned by this function may not be exactly 38 | symmetric as required by, e.g., ``scipy.spatial.distance`` functions. 39 | 40 | Read more in the :ref:`User Guide `. 41 | 42 | Parameters 43 | ---------- 44 | X : {array-like, sparse matrix}, shape (n_samples_1, n_features) 45 | 46 | Y : {array-like, sparse matrix}, shape (n_samples_2, n_features) 47 | 48 | Y_norm_squared : array-like, shape (n_samples_2, ), optional 49 | Pre-computed dot-products of vectors in Y (e.g., 50 | ``(Y**2).sum(axis=1)``) 51 | 52 | squared : boolean, optional 53 | Return squared Euclidean distances. 54 | 55 | X_norm_squared : array-like, shape = [n_samples_1], optional 56 | Pre-computed dot-products of vectors in X (e.g., 57 | ``(X**2).sum(axis=1)``) 58 | 59 | Returns 60 | ------- 61 | distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2) 62 | 63 | Examples 64 | -------- 65 | >>> from sklearn.metrics.pairwise import euclidean_distances 66 | >>> X = [[0, 1], [1, 1]] 67 | >>> # distance between rows of X 68 | >>> euclidean_distances(X, X) 69 | array([[ 0., 1.], 70 | [ 1., 0.]]) 71 | >>> # get distance to origin 72 | >>> euclidean_distances(X, [[0, 0]]) 73 | array([[ 1. ], 74 | [ 1.41421356]]) 75 | 76 | See also 77 | -------- 78 | paired_distances : distances betweens pairs of elements of X and Y. 79 | """ 80 | X, Y = check_pairwise_arrays(X, Y) 81 | 82 | if X_norm_squared is not None: 83 | XX = check_array(X_norm_squared) 84 | if XX.shape == (1, X.shape[0]): 85 | XX = XX.T 86 | elif XX.shape != (X.shape[0], 1): 87 | raise ValueError( 88 | "Incompatible dimensions for X and X_norm_squared") 89 | else: 90 | XX = row_norms(X, squared=True)[:, np.newaxis] 91 | 92 | if X is Y: # shortcut in the common case euclidean_distances(X, X) 93 | YY = XX.T 94 | elif Y_norm_squared is not None: 95 | YY = np.atleast_2d(Y_norm_squared) 96 | 97 | if YY.shape != (1, Y.shape[0]): 98 | raise ValueError( 99 | "Incompatible dimensions for Y and Y_norm_squared") 100 | else: 101 | YY = row_norms(Y, squared=True)[np.newaxis, :] 102 | 103 | distances = safe_sparse_dot(X, Y.T, dense_output=True) 104 | distances *= -2 105 | distances += XX 106 | distances += YY 107 | np.maximum(distances, 0, out=distances) 108 | 109 | if X is Y: 110 | # Ensure that distances between vectors and themselves are set to 0.0. 111 | # This may not be the case due to floating point rounding errors. 112 | distances.flat[::distances.shape[0] + 1] = 0.0 113 | 114 | return distances if squared else np.sqrt(distances, out=distances) 115 | 116 | 117 | def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", 118 | batch_size=500, metric_kwargs=None): 119 | """Compute minimum distances between one point and a set of points. 120 | 121 | This function computes for each row in X, the index of the row of Y which 122 | is closest (according to the specified distance). The minimal distances are 123 | also returned. 124 | 125 | This is mostly equivalent to calling: 126 | 127 | (pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis), 128 | pairwise_distances(X, Y=Y, metric=metric).min(axis=axis)) 129 | 130 | but uses much less memory, and is faster for large arrays. 131 | 132 | Parameters 133 | ---------- 134 | X : {array-like, sparse matrix}, shape (n_samples1, n_features) 135 | Array containing points. 136 | 137 | Y : {array-like, sparse matrix}, shape (n_samples2, n_features) 138 | Arrays containing points. 139 | 140 | axis : int, optional, default 1 141 | Axis along which the argmin and distances are to be computed. 142 | 143 | metric : string or callable, default 'euclidean' 144 | metric to use for distance computation. Any metric from scikit-learn 145 | or scipy.spatial.distance can be used. 146 | 147 | If metric is a callable function, it is called on each 148 | pair of instances (rows) and the resulting value recorded. The callable 149 | should take two arrays as input and return one value indicating the 150 | distance between them. This works for Scipy's metrics, but is less 151 | efficient than passing the metric name as a string. 152 | 153 | Distance matrices are not supported. 154 | 155 | Valid values for metric are: 156 | 157 | - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 158 | 'manhattan'] 159 | 160 | - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 161 | 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 162 | 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 163 | 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 164 | 'sqeuclidean', 'yule'] 165 | 166 | See the documentation for scipy.spatial.distance for details on these 167 | metrics. 168 | 169 | batch_size : integer 170 | To reduce memory consumption over the naive solution, data are 171 | processed in batches, comprising batch_size rows of X and 172 | batch_size rows of Y. The default value is quite conservative, but 173 | can be changed for fine-tuning. The larger the number, the larger the 174 | memory usage. 175 | 176 | metric_kwargs : dict, optional 177 | Keyword arguments to pass to specified metric function. 178 | 179 | Returns 180 | ------- 181 | argmin : numpy.ndarray 182 | Y[argmin[i], :] is the row in Y that is closest to X[i, :]. 183 | 184 | distances : numpy.ndarray 185 | distances[i] is the distance between the i-th row in X and the 186 | argmin[i]-th row in Y. 187 | 188 | See also 189 | -------- 190 | sklearn.metrics.pairwise_distances 191 | sklearn.metrics.pairwise_distances_argmin 192 | """ 193 | dist_func = None 194 | if metric in PAIRWISE_DISTANCE_FUNCTIONS: 195 | dist_func = PAIRWISE_DISTANCE_FUNCTIONS[metric] 196 | elif not callable(metric) and not isinstance(metric, str): 197 | raise ValueError("'metric' must be a string or a callable") 198 | 199 | X, Y = check_pairwise_arrays(X, Y) 200 | 201 | if metric_kwargs is None: 202 | metric_kwargs = {} 203 | 204 | if axis == 0: 205 | X, Y = Y, X 206 | 207 | # Allocate output arrays 208 | indices = np.empty(X.shape[0], dtype=np.intp) 209 | values = np.empty(X.shape[0]) 210 | values.fill(np.infty) 211 | 212 | for chunk_x in gen_batches(X.shape[0], batch_size): 213 | X_chunk = X[chunk_x, :] 214 | 215 | for chunk_y in gen_batches(Y.shape[0], batch_size): 216 | Y_chunk = Y[chunk_y, :] 217 | 218 | if dist_func is not None: 219 | if metric == 'euclidean': # special case, for speed 220 | d_chunk = safe_sparse_dot(X_chunk, Y_chunk.T, 221 | dense_output=True) 222 | d_chunk *= -2 223 | d_chunk += row_norms(X_chunk, squared=True)[:, np.newaxis] 224 | d_chunk += row_norms(Y_chunk, squared=True)[np.newaxis, :] 225 | np.maximum(d_chunk, 0, d_chunk) 226 | else: 227 | d_chunk = dist_func(X_chunk, Y_chunk, **metric_kwargs) 228 | else: 229 | d_chunk = pairwise_distances(X_chunk, Y_chunk, 230 | metric=metric, **metric_kwargs) 231 | 232 | # Update indices and minimum values using chunk 233 | min_indices = d_chunk.argmin(axis=1) 234 | min_values = d_chunk[np.arange(chunk_x.stop - chunk_x.start), 235 | min_indices] 236 | 237 | flags = values[chunk_x] > min_values 238 | indices[chunk_x][flags] = min_indices[flags] + chunk_y.start 239 | values[chunk_x][flags] = min_values[flags] 240 | 241 | if metric == "euclidean" and not metric_kwargs.get("squared", False): 242 | np.sqrt(values, values) 243 | return indices, values 244 | 245 | 246 | def check_pairwise_arrays(X, Y, precomputed=False, dtype=None): 247 | """ Set X and Y appropriately and checks inputs 248 | 249 | If Y is None, it is set as a pointer to X (i.e. not a copy). 250 | If Y is given, this does not happen. 251 | All distance metrics should use this function first to assert that the 252 | given parameters are correct and safe to use. 253 | 254 | Specifically, this function first ensures that both X and Y are arrays, 255 | then checks that they are at least two dimensional while ensuring that 256 | their elements are floats (or dtype if provided). Finally, the function 257 | checks that the size of the second dimension of the two arrays is equal, or 258 | the equivalent check for a precomputed distance matrix. 259 | 260 | Parameters 261 | ---------- 262 | X : {array-like, sparse matrix}, shape (n_samples_a, n_features) 263 | 264 | Y : {array-like, sparse matrix}, shape (n_samples_b, n_features) 265 | 266 | precomputed : bool 267 | True if X is to be treated as precomputed distances to the samples in 268 | Y. 269 | 270 | dtype : string, type, list of types or None (default=None) 271 | Data type required for X and Y. If None, the dtype will be an 272 | appropriate float type selected by _return_float_dtype. 273 | 274 | .. versionadded:: 0.18 275 | 276 | Returns 277 | ------- 278 | safe_X : {array-like, sparse matrix}, shape (n_samples_a, n_features) 279 | An array equal to X, guaranteed to be a numpy array. 280 | 281 | safe_Y : {array-like, sparse matrix}, shape (n_samples_b, n_features) 282 | An array equal to Y if Y was not None, guaranteed to be a numpy array. 283 | If Y was None, safe_Y will be a pointer to X. 284 | 285 | """ 286 | X, Y, dtype_float = _return_float_dtype(X, Y) 287 | 288 | warn_on_dtype = dtype is not None 289 | estimator = 'check_pairwise_arrays' 290 | if dtype is None: 291 | dtype = dtype_float 292 | 293 | if Y is X or Y is None: 294 | X = Y = check_array(X, accept_sparse='csr', dtype=dtype, 295 | warn_on_dtype=warn_on_dtype, estimator=estimator) 296 | else: 297 | X = check_array(X, accept_sparse='csr', dtype=dtype, 298 | warn_on_dtype=warn_on_dtype, estimator=estimator) 299 | Y = check_array(Y, accept_sparse='csr', dtype=dtype, 300 | warn_on_dtype=warn_on_dtype, estimator=estimator) 301 | 302 | if precomputed: 303 | if X.shape[1] != Y.shape[0]: 304 | raise ValueError("Precomputed metric requires shape " 305 | "(n_queries, n_indexed). Got (%d, %d) " 306 | "for %d indexed." % 307 | (X.shape[0], X.shape[1], Y.shape[0])) 308 | elif X.shape[1] != Y.shape[1]: 309 | raise ValueError("Incompatible dimension for X and Y matrices: " 310 | "X.shape[1] == %d while Y.shape[1] == %d" % ( 311 | X.shape[1], Y.shape[1])) 312 | 313 | return X, Y 314 | 315 | 316 | def manhattan_distances(X, Y=None, sum_over_features=True, 317 | size_threshold=None): 318 | """ Compute the L1 distances between the vectors in X and Y. 319 | 320 | With sum_over_features equal to False it returns the componentwise 321 | distances. 322 | 323 | Read more in the :ref:`User Guide `. 324 | 325 | Parameters 326 | ---------- 327 | X : array_like 328 | An array with shape (n_samples_X, n_features). 329 | 330 | Y : array_like, optional 331 | An array with shape (n_samples_Y, n_features). 332 | 333 | sum_over_features : bool, default=True 334 | If True the function returns the pairwise distance matrix 335 | else it returns the componentwise L1 pairwise-distances. 336 | Not supported for sparse matrix inputs. 337 | 338 | size_threshold : int, default=5e8 339 | Unused parameter. 340 | 341 | Returns 342 | ------- 343 | D : array 344 | If sum_over_features is False shape is 345 | (n_samples_X * n_samples_Y, n_features) and D contains the 346 | componentwise L1 pairwise-distances (ie. absolute difference), 347 | else shape is (n_samples_X, n_samples_Y) and D contains 348 | the pairwise L1 distances. 349 | 350 | Examples 351 | -------- 352 | >>> from sklearn.metrics.pairwise import manhattan_distances 353 | >>> manhattan_distances([[3]], [[3]])#doctest:+ELLIPSIS 354 | array([[ 0.]]) 355 | >>> manhattan_distances([[3]], [[2]])#doctest:+ELLIPSIS 356 | array([[ 1.]]) 357 | >>> manhattan_distances([[2]], [[3]])#doctest:+ELLIPSIS 358 | array([[ 1.]]) 359 | >>> manhattan_distances([[1, 2], [3, 4]],\ 360 | [[1, 2], [0, 3]])#doctest:+ELLIPSIS 361 | array([[ 0., 2.], 362 | [ 4., 4.]]) 363 | >>> import numpy as np 364 | >>> X = np.ones((1, 2)) 365 | >>> y = 2 * np.ones((2, 2)) 366 | >>> manhattan_distances(X, y, sum_over_features=False)#doctest:+ELLIPSIS 367 | array([[ 1., 1.], 368 | [ 1., 1.]]...) 369 | """ 370 | if size_threshold is not None: 371 | warnings.warn('Use of the "size_threshold" is deprecated ' 372 | 'in 0.19 and it will be removed version ' 373 | '0.21 of scikit-learn', DeprecationWarning) 374 | X, Y = check_pairwise_arrays(X, Y) 375 | 376 | if issparse(X) or issparse(Y): 377 | if not sum_over_features: 378 | raise TypeError("sum_over_features=%r not supported" 379 | " for sparse matrices" % sum_over_features) 380 | 381 | X = csr_matrix(X, copy=False) 382 | Y = csr_matrix(Y, copy=False) 383 | D = np.zeros((X.shape[0], Y.shape[0])) 384 | _sparse_manhattan(X.data, X.indices, X.indptr, 385 | Y.data, Y.indices, Y.indptr, 386 | X.shape[1], D) 387 | return D 388 | 389 | if sum_over_features: 390 | return distance.cdist(X, Y, 'cityblock') 391 | 392 | D = X[:, np.newaxis, :] - Y[np.newaxis, :, :] 393 | D = np.abs(D, D) 394 | return D.reshape((-1, X.shape[1])) 395 | 396 | 397 | def cosine_distances(X, Y=None): 398 | """Compute cosine distance between samples in X and Y. 399 | 400 | Cosine distance is defined as 1.0 minus the cosine similarity. 401 | 402 | Read more in the :ref:`User Guide `. 403 | 404 | Parameters 405 | ---------- 406 | X : array_like, sparse matrix 407 | with shape (n_samples_X, n_features). 408 | 409 | Y : array_like, sparse matrix (optional) 410 | with shape (n_samples_Y, n_features). 411 | 412 | Returns 413 | ------- 414 | distance matrix : array 415 | An array with shape (n_samples_X, n_samples_Y). 416 | 417 | See also 418 | -------- 419 | sklearn.metrics.pairwise.cosine_similarity 420 | scipy.spatial.distance.cosine (dense matrices only) 421 | """ 422 | # 1.0 - cosine_similarity(X, Y) without copy 423 | S = cosine_similarity(X, Y) 424 | S *= -1 425 | S += 1 426 | np.clip(S, 0, 2, out=S) 427 | if X is Y or Y is None: 428 | # Ensure that distances between vectors and themselves are set to 0.0. 429 | # This may not be the case due to floating point rounding errors. 430 | S[np.diag_indices_from(S)] = 0.0 431 | return S 432 | 433 | 434 | PAIRWISE_DISTANCE_FUNCTIONS = { 435 | # If updating this dictionary, update the doc in both distance_metrics() 436 | # and also in pairwise_distances()! 437 | 'cityblock': manhattan_distances, 438 | 'cosine': cosine_distances, 439 | 'euclidean': euclidean_distances, 440 | 'l2': euclidean_distances, 441 | 'l1': manhattan_distances, 442 | 'manhattan': manhattan_distances, 443 | 'precomputed': None, # HACK: precomputed is always allowed, never called 444 | } 445 | 446 | 447 | def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): 448 | """ Compute the distance matrix from a vector array X and optional Y. 449 | 450 | This method takes either a vector array or a distance matrix, and returns 451 | a distance matrix. If the input is a vector array, the distances are 452 | computed. If the input is a distances matrix, it is returned instead. 453 | 454 | This method provides a safe way to take a distance matrix as input, while 455 | preserving compatibility with many other algorithms that take a vector 456 | array. 457 | 458 | If Y is given (default is None), then the returned matrix is the pairwise 459 | distance between the arrays from both X and Y. 460 | 461 | Valid values for metric are: 462 | 463 | - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 464 | 'manhattan']. These metrics support sparse matrix inputs. 465 | 466 | - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 467 | 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 468 | 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 469 | 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] 470 | See the documentation for scipy.spatial.distance for details on these 471 | metrics. These metrics do not support sparse matrix inputs. 472 | 473 | Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are 474 | valid scipy.spatial.distance metrics), the scikit-learn implementation 475 | will be used, which is faster and has support for sparse matrices (except 476 | for 'cityblock'). For a verbose description of the metrics from 477 | scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics 478 | function. 479 | 480 | Read more in the :ref:`User Guide `. 481 | 482 | Parameters 483 | ---------- 484 | X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \ 485 | [n_samples_a, n_features] otherwise 486 | Array of pairwise distances between samples, or a feature array. 487 | 488 | Y : array [n_samples_b, n_features], optional 489 | An optional second feature array. Only allowed if metric != "precomputed". 490 | 491 | metric : string, or callable 492 | The metric to use when calculating distance between instances in a 493 | feature array. If metric is a string, it must be one of the options 494 | allowed by scipy.spatial.distance.pdist for its metric parameter, or 495 | a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. 496 | If metric is "precomputed", X is assumed to be a distance matrix. 497 | Alternatively, if metric is a callable function, it is called on each 498 | pair of instances (rows) and the resulting value recorded. The callable 499 | should take two arrays from X as input and return a value indicating 500 | the distance between them. 501 | 502 | n_jobs : int 503 | The number of jobs to use for the computation. This works by breaking 504 | down the pairwise matrix into n_jobs even slices and computing them in 505 | parallel. 506 | 507 | If -1 all CPUs are used. If 1 is given, no parallel computing code is 508 | used at all, which is useful for debugging. For n_jobs below -1, 509 | (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one 510 | are used. 511 | 512 | **kwds : optional keyword parameters 513 | Any further parameters are passed directly to the distance function. 514 | If using a scipy.spatial.distance metric, the parameters are still 515 | metric dependent. See the scipy docs for usage examples. 516 | 517 | Returns 518 | ------- 519 | D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b] 520 | A distance matrix D such that D_{i, j} is the distance between the 521 | ith and jth vectors of the given matrix X, if Y is None. 522 | If Y is not None, then D_{i, j} is the distance between the ith array 523 | from X and the jth array from Y. 524 | 525 | """ 526 | if (metric not in _VALID_METRICS and 527 | not callable(metric) and metric != "precomputed"): 528 | raise ValueError("Unknown metric %s. " 529 | "Valid metrics are %s, or 'precomputed', or a " 530 | "callable" % (metric, _VALID_METRICS)) 531 | 532 | if metric == "precomputed": 533 | X, _ = check_pairwise_arrays(X, Y, precomputed=True) 534 | return X 535 | elif metric in PAIRWISE_DISTANCE_FUNCTIONS: 536 | func = PAIRWISE_DISTANCE_FUNCTIONS[metric] 537 | elif callable(metric): 538 | func = partial(_pairwise_callable, metric=metric, **kwds) 539 | else: 540 | if issparse(X) or issparse(Y): 541 | raise TypeError("scipy distance metrics do not" 542 | " support sparse matrices.") 543 | 544 | dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None 545 | 546 | X, Y = check_pairwise_arrays(X, Y, dtype=dtype) 547 | 548 | if n_jobs == 1 and X is Y: 549 | return distance.squareform(distance.pdist(X, metric=metric, 550 | **kwds)) 551 | func = partial(distance.cdist, metric=metric, **kwds) 552 | 553 | return _parallel_pairwise(X, Y, func, n_jobs, **kwds) 554 | 555 | 556 | def _return_float_dtype(X, Y): 557 | """ 558 | 1. If dtype of X and Y is float32, then dtype float32 is returned. 559 | 2. Else dtype float is returned. 560 | """ 561 | if not issparse(X) and not isinstance(X, np.ndarray): 562 | X = np.asarray(X) 563 | 564 | if Y is None: 565 | Y_dtype = X.dtype 566 | elif not issparse(Y) and not isinstance(Y, np.ndarray): 567 | Y = np.asarray(Y) 568 | Y_dtype = Y.dtype 569 | else: 570 | Y_dtype = Y.dtype 571 | 572 | if X.dtype == Y_dtype == np.float32: 573 | dtype = np.float32 574 | else: 575 | dtype = np.float 576 | 577 | return X, Y, dtype 578 | 579 | 580 | def _parallel_pairwise(X, Y, func, n_jobs, **kwds): 581 | """Break the pairwise matrix in n_jobs even slices 582 | and compute them in parallel""" 583 | if n_jobs < 0: 584 | n_jobs = max(cpu_count() + 1 + n_jobs, 1) 585 | 586 | if Y is None: 587 | Y = X 588 | 589 | if n_jobs == 1: 590 | # Special case to avoid picklability checks in delayed 591 | return func(X, Y, **kwds) 592 | 593 | # TODO: in some cases, backend='threading' may be appropriate 594 | fd = delayed(func) 595 | ret = Parallel(n_jobs=n_jobs, verbose=0)( 596 | fd(X, Y[s], **kwds) 597 | for s in gen_even_slices(Y.shape[0], n_jobs)) 598 | 599 | return np.hstack(ret) 600 | 601 | 602 | def _pairwise_callable(X, Y, metric, **kwds): 603 | """Handle the callable case for pairwise_{distances,kernels} 604 | """ 605 | X, Y = check_pairwise_arrays(X, Y) 606 | 607 | if X is Y: 608 | # Only calculate metric for upper triangle 609 | out = np.zeros((X.shape[0], Y.shape[0]), dtype='float') 610 | iterator = itertools.combinations(range(X.shape[0]), 2) 611 | for i, j in iterator: 612 | out[i, j] = metric(X[i], Y[j], **kwds) 613 | 614 | # Make symmetric 615 | # NB: out += out.T will produce incorrect results 616 | out = out + out.T 617 | 618 | # Calculate diagonal 619 | # NB: nonzero diagonals are allowed for both metrics and kernels 620 | for i in range(X.shape[0]): 621 | x = X[i] 622 | out[i, i] = metric(x, x, **kwds) 623 | 624 | else: 625 | # Calculate all cells 626 | out = np.empty((X.shape[0], Y.shape[0]), dtype='float') 627 | iterator = itertools.product(range(X.shape[0]), range(Y.shape[0])) 628 | for i, j in iterator: 629 | out[i, j] = metric(X[i], Y[j], **kwds) 630 | 631 | return out 632 | 633 | 634 | PAIRWISE_BOOLEAN_FUNCTIONS = [ 635 | 'dice', 636 | 'jaccard', 637 | 'kulsinski', 638 | 'matching', 639 | 'rogerstanimoto', 640 | 'russellrao', 641 | 'sokalmichener', 642 | 'sokalsneath', 643 | 'yule', 644 | ] 645 | 646 | 647 | def cosine_similarity(X, Y=None, dense_output=True): 648 | """Compute cosine similarity between samples in X and Y. 649 | 650 | Cosine similarity, or the cosine kernel, computes similarity as the 651 | normalized dot product of X and Y: 652 | 653 | K(X, Y) = / (||X||*||Y||) 654 | 655 | On L2-normalized data, this function is equivalent to linear_kernel. 656 | 657 | Read more in the :ref:`User Guide `. 658 | 659 | Parameters 660 | ---------- 661 | X : ndarray or sparse array, shape: (n_samples_X, n_features) 662 | Input data. 663 | 664 | Y : ndarray or sparse array, shape: (n_samples_Y, n_features) 665 | Input data. If ``None``, the output will be the pairwise 666 | similarities between all samples in ``X``. 667 | 668 | dense_output : boolean (optional), default True 669 | Whether to return dense output even when the input is sparse. If 670 | ``False``, the output is sparse if both input arrays are sparse. 671 | 672 | .. versionadded:: 0.17 673 | parameter ``dense_output`` for dense output. 674 | 675 | Returns 676 | ------- 677 | kernel matrix : array 678 | An array with shape (n_samples_X, n_samples_Y). 679 | """ 680 | # to avoid recursive import 681 | 682 | X, Y = check_pairwise_arrays(X, Y) 683 | 684 | X_normalized = normalize(X, copy=True) 685 | if X is Y: 686 | Y_normalized = X_normalized 687 | else: 688 | Y_normalized = normalize(Y, copy=True) 689 | 690 | K = safe_sparse_dot(X_normalized, Y_normalized.T, dense_output=dense_output) 691 | 692 | return K 693 | 694 | 695 | _VALID_METRICS = ['euclidean', 'l2', 'l1', 'manhattan', 'cityblock', 696 | 'braycurtis', 'canberra', 'chebyshev', 'correlation', 697 | 'cosine', 'dice', 'hamming', 'jaccard', 'kulsinski', 698 | 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 699 | 'russellrao', 'seuclidean', 'sokalmichener', 700 | 'sokalsneath', 'sqeuclidean', 'yule', "wminkowski"] 701 | --------------------------------------------------------------------------------