├── size_constrained_clustering
    ├── sklearn_import
    │   ├── cluster
    │   │   ├── __init__.py
    │   │   ├── _k_means.pyx
    │   │   └── k_means_.py
    │   ├── metrics
    │   │   ├── __init__.py
    │   │   ├── pairwise_fast.pyx
    │   │   └── pairwise.py
    │   ├── externals
    │   │   └── __init__.py
    │   ├── preprocessing
    │   │   ├── __init__.py
    │   │   └── data.py
    │   ├── utils
    │   │   ├── fixes.py
    │   │   ├── __init__.py
    │   │   ├── sparsefuncs.py
    │   │   ├── extmath.py
    │   │   ├── sparsefuncs_fast.pyx
    │   │   └── validation.py
    │   ├── fixes.py
    │   ├── __init__.py
    │   ├── exceptions.py
    │   ├── funcsigs.py
    │   └── base.py
    ├── k_means_constrained
    │   ├── __init__.py
    │   ├── mincostflow_vectorized.py
    │   ├── mincostflow_vectorized_.pyx
    │   └── k_means_constrained_.py
    ├── __init__.py
    ├── fcm.py
    ├── minmax.py
    ├── shrinkage.py
    ├── base.py
    ├── da.py
    └── equal.py
├── pic
    ├── da.png
    ├── fcm.png
    ├── equal.png
    ├── minmax.png
    ├── shrinkage.png
    └── equal_heuristics.png
├── __init__.py
├── requirements.txt
├── .travis.yml
├── tests
    ├── test_pypi.py
    ├── memory_test.py
    ├── memory_monitor.py
    ├── test_da.py
    ├── test_equal.py
    ├── test_fcm.py
    └── test_minmax.py
├── LICENSE
├── examples
    └── examples.py
├── setup.py
└── README.md


/size_constrained_clustering/sklearn_import/cluster/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/sklearn_import/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/sklearn_import/externals/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/sklearn_import/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pic/da.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jingw2/size_constrained_clustering/HEAD/pic/da.png


--------------------------------------------------------------------------------
/pic/fcm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jingw2/size_constrained_clustering/HEAD/pic/fcm.png


--------------------------------------------------------------------------------
/pic/equal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jingw2/size_constrained_clustering/HEAD/pic/equal.png


--------------------------------------------------------------------------------
/pic/minmax.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jingw2/size_constrained_clustering/HEAD/pic/minmax.png


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .size_constrained_clustering import base, da, equal, fcm, minmax, shrinkage
3 | 


--------------------------------------------------------------------------------
/pic/shrinkage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jingw2/size_constrained_clustering/HEAD/pic/shrinkage.png


--------------------------------------------------------------------------------
/pic/equal_heuristics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jingw2/size_constrained_clustering/HEAD/pic/equal_heuristics.png


--------------------------------------------------------------------------------
/size_constrained_clustering/k_means_constrained/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | __all__ = ['KMeansConstrained']
3 | 
4 | from .k_means_constrained_ import KMeansConstrained
5 | 
6 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | import os 
3 | path = os.path.dirname(os.path.abspath(__file__))
4 | import sys 
5 | sys.path.append(path)
6 | import base, da, equal, fcm, minmax, shrinkage
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | joblib==0.15.1
 2 | psutil>=5.6.6
 3 | numpy>=1.16.5
 4 | scipy==1.6.0
 5 | ortools>=6.7
 6 | six==1.12.0
 7 | matplotlib==3.1.0
 8 | seaborn==0.10.1
 9 | Cython==0.29.20
10 | scikit_learn==0.24.1
11 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python 
 2 | python: 
 3 | - '3.7'
 4 | before_install:
 5 | - pip install -U pytest pytest-cov codecov
 6 | install:
 7 | - python setup.py build_ext --inplace
 8 | - pip install -r requirements.txt 
 9 | script:
10 | - pytest --cov=./tests
11 | after_sucess:
12 |   codecov
13 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/sklearn_import/utils/fixes.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn_import.fixes import _parse_version
 3 | 
 4 | np_version = _parse_version(np.__version__)
 5 | 
 6 | 
 7 | def sparse_min_max(X, axis):
 8 |     return (X.min(axis=axis).toarray().ravel(),
 9 |             X.max(axis=axis).toarray().ravel())
10 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/sklearn_import/fixes.py:
--------------------------------------------------------------------------------
1 | def _parse_version(version_string):
2 |     version = []
3 |     for x in version_string.split('.'):
4 |         try:
5 |             version.append(int(x))
6 |         except ValueError:
7 |             # x may be of the form dev-1ea1592
8 |             version.append(x)
9 |     return tuple(version)


--------------------------------------------------------------------------------
/tests/test_pypi.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from size_constrained_clustering import fcm, equal, minmax, shrinkage
 3 | import numpy as np
 4 | n_samples = 2000
 5 | n_clusters = 3
 6 | X = np.random.rand(n_samples, 2)
 7 | # 使用minmax flow方式求解
 8 | model = equal.SameSizeKMeansMinCostFlow(n_clusters)
 9 | # 使用heuristics方法求解
10 | model.fit(X)
11 | centers = model.cluster_centers_
12 | labels = model.labels_
13 | 


--------------------------------------------------------------------------------
/tests/memory_test.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np 
 3 | import collections
 4 | import os 
 5 | import sys 
 6 | 
 7 | path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 8 | sys.path.append(path)
 9 | from size_constrained_clustering  import equal, da
10 | 
11 | n_samples = 10000
12 | n_clusters = 4 
13 | X = np.random.rand(n_samples, 2)
14 | distribution = [0.25] * n_clusters
15 | model = da.DeterministicAnnealing(n_clusters, distribution)
16 | model.fit(X)
17 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/sklearn_import/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def get_config():
 5 |     """Retrieve current values for configuration set by :func:`set_config`
 6 | 
 7 |     Returns
 8 |     -------
 9 |     config : dict
10 |         Keys are parameter names that can be passed to :func:`set_config`.
11 |     """
12 |     return {'assume_finite': _ASSUME_FINITE}
13 | 
14 | 
15 | __version__ = '0.19.2'
16 | _ASSUME_FINITE = bool(os.environ.get('SKLEARN_ASSUME_FINITE', False))


--------------------------------------------------------------------------------
/size_constrained_clustering/k_means_constrained/mincostflow_vectorized.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import numpy as np
 4 | from ortools.graph.pywrapgraph import SimpleMinCostFlow
 5 | 
 6 | 
 7 | # Cython paths must be fully qualified
 8 | from k_means_constrained.mincostflow_vectorized_ import \
 9 |     SimpleMinCostFlow_AddArcWithCapacityAndUnitCostVectorized, \
10 |     SimpleMinCostFlow_SetNodeSupplyVectorized, \
11 |     SimpleMinCostFlow_FlowVectorized
12 | 
13 | 
14 | class SimpleMinCostFlowVectorized(SimpleMinCostFlow):
15 | 
16 |     def AddArcWithCapacityAndUnitCostVectorized(self, tail, head, capacity, unit_cost):
17 |         return SimpleMinCostFlow_AddArcWithCapacityAndUnitCostVectorized(self, tail, head, capacity, unit_cost)
18 | 
19 |     def SetNodeSupplyVectorized(self, node, supply):
20 |         return SimpleMinCostFlow_SetNodeSupplyVectorized(self, node, supply)
21 | 
22 |     def FlowVectorized(self, arc):
23 |         return SimpleMinCostFlow_FlowVectorized(self, arc)
24 | 
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2018 The Python Packaging Authority
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/tests/memory_monitor.py:
--------------------------------------------------------------------------------
 1 | #!usr/bin/python 3.7
 2 | #-*-coding:utf-8-*-
 3 | 
 4 | import subprocess 
 5 | import psutil
 6 | import matplotlib.pyplot as plt
 7 | import time
 8 | 
 9 | cmd = "python memory_test.py"
10 | process = subprocess.Popen(cmd.split(" "))
11 | 
12 | pid = process.pid
13 | print("process id: ", pid)
14 | 
15 | def get_memory_list():
16 |     process = psutil.Process(pid)
17 |     memory_list = []
18 |     while process_running(process):
19 |         try:
20 |             memo = process.memory_info().rss / 1024 / 1024 #MB
21 |         except:
22 |             break
23 |         memory_list.append(memo)
24 |         time.sleep(1)
25 |     return memory_list
26 | 
27 | def process_running(process):
28 |     try:
29 |         memo = process.memory_info().rss / 1024 / 1024
30 |         return True 
31 |     except:
32 |         return False
33 | 
34 | def plot():
35 |     start = time.time()
36 |     memory_list = get_memory_list()
37 |     end = time.time()
38 |     print("Time spent to run {}s".format(round(end-start, 2)))
39 |     plt.plot([x for x in range(len(memory_list))], memory_list)
40 |     plt.xlabel("record point")
41 |     plt.ylabel("memory (MB)")
42 |     plt.show()
43 | 
44 | if __name__ == "__main__":
45 |     plot()
46 | 


--------------------------------------------------------------------------------
/tests/test_da.py:
--------------------------------------------------------------------------------
 1 | #!usr/bin/python 3.7
 2 | #-*-coding:utf-8-*-
 3 | 
 4 | import pytest 
 5 | import sys 
 6 | import os 
 7 | path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 8 | sys.path.append(path)
 9 | from size_constrained_clustering import da
10 | 
11 | class TestDA:
12 | 
13 |     def test_input(self):
14 |         with pytest.raises(AssertionError):
15 |             da.DeterministicAnnealing(n_clusters=2, distribution=[0.25, 0.3])
16 |         with pytest.raises(AssertionError):
17 |             da.DeterministicAnnealing(n_clusters=1, distribution=[0.25, 0.3])
18 |         with pytest.raises(AssertionError):
19 |             da.DeterministicAnnealing(n_clusters=2, distribution=[0.25, 0.75], T=0.1)
20 |     
21 |     def test_output(self):
22 |         import collections
23 |         import random 
24 |         import numpy as np 
25 |         n_samples = 1000
26 |         random_state = 42
27 |         random.seed(random_state)
28 |         np.random.seed(random_state)
29 |         X = np.random.rand(n_samples, 2)
30 |         n_clusters = 4
31 |         distribution = [0.25] * n_clusters
32 | 
33 |         model = da.DeterministicAnnealing(n_clusters, distribution)
34 |         model.fit(X)
35 | 
36 |         labels = model.labels_
37 |         label_counter = collections.Counter(labels)
38 |         label_dist = list(label_counter.values())
39 |         label_dist = [d / np.sum(label_dist) for d in label_dist]
40 | 
41 |         assert np.sum(np.array(label_dist) - np.array(distribution)) <= 1e-6
42 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/sklearn_import/exceptions.py:
--------------------------------------------------------------------------------
 1 | class DataConversionWarning(UserWarning):
 2 |     """Warning used to notify implicit data conversions happening in the code.
 3 | 
 4 |     This warning occurs when some input data needs to be converted or
 5 |     interpreted in a way that may not match the user's expectations.
 6 | 
 7 |     For example, this warning may occur when the user
 8 |         - passes an integer array to a function which expects float input and
 9 |           will convert the input
10 |         - requests a non-copying operation, but a copy is required to meet the
11 |           implementation's data-type expectations;
12 |         - passes an input whose shape can be interpreted ambiguously.
13 | 
14 |     .. versionchanged:: 0.18
15 |        Moved from sklearn.utils.validation.
16 |     """
17 | 
18 | 
19 | class NotFittedError(ValueError, AttributeError):
20 |     """Exception class to raise if estimator is used before fitting.
21 | 
22 |     This class inherits from both ValueError and AttributeError to help with
23 |     exception handling and backward compatibility.
24 | 
25 |     Examples
26 |     --------
27 |     >>> from sklearn.svm import LinearSVC
28 |     >>> from sklearn.exceptions import NotFittedError
29 |     >>> try:
30 |     ...     LinearSVC().predict([[1, 2], [2, 3], [3, 4]])
31 |     ... except NotFittedError as e:
32 |     ...     print(repr(e))
33 |     ...                        # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
34 |     NotFittedError('This LinearSVC instance is not fitted yet'...)
35 | 
36 |     .. versionchanged:: 0.18
37 |        Moved from sklearn.utils.validation.
38 |     """


--------------------------------------------------------------------------------
/size_constrained_clustering/sklearn_import/metrics/pairwise_fast.pyx:
--------------------------------------------------------------------------------
 1 | #cython: boundscheck=False
 2 | #cython: cdivision=True
 3 | #cython: wraparound=False
 4 | 
 5 | # Author: Andreas Mueller <amueller@ais.uni-bonn.de>
 6 | #         Lars Buitinck
 7 | #
 8 | # License: BSD 3 clause
 9 | 
10 | from libc.string cimport memset
11 | import numpy as np
12 | cimport numpy as np
13 | 
14 | ctypedef float [:, :] float_array_2d_t
15 | ctypedef double [:, :] double_array_2d_t
16 | 
17 | cdef fused floating1d:
18 |     float[::1]
19 |     double[::1]
20 | 
21 | cdef fused floating_array_2d_t:
22 |     float_array_2d_t
23 |     double_array_2d_t
24 | 
25 | 
26 | np.import_array()
27 | 
28 | 
29 | def _sparse_manhattan(floating1d X_data, int[:] X_indices, int[:] X_indptr,
30 |                       floating1d Y_data, int[:] Y_indices, int[:] Y_indptr,
31 |                       np.npy_intp n_features, double[:, ::1] D):
32 |     """Pairwise L1 distances for CSR matrices.
33 | 
34 |     Usage:
35 | 
36 |     >>> D = np.zeros(X.shape[0], Y.shape[0])
37 |     >>> sparse_manhattan(X.data, X.indices, X.indptr,
38 |     ...                  Y.data, Y.indices, Y.indptr,
39 |     ...                  X.shape[1], D)
40 |     """
41 |     cdef double[::1] row = np.empty(n_features)
42 |     cdef np.npy_intp ix, iy, j
43 | 
44 |     with nogil:
45 |         for ix in range(D.shape[0]):
46 |             for iy in range(D.shape[1]):
47 |                 # Simple strategy: densify current row of X, then subtract the
48 |                 # corresponding row of Y.
49 |                 memset(&row[0], 0, n_features * sizeof(double))
50 |                 for j in range(X_indptr[ix], X_indptr[ix + 1]):
51 |                     row[X_indices[j]] = X_data[j]
52 |                 for j in range(Y_indptr[iy], Y_indptr[iy + 1]):
53 |                     row[Y_indices[j]] -= Y_data[j]
54 | 
55 |                 with gil:
56 |                     D[ix, iy] = row[0].abs().sum()


--------------------------------------------------------------------------------
/tests/test_equal.py:
--------------------------------------------------------------------------------
 1 | #!usr/bin/python 3.6
 2 | #-*-coding:utf-8-*-
 3 | 
 4 | import pytest 
 5 | import sys 
 6 | import os 
 7 | path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 8 | sys.path.append(path)
 9 | from size_constrained_clustering import equal
10 | 
11 | class TestEqual:
12 | 
13 |     def test_input(self):
14 |         with pytest.raises(AssertionError):
15 |             equal.SameSizeKMeansHeuristics(n_clusters=-1)
16 |         with pytest.raises(AssertionError):
17 |             equal.SameSizeKMeansMinCostFlow(n_clusters=-1)
18 |         with pytest.raises(AssertionError):
19 |             equal.SameSizeKMeansHeuristics(n_clusters=0)
20 |         with pytest.raises(AssertionError):
21 |             equal.SameSizeKMeansMinCostFlow(n_clusters=0)
22 |         with pytest.raises(AssertionError):
23 |             equal.SameSizeKMeansHeuristics(n_clusters=1, max_iters=1.2)
24 |         with pytest.raises(AssertionError):
25 |             equal.SameSizeKMeansMinCostFlow(n_clusters=1, max_iters=1.2)
26 |         with pytest.raises(Exception):
27 |             equal.SameSizeKMeansHeuristics(n_clusters=1, distance_func="a")
28 |         with pytest.raises(Exception):
29 |             equal.SameSizeKMeansMinCostFlow(n_clusters=1, distance_func="a")
30 |     
31 |     def test_output(self):
32 |         import numpy as np 
33 |         import collections
34 |         n_samples = 2000
35 |         n_clusters = 4 
36 |         X = np.random.rand(n_samples, 2)
37 |         model = equal.SameSizeKMeansHeuristics(n_clusters)
38 |         model.fit(X)
39 |         labels = model.labels_
40 |         label_counts = collections.Counter(labels)
41 |         assert_cluster_equal(label_counts)
42 | 
43 |         model = equal.SameSizeKMeansMinCostFlow(n_clusters)
44 |         model.fit(X)
45 |         labels = model.labels_
46 |         label_counts = collections.Counter(labels)
47 |         assert_cluster_equal(label_counts)
48 | 
49 | def assert_cluster_equal(label_counts):
50 |     size = label_counts[0]
51 |     for i in range(1, len(label_counts)):
52 |         assert label_counts[i] == size
53 |         
54 | 


--------------------------------------------------------------------------------
/tests/test_fcm.py:
--------------------------------------------------------------------------------
 1 | #!usr/bin/python 3.6
 2 | #-*-coding:utf-8-*-
 3 | 
 4 | import pytest 
 5 | import sys 
 6 | import os 
 7 | path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 8 | sys.path.append(path)
 9 | from size_constrained_clustering import fcm
10 | 
11 | class TestFCM:
12 | 
13 |     def test_input(self):
14 |         with pytest.raises(AssertionError):
15 |             fcm.FCM(n_clusters=-1)
16 |         with pytest.raises(AssertionError):
17 |             fcm.FCM(n_clusters=0)
18 |         with pytest.raises(AssertionError):
19 |             fcm.FCM(n_clusters=3, m=1)
20 |         with pytest.raises(AssertionError):
21 |             fcm.FCM(n_clusters=3, max_iters=1.0)
22 |         with pytest.raises(AssertionError):
23 |             fcm.FCM(n_clusters=3, epsilon=-1)
24 |         with pytest.raises(Exception):
25 |             fcm.FCM(n_clusters=3, distance_func="a")
26 |     
27 |     def test_output(self):
28 |         from sklearn.datasets import make_blobs
29 |         import numpy as np 
30 |         import collections 
31 |         n_samples = 5000
32 |         n_bins = 4  # use 3 bins for calibration_curve as we have 3 clusters here
33 |         centers = [(-5, -5), (0, 0), (5, 5), (7, 10)]
34 | 
35 |         X, _ = make_blobs(n_samples=n_samples, n_features=2, cluster_std=1.0,
36 |                         centers=centers, shuffle=False, random_state=42)
37 | 
38 |         model = fcm.FCM(n_bins)
39 |         model.fit(X)
40 |         fcm_centers = model.cluster_centers_
41 |         fcm_labels = model.labels_
42 | 
43 |         target_centers = np.array([[-0.020799, -0.03094044],
44 |                 [-4.99797698, -4.96240717],
45 |                 [7.01237337, 10.03848252],
46 |                 [4.97931177, 4.94258691]])
47 |         # within tolerance
48 |         fcm_centers = np.round(fcm_centers, 3)
49 |         target_centers = np.round(target_centers, 3)
50 |         label_counts = dict(collections.Counter(fcm_labels))
51 |         assert label_counts == {2: 1252, 0: 1250, 1: 1249, 3: 1249}
52 |         assert np.array_equal(fcm_centers, target_centers)
53 | 
54 | if __name__ == "__main__":
55 |     pass
56 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/k_means_constrained/mincostflow_vectorized_.pyx:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | cimport numpy as np
 4 | cimport cython
 5 | 
 6 | from ortools.graph._pywrapgraph import \
 7 |     SimpleMinCostFlow_AddArcWithCapacityAndUnitCost,\
 8 |     SimpleMinCostFlow_SetNodeSupply,\
 9 |     SimpleMinCostFlow_Flow
10 | 
11 | DTYPE = np.int32
12 | ctypedef np.int32_t DTYPE_t
13 | 
14 | 
15 | @cython.boundscheck(False)
16 | @cython.wraparound(False)
17 | def SimpleMinCostFlow_AddArcWithCapacityAndUnitCostVectorized(
18 |         self,
19 |         np.ndarray[DTYPE_t, ndim=1] tail,
20 |         np.ndarray[DTYPE_t, ndim=1] head,
21 |         np.ndarray[DTYPE_t, ndim=1] capacity,
22 |         np.ndarray[DTYPE_t, ndim=1] unit_cost):
23 | 
24 |     cdef int len = tail.shape[0]
25 | 
26 |     assert tail.dtype == DTYPE
27 |     assert head.dtype == DTYPE
28 |     assert capacity.dtype == DTYPE
29 |     assert unit_cost.dtype == DTYPE
30 |     assert head.shape[0] == len
31 |     assert capacity.shape[0] == len
32 |     assert unit_cost.shape[0] == len
33 | 
34 |     for i in range(len):
35 |         SimpleMinCostFlow_AddArcWithCapacityAndUnitCost(self, tail[i], head[i], capacity[i], unit_cost[i])
36 | 
37 | 
38 | @cython.boundscheck(False)
39 | @cython.wraparound(False)
40 | def SimpleMinCostFlow_SetNodeSupplyVectorized(self,
41 |                                               np.ndarray[DTYPE_t, ndim=1] node,
42 |                                               np.ndarray[DTYPE_t, ndim=1] supply):
43 |     cdef int len = node.shape[0]
44 | 
45 |     assert node.dtype == DTYPE
46 |     assert supply.dtype == DTYPE
47 |     assert supply.shape[0] == len
48 | 
49 |     for i in range(len):
50 |         SimpleMinCostFlow_SetNodeSupply(self, node[i], supply[i])
51 | 
52 | 
53 | @cython.boundscheck(False)
54 | @cython.wraparound(False)
55 | def SimpleMinCostFlow_FlowVectorized(self,
56 |                                      np.ndarray[DTYPE_t, ndim=1] arc):
57 | 
58 |     cdef int len = arc.shape[0]
59 | 
60 |     assert arc.dtype == DTYPE
61 | 
62 |     cdef np.ndarray flow = np.zeros(len, dtype=DTYPE)
63 | 
64 |     for i in range(len):
65 |         flow[i] = SimpleMinCostFlow_Flow(self, arc[i])
66 | 
67 |     return flow
68 | 
69 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/sklearn_import/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | def gen_batches(n, batch_size):
 2 |     """Generator to create slices containing batch_size elements, from 0 to n.
 3 | 
 4 |     The last slice may contain less than batch_size elements, when batch_size
 5 |     does not divide n.
 6 | 
 7 |     Examples
 8 |     --------
 9 |     >>> from sklearn.utils import gen_batches
10 |     >>> list(gen_batches(7, 3))
11 |     [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]
12 |     >>> list(gen_batches(6, 3))
13 |     [slice(0, 3, None), slice(3, 6, None)]
14 |     >>> list(gen_batches(2, 3))
15 |     [slice(0, 2, None)]
16 |     """
17 |     start = 0
18 |     for _ in range(int(n // batch_size)):
19 |         end = start + batch_size
20 |         yield slice(start, end)
21 |         start = end
22 |     if start < n:
23 |         yield slice(start, n)
24 | 
25 | 
26 | def gen_even_slices(n, n_packs, n_samples=None):
27 |     """Generator to create n_packs slices going up to n.
28 | 
29 |     Pass n_samples when the slices are to be used for sparse matrix indexing;
30 |     slicing off-the-end raises an exception, while it works for NumPy arrays.
31 | 
32 |     Examples
33 |     --------
34 |     >>> from sklearn.utils import gen_even_slices
35 |     >>> list(gen_even_slices(10, 1))
36 |     [slice(0, 10, None)]
37 |     >>> list(gen_even_slices(10, 10))                     #doctest: +ELLIPSIS
38 |     [slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)]
39 |     >>> list(gen_even_slices(10, 5))                      #doctest: +ELLIPSIS
40 |     [slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)]
41 |     >>> list(gen_even_slices(10, 3))
42 |     [slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)]
43 |     """
44 |     start = 0
45 |     if n_packs < 1:
46 |         raise ValueError("gen_even_slices got n_packs=%s, must be >=1"
47 |                          % n_packs)
48 |     for pack_num in range(n_packs):
49 |         this_n = n // n_packs
50 |         if pack_num < n % n_packs:
51 |             this_n += 1
52 |         if this_n > 0:
53 |             end = start + this_n
54 |             if n_samples is not None:
55 |                 end = min(n_samples, end)
56 |             yield slice(start, end, None)
57 |             start = end


--------------------------------------------------------------------------------
/tests/test_minmax.py:
--------------------------------------------------------------------------------
 1 | #!usr/bin/python 3.6
 2 | #-*-coding:utf-8-*-
 3 | 
 4 | import pytest 
 5 | import sys 
 6 | import os 
 7 | path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 8 | sys.path.append(path)
 9 | from size_constrained_clustering import minmax
10 | 
11 | class TestMinMax:
12 | 
13 |     def test_input(self):
14 |         with pytest.raises(AssertionError):
15 |             minmax.MinMaxKMeansMinCostFlow(n_clusters=-1, size_min=1, size_max=2)
16 |         with pytest.raises(AssertionError):
17 |             minmax.MinMaxKMeansMinCostFlow(n_clusters=0, size_min=1, size_max=2)
18 |         with pytest.raises(AssertionError):
19 |             minmax.MinMaxKMeansMinCostFlow(n_clusters=1, max_iters=1.2, size_min=1, size_max=2)
20 |         with pytest.raises(Exception):
21 |             minmax.MinMaxKMeansMinCostFlow(n_clusters=1, distance_func="a", size_min=1, size_max=2)
22 |         with pytest.raises(AssertionError):
23 |             minmax.MinMaxKMeansMinCostFlow(n_clusters=1, size_min=None, size_max=2)
24 |         with pytest.raises(AssertionError):
25 |             minmax.MinMaxKMeansMinCostFlow(n_clusters=1, size_min=-1, size_max=2)
26 |         with pytest.raises(AssertionError):
27 |             minmax.MinMaxKMeansMinCostFlow(n_clusters=1, size_min=20, size_max=10)
28 |         with pytest.raises(AssertionError):
29 |             model = minmax.MinMaxKMeansMinCostFlow(n_clusters=1, size_min=10, size_max=20)
30 |             import numpy as np 
31 |             X = np.random.random((1000, 2))
32 |             model.fit(X)
33 |     
34 |     def test_output(self):
35 |         from sklearn.datasets import make_blobs
36 |         import collections 
37 | 
38 |         n_samples = 2000
39 |         n_clusters = 4  # use 3 bins for calibration_curve as we have 3 clusters here
40 |         centers = [(-5, -5), (0, 0), (5, 5), (7, 10)]
41 | 
42 |         X, _ = make_blobs(n_samples=n_samples, n_features=2, cluster_std=1.0,
43 |                         centers=centers, shuffle=False, random_state=42)
44 |         
45 |         minsize = 200
46 |         maxsize = 800
47 |         model = minmax.MinMaxKMeansMinCostFlow(n_clusters, size_min=minsize, 
48 |             size_max=maxsize)
49 |         model.fit(X)
50 | 
51 |         label_counter = collections.Counter(model.labels_)
52 | 
53 |         for label, count in label_counter.items():
54 |             assert count >= minsize and count <= maxsize
55 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/sklearn_import/utils/sparsefuncs.py:
--------------------------------------------------------------------------------
 1 | from scipy import sparse as sp
 2 | from sklearn_import.utils.fixes import sparse_min_max
 3 | 
 4 | from .sparsefuncs_fast import (
 5 |     csr_mean_variance_axis0 as _csr_mean_var_axis0,
 6 |     csc_mean_variance_axis0 as _csc_mean_var_axis0)
 7 | 
 8 | 
 9 | def mean_variance_axis(X, axis):
10 |     """Compute mean and variance along an axix on a CSR or CSC matrix
11 | 
12 |     Parameters
13 |     ----------
14 |     X : CSR or CSC sparse matrix, shape (n_samples, n_features)
15 |         Input data.
16 | 
17 |     axis : int (either 0 or 1)
18 |         Axis along which the axis should be computed.
19 | 
20 |     Returns
21 |     -------
22 | 
23 |     means : float array with shape (n_features,)
24 |         Feature-wise means
25 | 
26 |     variances : float array with shape (n_features,)
27 |         Feature-wise variances
28 | 
29 |     """
30 |     _raise_error_wrong_axis(axis)
31 | 
32 |     if isinstance(X, sp.csr_matrix):
33 |         if axis == 0:
34 |             return _csr_mean_var_axis0(X)
35 |         else:
36 |             return _csc_mean_var_axis0(X.T)
37 |     elif isinstance(X, sp.csc_matrix):
38 |         if axis == 0:
39 |             return _csc_mean_var_axis0(X)
40 |         else:
41 |             return _csr_mean_var_axis0(X.T)
42 |     else:
43 |         _raise_typeerror(X)
44 | 
45 | 
46 | def min_max_axis(X, axis):
47 |     """Compute minimum and maximum along an axis on a CSR or CSC matrix
48 | 
49 |     Parameters
50 |     ----------
51 |     X : CSR or CSC sparse matrix, shape (n_samples, n_features)
52 |         Input data.
53 | 
54 |     axis : int (either 0 or 1)
55 |         Axis along which the axis should be computed.
56 | 
57 |     Returns
58 |     -------
59 | 
60 |     mins : float array with shape (n_features,)
61 |         Feature-wise minima
62 | 
63 |     maxs : float array with shape (n_features,)
64 |         Feature-wise maxima
65 |     """
66 |     if isinstance(X, sp.csr_matrix) or isinstance(X, sp.csc_matrix):
67 |         return sparse_min_max(X, axis=axis)
68 |     else:
69 |         _raise_typeerror(X)
70 | 
71 | 
72 | def _raise_typeerror(X):
73 |     """Raises a TypeError if X is not a CSR or CSC matrix"""
74 |     input_type = X.format if sp.issparse(X) else type(X)
75 |     err = "Expected a CSR or CSC sparse matrix, got %s." % input_type
76 |     raise TypeError(err)
77 | 
78 | 
79 | def _raise_error_wrong_axis(axis):
80 |     if axis not in (0, 1):
81 |         raise ValueError(
82 |             "Unknown axis value: %d. Use 0 for rows, or 1 for columns" % axis)
83 | 


--------------------------------------------------------------------------------
/examples/examples.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os 
 3 | import sys 
 4 | path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 5 | sys.path.append(path)
 6 | from src import fcm, equal, da, minmax, shrinkage
 7 | 
 8 | from sklearn.datasets import make_blobs
 9 | from matplotlib import pyplot as plt
10 | import numpy as np 
11 | from seaborn import scatterplot as scatter
12 | from sklearn.metrics.pairwise import haversine_distances
13 | import collections
14 | 
15 | def fcm_example():
16 |     n_samples = 2000
17 |     n_clusters = 4 
18 |     centers = [(-5, -5), (0, 0), (5, 5), (7, 10)]
19 | 
20 |     X, _ = make_blobs(n_samples=n_samples, n_features=2, cluster_std=1.0,
21 |                     centers=centers, shuffle=False, random_state=42)
22 | 
23 |     model = fcm.FCM(n_clusters)
24 |     model.fit(X)
25 |     centers = model.cluster_centers_
26 |     labels = model.labels_
27 | 
28 |     plot(centers, labels, X)
29 | 
30 | def equal_example():
31 |     n_samples = 2000
32 |     n_clusters = 3
33 |     X = np.random.rand(n_samples, 2)
34 |     # model = equal.SameSizeKMeansMinCostFlow(n_clusters)
35 |     model = equal.SameSizeKMeansHeuristics(n_clusters)
36 |     model.fit(X)
37 | 
38 |     centers = model.cluster_centers_
39 |     labels = model.labels_
40 | 
41 |     print("Cluster size count: ", collections.Counter(labels))
42 |     plot(centers, labels, X)
43 | 
44 | def minmax_example():
45 |     n_samples = 2000
46 |     n_clusters = 3
47 |     X = np.random.rand(n_samples, 2)
48 |     model = minmax.MinMaxKMeansMinCostFlow(n_clusters, size_min=400, size_max=800)
49 |     model.fit(X)
50 | 
51 |     centers = model.cluster_centers_
52 |     labels = model.labels_
53 | 
54 |     print("Cluster size count: ", collections.Counter(labels))
55 |     plot(centers, labels, X)
56 | 
57 | def da_example():
58 |     n_samples = 2000
59 |     n_clusters = 3
60 |     X = np.random.rand(n_samples, 2)
61 |     model = da.DeterministicAnnealing(n_clusters, distribution=[0.1, 0.6, 0.3])
62 |     model.fit(X)
63 | 
64 |     centers = model.cluster_centers_
65 |     labels = model.labels_
66 | 
67 |     cluster_size = list(collections.Counter(labels).values())
68 |     print("Cluster size: ", cluster_size)
69 |     print("Cluster size count: ", [c / n_samples for c in cluster_size])
70 |     plot(centers, labels, X)
71 | 
72 | def shrinkage_example():
73 |     n_samples = 1000
74 |     n_clusters = 4 
75 |     centers = [(-5, -5), (0, 0), (5, 5), (7, 10)]
76 | 
77 |     X, _ = make_blobs(n_samples=n_samples, n_features=2, cluster_std=1.0,
78 |                     centers=centers, shuffle=False, random_state=42)
79 | 
80 |     model = shrinkage.Shrinkage(n_clusters, size_min=100)
81 |     model.fit(X)
82 |     centers = model.cluster_centers_
83 |     labels = model.labels_
84 | 
85 |     plot(centers, labels, X)
86 | 
87 | def plot(centers, labels, X):
88 |     f, axes = plt.subplots(1, 2, figsize=(11, 5))
89 |     scatter(X[:, 0], X[:, 1], ax=axes[0])
90 |     scatter(X[:, 0], X[:, 1], ax=axes[1], hue=labels)
91 |     scatter(centers[:, 0], centers[:, 1], ax=axes[1], marker="s", s=200)
92 |     plt.show()
93 | 
94 | if __name__ == "__main__":
95 |     shrinkage_example()
96 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/fcm.py:
--------------------------------------------------------------------------------
 1 | #!usr/bin/python 3.7
 2 | #-*-coding:utf-8-*-
 3 | 
 4 | '''
 5 | @file: fcm.py, fuzzy c-means algorithm
 6 | @Author: Jing Wang (jingw2@foxmail.com)
 7 | @Date: 06/06/2020
 8 | @paper: Clustering with Size Constraints
 9 | @github reference: https://github.com/omadson/fuzzy-c-means/blob/master/fcmeans/fcm.py
10 | '''
11 | 
12 | from scipy.spatial.distance import cdist
13 | import numpy as np 
14 | from scipy.linalg import norm
15 | import sys 
16 | import os 
17 | path = os.path.dirname(os.path.abspath(__file__))
18 | sys.path.append(path)
19 | import base
20 | 
21 | class FCM(base.Base):
22 |     
23 |     def __init__(self, n_clusters, \
24 |         max_iters=1000, m=2, 
25 |         epsilon=1e-5,
26 |         random_state=42, 
27 |         distance_func=cdist):
28 |         '''
29 |         Args:
30 |             n_clusters (int): number of clusters 
31 |             max_iters (int): maximum iterations
32 |             m (float): membership order, in general it is 2 
33 |             epsilon (float): 1e-5
34 |             random_state (int): random seed
35 |             distance_func (callable function/None), default is Euclidean distance
36 |         '''
37 |         super(FCM, self).__init__(n_clusters, max_iters, distance_func)
38 |         assert m > 1
39 |         assert epsilon > 0
40 |         self.m = m 
41 |         self.epsilon = epsilon
42 |         self.random_state = random_state
43 |         self.u, self.cluster_centers_ = None, None
44 | 
45 |     def fit(self, X):
46 |         '''
47 |         Args:
48 |             X (array like): shape is (n_samples, n_dimensions)
49 |         '''
50 |         np.random.seed(self.random_state)
51 |         n_samples, n_dimensions = X.shape
52 | 
53 |         # initialize mu 
54 |         self.u = np.random.random(size=(n_samples, self.n_clusters))
55 |         self.u /= np.sum(self.u, axis=1).reshape((-1, 1))
56 | 
57 |         # initialize centers
58 |         itr = 0
59 |         while True:
60 |             last_u = self.u.copy()
61 |             # update centers
62 |             self.cluster_centers_ = self.update_centers(X) 
63 |             # update membership
64 |             self.u = self.update_membership(X)
65 |             if norm(self.u - last_u) < self.epsilon or itr >= self.max_iters:
66 |                 break 
67 |             itr += 1
68 |         
69 |         self.labels_ = np.argmax(self.u, axis=1)
70 | 
71 |     def update_centers(self, X):
72 |         '''
73 |         Update centers based new u
74 |         '''
75 |         um = np.power(self.u, self.m) # (n_samples, n_clusters)
76 |         centers = (X.T.dot(um)).T / np.sum(um, axis=0).reshape((-1, 1))
77 |         return centers
78 | 
79 |     def update_membership(self, X):
80 |         power = 2. / (self.m - 1)
81 |         n_samples, n_dimensions = X.shape
82 |         dist = self.distance_func(X, self.cluster_centers_)
83 |         dist = np.power(dist, power)
84 |         u = dist * np.sum(1. / dist, axis=1).reshape((-1, 1))
85 |         u = 1. / u
86 |         # normalize
87 |         u /= np.sum(u, axis=1).reshape((-1, 1))
88 |         return u
89 |     
90 |     def predict(self, X):
91 |         u = self.update_membership(X)
92 |         labels = np.argmax(u, axis=1)
93 |         return labels
94 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/minmax.py:
--------------------------------------------------------------------------------
 1 | #!usr/bin/python 3.7
 2 | #-*-coding:utf-8-*-
 3 | 
 4 | '''
 5 | @file: same_size_kmeans.py, equal size clustering with heuristics
 6 | @Author: Jing Wang (jingw2@foxmail.com)
 7 | @Date: 06/18/2020
 8 | @paper: 
 9 | @github reference: https://github.com/joshlk/k-means-constrained
10 | @Web: 
11 | '''
12 | 
13 | import os 
14 | import sys
15 | path = os.path.dirname(os.path.abspath(__file__))
16 | sys.path.append(path)
17 | import base 
18 | from k_means_constrained import KMeansConstrained
19 | import numpy as np 
20 | import matplotlib.pyplot as plt
21 | from seaborn import scatterplot as scatter
22 | import collections
23 | from sklearn.metrics.pairwise import haversine_distances
24 | from sklearn.datasets import make_blobs
25 | from scipy.spatial.distance import cdist
26 | 
27 | class MinMaxKMeansMinCostFlow(base.Base):
28 | 
29 |     def __init__(self, n_clusters, size_min=None, size_max=None, 
30 |             max_iters=1000, distance_func=cdist, random_state=42):
31 |         '''
32 |         Args:
33 |             n_clusters (int): number of clusters 
34 |             max_iters (int): maximum iterations
35 |             distance_func (object): callable function with input (X, centers) / None, by default is l2-distance
36 |             random_state (int): random state to initiate, by default it is 42
37 |         '''
38 |         super(MinMaxKMeansMinCostFlow, self).__init__(n_clusters, max_iters, distance_func)
39 |         self.clf = None
40 |         self.size_min = size_min
41 |         self.size_max = size_max 
42 |         assert size_min is not None and size_max is not None
43 |         assert size_min >= 0 and size_max >= 0
44 |         assert size_min <= size_max
45 | 
46 |     def fit(self, X):
47 |         n_samples, n_features = X.shape
48 |         assert self.size_max * self.n_clusters >= n_samples
49 | 
50 |         clf = KMeansConstrained(self.n_clusters, size_min=self.size_min, 
51 |             size_max=self.size_max, distance_func=self.distance_func)
52 |    
53 |         clf.fit(X)
54 | 
55 |         self.clf = clf 
56 |         self.cluster_centers_ = self.clf.cluster_centers_
57 |         self.labels_ = self.clf.labels_
58 |     
59 |     def predict(self, X):
60 |         return self.clf.predict(X)
61 | 
62 | if __name__ == "__main__":
63 |     n_samples = 2000
64 |     n_clusters = 4  # use 3 bins for calibration_curve as we have 3 clusters here
65 |     centers = [(-5, -5), (0, 0), (5, 5), (7, 10)]
66 | 
67 |     X, _ = make_blobs(n_samples=n_samples, n_features=2, cluster_std=1.0,
68 |                     centers=centers, shuffle=False, random_state=42)
69 |     minsize = n_samples // n_clusters
70 |     maxsize = n_samples // n_clusters
71 |     minmax = MinMaxKMeansMinCostFlow(n_clusters, size_min=minsize, 
72 |         size_max=maxsize, distance_func=cdist)
73 |     minmax.fit(X)
74 | 
75 |     fcm_centers = minmax.cluster_centers_
76 |     fcm_labels = minmax.labels_
77 | 
78 |     print(collections.Counter(fcm_labels))
79 | 
80 |     f, axes = plt.subplots(1, 2, figsize=(11, 5))
81 |     scatter(X[:, 0], X[:, 1], ax=axes[0])
82 |     scatter(X[:, 0], X[:, 1], ax=axes[1], hue=fcm_labels)
83 |     scatter(fcm_centers[:, 0], fcm_centers[:, 1], ax=axes[1], marker="s",s=200)
84 |     plt.show()
85 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/shrinkage.py:
--------------------------------------------------------------------------------
 1 | #!usr/bin/python 3.6
 2 | #-*-coding:utf-8-*-
 3 | 
 4 | '''
 5 | @file: shrinkage.py, shrinkage clustering
 6 | @Author: Jing Wang (jingw2@foxmail.com)
 7 | @Date: 06/24/2020
 8 | @Paper reference: Shrinkage Clustering: A fast and \
 9 |     size-constrained clustering algorithm for biomedical applications
10 | '''
11 | 
12 | import os 
13 | import sys
14 | path = os.path.dirname(os.path.abspath(__file__))
15 | sys.path.append(path)
16 | import base 
17 | from scipy.spatial.distance import cdist
18 | import numpy as np
19 | import random
20 | 
21 | class Shrinkage(base.Base):
22 | 
23 |     def __init__(self, n_clusters, size_min=1, max_iters=1000, \
24 |         distance_func=cdist, random_state=42):
25 |         '''
26 |         Args:
27 |             n_clusters (int): number of clusters 
28 |             max_iters (int): maximum iterations
29 |             distance_func (object): callable function with input (X, centers) / None, by default is l2-distance
30 |             random_state (int): random state to initiate, by default it is 42
31 |         '''
32 |         super(Shrinkage, self).__init__(n_clusters, max_iters, distance_func)
33 |         np.random.seed(random_state)
34 |         random.seed(random_state)
35 |         self.size_min = size_min
36 |         assert isinstance(size_min, int)
37 |         assert size_min >= 1 
38 | 
39 |     def fit(self, X):
40 |         
41 |         n_samples, n_features = X.shape
42 | 
43 |         assert self.size_min <= n_samples // self.n_clusters
44 |         # calculate similarity matrix, larger similarity means more resemblance
45 |         S = self.distance_func(X, X)
46 |         S /= np.max(S)
47 |         S = 1 - S
48 |         # initialize
49 |         A, S_tilde = self._init(S)
50 |         iters = 0
51 |         while True:
52 |             # remove empty clusters 
53 |             cluster_size = np.sum(A, axis=0)
54 |             keep_cluster = np.where(cluster_size >= self.size_min)[0]
55 |             A = A[:, keep_cluster]
56 |             
57 |             # permute cluster membership
58 |             M = S_tilde @ A
59 |             v = np.min(M - np.sum(M * A, axis=1).reshape((-1, 1)), axis=1)
60 |             X_bar = np.argmin(v)
61 |             C_prime = np.argmin(M[X_bar])
62 | 
63 |             K = A.shape[1]
64 |             A[X_bar] = np.zeros(K)
65 |             A[X_bar, C_prime] = 1
66 | 
67 |             if abs(np.sum(v)) < 1e-5 or iters >= self.max_iters:
68 |                 break 
69 |             
70 |             iters += 1
71 |         
72 |         self.labels_ = np.argmax(A, axis=1)
73 |         self.cluster_centers_ = self.update_centers(X, A)
74 | 
75 |     
76 |     def _init(self, S):
77 |         '''
78 |         Initialize A and S_tilde
79 |         '''
80 |         n_samples, _ = S.shape 
81 |         A = np.zeros((n_samples, self.n_clusters))
82 |         A[range(n_samples), [random.choice(range(self.n_clusters)) for _ in range(n_samples)]] = 1
83 |         S_tilde = 1 - 2 * S 
84 |         return A, S_tilde
85 |     
86 |     def update_centers(self, X, labels):
87 |         '''
88 |         Update centers 
89 |         Args:
90 |             X (array like): (n_samples, n_features)
91 |             labels (array like): (n_samples, n_clusters), one-hot array
92 |         
93 |         Return:
94 |             centers (array like): (n_clusters, n_features)
95 |         '''
96 |         centers = (X.T.dot(labels)).T / np.sum(labels, axis=0).reshape((-1, 1))
97 |         return centers
98 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, Extension, dist
 2 | 
 3 | try:
 4 |     from setuptools import setup
 5 | except:
 6 |     from distutils.core import setup
 7 | 
 8 | import os
 9 | this_directory = os.path.abspath(os.path.dirname(__file__))
10 | with open(os.path.join(this_directory, 'README.md'), encoding='utf-8') as f:
11 |     long_description = f.read()
12 | 
13 | dist.Distribution().fetch_build_eggs(["cython>=0.27", "numpy>=1.13"])
14 | 
15 | 
16 | try:
17 |     from numpy import get_include
18 | except:
19 |     def get_include():
20 |         # Defer import to later
21 |         from numpy import get_include
22 |         return get_include()
23 | 
24 | try:
25 |     from Cython.Build import cythonize
26 | except ImportError:
27 |     print("! Could not import Cython !")
28 |     cythonize = None
29 | 
30 | 
31 | # https://cython.readthedocs.io/en/latest/src/userguide/source_files_and_compilation.html#distributing-cython-modules
32 | def no_cythonize(extensions, **_ignore):
33 |     for extension in extensions:
34 |         sources = []
35 |         for sfile in extension.sources:
36 |             path, ext = os.path.splitext(sfile)
37 |             if ext in (".pyx", ".py"):
38 |                 if extension.language == "c++":
39 |                     ext = ".cpp"
40 |                 else:
41 |                     ext = ".c"
42 |                 sfile = path + ext
43 |             sources.append(sfile)
44 |         extension.sources[:] = sources
45 |     return extensions
46 | 
47 | path = os.path.dirname(os.path.abspath(__file__))
48 | extensions = [
49 |     Extension("size_constrained_clustering.k_means_constrained.mincostflow_vectorized_", [os.path.join(path, "size_constrained_clustering/k_means_constrained/mincostflow_vectorized_.pyx")],
50 |               include_dirs=[get_include()]),
51 |     Extension("size_constrained_clustering.sklearn_import.cluster._k_means", [os.path.join(path, "size_constrained_clustering/sklearn_import/cluster/_k_means.pyx")],
52 |               include_dirs=[get_include()]),
53 |     Extension("size_constrained_clustering.sklearn_import.metrics.pairwise_fast", [os.path.join(path, "size_constrained_clustering/sklearn_import/metrics/pairwise_fast.pyx")],
54 |                   include_dirs=[get_include()]),
55 |     Extension("size_constrained_clustering.sklearn_import.utils.sparsefuncs_fast", [os.path.join(path, "size_constrained_clustering/sklearn_import/utils/sparsefuncs_fast.pyx")],
56 |                       include_dirs=[get_include()]),
57 | ]
58 | 
59 | CYTHONIZE = bool(int(os.getenv("CYTHONIZE", 1))) and cythonize is not None
60 | 
61 | if CYTHONIZE:
62 |     compiler_directives = {"language_level": 3, "embedsignature": True}
63 |     extensions = cythonize(extensions, compiler_directives=compiler_directives)
64 | else:
65 |     extensions = no_cythonize(extensions)
66 | 
67 | with open(os.path.join(path, "requirements.txt")) as fp:
68 |     install_requires = fp.read().strip().split("\n")
69 | 
70 | VERSION = "0.1.1"
71 | LICENSE = 'MIT'
72 | setup(
73 |       ext_modules=extensions,
74 |       version=VERSION,
75 |       setup_requires=["cython", "numpy"],
76 |       install_requires=install_requires,
77 |       name='size_constrained_clustering',
78 |       description='Size Constrained Clustering solver',
79 |       long_description=long_description,
80 |       long_description_content_type='text/markdown',
81 |       url='https://github.com/jingw2/size_constrained_clustering',
82 |       author='Jing Wang',
83 |       author_email='jingw2@foxmail.com',
84 |       license=LICENSE,
85 |       packages=find_packages(),
86 |       python_requires='>=3.6')
87 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/sklearn_import/preprocessing/data.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from scipy import sparse
  3 | 
  4 | from sklearn_import.utils.sparsefuncs_fast import inplace_csr_row_normalize_l1, inplace_csr_row_normalize_l2
  5 | 
  6 | from sklearn_import.utils.sparsefuncs import min_max_axis
  7 | 
  8 | from sklearn_import.utils.extmath import row_norms
  9 | from sklearn_import.utils.validation import check_array, FLOAT_DTYPES
 10 | 
 11 | 
 12 | def normalize(X, norm='l2', axis=1, copy=True, return_norm=False):
 13 |     """Scale input vectors individually to unit norm (vector length).
 14 | 
 15 |     Read more in the :ref:`User Guide <preprocessing_normalization>`.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     X : {array-like, sparse matrix}, shape [n_samples, n_features]
 20 |         The data to normalize, element by element.
 21 |         scipy.sparse matrices should be in CSR format to avoid an
 22 |         un-necessary copy.
 23 | 
 24 |     norm : 'l1', 'l2', or 'max', optional ('l2' by default)
 25 |         The norm to use to normalize each non zero sample (or each non-zero
 26 |         feature if axis is 0).
 27 | 
 28 |     axis : 0 or 1, optional (1 by default)
 29 |         axis used to normalize the data along. If 1, independently normalize
 30 |         each sample, otherwise (if 0) normalize each feature.
 31 | 
 32 |     copy : boolean, optional, default True
 33 |         set to False to perform inplace row normalization and avoid a
 34 |         copy (if the input is already a numpy array or a scipy.sparse
 35 |         CSR matrix and if axis is 1).
 36 | 
 37 |     return_norm : boolean, default False
 38 |         whether to return the computed norms
 39 | 
 40 |     Returns
 41 |     -------
 42 |     X : {array-like, sparse matrix}, shape [n_samples, n_features]
 43 |         Normalized input X.
 44 | 
 45 |     norms : array, shape [n_samples] if axis=1 else [n_features]
 46 |         An array of norms along given axis for X.
 47 |         When X is sparse, a NotImplementedError will be raised
 48 |         for norm 'l1' or 'l2'.
 49 | 
 50 |     See also
 51 |     --------
 52 |     Normalizer: Performs normalization using the ``Transformer`` API
 53 |         (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`).
 54 | 
 55 |     Notes
 56 |     -----
 57 |     For a comparison of the different scalers, transformers, and normalizers,
 58 |     see :ref:`examples/preprocessing/plot_all_scaling.py
 59 |     <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
 60 | 
 61 |     """
 62 |     if norm not in ('l1', 'l2', 'max'):
 63 |         raise ValueError("'%s' is not a supported norm" % norm)
 64 | 
 65 |     if axis == 0:
 66 |         sparse_format = 'csc'
 67 |     elif axis == 1:
 68 |         sparse_format = 'csr'
 69 |     else:
 70 |         raise ValueError("'%d' is not a supported axis" % axis)
 71 | 
 72 |     X = check_array(X, sparse_format, copy=copy,
 73 |                     estimator='the normalize function', dtype=FLOAT_DTYPES)
 74 |     if axis == 0:
 75 |         X = X.T
 76 | 
 77 |     if sparse.issparse(X):
 78 |         if return_norm and norm in ('l1', 'l2'):
 79 |             raise NotImplementedError("return_norm=True is not implemented "
 80 |                                       "for sparse matrices with norm 'l1' "
 81 |                                       "or norm 'l2'")
 82 |         if norm == 'l1':
 83 |             inplace_csr_row_normalize_l1(X)
 84 |         elif norm == 'l2':
 85 |             inplace_csr_row_normalize_l2(X)
 86 |         elif norm == 'max':
 87 |             _, norms = min_max_axis(X, 1)
 88 |             norms_elementwise = norms.repeat(np.diff(X.indptr))
 89 |             mask = norms_elementwise != 0
 90 |             X.data[mask] /= norms_elementwise[mask]
 91 |     else:
 92 |         if norm == 'l1':
 93 |             norms = np.abs(X).sum(axis=1)
 94 |         elif norm == 'l2':
 95 |             norms = row_norms(X)
 96 |         elif norm == 'max':
 97 |             norms = np.max(X, axis=1)
 98 |         norms = _handle_zeros_in_scale(norms, copy=False)
 99 |         X /= norms[:, np.newaxis]
100 | 
101 |     if axis == 0:
102 |         X = X.T
103 | 
104 |     if return_norm:
105 |         return X, norms
106 |     else:
107 |         return X
108 | 
109 | 
110 | def _handle_zeros_in_scale(scale, copy=True):
111 |     ''' Makes sure that whenever scale is zero, we handle it correctly.
112 | 
113 |     This happens in most scalers when we have constant features.'''
114 | 
115 |     # if we are fitting on 1D arrays, scale might be a scalar
116 |     if np.isscalar(scale):
117 |         if scale == .0:
118 |             scale = 1.
119 |         return scale
120 |     elif isinstance(scale, np.ndarray):
121 |         if copy:
122 |             # New array to avoid side-effects
123 |             scale = scale.copy()
124 |         scale[scale == 0.0] = 1.0
125 |         return scale
126 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/sklearn_import/utils/extmath.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | 
  3 | import numpy as np
  4 | from scipy.sparse import issparse, csr_matrix
  5 | from sklearn_import.utils.sparsefuncs_fast import csr_row_norms
  6 | 
  7 | from sklearn_import.utils.fixes import np_version
  8 | 
  9 | 
 10 | def row_norms(X, squared=False):
 11 |     """Row-wise (squared) Euclidean norm of X.
 12 | 
 13 |     Equivalent to np.sqrt((X * X).sum(axis=1)), but also supports sparse
 14 |     matrices and does not create an X.shape-sized temporary.
 15 | 
 16 |     Performs no input validation.
 17 |     """
 18 |     if issparse(X):
 19 |         if not isinstance(X, csr_matrix):
 20 |             X = csr_matrix(X)
 21 |         norms = csr_row_norms(X)
 22 |     else:
 23 |         norms = np.einsum('ij,ij->i', X, X)
 24 | 
 25 |     if not squared:
 26 |         np.sqrt(norms, norms)
 27 |     return norms
 28 | 
 29 | 
 30 | def squared_norm(x):
 31 |     """Squared Euclidean or Frobenius norm of x.
 32 | 
 33 |     Returns the Euclidean norm when x is a vector, the Frobenius norm when x
 34 |     is a matrix (2-d array). Faster than norm(x) ** 2.
 35 |     """
 36 |     x = np.ravel(x, order='K')
 37 |     if np.issubdtype(x.dtype, np.integer):
 38 |         warnings.warn('Array type is integer, np.dot may overflow. '
 39 |                       'Data should be float type to avoid this issue',
 40 |                       UserWarning)
 41 |     return np.dot(x, x)
 42 | 
 43 | 
 44 | def cartesian(arrays, out=None):
 45 |     """Generate a cartesian product of input arrays.
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     arrays : list of array-like
 50 |         1-D arrays to form the cartesian product of.
 51 |     out : ndarray
 52 |         Array to place the cartesian product in.
 53 | 
 54 |     Returns
 55 |     -------
 56 |     out : ndarray
 57 |         2-D array of shape (M, len(arrays)) containing cartesian products
 58 |         formed of input arrays.
 59 | 
 60 |     Examples
 61 |     --------
 62 |     >>> cartesian(([1, 2, 3], [4, 5], [6, 7]))
 63 |     array([[1, 4, 6],
 64 |            [1, 4, 7],
 65 |            [1, 5, 6],
 66 |            [1, 5, 7],
 67 |            [2, 4, 6],
 68 |            [2, 4, 7],
 69 |            [2, 5, 6],
 70 |            [2, 5, 7],
 71 |            [3, 4, 6],
 72 |            [3, 4, 7],
 73 |            [3, 5, 6],
 74 |            [3, 5, 7]])
 75 | 
 76 |     """
 77 |     arrays = [np.asarray(x) for x in arrays]
 78 |     shape = (len(x) for x in arrays)
 79 |     dtype = arrays[0].dtype
 80 | 
 81 |     ix = np.indices(shape)
 82 |     ix = ix.reshape(len(arrays), -1).T
 83 | 
 84 |     if out is None:
 85 |         out = np.empty_like(ix, dtype=dtype)
 86 | 
 87 |     for n, arr in enumerate(arrays):
 88 |         out[:, n] = arrays[n][ix[:, n]]
 89 | 
 90 |     return out
 91 | 
 92 | 
 93 | def stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
 94 |     """Use high precision for cumsum and check that final value matches sum
 95 | 
 96 |     Parameters
 97 |     ----------
 98 |     arr : array-like
 99 |         To be cumulatively summed as flat
100 |     axis : int, optional
101 |         Axis along which the cumulative sum is computed.
102 |         The default (None) is to compute the cumsum over the flattened array.
103 |     rtol : float
104 |         Relative tolerance, see ``np.allclose``
105 |     atol : float
106 |         Absolute tolerance, see ``np.allclose``
107 |     """
108 |     # sum is as unstable as cumsum for numpy < 1.9
109 |     if np_version < (1, 9):
110 |         return np.cumsum(arr, axis=axis, dtype=np.float64)
111 | 
112 |     out = np.cumsum(arr, axis=axis, dtype=np.float64)
113 |     expected = np.sum(arr, axis=axis, dtype=np.float64)
114 |     if not np.all(np.isclose(out.take(-1, axis=axis), expected, rtol=rtol,
115 |                              atol=atol, equal_nan=True)):
116 |         warnings.warn('cumsum was found to be unstable: '
117 |                       'its last element does not correspond to sum',
118 |                       RuntimeWarning)
119 |     return out
120 | 
121 | 
122 | def safe_sparse_dot(a, b, dense_output=False):
123 |     """Dot product that handle the sparse matrix case correctly
124 | 
125 |     Uses BLAS GEMM as replacement for numpy.dot where possible
126 |     to avoid unnecessary copies.
127 | 
128 |     Parameters
129 |     ----------
130 |     a : array or sparse matrix
131 |     b : array or sparse matrix
132 |     dense_output : boolean, default False
133 |         When False, either ``a`` or ``b`` being sparse will yield sparse
134 |         output. When True, output will always be an array.
135 | 
136 |     Returns
137 |     -------
138 |     dot_product : array or sparse matrix
139 |         sparse if ``a`` or ``b`` is sparse and ``dense_output=False``.
140 |     """
141 |     if issparse(a) or issparse(b):
142 |         ret = a * b
143 |         if dense_output and hasattr(ret, "toarray"):
144 |             ret = ret.toarray()
145 |         return ret
146 |     else:
147 |         return np.dot(a, b)
148 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/sklearn_import/cluster/_k_means.pyx:
--------------------------------------------------------------------------------
  1 | # cython: profile=True
  2 | # Profiling is enabled by default as the overhead does not seem to be measurable
  3 | # on this specific use case.
  4 | 
  5 | # Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
  6 | #         Olivier Grisel <olivier.grisel@ensta.org>
  7 | #         Lars Buitinck
  8 | #
  9 | # License: BSD 3 clause
 10 | 
 11 | import numpy as np
 12 | cimport numpy as np
 13 | cimport cython
 14 | from cython cimport floating
 15 | 
 16 | from sklearn_import.utils.sparsefuncs_fast import assign_rows_csr
 17 | 
 18 | ctypedef np.float64_t DOUBLE
 19 | ctypedef np.int32_t INT
 20 | 
 21 | ctypedef floating (*DOT)(int N, floating *X, int incX, floating *Y,
 22 |                          int incY)
 23 | 
 24 | 
 25 | np.import_array()
 26 | 
 27 | @cython.boundscheck(False)
 28 | @cython.wraparound(False)
 29 | @cython.cdivision(True)
 30 | def _centers_dense(np.ndarray[floating, ndim=2] X,
 31 |         np.ndarray[INT, ndim=1] labels, int n_clusters,
 32 |         np.ndarray[floating, ndim=1] distances):
 33 |     """M step of the K-means EM algorithm
 34 | 
 35 |     Computation of cluster centers / means.
 36 | 
 37 |     Parameters
 38 |     ----------
 39 |     X : array-like, shape (n_samples, n_features)
 40 | 
 41 |     labels : array of integers, shape (n_samples)
 42 |         Current label assignment
 43 | 
 44 |     n_clusters : int
 45 |         Number of desired clusters
 46 | 
 47 |     distances : array-like, shape (n_samples)
 48 |         Distance to closest cluster for each sample.
 49 | 
 50 |     Returns
 51 |     -------
 52 |     centers : array, shape (n_clusters, n_features)
 53 |         The resulting centers
 54 |     """
 55 |     ## TODO: add support for CSR input
 56 |     cdef int n_samples, n_features
 57 |     n_samples = X.shape[0]
 58 |     n_features = X.shape[1]
 59 |     cdef int i, j, c
 60 |     cdef np.ndarray[floating, ndim=2] centers
 61 |     if floating is float:
 62 |         centers = np.zeros((n_clusters, n_features), dtype=np.float32)
 63 |     else:
 64 |         centers = np.zeros((n_clusters, n_features), dtype=np.float64)
 65 | 
 66 |     n_samples_in_cluster = np.bincount(labels, minlength=n_clusters)
 67 |     empty_clusters = np.where(n_samples_in_cluster == 0)[0]
 68 |     # maybe also relocate small clusters?
 69 | 
 70 |     if len(empty_clusters):
 71 |         # find points to reassign empty clusters to
 72 |         far_from_centers = distances.argsort()[::-1]
 73 | 
 74 |         for i, cluster_id in enumerate(empty_clusters):
 75 |             # XXX two relocated clusters could be close to each other
 76 |             new_center = X[far_from_centers[i]]
 77 |             centers[cluster_id] = new_center
 78 |             n_samples_in_cluster[cluster_id] = 1
 79 | 
 80 |     for i in range(n_samples):
 81 |         for j in range(n_features):
 82 |             centers[labels[i], j] += X[i, j]
 83 | 
 84 |     centers /= n_samples_in_cluster[:, np.newaxis]
 85 | 
 86 |     return centers
 87 | 
 88 | 
 89 | @cython.boundscheck(False)
 90 | @cython.wraparound(False)
 91 | @cython.cdivision(True)
 92 | def _centers_sparse(X, np.ndarray[INT, ndim=1] labels, n_clusters,
 93 |         np.ndarray[floating, ndim=1] distances):
 94 |     """M step of the K-means EM algorithm
 95 | 
 96 |     Computation of cluster centers / means.
 97 | 
 98 |     Parameters
 99 |     ----------
100 |     X : scipy.sparse.csr_matrix, shape (n_samples, n_features)
101 | 
102 |     labels : array of integers, shape (n_samples)
103 |         Current label assignment
104 | 
105 |     n_clusters : int
106 |         Number of desired clusters
107 | 
108 |     distances : array-like, shape (n_samples)
109 |         Distance to closest cluster for each sample.
110 | 
111 |     Returns
112 |     -------
113 |     centers : array, shape (n_clusters, n_features)
114 |         The resulting centers
115 |     """
116 |     cdef int n_features = X.shape[1]
117 |     cdef int curr_label
118 | 
119 |     cdef np.ndarray[floating, ndim=1] data = X.data
120 |     cdef np.ndarray[int, ndim=1] indices = X.indices
121 |     cdef np.ndarray[int, ndim=1] indptr = X.indptr
122 | 
123 |     cdef np.ndarray[floating, ndim=2, mode="c"] centers
124 |     cdef np.ndarray[np.npy_intp, ndim=1] far_from_centers
125 |     cdef np.ndarray[np.npy_intp, ndim=1, mode="c"] n_samples_in_cluster = \
126 |         np.bincount(labels, minlength=n_clusters)
127 |     cdef np.ndarray[np.npy_intp, ndim=1, mode="c"] empty_clusters = \
128 |         np.where(n_samples_in_cluster == 0)[0]
129 |     cdef int n_empty_clusters = empty_clusters.shape[0]
130 | 
131 |     if floating is float:
132 |         centers = np.zeros((n_clusters, n_features), dtype=np.float32)
133 |     else:
134 |         centers = np.zeros((n_clusters, n_features), dtype=np.float64)
135 | 
136 |     # maybe also relocate small clusters?
137 | 
138 |     if n_empty_clusters > 0:
139 |         # find points to reassign empty clusters to
140 |         far_from_centers = distances.argsort()[::-1][:n_empty_clusters]
141 | 
142 |         # XXX two relocated clusters could be close to each other
143 |         assign_rows_csr(X, far_from_centers, empty_clusters, centers)
144 | 
145 |         for i in range(n_empty_clusters):
146 |             n_samples_in_cluster[empty_clusters[i]] = 1
147 | 
148 |     for i in range(labels.shape[0]):
149 |         curr_label = labels[i]
150 |         for ind in range(indptr[i], indptr[i + 1]):
151 |             j = indices[ind]
152 |             centers[curr_label, j] += data[ind]
153 | 
154 |     centers /= n_samples_in_cluster[:, np.newaxis]
155 | 
156 |     return centers
157 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/sklearn_import/funcsigs.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import types
  3 | from collections import OrderedDict
  4 | 
  5 | from sklearn_import.externals.funcsigs import _NonUserDefinedCallables, _get_user_defined_method, \
  6 |     _POSITIONAL_ONLY, _VAR_POSITIONAL, _VAR_KEYWORD, Signature
  7 | 
  8 | 
  9 | def signature(obj):
 10 |     '''Get a signature object for the passed callable.'''
 11 | 
 12 |     if not callable(obj):
 13 |         raise TypeError('{0!r} is not a callable object'.format(obj))
 14 | 
 15 |     if isinstance(obj, types.MethodType):
 16 |         sig = signature(obj.__func__)
 17 |         if obj.__self__ is None:
 18 |             # Unbound method: the first parameter becomes positional-only
 19 |             if sig.parameters:
 20 |                 first = sig.parameters.values()[0].replace(
 21 |                     kind=_POSITIONAL_ONLY)
 22 |                 return sig.replace(
 23 |                     parameters=(first,) + tuple(sig.parameters.values())[1:])
 24 |             else:
 25 |                 return sig
 26 |         else:
 27 |             # In this case we skip the first parameter of the underlying
 28 |             # function (usually `self` or `cls`).
 29 |             return sig.replace(parameters=tuple(sig.parameters.values())[1:])
 30 | 
 31 |     try:
 32 |         sig = obj.__signature__
 33 |     except AttributeError:
 34 |         pass
 35 |     else:
 36 |         if sig is not None:
 37 |             return sig
 38 | 
 39 |     try:
 40 |         # Was this function wrapped by a decorator?
 41 |         wrapped = obj.__wrapped__
 42 |     except AttributeError:
 43 |         pass
 44 |     else:
 45 |         return signature(wrapped)
 46 | 
 47 |     if isinstance(obj, types.FunctionType):
 48 |         return Signature.from_function(obj)
 49 | 
 50 |     if isinstance(obj, functools.partial):
 51 |         sig = signature(obj.func)
 52 | 
 53 |         new_params = OrderedDict(sig.parameters.items())
 54 | 
 55 |         partial_args = obj.args or ()
 56 |         partial_keywords = obj.keywords or {}
 57 |         try:
 58 |             ba = sig.bind_partial(*partial_args, **partial_keywords)
 59 |         except TypeError as ex:
 60 |             msg = 'partial object {0!r} has incorrect arguments'.format(obj)
 61 |             raise ValueError(msg)
 62 | 
 63 |         for arg_name, arg_value in ba.arguments.items():
 64 |             param = new_params[arg_name]
 65 |             if arg_name in partial_keywords:
 66 |                 # We set a new default value, because the following code
 67 |                 # is correct:
 68 |                 #
 69 |                 #   >>> def foo(a): print(a)
 70 |                 #   >>> print(partial(partial(foo, a=10), a=20)())
 71 |                 #   20
 72 |                 #   >>> print(partial(partial(foo, a=10), a=20)(a=30))
 73 |                 #   30
 74 |                 #
 75 |                 # So, with 'partial' objects, passing a keyword argument is
 76 |                 # like setting a new default value for the corresponding
 77 |                 # parameter
 78 |                 #
 79 |                 # We also mark this parameter with '_partial_kwarg'
 80 |                 # flag.  Later, in '_bind', the 'default' value of this
 81 |                 # parameter will be added to 'kwargs', to simulate
 82 |                 # the 'functools.partial' real call.
 83 |                 new_params[arg_name] = param.replace(default=arg_value,
 84 |                                                      _partial_kwarg=True)
 85 | 
 86 |             elif (param.kind not in (_VAR_KEYWORD, _VAR_POSITIONAL) and
 87 |                             not param._partial_kwarg):
 88 |                 new_params.pop(arg_name)
 89 | 
 90 |         return sig.replace(parameters=new_params.values())
 91 | 
 92 |     sig = None
 93 |     if isinstance(obj, type):
 94 |         # obj is a class or a metaclass
 95 | 
 96 |         # First, let's see if it has an overloaded __call__ defined
 97 |         # in its metaclass
 98 |         call = _get_user_defined_method(type(obj), '__call__')
 99 |         if call is not None:
100 |             sig = signature(call)
101 |         else:
102 |             # Now we check if the 'obj' class has a '__new__' method
103 |             new = _get_user_defined_method(obj, '__new__')
104 |             if new is not None:
105 |                 sig = signature(new)
106 |             else:
107 |                 # Finally, we should have at least __init__ implemented
108 |                 init = _get_user_defined_method(obj, '__init__')
109 |                 if init is not None:
110 |                     sig = signature(init)
111 |     elif not isinstance(obj, _NonUserDefinedCallables):
112 |         # An object with __call__
113 |         # We also check that the 'obj' is not an instance of
114 |         # _WrapperDescriptor or _MethodWrapper to avoid
115 |         # infinite recursion (and even potential segfault)
116 |         call = _get_user_defined_method(type(obj), '__call__', 'im_func')
117 |         if call is not None:
118 |             sig = signature(call)
119 | 
120 |     if sig is not None:
121 |         # For classes and objects we skip the first parameter of their
122 |         # __call__, __new__, or __init__ methods
123 |         return sig.replace(parameters=tuple(sig.parameters.values())[1:])
124 | 
125 |     if isinstance(obj, types.BuiltinFunctionType):
126 |         # Raise a nicer error message for builtins
127 |         msg = 'no signature found for builtin function {0!r}'.format(obj)
128 |         raise ValueError(msg)
129 | 
130 |     raise ValueError('callable {0!r} is not supported by signature'.format(obj))
131 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/base.py:
--------------------------------------------------------------------------------
  1 | #!usr/bin/python 3.7
  2 | #-*-coding:utf-8-*-
  3 | 
  4 | '''
  5 | @file: base.py, base for clustering algorithm
  6 | @Author: Jing Wang (jingw2@foxmail.com)
  7 | @Date: 06/07/2020
  8 | '''
  9 | from scipy.spatial.distance import cdist
 10 | import numpy as np 
 11 | import warnings
 12 | import scipy.sparse as sp
 13 | 
 14 | import os 
 15 | import sys 
 16 | path = os.path.dirname(os.path.abspath(__file__))
 17 | sys.path.append(path)
 18 | from sklearn_import.utils.extmath import stable_cumsum
 19 | 
 20 | class Base(object):
 21 | 
 22 |     def __init__(self, n_clusters, max_iters, distance_func=cdist):
 23 |         '''
 24 |         Base Cluster object
 25 | 
 26 |         Args:
 27 |             n_clusters (int): number of clusters 
 28 |             max_iters (int): maximum iterations
 29 |             distance_func (callable function): distance function callback
 30 |         '''
 31 |         assert isinstance(n_clusters, int)
 32 |         assert n_clusters >= 1
 33 |         assert isinstance(max_iters, int)
 34 |         assert max_iters >= 1
 35 |         self.n_clusters = n_clusters 
 36 |         self.max_iters = max_iters
 37 |         if distance_func is not None and not callable(distance_func):
 38 |             raise Exception("Distance function is not callable")
 39 |         self.distance_func = distance_func
 40 | 
 41 |     def fit(self, X):
 42 |         pass 
 43 | 
 44 |     def predict(self, X):
 45 |         pass 
 46 | 
 47 | def k_init(X, n_clusters, x_squared_norms, random_state=42, distance_func=cdist, n_local_trials=None):
 48 |     """Init n_clusters seeds according to k-means++
 49 | 
 50 |     Parameters
 51 |     ----------
 52 |     X : array or sparse matrix, shape (n_samples, n_features)
 53 |         The data to pick seeds for. To avoid memory copy, the input data
 54 |         should be double precision (dtype=np.float64).
 55 | 
 56 |     n_clusters : integer
 57 |         The number of seeds to choose
 58 | 
 59 |     x_squared_norms : array, shape (n_samples,)
 60 |         Squared Euclidean norm of each data point.
 61 | 
 62 |     random_state : int, RandomState instance
 63 |         The generator used to initialize the centers. Use an int to make the
 64 |         randomness deterministic.
 65 |         See :term:`Glossary <random_state>`.
 66 | 
 67 |     n_local_trials : integer, optional
 68 |         The number of seeding trials for each center (except the first),
 69 |         of which the one reducing inertia the most is greedily chosen.
 70 |         Set to None to make the number of trials depend logarithmically
 71 |         on the number of seeds (2+log(k)); this is the default.
 72 | 
 73 |     Notes
 74 |     -----
 75 |     Selects initial cluster centers for k-mean clustering in a smart way
 76 |     to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
 77 |     "k-means++: the advantages of careful seeding". ACM-SIAM symposium
 78 |     on Discrete algorithms. 2007
 79 | 
 80 |     Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip,
 81 |     which is the implementation used in the aforementioned paper.
 82 |     """
 83 |     n_samples, n_features = X.shape
 84 | 
 85 |     centers = np.empty((n_clusters, n_features), dtype=X.dtype)
 86 | 
 87 |     assert x_squared_norms is not None, 'x_squared_norms None in _k_init'
 88 | 
 89 |     # Set the number of local seeding trials if none is given
 90 |     if n_local_trials is None:
 91 |         # This is what Arthur/Vassilvitskii tried, but did not report
 92 |         # specific results for other than mentioning in the conclusion
 93 |         # that it helped.
 94 |         n_local_trials = 2 + int(np.log(n_clusters))
 95 | 
 96 |     # Pick first center randomly
 97 |     center_id = random_state.randint(n_samples)
 98 |     if sp.issparse(X):
 99 |         centers[0] = X[center_id].toarray()
100 |     else:
101 |         centers[0] = X[center_id]
102 | 
103 |     # Initialize list of closest distances and calculate current potential
104 |     closest_dist_sq = distance_func(
105 |         centers[0, np.newaxis], X)
106 |     current_pot = closest_dist_sq.sum()
107 | 
108 |     # Pick the remaining n_clusters-1 points
109 |     for c in range(1, n_clusters):
110 |         # Choose center candidates by sampling with probability proportional
111 |         # to the squared distance to the closest existing center
112 |         rand_vals = random_state.random_sample(n_local_trials) * current_pot
113 |         candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq),
114 |                                         rand_vals)
115 |         # XXX: numerical imprecision can result in a candidate_id out of range
116 |         np.clip(candidate_ids, None, closest_dist_sq.size - 1,
117 |                 out=candidate_ids)
118 | 
119 |         # Compute distances to center candidates
120 |         # distance_to_candidates = euclidean_distances(
121 |         #     X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True)
122 |         distance_to_candidates = distance_func(X[candidate_ids], X)
123 | 
124 |         # update closest distances squared and potential for each candidate
125 |         np.minimum(closest_dist_sq, distance_to_candidates,
126 |                    out=distance_to_candidates)
127 |         candidates_pot = distance_to_candidates.sum(axis=1)
128 | 
129 |         # Decide which candidate is the best
130 |         best_candidate = np.argmin(candidates_pot)
131 |         current_pot = candidates_pot[best_candidate]
132 |         closest_dist_sq = distance_to_candidates[best_candidate]
133 |         best_candidate = candidate_ids[best_candidate]
134 | 
135 |         # Permanently add best center candidate found in local tries
136 |         if sp.issparse(X):
137 |             centers[c] = X[best_candidate].toarray()
138 |         else:
139 |             centers[c] = X[best_candidate]
140 | 
141 |     return centers
142 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Size Constrained Clustering Solver
  2 | [![Build Status](https://travis-ci.org/jingw2/size_constrained_clustering.svg?branch=master)](https://travis-ci.org/jingw2/size_constrained_clustering)
  3 | [![PyPI version](https://badge.fury.io/py/size-constrained-clustering.svg)](https://badge.fury.io/py/size-constrained-clustering)
  4 | ![GitHub](https://img.shields.io/github/license/jingw2/size_constrained_clustering)
  5 | [![codecov](https://codecov.io/gh/jingw2/size_constrained_clustering/branch/master/graph/badge.svg)](https://codecov.io/gh/jingw2/size_constrained_clustering)
  6 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/size-constrained-clustering)
  7 | ![Codecov](https://img.shields.io/codecov/c/github/jingw2/size_constrained_clustering)
  8 | 
  9 | 
 10 | Implementation of Size Constrained Clustering. 
 11 | Size constrained clustering can be treated as an optimization problem. Details could be found in a set of reference paper.
 12 | 
 13 | ### Installation
 14 | Requirement Python >= 3.6, Numpy >= 1.13, Cython >= 0.29
 15 | * install from PyPI
 16 | ```shell
 17 | pip install size-constrained-clustering
 18 | ```
 19 | 
 20 | ### Methods
 21 | * Fuzzy C-means Algorithm: Similar to KMeans, but use membership probability, not 0 or 1
 22 | * Same Size Contrained KMeans Heuristics: Use Heuristics methods to reach same size clustering
 23 | * Same Size Contrained KMeans Inspired by Minimum Cost Flow Problem
 24 | * Minimum and Maximum Size Constrained KMeans Inspired by Minimum Cost Flow Problem
 25 | * Deterministic Annealling Algorithm: Input target cluster distribution, return correspondent clusters
 26 | * Shrinkage Clustering: base algorithm and minimum size constraints
 27 | 
 28 | ### Usage:
 29 | ```python
 30 | # setup
 31 | from size_constrained_clustering import fcm, equal, minmax, shrinkage
 32 | # by default it is euclidean distance, but can select others
 33 | from sklearn.metrics.pairwise import haversine_distances
 34 | import numpy as np
 35 | ```
 36 | 
 37 | Fuzzy C-means 
 38 | ```python
 39 | n_samples = 2000
 40 | n_clusters = 4
 41 | centers = [(-5, -5), (0, 0), (5, 5), (7, 10)]
 42 | X, _ = make_blobs(n_samples=n_samples, n_features=2, cluster_std=1.0,
 43 |                     centers=centers, shuffle=False, random_state=42)
 44 | model = fcm.FCM(n_clusters)
 45 | # use other distance function: e.g. haversine distance
 46 | # model = fcm.FCM(n_clusters, distance_func=haversine_distances)
 47 | model.fit(X)
 48 | centers = model.cluster_centers_
 49 | labels = model.labels_
 50 | ```
 51 | ![alt text](https://github.com/jingw2/size_constrained_clustering/blob/master/pic/fcm.png)
 52 | 
 53 | 
 54 | Equal Size Constraint
 55 | ```python
 56 | n_samples = 2000
 57 | n_clusters = 3
 58 | X = np.random.rand(n_samples, 2)
 59 | # use minimum cost flow framework to solve
 60 | model = equal.SameSizeKMeansMinCostFlow(n_clusters)
 61 | # use heuristics method to solve
 62 | model = equal.SameSizeKMeansHeuristics(n_clusters)
 63 | model.fit(X)
 64 | centers = model.cluster_centers_
 65 | labels = model.labels_
 66 | ```
 67 | ![alt text](https://github.com/jingw2/size_constrained_clustering/blob/master/pic/equal.png)
 68 | 
 69 | Cluster size: 667, 667 and 666 in the figure above.
 70 | 
 71 | Minimum and Maximum Size Constraint
 72 | ```python
 73 | n_samples = 2000
 74 | n_clusters = 3
 75 | X = np.random.rand(n_samples, 2)
 76 | model = minmax.MinMaxKMeansMinCostFlow(n_clusters, size_min=400,   size_max=800)
 77 | model.fit(X)
 78 | centers = model.cluster_centers_
 79 | labels = model.labels_
 80 | ```
 81 | ![alt text](https://github.com/jingw2/size_constrained_clustering/blob/master/pic/minmax.png)
 82 | 
 83 | Cluster size: 753, 645 and 602 in the figure above.
 84 | 
 85 | Deterministic Annealing
 86 | ```python
 87 | n_samples = 2000
 88 | n_clusters = 3
 89 | X = np.random.rand(n_samples, 2)
 90 | # distribution is the distribution of cluster sizes
 91 | model = da.DeterministicAnnealing(n_clusters, distribution=[0.1, 0.6, 0.3])
 92 | model.fit(X)
 93 | centers = model.cluster_centers_
 94 | labels = model.labels_
 95 | ```
 96 | ![alt text](https://github.com/jingw2/size_constrained_clustering/blob/master/pic/da.png)
 97 | 
 98 | Cluster size: 1200, 600 and 200 in the figure above, corresponding to distribution [0.6, 0.3, 0.1]
 99 | 
100 | Shrinkage Clustering
101 | 
102 | The result might be not available.
103 | ```python
104 | n_samples = 1000
105 | n_clusters = 4
106 | centers = [(-5, -5), (0, 0), (5, 5), (7, 10)]
107 | X, _ = make_blobs(n_samples=n_samples, n_features=2, cluster_std=1.0, centers=centers, shuffle=False, random_state=42)
108 | 
109 | model = shrinkage.Shrinkage(n_clusters, size_min=100)
110 | model.fit(X)
111 | centers = model.cluster_centers_
112 | labels = model.labels_
113 | ```
114 | ![alt text](https://github.com/jingw2/size_constrained_clustering/blob/master/pic/shrinkage.png)
115 | 
116 | 
117 | ## Copyright
118 | Copyright (c) 2020 Jing Wang. Released under the MIT License. 
119 | 
120 | Third-party copyright in this distribution is noted where applicable.
121 | 
122 | ### Reference
123 | * [Clustering with Capacity and Size Constraints: A Deterministic
124 | Approach](http://web.eecs.umich.edu/~mayankb/docs/ClusterCap.pdf)
125 | * [Deterministic Annealing, Clustering and Optimization](https://thesis.library.caltech.edu/2858/1/Rose_k_1991.pdf)
126 | * [Deterministic Annealing, Constrained Clustering, and Opthiieation](https://authors.library.caltech.edu/78353/1/00170767.pdf)
127 | * [Shrinkage Clustering](https://www.researchgate.net/publication/322668506_Shrinkage_Clustering_A_fast_and_size-constrained_clustering_algorithm_for_biomedical_applications)
128 | * [Clustering with size constraints](https://www.researchgate.net/publication/268292668_Clustering_with_Size_Constraints)
129 | * [Data Clustering with Cluster Size Constraints Using a Modified k-means Algorithm](https://core.ac.uk/download/pdf/61217069.pdf)
130 | * [KMeans Constrained Clustering Inspired by Minimum Cost Flow Problem](https://github.com/joshlk/k-means-constrained)
131 | * [Same Size Kmeans Heuristics Methods](https://elki-project.github.io/tutorial/same-size_k_means)
132 | * [Google's Operations Research tools's
133 | `SimpleMinCostFlow`](https://developers.google.com/optimization/flow/mincostflow)
134 | * [Cluster KMeans Constrained](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-2000-65.pdf)
135 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/da.py:
--------------------------------------------------------------------------------
  1 | #!usr/bin/python 3.6
  2 | #-*-coding:utf-8-*-
  3 | 
  4 | '''
  5 | @file: da.py, deterministic annealing algorithm
  6 | @Author: Jing Wang (jingw2@foxmail.com)
  7 | @Date: 11/28/2019
  8 | @Paper reference: Clustering with Capacity and Size Constraints: A Deterministic Approach
  9 | '''
 10 | 
 11 | import numpy as np
 12 | import matplotlib.pyplot as plt
 13 | from copy import deepcopy
 14 | import collections
 15 | import random
 16 | from scipy.spatial.distance import cdist
 17 | 
 18 | import os 
 19 | import sys 
 20 | path = os.path.dirname(os.path.abspath(__file__))
 21 | sys.path.append(path)
 22 | import base 
 23 | 
 24 | class DeterministicAnnealing(base.Base):
 25 | 
 26 |     def __init__(self, n_clusters, distribution, max_iters=1000, 
 27 |                 distance_func=cdist, random_state=42, T=None):
 28 |         '''
 29 |         Args:
 30 |             n_clusters (int): number of clusters
 31 |             distribution (list): a list of ratio distribution for each cluster
 32 |             T (list): inverse choice of beta coefficients
 33 |         '''
 34 |         super(DeterministicAnnealing, self).__init__(n_clusters, max_iters, distance_func)
 35 |         self.lamb = distribution
 36 |         assert np.sum(distribution) == 1 
 37 |         assert len(distribution) == n_clusters
 38 |         assert isinstance(T, list) or T is None
 39 | 
 40 |         self.beta = None
 41 |         self.T = T
 42 |         self.cluster_centers_ = None 
 43 |         self.labels_ = None 
 44 |         self._eta = None
 45 |         self._demands_prob = None
 46 |         random.seed(random_state)
 47 |         np.random.seed(random_state)
 48 | 
 49 |     def fit(self, X, demands_prob=None):
 50 |         # setting T, loop
 51 |         T = [1, 0.1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8]
 52 |         solutions = []
 53 |         diff_list = []
 54 |         is_early_terminated = False
 55 | 
 56 |         n_samples, n_features = X.shape
 57 |         self.capacity = [n_samples * d for d in self.lamb]
 58 |         if demands_prob is None:
 59 |             demands_prob = np.ones((n_samples, 1))
 60 |         else:
 61 |             demands_prob = np.asarray(demands_prob).reshape((-1, 1))
 62 |             assert demands_prob.shape[0] == X.shape[0]
 63 |         demands_prob = demands_prob / sum(demands_prob)
 64 |         for t in T:
 65 |             self.T = t
 66 |             centers = self.initial_centers(X)
 67 |             
 68 |             eta = self.lamb
 69 |             labels = None
 70 |             for _ in range(self.max_iters):
 71 |                 self.beta = 1. / self.T
 72 |                 distance_matrix = self.distance_func(X, centers)
 73 |                 eta = self.update_eta(eta, demands_prob, distance_matrix)
 74 |                 gibbs = self.update_gibbs(eta, distance_matrix)
 75 |                 centers = self.update_centers(demands_prob, gibbs, X)
 76 |                 self.T *= 0.999
 77 | 
 78 |                 labels = np.argmax(gibbs, axis=1)
 79 | 
 80 |                 if self._is_satisfied(labels): break
 81 | 
 82 |             solutions.append([labels, centers])
 83 |             resultant_clusters = len(collections.Counter(labels))
 84 | 
 85 |             diff_list.append(abs(resultant_clusters - self.n_clusters))
 86 |             if resultant_clusters == self.n_clusters:
 87 |                 is_early_terminated = True
 88 |                 break
 89 | 
 90 |         # modification for non-strictly satisfaction, only works for one demand per location
 91 |         # labels = self.modify(labels, centers, distance_matrix)
 92 |         if not is_early_terminated:
 93 |             best_index = np.argmin(diff_list)
 94 |             labels, centers = solutions[best_index]
 95 | 
 96 |         self.cluster_centers_ = centers 
 97 |         self.labels_ = labels
 98 |         self._eta = eta
 99 |         self._demands_prob = demands_prob
100 |     
101 |     def predict(self, X):
102 |         distance_matrix = self.distance_func(X, self.cluster_centers_)
103 |         eta = self.update_eta(self._eta, self._demands_prob, distance_matrix)
104 |         gibbs = self.update_gibbs(eta, distance_matrix)
105 |         labels = np.argmax(gibbs, axis=1)
106 |         return labels
107 | 
108 |     def modify(self, labels, centers, distance_matrix):
109 |         centers_distance = self.distance_func(centers, centers)
110 |         adjacent_centers = {i: np.argsort(centers_distance, axis=1)[i, 1:3].tolist() for i in range(self.n_clusters)}
111 |         while not self._is_satisfied(labels):
112 |             count = collections.Counter(labels)
113 |             cluster_id_list = list(count.keys())
114 |             random.shuffle(cluster_id_list)
115 |             for cluster_id in cluster_id_list:
116 |                 num_points = count[cluster_id]
117 |                 diff = num_points - self.capacity[cluster_id]
118 |                 if diff <= 0: 
119 |                     continue
120 |                 adjacent_cluster = None
121 |                 adjacent_cluster = random.choice(adjacent_centers[cluster_id])
122 |                 if adjacent_cluster is None: 
123 |                     continue
124 |                 cluster_point_id = np.where(labels==cluster_id)[0].tolist()
125 |                 diff_distance = distance_matrix[cluster_point_id, adjacent_cluster] \
126 |                                 - distance_matrix[cluster_point_id, cluster_id]
127 |                 remove_point_id = np.asarray(cluster_point_id)[np.argsort(diff_distance)[:diff]]
128 |                 labels[remove_point_id] = adjacent_cluster
129 | 
130 |         return labels
131 | 
132 |     def initial_centers(self, X):
133 |         selective_centers = random.sample(range(X.shape[0]), self.n_clusters)
134 |         centers = X[selective_centers]
135 |         return centers
136 | 
137 |     def _is_satisfied(self, labels):
138 |         count = collections.Counter(labels)
139 |         for cluster_id in range(len(self.capacity)):
140 |             if cluster_id not in count:
141 |                 return False
142 |             num_points = count[cluster_id]
143 |             if num_points > self.capacity[cluster_id]:
144 |                 return False
145 |         return True
146 | 
147 |     def update_eta(self, eta, demands_prob, distance_matrix):
148 |         n_points, n_centers = distance_matrix.shape
149 |         eta_repmat = np.tile(np.asarray(eta).reshape(1, -1), (n_points, 1))
150 |         exp_term = np.exp(- self.beta * distance_matrix)
151 |         divider = exp_term / np.sum(np.multiply(exp_term,
152 |                             eta_repmat), axis=1).reshape((-1, 1))
153 |         eta = np.divide(np.asarray(self.lamb),
154 |                         np.sum(divider * demands_prob, axis=0))
155 | 
156 |         return eta
157 | 
158 |     def update_gibbs(self, eta, distance_matrix):
159 |         n_points, n_centers = distance_matrix.shape
160 |         eta_repmat = np.tile(np.asarray(eta).reshape(1, -1), (n_points, 1))
161 |         exp_term = np.exp(- self.beta * distance_matrix)
162 |         factor = np.multiply(exp_term, eta_repmat)
163 |         gibbs = factor / np.sum(factor, axis=1).reshape((-1, 1))
164 |         return gibbs
165 | 
166 |     def update_centers(self, demands_prob, gibbs, X):
167 |         n_points, n_features = X.shape
168 |         divide_up = gibbs.T.dot(X * demands_prob)# n_cluster, n_features
169 |         p_y = np.sum(gibbs * demands_prob, axis=0) # n_cluster,
170 |         p_y_repmat = np.tile(p_y.reshape(-1, 1), (1, n_features))
171 |         centers = np.divide(divide_up, p_y_repmat)
172 |         return centers
173 | 
174 | if __name__ == "__main__":
175 |     X = []
176 |     n_points = 1000
177 |     random_state = 42
178 |     random.seed(random_state)
179 |     np.random.seed(random_state)
180 |     # demands = np.random.randint(1, 24, (n_points, 1))
181 |     X = np.random.rand(n_points, 2)
182 |     demands = np.ones((n_points, 1))
183 |     n_clusters = 4
184 |     n_iters = 100
185 |     max_size = [n_points / n_clusters] * n_clusters
186 |     max_size = [0.25, 0.5, 0.1, 0.15]
187 | 
188 |     da = DeterministicAnnealing(n_clusters, max_size, n_iters)
189 |     da.fit(X, demands)
190 |     labels = da.labels_
191 |     centers = da.cluster_centers_
192 |     print(centers)
193 |     labels_demand_cnt = {}
194 |     for i, label in enumerate(labels):
195 |         labels_demand_cnt[label] = labels_demand_cnt.get(label, 0) + demands[i][0]
196 | 
197 |     sorted_labels = sorted(labels_demand_cnt.items())
198 |     x = list(range(n_clusters))
199 |     y = [j for i, j in sorted_labels]
200 |     plt.scatter(X[:, 0], X[:, 1], c=labels)
201 |     print(collections.Counter(labels_demand_cnt))
202 |     # plt.show()
203 |     plt.xlabel("X")
204 |     plt.ylabel("Y")
205 |     # plt.bar(x, y)
206 |     plt.show()
207 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/sklearn_import/base.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from collections import defaultdict
  3 | 
  4 | import numpy as np
  5 | import six
  6 | 
  7 | from sklearn_import import __version__
  8 | from sklearn_import.funcsigs import signature
  9 | 
 10 | 
 11 | class BaseEstimator(object):
 12 |     """Base class for all estimators in scikit-learn
 13 | 
 14 |     Notes
 15 |     -----
 16 |     All estimators should specify all the parameters that can be set
 17 |     at the class level in their ``__init__`` as explicit keyword
 18 |     arguments (no ``*args`` or ``**kwargs``).
 19 |     """
 20 | 
 21 |     @classmethod
 22 |     def _get_param_names(cls):
 23 |         """Get parameter names for the estimator"""
 24 |         # fetch the constructor or the original constructor before
 25 |         # deprecation wrapping if any
 26 |         init = getattr(cls.__init__, 'deprecated_original', cls.__init__)
 27 |         if init is object.__init__:
 28 |             # No explicit constructor to introspect
 29 |             return []
 30 | 
 31 |         # introspect the constructor arguments to find the model parameters
 32 |         # to represent
 33 |         init_signature = signature(init)
 34 |         # Consider the constructor parameters excluding 'self'
 35 |         parameters = [p for p in init_signature.parameters.values()
 36 |                       if p.name != 'self' and p.kind != p.VAR_KEYWORD]
 37 |         for p in parameters:
 38 |             if p.kind == p.VAR_POSITIONAL:
 39 |                 raise RuntimeError("scikit-learn estimators should always "
 40 |                                    "specify their parameters in the signature"
 41 |                                    " of their __init__ (no varargs)."
 42 |                                    " %s with constructor %s doesn't "
 43 |                                    " follow this convention."
 44 |                                    % (cls, init_signature))
 45 |         # Extract and sort argument names excluding 'self'
 46 |         return sorted([p.name for p in parameters])
 47 | 
 48 |     def get_params(self, deep=True):
 49 |         """Get parameters for this estimator.
 50 | 
 51 |         Parameters
 52 |         ----------
 53 |         deep : boolean, optional
 54 |             If True, will return the parameters for this estimator and
 55 |             contained subobjects that are estimators.
 56 | 
 57 |         Returns
 58 |         -------
 59 |         params : mapping of string to any
 60 |             Parameter names mapped to their values.
 61 |         """
 62 |         out = dict()
 63 |         for key in self._get_param_names():
 64 |             # We need deprecation warnings to always be on in order to
 65 |             # catch deprecated param values.
 66 |             # This is set in utils/__init__.py but it gets overwritten
 67 |             # when running under python3 somehow.
 68 |             warnings.simplefilter("always", DeprecationWarning)
 69 |             try:
 70 |                 with warnings.catch_warnings(record=True) as w:
 71 |                     value = getattr(self, key, None)
 72 |                 if len(w) and w[0].category == DeprecationWarning:
 73 |                     # if the parameter is deprecated, don't show it
 74 |                     continue
 75 |             finally:
 76 |                 warnings.filters.pop(0)
 77 | 
 78 |             # XXX: should we rather test if instance of estimator?
 79 |             if deep and hasattr(value, 'get_params'):
 80 |                 deep_items = value.get_params().items()
 81 |                 out.update((key + '__' + k, val) for k, val in deep_items)
 82 |             out[key] = value
 83 |         return out
 84 | 
 85 |     def set_params(self, **params):
 86 |         """Set the parameters of this estimator.
 87 | 
 88 |         The method works on simple estimators as well as on nested objects
 89 |         (such as pipelines). The latter have parameters of the form
 90 |         ``<component>__<parameter>`` so that it's possible to update each
 91 |         component of a nested object.
 92 | 
 93 |         Returns
 94 |         -------
 95 |         self
 96 |         """
 97 |         if not params:
 98 |             # Simple optimization to gain speed (inspect is slow)
 99 |             return self
100 |         valid_params = self.get_params(deep=True)
101 | 
102 |         nested_params = defaultdict(dict)  # grouped by prefix
103 |         for key, value in params.items():
104 |             key, delim, sub_key = key.partition('__')
105 |             if key not in valid_params:
106 |                 raise ValueError('Invalid parameter %s for estimator %s. '
107 |                                  'Check the list of available parameters '
108 |                                  'with `estimator.get_params().keys()`.' %
109 |                                  (key, self))
110 | 
111 |             if delim:
112 |                 nested_params[key][sub_key] = value
113 |             else:
114 |                 setattr(self, key, value)
115 | 
116 |         for key, sub_params in nested_params.items():
117 |             valid_params[key].set_params(**sub_params)
118 | 
119 |         return self
120 | 
121 |     def __repr__(self):
122 |         class_name = self.__class__.__name__
123 |         return '%s(%s)' % (class_name, _pprint(self.get_params(deep=False),
124 |                                                offset=len(class_name),),)
125 | 
126 |     def __getstate__(self):
127 |         try:
128 |             state = super(BaseEstimator, self).__getstate__()
129 |         except AttributeError:
130 |             state = self.__dict__.copy()
131 | 
132 |         if type(self).__module__.startswith('sklearn.'):
133 |             return dict(state.items(), _sklearn_version=__version__)
134 |         else:
135 |             return state
136 | 
137 |     def __setstate__(self, state):
138 |         if type(self).__module__.startswith('sklearn.'):
139 |             pickle_version = state.pop("_sklearn_version", "pre-0.18")
140 |             if pickle_version != __version__:
141 |                 warnings.warn(
142 |                     "Trying to unpickle estimator {0} from version {1} when "
143 |                     "using version {2}. This might lead to breaking code or "
144 |                     "invalid results. Use at your own risk.".format(
145 |                         self.__class__.__name__, pickle_version, __version__),
146 |                     UserWarning)
147 |         try:
148 |             super(BaseEstimator, self).__setstate__(state)
149 |         except AttributeError:
150 |             self.__dict__.update(state)
151 | 
152 | 
153 | class ClusterMixin(object):
154 |     """Mixin class for all cluster estimators in scikit-learn."""
155 |     _estimator_type = "clusterer"
156 | 
157 |     def fit_predict(self, X, y=None):
158 |         """Performs clustering on X and returns cluster labels.
159 | 
160 |         Parameters
161 |         ----------
162 |         X : ndarray, shape (n_samples, n_features)
163 |             Input data.
164 | 
165 |         Returns
166 |         -------
167 |         y : ndarray, shape (n_samples,)
168 |             cluster labels
169 |         """
170 |         # non-optimized default implementation; override when a better
171 |         # method is possible for a given clustering algorithm
172 |         self.fit(X)
173 |         return self.labels_
174 | 
175 | 
176 | class TransformerMixin(object):
177 |     """Mixin class for all transformers in scikit-learn."""
178 | 
179 |     def fit_transform(self, X, y=None, **fit_params):
180 |         """Fit to data, then transform it.
181 | 
182 |         Fits transformer to X and y with optional parameters fit_params
183 |         and returns a transformed version of X.
184 | 
185 |         Parameters
186 |         ----------
187 |         X : numpy array of shape [n_samples, n_features]
188 |             Training set.
189 | 
190 |         y : numpy array of shape [n_samples]
191 |             Target values.
192 | 
193 |         Returns
194 |         -------
195 |         X_new : numpy array of shape [n_samples, n_features_new]
196 |             Transformed array.
197 | 
198 |         """
199 |         # non-optimized default implementation; override when a better
200 |         # method is possible for a given clustering algorithm
201 |         if y is None:
202 |             # fit method of arity 1 (unsupervised transformation)
203 |             return self.fit(X, **fit_params).transform(X)
204 |         else:
205 |             # fit method of arity 2 (supervised transformation)
206 |             return self.fit(X, y, **fit_params).transform(X)
207 | 
208 | 
209 | def _pprint(params, offset=0, printer=repr):
210 |     """Pretty print the dictionary 'params'
211 | 
212 |     Parameters
213 |     ----------
214 |     params : dict
215 |         The dictionary to pretty print
216 | 
217 |     offset : int
218 |         The offset in characters to add at the begin of each line.
219 | 
220 |     printer : callable
221 |         The function to convert entries to strings, typically
222 |         the builtin str or repr
223 | 
224 |     """
225 |     # Do a multi-line justified repr:
226 |     options = np.get_printoptions()
227 |     np.set_printoptions(precision=5, threshold=64, edgeitems=2)
228 |     params_list = list()
229 |     this_line_length = offset
230 |     line_sep = ',\n' + (1 + offset // 2) * ' '
231 |     for i, (k, v) in enumerate(sorted(six.iteritems(params))):
232 |         if type(v) is float:
233 |             # use str for representing floating point numbers
234 |             # this way we get consistent representation across
235 |             # architectures and versions.
236 |             this_repr = '%s=%s' % (k, str(v))
237 |         else:
238 |             # use repr of the rest
239 |             this_repr = '%s=%s' % (k, printer(v))
240 |         if len(this_repr) > 500:
241 |             this_repr = this_repr[:300] + '...' + this_repr[-100:]
242 |         if i > 0:
243 |             if (this_line_length + len(this_repr) >= 75 or '\n' in this_repr):
244 |                 params_list.append(line_sep)
245 |                 this_line_length = len(line_sep)
246 |             else:
247 |                 params_list.append(', ')
248 |                 this_line_length += 2
249 |         params_list.append(this_repr)
250 |         this_line_length += len(this_repr)
251 | 
252 |     np.set_printoptions(**options)
253 |     lines = ''.join(params_list)
254 |     # Strip trailing space to avoid nightmare in doctests
255 |     lines = '\n'.join(l.rstrip(' ') for l in lines.split('\n'))
256 |     return lines
257 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/equal.py:
--------------------------------------------------------------------------------
  1 | #!usr/bin/python 3.7
  2 | #-*-coding:utf-8-*-
  3 | 
  4 | '''
  5 | @file: same_size_kmeans.py, equal size clustering with heuristics
  6 | @Author: Jing Wang (jingw2@foxmail.com)
  7 | @Date: 06/16/2020
  8 | @paper: 
  9 | @github reference: https://github.com/joshlk/k-means-constrained
 10 | @Web: https://elki-project.github.io/tutorial/same-size_k_means
 11 | '''
 12 | 
 13 | from scipy.spatial.distance import cdist
 14 | import numpy as np 
 15 | # from sklearn.cluster._k_means import _k_init
 16 | from sklearn.preprocessing import OneHotEncoder
 17 | import collections
 18 | import warnings
 19 | 
 20 | import sys 
 21 | import os 
 22 | path = os.path.dirname(os.path.abspath(__file__))
 23 | sys.path.append(path)
 24 | import base
 25 | from k_means_constrained import KMeansConstrained
 26 | 
 27 | class SameSizeKMeansHeuristics(base.Base):
 28 |     
 29 |     def __init__(self, n_clusters, max_iters=1000, distance_func=cdist, random_state=42):
 30 |         '''
 31 |         Args:
 32 |             n_clusters (int): number of clusters 
 33 |             max_iters (int): maximum iterations
 34 |             distance_func (object): callable function with input (X, centers) / None, by default is l2-distance
 35 |             random_state (int): random state to initiate, by default it is 42
 36 |         '''
 37 |         super(SameSizeKMeansHeuristics, self).__init__(n_clusters, max_iters, distance_func)
 38 |         self.random_state = np.random.RandomState(random_state)
 39 |     
 40 |     def fit(self, X):
 41 |         '''
 42 |         Args:
 43 |             X (array like): shape (n_samples, n_features)
 44 |         '''
 45 |         n_samples, _ = X.shape
 46 |         minsize = n_samples // self.n_clusters
 47 |         maxsize = (n_samples + self.n_clusters - 1) // self.n_clusters
 48 |         if minsize != maxsize:
 49 |             warnings.warn("Cluster minimum and maximum size are {} and {}, respectively".format(minsize, maxsize))
 50 |         
 51 |         # initiate 
 52 |         labels = self._init(X)
 53 |         encoder = OneHotEncoder()
 54 |         labels_onehot = encoder.fit_transform(labels.reshape((-1, 1))).toarray()
 55 |         itr = 0
 56 |         clusters = collections.Counter(labels)
 57 |         while True:
 58 |             # update centers
 59 |             labels_onehot = encoder.fit_transform(labels.reshape((-1, 1))).toarray()
 60 |             centers = self.update_centers(X, labels_onehot)
 61 |             # compute distance to centers 
 62 |             dist_mat = self.distance_func(X, centers)
 63 |             # calculate preference
 64 |             labels = labels.astype(int)
 65 |             preference = dist_mat[range(n_samples), labels] - np.min(dist_mat, axis=1)
 66 |             argsort = np.argsort(preference)[::-1] # descending order
 67 |             # transfer list 
 68 |             transfer = {c: [] for c in range(self.n_clusters)}
 69 | 
 70 |             for sample_id in argsort:
 71 |                 source = labels[sample_id]
 72 |                 dest = np.argmin(dist_mat[sample_id])
 73 | 
 74 |                 # cannot transfer to same cluster
 75 |                 if source == dest:
 76 |                     continue
 77 | 
 78 |                 sample_gain = dist_mat[sample_id][source] - dist_mat[sample_id][dest]
 79 | 
 80 |                 # find if there is pair transfer
 81 |                 dest_transfer = transfer[dest]
 82 |                 gains = {}
 83 |                 for other_id in dest_transfer:
 84 |                     other_gain = dist_mat[other_id][dest] - dist_mat[other_id][source]
 85 |                     gain = sample_gain + other_gain
 86 |                     if gain > 0:
 87 |                         gains[other_id] = gain
 88 |                 if len(gains) > 0:
 89 |                     other_id = sorted(gains.items(), key=lambda x: x[1], reverse=True)[0][0]
 90 |                     labels[other_id], labels[sample_id] \
 91 |                         = labels[sample_id], labels[other_id]
 92 |                     transfer[dest].remove(other_id)
 93 |                     if sample_id in transfer[source]:
 94 |                         transfer[source].remove(sample_id)
 95 |                     continue
 96 |                 
 97 |                 # if cluster size allows, move a single object
 98 |                 if (sample_gain > 0 and clusters[dest] < maxsize and clusters[source] > minsize):
 99 |                     labels[sample_id] = dest
100 |                     clusters[dest] += 1 
101 |                     clusters[source] -= 1
102 |                     if sample_id in transfer[source]:
103 |                         transfer[source].remove(sample_id)
104 |                     continue
105 |                 
106 |                 # if the object would prefer a different cluster, put in transfer list
107 |                 if (sample_gain > 0):
108 |                     transfer[source].append(sample_id)
109 |                 
110 |             if len(transfer) <= 0:
111 |                 break 
112 |                 
113 |             itr += 1 
114 |             pending = sum([len(val) for key, val in transfer.items()])
115 |             if itr >= self.max_iters:
116 |                 print("Reach maximum iterations! Now pending transfer samples {}!".format(pending))
117 |                 break 
118 | 
119 |         self.cluster_centers_ = centers 
120 |         self.labels_ = labels    
121 |     
122 |     def predict(self, X):
123 |         '''
124 |         Predict labels based input X
125 |         Args:
126 |             X (array like): shape (n_samples, n_features)
127 |         '''
128 |         dist_mat = self.distance_func(X, self.cluster_centers_)
129 |         labels = np.argmin(dist_mat, axis=1)
130 |         return labels
131 |     
132 |     def update_centers(self, X, labels):
133 |         '''
134 |         Update centers 
135 |         Args:
136 |             X (array like): (n_samples, n_features)
137 |             labels (array like): (n_samples, n_clusters), one-hot array
138 |         
139 |         Return:
140 |             centers (array like): (n_clusters, n_features)
141 |         '''
142 |         centers = (X.T.dot(labels)).T / np.sum(labels, axis=0).reshape((-1, 1))
143 |         return centers
144 |         
145 |     def _init(self, X):
146 |         '''
147 |         Initiate centroids based on X input with kmeans ++ 
148 | 
149 |         Args:
150 |             X (array like): shape is (n_samples, n_features)
151 |         
152 |         Returns:
153 |             labels (array like): shape is (n_samples,) 
154 |         '''
155 |         n_samples, n_features = X.shape
156 |         max_size = (n_samples + self.n_clusters - 1) // self.n_clusters
157 |         # initiate centroids with kmeans++
158 |         X_squared_norm = np.sum(np.square(X), axis=1)
159 |         centers = base.k_init(X, self.n_clusters, X_squared_norm, self.random_state)
160 |         
161 |         # calculate priority 
162 |         dist_mat = self.distance_func(X, centers) # (n_samples, n_clusters)
163 |         priority = np.max(dist_mat, axis=1) - np.min(dist_mat, axis=1)
164 |         argsort = np.argsort(priority)[::-1] # descending order
165 |         clusters = {i: 0 for i in range(self.n_clusters)}
166 | 
167 |         # assign to clusters based on priority
168 |         samples = list(range(n_samples))
169 |         visited = set()
170 |         dist_mat_copy = dist_mat.copy()
171 |         m = np.zeros_like(dist_mat_copy)
172 |         labels = np.zeros(n_samples)
173 |         while len(samples) > 0:
174 |             for sample_id in argsort:
175 |                 if sample_id in visited:
176 |                     continue
177 |                 cluster_id = np.argmin(dist_mat_copy[sample_id])
178 |                 if clusters[cluster_id] < max_size:
179 |                     labels[sample_id] = cluster_id
180 |                     clusters[cluster_id] += 1
181 |                     samples.remove(sample_id)
182 |                     visited.add(sample_id)
183 |                 else:
184 |                     break 
185 |             dist_mat_copy = dist_mat.copy()
186 |             # mask full cluster column
187 |             m[:, cluster_id] = 1
188 |             dist_mat_copy = np.ma.masked_array(dist_mat_copy, m)
189 |             priority = np.max(dist_mat_copy, axis=1) - np.min(dist_mat_copy, axis=1)
190 |             argsort = np.argsort(priority)[::-1] # descending order
191 |         
192 |         return labels 
193 | 
194 | class SameSizeKMeansMinCostFlow(base.Base):
195 |      
196 |     def __init__(self, n_clusters, max_iters=1000, distance_func=cdist, random_state=42):
197 |         '''
198 |         Args:
199 |             n_clusters (int): number of clusters 
200 |             max_iters (int): maximum iterations
201 |             distance_func (object): callable function with input (X, centers) / None, by default is l2-distance
202 |             random_state (int): random state to initiate, by default it is 42
203 |         '''
204 |         super(SameSizeKMeansMinCostFlow, self).__init__(n_clusters, max_iters, distance_func)
205 |         self.clf = None
206 | 
207 |     def fit(self, X):
208 |         n_samples, n_features = X.shape
209 |         minsize = n_samples // self.n_clusters
210 |         maxsize = (n_samples + self.n_clusters - 1) // self.n_clusters
211 | 
212 |         clf = KMeansConstrained(self.n_clusters, size_min=minsize, 
213 |             size_max=maxsize, distance_func=self.distance_func)
214 | 
215 |         if minsize != maxsize:
216 |             warnings.warn("Cluster minimum and maximum size are {} and {}, respectively".format(minsize, maxsize))
217 |         
218 |         clf.fit(X)
219 | 
220 |         self.clf = clf 
221 |         self.cluster_centers_ = self.clf.cluster_centers_
222 |         self.labels_ = self.clf.labels_
223 |     
224 |     def predict(self, X):
225 |         return self.clf.predict(X)
226 | 
227 | if __name__ == "__main__":
228 |     from sklearn.datasets import make_blobs
229 |     from matplotlib import pyplot as plt
230 |     from seaborn import scatterplot as scatter
231 |     from sklearn.metrics.pairwise import haversine_distances
232 |     n_samples = 2000
233 |     n_clusters = 4  # use 3 bins for calibration_curve as we have 3 clusters here
234 |     centers = [(-5, -5), (0, 0), (5, 5), (7, 10)]
235 | 
236 |     X, _ = make_blobs(n_samples=n_samples, n_features=2, cluster_std=1.0,
237 |                     centers=centers, shuffle=False, random_state=42)
238 | 
239 |     # X = np.random.rand(n_samples, 2)
240 |     equal = SameSizeKMeansMinCostFlow(n_clusters)
241 |     equal.fit(X)
242 | 
243 |     fcm_centers = equal.cluster_centers_
244 |     fcm_labels = equal.labels_
245 | 
246 |     f, axes = plt.subplots(1, 2, figsize=(11, 5))
247 |     scatter(X[:, 0], X[:, 1], ax=axes[0])
248 |     scatter(X[:, 0], X[:, 1], ax=axes[1], hue=fcm_labels)
249 |     scatter(fcm_centers[:, 0], fcm_centers[:, 1], ax=axes[1], marker="s",s=200)
250 |     plt.show()
251 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/sklearn_import/utils/sparsefuncs_fast.pyx:
--------------------------------------------------------------------------------
  1 | # Authors: Mathieu Blondel
  2 | #          Olivier Grisel
  3 | #          Peter Prettenhofer
  4 | #          Lars Buitinck
  5 | #          Giorgio Patrini
  6 | #
  7 | # License: BSD 3 clause
  8 | 
  9 | #!python
 10 | #cython: boundscheck=False, wraparound=False, cdivision=True
 11 | 
 12 | from libc.math cimport fabs, sqrt, pow
 13 | cimport numpy as np
 14 | import numpy as np
 15 | import scipy.sparse as sp
 16 | cimport cython
 17 | from cython cimport floating
 18 | 
 19 | np.import_array()
 20 | 
 21 | 
 22 | ctypedef np.float64_t DOUBLE
 23 | 
 24 | def csr_row_norms(X):
 25 |     """L2 norm of each row in CSR matrix X."""
 26 |     if X.dtype != np.float32:
 27 |         X = X.astype(np.float64)
 28 |     return _csr_row_norms(X.data, X.shape, X.indices, X.indptr)
 29 | 
 30 | 
 31 | def _csr_row_norms(np.ndarray[floating, ndim=1, mode="c"] X_data,
 32 |                    shape,
 33 |                    np.ndarray[int, ndim=1, mode="c"] X_indices,
 34 |                    np.ndarray[int, ndim=1, mode="c"] X_indptr):
 35 |     cdef:
 36 |         unsigned int n_samples = shape[0]
 37 |         unsigned int n_features = shape[1]
 38 |         np.ndarray[DOUBLE, ndim=1, mode="c"] norms
 39 | 
 40 |         np.npy_intp i, j
 41 |         double sum_
 42 | 
 43 |     norms = np.zeros(n_samples, dtype=np.float64)
 44 | 
 45 |     for i in range(n_samples):
 46 |         sum_ = 0.0
 47 |         for j in range(X_indptr[i], X_indptr[i + 1]):
 48 |             sum_ += X_data[j] * X_data[j]
 49 |         norms[i] = sum_
 50 | 
 51 |     return norms
 52 | 
 53 | 
 54 | def csr_mean_variance_axis0(X):
 55 |     """Compute mean and variance along axis 0 on a CSR matrix
 56 | 
 57 |     Parameters
 58 |     ----------
 59 |     X : CSR sparse matrix, shape (n_samples, n_features)
 60 |         Input data.
 61 | 
 62 |     Returns
 63 |     -------
 64 | 
 65 |     means : float array with shape (n_features,)
 66 |         Feature-wise means
 67 | 
 68 |     variances : float array with shape (n_features,)
 69 |         Feature-wise variances
 70 | 
 71 |     """
 72 |     if X.dtype != np.float32:
 73 |         X = X.astype(np.float64)
 74 |     return _csr_mean_variance_axis0(X.data, X.shape, X.indices)
 75 | 
 76 | 
 77 | def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
 78 |                              shape,
 79 |                              np.ndarray[int, ndim=1] X_indices):
 80 |     # Implement the function here since variables using fused types
 81 |     # cannot be declared directly and can only be passed as function arguments
 82 |     cdef unsigned int n_samples = shape[0]
 83 |     cdef unsigned int n_features = shape[1]
 84 | 
 85 |     cdef unsigned int i
 86 |     cdef unsigned int non_zero = X_indices.shape[0]
 87 |     cdef unsigned int col_ind
 88 |     cdef floating diff
 89 | 
 90 |     # means[j] contains the mean of feature j
 91 |     cdef np.ndarray[floating, ndim=1] means
 92 |     # variances[j] contains the variance of feature j
 93 |     cdef np.ndarray[floating, ndim=1] variances
 94 | 
 95 |     if floating is float:
 96 |         dtype = np.float32
 97 |     else:
 98 |         dtype = np.float64
 99 | 
100 |     means = np.zeros(n_features, dtype=dtype)
101 |     variances = np.zeros_like(means, dtype=dtype)
102 | 
103 |     # counts[j] contains the number of samples where feature j is non-zero
104 |     cdef np.ndarray[int, ndim=1] counts = np.zeros(n_features,
105 |                                                    dtype=np.int32)
106 | 
107 |     for i in xrange(non_zero):
108 |         col_ind = X_indices[i]
109 |         means[col_ind] += X_data[i]
110 | 
111 |     means /= n_samples
112 | 
113 |     for i in xrange(non_zero):
114 |         col_ind = X_indices[i]
115 |         diff = X_data[i] - means[col_ind]
116 |         variances[col_ind] += diff * diff
117 |         counts[col_ind] += 1
118 | 
119 |     for i in xrange(n_features):
120 |         variances[i] += (n_samples - counts[i]) * means[i] ** 2
121 |         variances[i] /= n_samples
122 | 
123 |     return means, variances
124 | 
125 | 
126 | def csc_mean_variance_axis0(X):
127 |     """Compute mean and variance along axis 0 on a CSC matrix
128 | 
129 |     Parameters
130 |     ----------
131 |     X : CSC sparse matrix, shape (n_samples, n_features)
132 |         Input data.
133 | 
134 |     Returns
135 |     -------
136 | 
137 |     means : float array with shape (n_features,)
138 |         Feature-wise means
139 | 
140 |     variances : float array with shape (n_features,)
141 |         Feature-wise variances
142 | 
143 |     """
144 |     if X.dtype != np.float32:
145 |         X = X.astype(np.float64)
146 |     return _csc_mean_variance_axis0(X.data, X.shape, X.indices, X.indptr)
147 | 
148 | 
149 | def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
150 |                              shape,
151 |                              np.ndarray[int, ndim=1] X_indices,
152 |                              np.ndarray[int, ndim=1] X_indptr):
153 |     # Implement the function here since variables using fused types
154 |     # cannot be declared directly and can only be passed as function arguments
155 |     cdef unsigned int n_samples = shape[0]
156 |     cdef unsigned int n_features = shape[1]
157 | 
158 |     cdef unsigned int i
159 |     cdef unsigned int j
160 |     cdef unsigned int counts
161 |     cdef unsigned int startptr
162 |     cdef unsigned int endptr
163 |     cdef floating diff
164 | 
165 |     # means[j] contains the mean of feature j
166 |     cdef np.ndarray[floating, ndim=1] means
167 |     # variances[j] contains the variance of feature j
168 |     cdef np.ndarray[floating, ndim=1] variances
169 |     if floating is float:
170 |         dtype = np.float32
171 |     else:
172 |         dtype = np.float64
173 | 
174 |     means = np.zeros(n_features, dtype=dtype)
175 |     variances = np.zeros_like(means, dtype=dtype)
176 | 
177 |     for i in xrange(n_features):
178 | 
179 |         startptr = X_indptr[i]
180 |         endptr = X_indptr[i + 1]
181 |         counts = endptr - startptr
182 | 
183 |         for j in xrange(startptr, endptr):
184 |             means[i] += X_data[j]
185 |         means[i] /= n_samples
186 | 
187 |         for j in xrange(startptr, endptr):
188 |             diff = X_data[j] - means[i]
189 |             variances[i] += diff * diff
190 | 
191 |         variances[i] += (n_samples - counts) * means[i] * means[i]
192 |         variances[i] /= n_samples
193 | 
194 |     return means, variances
195 | 
196 | 
197 | def incr_mean_variance_axis0(X, last_mean, last_var, unsigned long last_n):
198 |     """Compute mean and variance along axis 0 on a CSR or CSC matrix.
199 | 
200 |     last_mean, last_var are the statistics computed at the last step by this
201 |     function. Both must be initilized to 0.0. last_n is the
202 |     number of samples encountered until now and is initialized at 0.
203 | 
204 |     Parameters
205 |     ----------
206 |     X : CSR or CSC sparse matrix, shape (n_samples, n_features)
207 |       Input data.
208 | 
209 |     last_mean : float array with shape (n_features,)
210 |       Array of feature-wise means to update with the new data X.
211 | 
212 |     last_var : float array with shape (n_features,)
213 |       Array of feature-wise var to update with the new data X.
214 | 
215 |     last_n : int
216 |       Number of samples seen so far, before X.
217 | 
218 |     Returns
219 |     -------
220 | 
221 |     updated_mean : float array with shape (n_features,)
222 |       Feature-wise means
223 | 
224 |     updated_variance : float array with shape (n_features,)
225 |       Feature-wise variances
226 | 
227 |     updated_n : int
228 |       Updated number of samples seen
229 | 
230 |     References
231 |     ----------
232 | 
233 |     T. Chan, G. Golub, R. LeVeque. Algorithms for computing the sample
234 |       variance: recommendations, The American Statistician, Vol. 37, No. 3,
235 |       pp. 242-247
236 | 
237 |     Also, see the non-sparse implementation of this in
238 |     `utils.extmath._batch_mean_variance_update`.
239 | 
240 |     """
241 |     if X.dtype != np.float32:
242 |         X = X.astype(np.float64)
243 |     return _incr_mean_variance_axis0(X.data, X.shape, X.indices, X.indptr,
244 |                                      X.format, last_mean, last_var, last_n)
245 | 
246 | 
247 | def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
248 |                               shape,
249 |                               np.ndarray[int, ndim=1] X_indices,
250 |                               np.ndarray[int, ndim=1] X_indptr,
251 |                               X_format,
252 |                               last_mean,
253 |                               last_var,
254 |                               unsigned long last_n):
255 |     # Implement the function here since variables using fused types
256 |     # cannot be declared directly and can only be passed as function arguments
257 |     cdef unsigned long n_samples = shape[0]
258 |     cdef unsigned int n_features = shape[1]
259 |     cdef unsigned int i
260 | 
261 |     # last = stats until now
262 |     # new = the current increment
263 |     # updated = the aggregated stats
264 |     # when arrays, they are indexed by i per-feature
265 |     cdef np.ndarray[floating, ndim=1] new_mean
266 |     cdef np.ndarray[floating, ndim=1] new_var
267 |     cdef np.ndarray[floating, ndim=1] updated_mean
268 |     cdef np.ndarray[floating, ndim=1] updated_var
269 |     if floating is float:
270 |         dtype = np.float32
271 |     else:
272 |         dtype = np.float64
273 | 
274 |     new_mean = np.zeros(n_features, dtype=dtype)
275 |     new_var = np.zeros_like(new_mean, dtype=dtype)
276 |     updated_mean = np.zeros_like(new_mean, dtype=dtype)
277 |     updated_var = np.zeros_like(new_mean, dtype=dtype)
278 | 
279 |     cdef unsigned long new_n
280 |     cdef unsigned long updated_n
281 |     cdef floating last_over_new_n
282 | 
283 |     # Obtain new stats first
284 |     new_n = n_samples
285 | 
286 |     if X_format == 'csr':
287 |         # X is a CSR matrix
288 |         new_mean, new_var = _csr_mean_variance_axis0(X_data, shape, X_indices)
289 |     else:
290 |         # X is a CSC matrix
291 |         new_mean, new_var = _csc_mean_variance_axis0(X_data, shape, X_indices,
292 |                                                      X_indptr)
293 | 
294 |     # First pass
295 |     if last_n == 0:
296 |         return new_mean, new_var, new_n
297 |     # Next passes
298 |     else:
299 |         updated_n = last_n + new_n
300 |         last_over_new_n = last_n / new_n
301 | 
302 |     for i in xrange(n_features):
303 |         # Unnormalized old stats
304 |         last_mean[i] *= last_n
305 |         last_var[i] *= last_n
306 | 
307 |         # Unnormalized new stats
308 |         new_mean[i] *= new_n
309 |         new_var[i] *= new_n
310 | 
311 |         # Update stats
312 |         updated_var[i] = (last_var[i] + new_var[i] +
313 |                           last_over_new_n / updated_n *
314 |                           (last_mean[i] / last_over_new_n - new_mean[i]) ** 2)
315 | 
316 |         updated_mean[i] = (last_mean[i] + new_mean[i]) / updated_n
317 |         updated_var[i] = updated_var[i] / updated_n
318 | 
319 |     return updated_mean, updated_var, updated_n
320 | 
321 | 
322 | def inplace_csr_row_normalize_l1(X):
323 |     """Inplace row normalize using the l1 norm"""
324 |     _inplace_csr_row_normalize_l1(X.data, X.shape, X.indices, X.indptr)
325 | 
326 | 
327 | def _inplace_csr_row_normalize_l1(np.ndarray[floating, ndim=1] X_data,
328 |                                   shape,
329 |                                   np.ndarray[int, ndim=1] X_indices,
330 |                                   np.ndarray[int, ndim=1] X_indptr):
331 |     cdef unsigned int n_samples = shape[0]
332 |     cdef unsigned int n_features = shape[1]
333 | 
334 |     # the column indices for row i are stored in:
335 |     #    indices[indptr[i]:indices[i+1]]
336 |     # and their corresponding values are stored in:
337 |     #    data[indptr[i]:indptr[i+1]]
338 |     cdef unsigned int i
339 |     cdef unsigned int j
340 |     cdef double sum_
341 | 
342 |     for i in xrange(n_samples):
343 |         sum_ = 0.0
344 | 
345 |         for j in xrange(X_indptr[i], X_indptr[i + 1]):
346 |             sum_ += fabs(X_data[j])
347 | 
348 |         if sum_ == 0.0:
349 |             # do not normalize empty rows (can happen if CSR is not pruned
350 |             # correctly)
351 |             continue
352 | 
353 |         for j in xrange(X_indptr[i], X_indptr[i + 1]):
354 |             X_data[j] /= sum_
355 | 
356 | 
357 | def inplace_csr_row_normalize_l2(X):
358 |     """Inplace row normalize using the l2 norm"""
359 |     _inplace_csr_row_normalize_l2(X.data, X.shape, X.indices, X.indptr)
360 | 
361 | 
362 | def _inplace_csr_row_normalize_l2(np.ndarray[floating, ndim=1] X_data,
363 |                                   shape,
364 |                                   np.ndarray[int, ndim=1] X_indices,
365 |                                   np.ndarray[int, ndim=1] X_indptr):
366 |     cdef unsigned int n_samples = shape[0]
367 |     cdef unsigned int n_features = shape[1]
368 | 
369 |     cdef unsigned int i
370 |     cdef unsigned int j
371 |     cdef double sum_
372 | 
373 |     for i in xrange(n_samples):
374 |         sum_ = 0.0
375 | 
376 |         for j in xrange(X_indptr[i], X_indptr[i + 1]):
377 |             sum_ += (X_data[j] * X_data[j])
378 | 
379 |         if sum_ == 0.0:
380 |             # do not normalize empty rows (can happen if CSR is not pruned
381 |             # correctly)
382 |             continue
383 | 
384 |         sum_ = sqrt(sum_)
385 | 
386 |         for j in xrange(X_indptr[i], X_indptr[i + 1]):
387 |             X_data[j] /= sum_
388 | 
389 | 
390 | def assign_rows_csr(X,
391 |                     np.ndarray[np.npy_intp, ndim=1] X_rows,
392 |                     np.ndarray[np.npy_intp, ndim=1] out_rows,
393 |                     np.ndarray[floating, ndim=2, mode="c"] out):
394 |     """Densify selected rows of a CSR matrix into a preallocated array.
395 | 
396 |     Like out[out_rows] = X[X_rows].toarray() but without copying.
397 |     No-copy supported for both dtype=np.float32 and dtype=np.float64.
398 | 
399 |     Parameters
400 |     ----------
401 |     X : scipy.sparse.csr_matrix, shape=(n_samples, n_features)
402 |     X_rows : array, dtype=np.intp, shape=n_rows
403 |     out_rows : array, dtype=np.intp, shape=n_rows
404 |     out : array, shape=(arbitrary, n_features)
405 |     """
406 |     cdef:
407 |         # npy_intp (np.intp in Python) is what np.where returns,
408 |         # but int is what scipy.sparse uses.
409 |         int i, ind, j
410 |         np.npy_intp rX
411 |         np.ndarray[floating, ndim=1] data = X.data
412 |         np.ndarray[int, ndim=1] indices = X.indices, indptr = X.indptr
413 | 
414 |     if X_rows.shape[0] != out_rows.shape[0]:
415 |         raise ValueError("cannot assign %d rows to %d"
416 |                          % (X_rows.shape[0], out_rows.shape[0]))
417 | 
418 |     out[out_rows] = 0.
419 |     for i in range(X_rows.shape[0]):
420 |         rX = X_rows[i]
421 |         for ind in range(indptr[rX], indptr[rX + 1]):
422 |             j = indices[ind]
423 |             out[out_rows[i], j] = data[ind]


--------------------------------------------------------------------------------
/size_constrained_clustering/sklearn_import/utils/validation.py:
--------------------------------------------------------------------------------
  1 | import numbers
  2 | import warnings
  3 | 
  4 | import numpy as np
  5 | from scipy import sparse as sp
  6 | from sklearn_import.exceptions import NotFittedError
  7 | 
  8 | from sklearn_import import get_config as _get_config
  9 | 
 10 | from sklearn_import.exceptions import DataConversionWarning
 11 | import six
 12 | 
 13 | 
 14 | def check_array(array, accept_sparse=False, dtype="numeric", order=None,
 15 |                 copy=False, force_all_finite=True, ensure_2d=True,
 16 |                 allow_nd=False, ensure_min_samples=1, ensure_min_features=1,
 17 |                 warn_on_dtype=False, estimator=None):
 18 |     """Input validation on an array, list, sparse matrix or similar.
 19 | 
 20 |     By default, the input is converted to an at least 2D numpy array.
 21 |     If the dtype of the array is object, attempt converting to float,
 22 |     raising on failure.
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     array : object
 27 |         Input object to check / convert.
 28 | 
 29 |     accept_sparse : string, boolean or list/tuple of strings (default=False)
 30 |         String[s] representing allowed sparse matrix formats, such as 'csc',
 31 |         'csr', etc. If the input is sparse but not in the allowed format,
 32 |         it will be converted to the first listed format. True allows the input
 33 |         to be any format. False means that a sparse matrix input will
 34 |         raise an error.
 35 | 
 36 |         .. deprecated:: 0.19
 37 |            Passing 'None' to parameter ``accept_sparse`` in methods is
 38 |            deprecated in version 0.19 "and will be removed in 0.21. Use
 39 |            ``accept_sparse=False`` instead.
 40 | 
 41 |     dtype : string, type, list of types or None (default="numeric")
 42 |         Data type of result. If None, the dtype of the input is preserved.
 43 |         If "numeric", dtype is preserved unless array.dtype is object.
 44 |         If dtype is a list of types, conversion on the first type is only
 45 |         performed if the dtype of the input is not in the list.
 46 | 
 47 |     order : 'F', 'C' or None (default=None)
 48 |         Whether an array will be forced to be fortran or c-style.
 49 |         When order is None (default), then if copy=False, nothing is ensured
 50 |         about the memory layout of the output array; otherwise (copy=True)
 51 |         the memory layout of the returned array is kept as close as possible
 52 |         to the original array.
 53 | 
 54 |     copy : boolean (default=False)
 55 |         Whether a forced copy will be triggered. If copy=False, a copy might
 56 |         be triggered by a conversion.
 57 | 
 58 |     force_all_finite : boolean (default=True)
 59 |         Whether to raise an error on np.inf and np.nan in X.
 60 | 
 61 |     ensure_2d : boolean (default=True)
 62 |         Whether to raise a value error if X is not 2d.
 63 | 
 64 |     allow_nd : boolean (default=False)
 65 |         Whether to allow X.ndim > 2.
 66 | 
 67 |     ensure_min_samples : int (default=1)
 68 |         Make sure that the array has a minimum number of samples in its first
 69 |         axis (rows for a 2D array). Setting to 0 disables this check.
 70 | 
 71 |     ensure_min_features : int (default=1)
 72 |         Make sure that the 2D array has some minimum number of features
 73 |         (columns). The default value of 1 rejects empty datasets.
 74 |         This check is only enforced when the input data has effectively 2
 75 |         dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
 76 |         disables this check.
 77 | 
 78 |     warn_on_dtype : boolean (default=False)
 79 |         Raise DataConversionWarning if the dtype of the input data structure
 80 |         does not match the requested dtype, causing a memory copy.
 81 | 
 82 |     estimator : str or estimator instance (default=None)
 83 |         If passed, include the name of the estimator in warning messages.
 84 | 
 85 |     Returns
 86 |     -------
 87 |     X_converted : object
 88 |         The converted and validated X.
 89 | 
 90 |     """
 91 |     # accept_sparse 'None' deprecation check
 92 |     if accept_sparse is None:
 93 |         warnings.warn(
 94 |             "Passing 'None' to parameter 'accept_sparse' in methods "
 95 |             "check_array and check_X_y is deprecated in version 0.19 "
 96 |             "and will be removed in 0.21. Use 'accept_sparse=False' "
 97 |             " instead.", DeprecationWarning)
 98 |         accept_sparse = False
 99 | 
100 |     # store whether originally we wanted numeric dtype
101 |     dtype_numeric = isinstance(dtype, six.string_types) and dtype == "numeric"
102 | 
103 |     dtype_orig = getattr(array, "dtype", None)
104 |     if not hasattr(dtype_orig, 'kind'):
105 |         # not a data type (e.g. a column named dtype in a pandas DataFrame)
106 |         dtype_orig = None
107 | 
108 |     if dtype_numeric:
109 |         if dtype_orig is not None and dtype_orig.kind == "O":
110 |             # if input is object, convert to float.
111 |             dtype = np.float64
112 |         else:
113 |             dtype = None
114 | 
115 |     if isinstance(dtype, (list, tuple)):
116 |         if dtype_orig is not None and dtype_orig in dtype:
117 |             # no dtype conversion required
118 |             dtype = None
119 |         else:
120 |             # dtype conversion required. Let's select the first element of the
121 |             # list of accepted types.
122 |             dtype = dtype[0]
123 | 
124 |     if estimator is not None:
125 |         if isinstance(estimator, six.string_types):
126 |             estimator_name = estimator
127 |         else:
128 |             estimator_name = estimator.__class__.__name__
129 |     else:
130 |         estimator_name = "Estimator"
131 |     context = " by %s" % estimator_name if estimator is not None else ""
132 | 
133 |     if sp.issparse(array):
134 |         array = _ensure_sparse_format(array, accept_sparse, dtype, copy,
135 |                                       force_all_finite)
136 |     else:
137 |         array = np.array(array, dtype=dtype, order=order, copy=copy)
138 | 
139 |         if ensure_2d:
140 |             if array.ndim == 1:
141 |                 raise ValueError(
142 |                     "Expected 2D array, got 1D array instead:\narray={}.\n"
143 |                     "Reshape your data either using array.reshape(-1, 1) if "
144 |                     "your data has a single feature or array.reshape(1, -1) "
145 |                     "if it contains a single sample.".format(array))
146 |             array = np.atleast_2d(array)
147 |             # To ensure that array flags are maintained
148 |             array = np.array(array, dtype=dtype, order=order, copy=copy)
149 | 
150 |         # make sure we actually converted to numeric:
151 |         if dtype_numeric and array.dtype.kind == "O":
152 |             array = array.astype(np.float64)
153 |         if not allow_nd and array.ndim >= 3:
154 |             raise ValueError("Found array with dim %d. %s expected <= 2."
155 |                              % (array.ndim, estimator_name))
156 |         if force_all_finite:
157 |             _assert_all_finite(array)
158 | 
159 |     shape_repr = _shape_repr(array.shape)
160 |     if ensure_min_samples > 0:
161 |         n_samples = _num_samples(array)
162 |         if n_samples < ensure_min_samples:
163 |             raise ValueError("Found array with %d sample(s) (shape=%s) while a"
164 |                              " minimum of %d is required%s."
165 |                              % (n_samples, shape_repr, ensure_min_samples,
166 |                                 context))
167 | 
168 |     if ensure_min_features > 0 and array.ndim == 2:
169 |         n_features = array.shape[1]
170 |         if n_features < ensure_min_features:
171 |             raise ValueError("Found array with %d feature(s) (shape=%s) while"
172 |                              " a minimum of %d is required%s."
173 |                              % (n_features, shape_repr, ensure_min_features,
174 |                                 context))
175 | 
176 |     if warn_on_dtype and dtype_orig is not None and array.dtype != dtype_orig:
177 |         msg = ("Data with input dtype %s was converted to %s%s."
178 |                % (dtype_orig, array.dtype, context))
179 |         warnings.warn(msg, DataConversionWarning)
180 |     return array
181 | 
182 | 
183 | def check_random_state(seed):
184 |     """Turn seed into a np.random.RandomState instance
185 | 
186 |     Parameters
187 |     ----------
188 |     seed : None | int | instance of RandomState
189 |         If seed is None, return the RandomState singleton used by np.random.
190 |         If seed is an int, return a new RandomState instance seeded with seed.
191 |         If seed is already a RandomState instance, return it.
192 |         Otherwise raise ValueError.
193 |     """
194 |     if seed is None or seed is np.random:
195 |         return np.random.mtrand._rand
196 |     if isinstance(seed, (numbers.Integral, np.integer)):
197 |         return np.random.RandomState(seed)
198 |     if isinstance(seed, np.random.RandomState):
199 |         return seed
200 |     raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
201 |                      ' instance' % seed)
202 | 
203 | 
204 | def as_float_array(X, copy=True, force_all_finite=True):
205 |     """Converts an array-like to an array of floats.
206 | 
207 |     The new dtype will be np.float32 or np.float64, depending on the original
208 |     type. The function can create a copy or modify the argument depending
209 |     on the argument copy.
210 | 
211 |     Parameters
212 |     ----------
213 |     X : {array-like, sparse matrix}
214 | 
215 |     copy : bool, optional
216 |         If True, a copy of X will be created. If False, a copy may still be
217 |         returned if X's dtype is not a floating point type.
218 | 
219 |     force_all_finite : boolean (default=True)
220 |         Whether to raise an error on np.inf and np.nan in X.
221 | 
222 |     Returns
223 |     -------
224 |     XT : {array, sparse matrix}
225 |         An array of type np.float
226 |     """
227 |     if isinstance(X, np.matrix) or (not isinstance(X, np.ndarray)
228 |                                     and not sp.issparse(X)):
229 |         return check_array(X, ['csr', 'csc', 'coo'], dtype=np.float64,
230 |                            copy=copy, force_all_finite=force_all_finite,
231 |                            ensure_2d=False)
232 |     elif sp.issparse(X) and X.dtype in [np.float32, np.float64]:
233 |         return X.copy() if copy else X
234 |     elif X.dtype in [np.float32, np.float64]:  # is numpy array
235 |         return X.copy('F' if X.flags['F_CONTIGUOUS'] else 'C') if copy else X
236 |     else:
237 |         if X.dtype.kind in 'uib' and X.dtype.itemsize <= 4:
238 |             return_dtype = np.float32
239 |         else:
240 |             return_dtype = np.float64
241 |         return X.astype(return_dtype)
242 | 
243 | 
244 | def _assert_all_finite(X):
245 |     """Like assert_all_finite, but only for ndarray."""
246 |     if _get_config()['assume_finite']:
247 |         return
248 |     X = np.asanyarray(X)
249 |     # First try an O(n) time, O(1) space solution for the common case that
250 |     # everything is finite; fall back to O(n) space np.isfinite to prevent
251 |     # false positives from overflow in sum method.
252 |     if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum())
253 |             and not np.isfinite(X).all()):
254 |         raise ValueError("Input contains NaN, infinity"
255 |                          " or a value too large for %r." % X.dtype)
256 | 
257 | 
258 | def _num_samples(x):
259 |     """Return number of samples in array-like x."""
260 |     if hasattr(x, 'fit') and callable(x.fit):
261 |         # Don't get num_samples from an ensembles length!
262 |         raise TypeError('Expected sequence or array-like, got '
263 |                         'estimator %s' % x)
264 |     if not hasattr(x, '__len__') and not hasattr(x, 'shape'):
265 |         if hasattr(x, '__array__'):
266 |             x = np.asarray(x)
267 |         else:
268 |             raise TypeError("Expected sequence or array-like, got %s" %
269 |                             type(x))
270 |     if hasattr(x, 'shape'):
271 |         if len(x.shape) == 0:
272 |             raise TypeError("Singleton array %r cannot be considered"
273 |                             " a valid collection." % x)
274 |         return x.shape[0]
275 |     else:
276 |         return len(x)
277 | 
278 | 
279 | def _shape_repr(shape):
280 |     """Return a platform independent representation of an array shape
281 | 
282 |     Under Python 2, the `long` type introduces an 'L' suffix when using the
283 |     default %r format for tuples of integers (typically used to store the shape
284 |     of an array).
285 | 
286 |     Under Windows 64 bit (and Python 2), the `long` type is used by default
287 |     in numpy shapes even when the integer dimensions are well below 32 bit.
288 |     The platform specific type causes string messages or doctests to change
289 |     from one platform to another which is not desirable.
290 | 
291 |     Under Python 3, there is no more `long` type so the `L` suffix is never
292 |     introduced in string representation.
293 | 
294 |     >>> _shape_repr((1, 2))
295 |     '(1, 2)'
296 |     >>> one = 2 ** 64 / 2 ** 64  # force an upcast to `long` under Python 2
297 |     >>> _shape_repr((one, 2 * one))
298 |     '(1, 2)'
299 |     >>> _shape_repr((1,))
300 |     '(1,)'
301 |     >>> _shape_repr(())
302 |     '()'
303 |     """
304 |     if len(shape) == 0:
305 |         return "()"
306 |     joined = ", ".join("%d" % e for e in shape)
307 |     if len(shape) == 1:
308 |         # special notation for singleton tuples
309 |         joined += ','
310 |     return "(%s)" % joined
311 | 
312 | 
313 | def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy,
314 |                           force_all_finite):
315 |     """Convert a sparse matrix to a given format.
316 | 
317 |     Checks the sparse format of spmatrix and converts if necessary.
318 | 
319 |     Parameters
320 |     ----------
321 |     spmatrix : scipy sparse matrix
322 |         Input to validate and convert.
323 | 
324 |     accept_sparse : string, boolean or list/tuple of strings
325 |         String[s] representing allowed sparse matrix formats ('csc',
326 |         'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but
327 |         not in the allowed format, it will be converted to the first listed
328 |         format. True allows the input to be any format. False means
329 |         that a sparse matrix input will raise an error.
330 | 
331 |     dtype : string, type or None
332 |         Data type of result. If None, the dtype of the input is preserved.
333 | 
334 |     copy : boolean
335 |         Whether a forced copy will be triggered. If copy=False, a copy might
336 |         be triggered by a conversion.
337 | 
338 |     force_all_finite : boolean
339 |         Whether to raise an error on np.inf and np.nan in X.
340 | 
341 |     Returns
342 |     -------
343 |     spmatrix_converted : scipy sparse matrix.
344 |         Matrix that is ensured to have an allowed type.
345 |     """
346 |     if dtype is None:
347 |         dtype = spmatrix.dtype
348 | 
349 |     changed_format = False
350 | 
351 |     if isinstance(accept_sparse, six.string_types):
352 |         accept_sparse = [accept_sparse]
353 | 
354 |     if accept_sparse is False:
355 |         raise TypeError('A sparse matrix was passed, but dense '
356 |                         'data is required. Use X.toarray() to '
357 |                         'convert to a dense numpy array.')
358 |     elif isinstance(accept_sparse, (list, tuple)):
359 |         if len(accept_sparse) == 0:
360 |             raise ValueError("When providing 'accept_sparse' "
361 |                              "as a tuple or list, it must contain at "
362 |                              "least one string value.")
363 |         # ensure correct sparse format
364 |         if spmatrix.format not in accept_sparse:
365 |             # create new with correct sparse
366 |             spmatrix = spmatrix.asformat(accept_sparse[0])
367 |             changed_format = True
368 |     elif accept_sparse is not True:
369 |         # any other type
370 |         raise ValueError("Parameter 'accept_sparse' should be a string, "
371 |                          "boolean or list of strings. You provided "
372 |                          "'accept_sparse={}'.".format(accept_sparse))
373 | 
374 |     if dtype != spmatrix.dtype:
375 |         # convert dtype
376 |         spmatrix = spmatrix.astype(dtype)
377 |     elif copy and not changed_format:
378 |         # force copy
379 |         spmatrix = spmatrix.copy()
380 | 
381 |     if force_all_finite:
382 |         if not hasattr(spmatrix, "data"):
383 |             warnings.warn("Can't check %s sparse matrix for nan or inf."
384 |                           % spmatrix.format)
385 |         else:
386 |             _assert_all_finite(spmatrix.data)
387 |     return spmatrix
388 | 
389 | 
390 | FLOAT_DTYPES = (np.float64, np.float32, np.float16)
391 | 
392 | 
393 | def check_is_fitted(estimator, attributes, msg=None, all_or_any=all):
394 |     """Perform is_fitted validation for estimator.
395 | 
396 |     Checks if the estimator is fitted by verifying the presence of
397 |     "all_or_any" of the passed attributes and raises a NotFittedError with the
398 |     given message.
399 | 
400 |     Parameters
401 |     ----------
402 |     estimator : estimator instance.
403 |         estimator instance for which the check is performed.
404 | 
405 |     attributes : attribute name(s) given as string or a list/tuple of strings
406 |         Eg.:
407 |             ``["coef_", "estimator_", ...], "coef_"``
408 | 
409 |     msg : string
410 |         The default error message is, "This %(name)s instance is not fitted
411 |         yet. Call 'fit' with appropriate arguments before using this method."
412 | 
413 |         For custom messages if "%(name)s" is present in the message string,
414 |         it is substituted for the estimator name.
415 | 
416 |         Eg. : "Estimator, %(name)s, must be fitted before sparsifying".
417 | 
418 |     all_or_any : callable, {all, any}, default all
419 |         Specify whether all or any of the given attributes must exist.
420 | 
421 |     Returns
422 |     -------
423 |     None
424 | 
425 |     Raises
426 |     ------
427 |     NotFittedError
428 |         If the attributes are not found.
429 |     """
430 |     if msg is None:
431 |         msg = ("This %(name)s instance is not fitted yet. Call 'fit' with "
432 |                "appropriate arguments before using this method.")
433 | 
434 |     if not hasattr(estimator, 'fit'):
435 |         raise TypeError("%s is not an estimator instance." % (estimator))
436 | 
437 |     if not isinstance(attributes, (list, tuple)):
438 |         attributes = [attributes]
439 | 
440 |     if not all_or_any([hasattr(estimator, attr) for attr in attributes]):
441 |         raise NotFittedError(msg % {'name': type(estimator).__name__})
442 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/sklearn_import/cluster/k_means_.py:
--------------------------------------------------------------------------------
  1 | """K-means clustering"""
  2 | 
  3 | # Authors: Gael Varoquaux <gael.varoquaux@normalesup.org>
  4 | #          Thomas Rueckstiess <ruecksti@in.tum.de>
  5 | #          James Bergstra <james.bergstra@umontreal.ca>
  6 | #          Jan Schlueter <scikit-learn@jan-schlueter.de>
  7 | #          Nelle Varoquaux
  8 | #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
  9 | #          Olivier Grisel <olivier.grisel@ensta.org>
 10 | #          Mathieu Blondel <mathieu@mblondel.org>
 11 | #          Robert Layton <robertlayton@gmail.com>
 12 | # License: BSD 3 clause
 13 | 
 14 | import warnings
 15 | 
 16 | import numpy as np
 17 | import scipy.sparse as sp
 18 | from sklearn_import.base import BaseEstimator, ClusterMixin, TransformerMixin
 19 | from six import string_types
 20 | from sklearn_import.metrics.pairwise import euclidean_distances, pairwise_distances_argmin_min
 21 | from sklearn_import.utils.validation import check_array, check_random_state, FLOAT_DTYPES, \
 22 |     check_is_fitted
 23 | from sklearn_import.utils.extmath import row_norms, stable_cumsum
 24 | from sklearn_import.utils.sparsefuncs import mean_variance_axis
 25 | 
 26 | from sklearn_import.cluster import _k_means
 27 | 
 28 | 
 29 | ###############################################################################
 30 | # Initialization heuristic
 31 | 
 32 | 
 33 | def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
 34 |     """Init n_clusters seeds according to k-means++
 35 | 
 36 |     Parameters
 37 |     -----------
 38 |     X : array or sparse matrix, shape (n_samples, n_features)
 39 |         The data to pick seeds for. To avoid memory copy, the input data
 40 |         should be double precision (dtype=np.float64).
 41 | 
 42 |     n_clusters : integer
 43 |         The number of seeds to choose
 44 | 
 45 |     x_squared_norms : array, shape (n_samples,)
 46 |         Squared Euclidean norm of each data point.
 47 | 
 48 |     random_state : numpy.RandomState
 49 |         The generator used to initialize the centers.
 50 | 
 51 |     n_local_trials : integer, optional
 52 |         The number of seeding trials for each center (except the first),
 53 |         of which the one reducing inertia the most is greedily chosen.
 54 |         Set to None to make the number of trials depend logarithmically
 55 |         on the number of seeds (2+log(k)); this is the default.
 56 | 
 57 |     Notes
 58 |     -----
 59 |     Selects initial cluster centers for k-mean clustering in a smart way
 60 |     to speed up convergence. see: Arthur, D. and Vassilvitskii, S.
 61 |     "k-means++: the advantages of careful seeding". ACM-SIAM symposium
 62 |     on Discrete algorithms. 2007
 63 | 
 64 |     Version ported from http://www.stanford.edu/~darthur/kMeansppTest.zip,
 65 |     which is the implementation used in the aforementioned paper.
 66 |     """
 67 |     n_samples, n_features = X.shape
 68 | 
 69 |     centers = np.empty((n_clusters, n_features), dtype=X.dtype)
 70 | 
 71 |     assert x_squared_norms is not None, 'x_squared_norms None in _k_init'
 72 | 
 73 |     # Set the number of local seeding trials if none is given
 74 |     if n_local_trials is None:
 75 |         # This is what Arthur/Vassilvitskii tried, but did not report
 76 |         # specific results for other than mentioning in the conclusion
 77 |         # that it helped.
 78 |         n_local_trials = 2 + int(np.log(n_clusters))
 79 | 
 80 |     # Pick first center randomly
 81 |     center_id = random_state.randint(n_samples)
 82 |     if sp.issparse(X):
 83 |         centers[0] = X[center_id].toarray()
 84 |     else:
 85 |         centers[0] = X[center_id]
 86 | 
 87 |     # Initialize list of closest distances and calculate current potential
 88 |     closest_dist_sq = euclidean_distances(
 89 |         centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms,
 90 |         squared=True)
 91 |     current_pot = closest_dist_sq.sum()
 92 | 
 93 |     # Pick the remaining n_clusters-1 points
 94 |     for c in range(1, n_clusters):
 95 |         # Choose center candidates by sampling with probability proportional
 96 |         # to the squared distance to the closest existing center
 97 |         rand_vals = random_state.random_sample(n_local_trials) * current_pot
 98 |         candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq),
 99 |                                         rand_vals)
100 | 
101 |         # Compute distances to center candidates
102 |         distance_to_candidates = euclidean_distances(
103 |             X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True)
104 | 
105 |         # Decide which candidate is the best
106 |         best_candidate = None
107 |         best_pot = None
108 |         best_dist_sq = None
109 |         for trial in range(n_local_trials):
110 |             # Compute potential when including center candidate
111 |             new_dist_sq = np.minimum(closest_dist_sq,
112 |                                      distance_to_candidates[trial])
113 |             new_pot = new_dist_sq.sum()
114 | 
115 |             # Store result if it is the best local trial so far
116 |             if (best_candidate is None) or (new_pot < best_pot):
117 |                 best_candidate = candidate_ids[trial]
118 |                 best_pot = new_pot
119 |                 best_dist_sq = new_dist_sq
120 | 
121 |         # Permanently add best center candidate found in local tries
122 |         if sp.issparse(X):
123 |             centers[c] = X[best_candidate].toarray()
124 |         else:
125 |             centers[c] = X[best_candidate]
126 |         current_pot = best_pot
127 |         closest_dist_sq = best_dist_sq
128 | 
129 |     return centers
130 | 
131 | 
132 | ###############################################################################
133 | # K-means batch estimation by EM (expectation maximization)
134 | 
135 | def _validate_center_shape(X, n_centers, centers):
136 |     """Check if centers is compatible with X and n_centers"""
137 |     if len(centers) != n_centers:
138 |         raise ValueError('The shape of the initial centers (%s) '
139 |                          'does not match the number of clusters %i'
140 |                          % (centers.shape, n_centers))
141 |     if centers.shape[1] != X.shape[1]:
142 |         raise ValueError(
143 |             "The number of features of the initial centers %s "
144 |             "does not match the number of features of the data %s."
145 |             % (centers.shape[1], X.shape[1]))
146 | 
147 | 
148 | def _tolerance(X, tol):
149 |     """Return a tolerance which is independent of the dataset"""
150 |     if sp.issparse(X):
151 |         variances = mean_variance_axis(X, axis=0)[1]
152 |     else:
153 |         variances = np.var(X, axis=0)
154 |     return np.mean(variances) * tol
155 | 
156 | 
157 | def _labels_inertia_precompute_dense(X, x_squared_norms, centers, distances):
158 |     """Compute labels and inertia using a full distance matrix.
159 | 
160 |     This will overwrite the 'distances' array in-place.
161 | 
162 |     Parameters
163 |     ----------
164 |     X : numpy array, shape (n_sample, n_features)
165 |         Input data.
166 | 
167 |     x_squared_norms : numpy array, shape (n_samples,)
168 |         Precomputed squared norms of X.
169 | 
170 |     centers : numpy array, shape (n_clusters, n_features)
171 |         Cluster centers which data is assigned to.
172 | 
173 |     distances : numpy array, shape (n_samples,)
174 |         Pre-allocated array in which distances are stored.
175 | 
176 |     Returns
177 |     -------
178 |     labels : numpy array, dtype=np.int, shape (n_samples,)
179 |         Indices of clusters that samples are assigned to.
180 | 
181 |     inertia : float
182 |         Sum of distances of samples to their closest cluster center.
183 | 
184 |     """
185 |     n_samples = X.shape[0]
186 | 
187 |     # Breakup nearest neighbor distance computation into batches to prevent
188 |     # memory blowup in the case of a large number of samples and clusters.
189 |     # TODO: Once PR #7383 is merged use check_inputs=False in metric_kwargs.
190 |     labels, mindist = pairwise_distances_argmin_min(
191 |         X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True})
192 |     # cython k-means code assumes int32 inputs
193 |     labels = labels.astype(np.int32)
194 |     if n_samples == distances.shape[0]:
195 |         # distances will be changed in-place
196 |         distances[:] = mindist
197 |     inertia = mindist.sum()
198 |     return labels, inertia
199 | 
200 | 
201 | def _labels_inertia(X, x_squared_norms, centers,
202 |                     precompute_distances=True, distances=None):
203 |     """E step of the K-means EM algorithm.
204 | 
205 |     Compute the labels and the inertia of the given samples and centers.
206 |     This will compute the distances in-place.
207 | 
208 |     Parameters
209 |     ----------
210 |     X : float64 array-like or CSR sparse matrix, shape (n_samples, n_features)
211 |         The input samples to assign to the labels.
212 | 
213 |     x_squared_norms : array, shape (n_samples,)
214 |         Precomputed squared euclidean norm of each data point, to speed up
215 |         computations.
216 | 
217 |     centers : float array, shape (k, n_features)
218 |         The cluster centers.
219 | 
220 |     precompute_distances : boolean, default: True
221 |         Precompute distances (faster but takes more memory).
222 | 
223 |     distances : float array, shape (n_samples,)
224 |         Pre-allocated array to be filled in with each sample's distance
225 |         to the closest center.
226 | 
227 |     Returns
228 |     -------
229 |     labels : int array of shape(n)
230 |         The resulting assignment
231 | 
232 |     inertia : float
233 |         Sum of distances of samples to their closest cluster center.
234 |     """
235 |     n_samples = X.shape[0]
236 |     # set the default value of centers to -1 to be able to detect any anomaly
237 |     # easily
238 |     labels = -np.ones(n_samples, np.int32)
239 |     if distances is None:
240 |         distances = np.zeros(shape=(0,), dtype=X.dtype)
241 |     # distances will be changed in-place
242 |     if sp.issparse(X):
243 |         inertia = _k_means._assign_labels_csr(
244 |             X, x_squared_norms, centers, labels, distances=distances)
245 |     else:
246 |         if precompute_distances:
247 |             return _labels_inertia_precompute_dense(X, x_squared_norms,
248 |                                                     centers, distances)
249 |         inertia = _k_means._assign_labels_array(
250 |             X, x_squared_norms, centers, labels, distances=distances)
251 |     return labels, inertia
252 | 
253 | 
254 | def _init_centroids(X, k, init, random_state=None, x_squared_norms=None,
255 |                     init_size=None):
256 |     """Compute the initial centroids
257 | 
258 |     Parameters
259 |     ----------
260 | 
261 |     X : array, shape (n_samples, n_features)
262 | 
263 |     k : int
264 |         number of centroids
265 | 
266 |     init : {'k-means++', 'random' or ndarray or callable} optional
267 |         Method for initialization
268 | 
269 |     random_state : int, RandomState instance or None, optional, default: None
270 |         If int, random_state is the seed used by the random number generator;
271 |         If RandomState instance, random_state is the random number generator;
272 |         If None, the random number generator is the RandomState instance used
273 |         by `np.random`.
274 | 
275 |     x_squared_norms :  array, shape (n_samples,), optional
276 |         Squared euclidean norm of each data point. Pass it if you have it at
277 |         hands already to avoid it being recomputed here. Default: None
278 | 
279 |     init_size : int, optional
280 |         Number of samples to randomly sample for speeding up the
281 |         initialization (sometimes at the expense of accuracy): the
282 |         only algorithm is initialized by running a batch KMeans on a
283 |         random subset of the data. This needs to be larger than k.
284 | 
285 |     Returns
286 |     -------
287 |     centers : array, shape(k, n_features)
288 |     """
289 |     random_state = check_random_state(random_state)
290 |     n_samples = X.shape[0]
291 | 
292 |     if x_squared_norms is None:
293 |         x_squared_norms = row_norms(X, squared=True)
294 | 
295 |     if init_size is not None and init_size < n_samples:
296 |         if init_size < k:
297 |             warnings.warn(
298 |                 "init_size=%d should be larger than k=%d. "
299 |                 "Setting it to 3*k" % (init_size, k),
300 |                 RuntimeWarning, stacklevel=2)
301 |             init_size = 3 * k
302 |         init_indices = random_state.randint(0, n_samples, init_size)
303 |         X = X[init_indices]
304 |         x_squared_norms = x_squared_norms[init_indices]
305 |         n_samples = X.shape[0]
306 |     elif n_samples < k:
307 |         raise ValueError(
308 |             "n_samples=%d should be larger than k=%d" % (n_samples, k))
309 | 
310 |     if isinstance(init, string_types) and init == 'k-means++':
311 |         centers = _k_init(X, k, random_state=random_state,
312 |                           x_squared_norms=x_squared_norms)
313 |     elif isinstance(init, string_types) and init == 'random':
314 |         seeds = random_state.permutation(n_samples)[:k]
315 |         centers = X[seeds]
316 |     elif hasattr(init, '__array__'):
317 |         # ensure that the centers have the same dtype as X
318 |         # this is a requirement of fused types of cython
319 |         centers = np.array(init, dtype=X.dtype)
320 |     elif callable(init):
321 |         centers = init(X, k, random_state=random_state)
322 |         centers = np.asarray(centers, dtype=X.dtype)
323 |     else:
324 |         raise ValueError("the init parameter for the k-means should "
325 |                          "be 'k-means++' or 'random' or an ndarray, "
326 |                          "'%s' (type '%s') was passed." % (init, type(init)))
327 | 
328 |     if sp.issparse(centers):
329 |         centers = centers.toarray()
330 | 
331 |     _validate_center_shape(X, k, centers)
332 |     return centers
333 | 
334 | 
335 | class KMeans(BaseEstimator, ClusterMixin, TransformerMixin):
336 |     """K-Means clustering
337 | 
338 |     Read more in the :ref:`User Guide <k_means>`.
339 | 
340 |     Parameters
341 |     ----------
342 | 
343 |     n_clusters : int, optional, default: 8
344 |         The number of clusters to form as well as the number of
345 |         centroids to generate.
346 | 
347 |     init : {'k-means++', 'random' or an ndarray}
348 |         Method for initialization, defaults to 'k-means++':
349 | 
350 |         'k-means++' : selects initial cluster centers for k-mean
351 |         clustering in a smart way to speed up convergence. See section
352 |         Notes in k_init for more details.
353 | 
354 |         'random': choose k observations (rows) at random from data for
355 |         the initial centroids.
356 | 
357 |         If an ndarray is passed, it should be of shape (n_clusters, n_features)
358 |         and gives the initial centers.
359 | 
360 |     n_init : int, default: 10
361 |         Number of time the k-means algorithm will be run with different
362 |         centroid seeds. The final results will be the best output of
363 |         n_init consecutive runs in terms of inertia.
364 | 
365 |     max_iter : int, default: 300
366 |         Maximum number of iterations of the k-means algorithm for a
367 |         single run.
368 | 
369 |     tol : float, default: 1e-4
370 |         Relative tolerance with regards to inertia to declare convergence
371 | 
372 |     precompute_distances : {'auto', True, False}
373 |         Precompute distances (faster but takes more memory).
374 | 
375 |         'auto' : do not precompute distances if n_samples * n_clusters > 12
376 |         million. This corresponds to about 100MB overhead per job using
377 |         double precision.
378 | 
379 |         True : always precompute distances
380 | 
381 |         False : never precompute distances
382 | 
383 |     verbose : int, default 0
384 |         Verbosity mode.
385 | 
386 |     random_state : int, RandomState instance or None, optional, default: None
387 |         If int, random_state is the seed used by the random number generator;
388 |         If RandomState instance, random_state is the random number generator;
389 |         If None, the random number generator is the RandomState instance used
390 |         by `np.random`.
391 | 
392 |     copy_x : boolean, default True
393 |         When pre-computing distances it is more numerically accurate to center
394 |         the data first.  If copy_x is True, then the original data is not
395 |         modified.  If False, the original data is modified, and put back before
396 |         the function returns, but small numerical differences may be introduced
397 |         by subtracting and then adding the data mean.
398 | 
399 |     n_jobs : int
400 |         The number of jobs to use for the computation. This works by computing
401 |         each of the n_init runs in parallel.
402 | 
403 |         If -1 all CPUs are used. If 1 is given, no parallel computing code is
404 |         used at all, which is useful for debugging. For n_jobs below -1,
405 |         (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
406 |         are used.
407 | 
408 |     algorithm : "auto", "full" or "elkan", default="auto"
409 |         K-means algorithm to use. The classical EM-style algorithm is "full".
410 |         The "elkan" variation is more efficient by using the triangle
411 |         inequality, but currently doesn't support sparse data. "auto" chooses
412 |         "elkan" for dense data and "full" for sparse data.
413 | 
414 |     Attributes
415 |     ----------
416 |     cluster_centers_ : array, [n_clusters, n_features]
417 |         Coordinates of cluster centers
418 | 
419 |     labels_ :
420 |         Labels of each point
421 | 
422 |     inertia_ : float
423 |         Sum of distances of samples to their closest cluster center.
424 | 
425 |     Examples
426 |     --------
427 | 
428 |     >>> from sklearn.cluster import KMeans
429 |     >>> import numpy as np
430 |     >>> X = np.array([[1, 2], [1, 4], [1, 0],
431 |     ...               [4, 2], [4, 4], [4, 0]])
432 |     >>> kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
433 |     >>> kmeans.labels_
434 |     array([0, 0, 0, 1, 1, 1], dtype=int32)
435 |     >>> kmeans.predict([[0, 0], [4, 4]])
436 |     array([0, 1], dtype=int32)
437 |     >>> kmeans.cluster_centers_
438 |     array([[ 1.,  2.],
439 |            [ 4.,  2.]])
440 | 
441 |     See also
442 |     --------
443 | 
444 |     MiniBatchKMeans
445 |         Alternative online implementation that does incremental updates
446 |         of the centers positions using mini-batches.
447 |         For large scale learning (say n_samples > 10k) MiniBatchKMeans is
448 |         probably much faster than the default batch implementation.
449 | 
450 |     Notes
451 |     ------
452 |     The k-means problem is solved using Lloyd's algorithm.
453 | 
454 |     The average complexity is given by O(k n T), were n is the number of
455 |     samples and T is the number of iteration.
456 | 
457 |     The worst case complexity is given by O(n^(k+2/p)) with
458 |     n = n_samples, p = n_features. (D. Arthur and S. Vassilvitskii,
459 |     'How slow is the k-means method?' SoCG2006)
460 | 
461 |     In practice, the k-means algorithm is very fast (one of the fastest
462 |     clustering algorithms available), but it falls in local minima. That's why
463 |     it can be useful to restart it several times.
464 | 
465 |     """
466 | 
467 |     def __init__(self, n_clusters=8, init='k-means++', n_init=10,
468 |                  max_iter=300, tol=1e-4, precompute_distances='auto',
469 |                  verbose=0, random_state=None, copy_x=True,
470 |                  n_jobs=1, algorithm='auto'):
471 | 
472 |         self.n_clusters = n_clusters
473 |         self.init = init
474 |         self.max_iter = max_iter
475 |         self.tol = tol
476 |         self.precompute_distances = precompute_distances
477 |         self.n_init = n_init
478 |         self.verbose = verbose
479 |         self.random_state = random_state
480 |         self.copy_x = copy_x
481 |         self.n_jobs = n_jobs
482 |         self.algorithm = algorithm
483 | 
484 |     def _check_fit_data(self, X):
485 |         """Verify that the number of samples given is larger than k"""
486 |         X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32])
487 |         if X.shape[0] < self.n_clusters:
488 |             raise ValueError("n_samples=%d should be >= n_clusters=%d" % (
489 |                 X.shape[0], self.n_clusters))
490 |         return X
491 | 
492 |     def _check_test_data(self, X):
493 |         X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES)
494 |         n_samples, n_features = X.shape
495 |         expected_n_features = self.cluster_centers_.shape[1]
496 |         if not n_features == expected_n_features:
497 |             raise ValueError("Incorrect number of features. "
498 |                              "Got %d features, expected %d" % (
499 |                                  n_features, expected_n_features))
500 | 
501 |         return X
502 | 
503 |     def fit(self, X, y=None):
504 |         """Compute k-means clustering.
505 | 
506 |         Parameters
507 |         ----------
508 |         X : array-like or sparse matrix, shape=(n_samples, n_features)
509 |             Training instances to cluster.
510 |         """
511 |         # Added to remove scikit-learn internal dependenceies
512 |         raise NotImplemented
513 | 
514 |     def fit_predict(self, X, y=None):
515 |         """Compute cluster centers and predict cluster index for each sample.
516 | 
517 |         Convenience method; equivalent to calling fit(X) followed by
518 |         predict(X).
519 | 
520 |         Parameters
521 |         ----------
522 |         X : {array-like, sparse matrix}, shape = [n_samples, n_features]
523 |             New data to transform.
524 | 
525 |         Returns
526 |         -------
527 |         labels : array, shape [n_samples,]
528 |             Index of the cluster each sample belongs to.
529 |         """
530 |         return self.fit(X).labels_
531 | 
532 |     def fit_transform(self, X, y=None):
533 |         """Compute clustering and transform X to cluster-distance space.
534 | 
535 |         Equivalent to fit(X).transform(X), but more efficiently implemented.
536 | 
537 |         Parameters
538 |         ----------
539 |         X : {array-like, sparse matrix}, shape = [n_samples, n_features]
540 |             New data to transform.
541 | 
542 |         Returns
543 |         -------
544 |         X_new : array, shape [n_samples, k]
545 |             X transformed in the new space.
546 |         """
547 |         # Currently, this just skips a copy of the data if it is not in
548 |         # np.array or CSR format already.
549 |         # XXX This skips _check_test_data, which may change the dtype;
550 |         # we should refactor the input validation.
551 |         X = self._check_fit_data(X)
552 |         return self.fit(X)._transform(X)
553 | 
554 |     def transform(self, X):
555 |         """Transform X to a cluster-distance space.
556 | 
557 |         In the new space, each dimension is the distance to the cluster
558 |         centers.  Note that even if X is sparse, the array returned by
559 |         `transform` will typically be dense.
560 | 
561 |         Parameters
562 |         ----------
563 |         X : {array-like, sparse matrix}, shape = [n_samples, n_features]
564 |             New data to transform.
565 | 
566 |         Returns
567 |         -------
568 |         X_new : array, shape [n_samples, k]
569 |             X transformed in the new space.
570 |         """
571 |         check_is_fitted(self, 'cluster_centers_')
572 | 
573 |         X = self._check_test_data(X)
574 |         return self._transform(X)
575 | 
576 |     def _transform(self, X):
577 |         """guts of transform method; no input validation"""
578 |         return euclidean_distances(X, self.cluster_centers_)
579 | 
580 |     def predict(self, X):
581 |         """Predict the closest cluster each sample in X belongs to.
582 | 
583 |         In the vector quantization literature, `cluster_centers_` is called
584 |         the code book and each value returned by `predict` is the index of
585 |         the closest code in the code book.
586 | 
587 |         Parameters
588 |         ----------
589 |         X : {array-like, sparse matrix}, shape = [n_samples, n_features]
590 |             New data to predict.
591 | 
592 |         Returns
593 |         -------
594 |         labels : array, shape [n_samples,]
595 |             Index of the cluster each sample belongs to.
596 |         """
597 |         check_is_fitted(self, 'cluster_centers_')
598 | 
599 |         X = self._check_test_data(X)
600 |         x_squared_norms = row_norms(X, squared=True)
601 |         return _labels_inertia(X, x_squared_norms, self.cluster_centers_)[0]
602 | 
603 |     def score(self, X, y=None):
604 |         """Opposite of the value of X on the K-means objective.
605 | 
606 |         Parameters
607 |         ----------
608 |         X : {array-like, sparse matrix}, shape = [n_samples, n_features]
609 |             New data.
610 | 
611 |         Returns
612 |         -------
613 |         score : float
614 |             Opposite of the value of X on the K-means objective.
615 |         """
616 |         check_is_fitted(self, 'cluster_centers_')
617 | 
618 |         X = self._check_test_data(X)
619 |         x_squared_norms = row_norms(X, squared=True)
620 |         return -_labels_inertia(X, x_squared_norms, self.cluster_centers_)[1]
621 | 
622 | 
623 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/k_means_constrained/k_means_constrained_.py:
--------------------------------------------------------------------------------
  1 | """K-means clustering"""
  2 | 
  3 | # Authors: Josh Levy-Kramer <josh@outra.co.uk>
  4 | #          Gael Varoquaux <gael.varoquaux@normalesup.org>
  5 | #          Thomas Rueckstiess <ruecksti@in.tum.de>
  6 | #          James Bergstra <james.bergstra@umontreal.ca>
  7 | #          Jan Schlueter <scikit-learn@jan-schlueter.de>
  8 | #          Nelle Varoquaux
  9 | #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 10 | #          Olivier Grisel <olivier.grisel@ensta.org>
 11 | #          Mathieu Blondel <mathieu@mblondel.org>
 12 | #          Robert Layton <robertlayton@gmail.com>
 13 | # License: BSD 3 clause
 14 | 
 15 | import warnings
 16 | import numpy as np
 17 | import scipy.sparse as sp
 18 | from scipy.spatial.distance import cdist
 19 | 
 20 | import sys 
 21 | import os 
 22 | folderpath = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 23 | sys.path.append(folderpath)
 24 | from sklearn_import.metrics.pairwise import euclidean_distances
 25 | from sklearn_import.utils.extmath import row_norms, squared_norm, cartesian
 26 | from sklearn_import.utils.validation import check_array, check_random_state, as_float_array
 27 | 
 28 | from joblib import Parallel
 29 | from joblib import delayed
 30 | 
 31 | # Internal scikit learn methods imported into this project
 32 | from sklearn_import.cluster._k_means import _centers_dense, _centers_sparse
 33 | from sklearn_import.cluster.k_means_ import _validate_center_shape, _tolerance, KMeans, _init_centroids
 34 | 
 35 | from k_means_constrained.mincostflow_vectorized import SimpleMinCostFlowVectorized
 36 | 
 37 | 
 38 | def k_means_constrained(X, n_clusters, size_min=None, size_max=None, init='k-means++',
 39 |             distance_func=cdist,
 40 |             n_init=10, max_iter=300, verbose=False,
 41 |             tol=1e-4, random_state=None, copy_x=True, n_jobs=1,
 42 |             return_n_iter=False):
 43 |     """K-Means clustering with minimum and maximum cluster size constraints.
 44 | 
 45 |     Read more in the :ref:`User Guide <k_means>`.
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     X : array-like, shape (n_samples, n_features)
 50 |         The observations to cluster.
 51 | 
 52 |     size_min : int, optional, default: None
 53 |         Constrain the label assignment so that each cluster has a minimum
 54 |         size of size_min. If None, no constrains will be applied
 55 | 
 56 |     size_max : int, optional, default: None
 57 |         Constrain the label assignment so that each cluster has a maximum
 58 |         size of size_max. If None, no constrains will be applied
 59 | 
 60 |     n_clusters : int
 61 |         The number of clusters to form as well as the number of
 62 |         centroids to generate.
 63 | 
 64 |     init : {'k-means++', 'random', or ndarray, or a callable}, optional
 65 |         Method for initialization, default to 'k-means++':
 66 | 
 67 |         'k-means++' : selects initial cluster centers for k-mean
 68 |         clustering in a smart way to speed up convergence. See section
 69 |         Notes in k_init for more details.
 70 | 
 71 |         'random': generate k centroids from a Gaussian with mean and
 72 |         variance estimated from the data.
 73 | 
 74 |         If an ndarray is passed, it should be of shape (n_clusters, n_features)
 75 |         and gives the initial centers.
 76 | 
 77 |         If a callable is passed, it should take arguments X, k and
 78 |         and a random state and return an initialization.
 79 | 
 80 |     n_init : int, optional, default: 10
 81 |         Number of time the k-means algorithm will be run with different
 82 |         centroid seeds. The final results will be the best output of
 83 |         n_init consecutive runs in terms of inertia.
 84 | 
 85 |     max_iter : int, optional, default 300
 86 |         Maximum number of iterations of the k-means algorithm to run.
 87 | 
 88 |     verbose : boolean, optional
 89 |         Verbosity mode.
 90 | 
 91 |     tol : float, optional
 92 |         The relative increment in the results before declaring convergence.
 93 | 
 94 |     random_state : int, RandomState instance or None, optional, default: None
 95 |         If int, random_state is the seed used by the random number generator;
 96 |         If RandomState instance, random_state is the random number generator;
 97 |         If None, the random number generator is the RandomState instance used
 98 |         by `np.random`.
 99 | 
100 |     copy_x : boolean, optional
101 |         When pre-computing distances it is more numerically accurate to center
102 |         the data first.  If copy_x is True, then the original data is not
103 |         modified.  If False, the original data is modified, and put back before
104 |         the function returns, but small numerical differences may be introduced
105 |         by subtracting and then adding the data mean.
106 | 
107 |     n_jobs : int
108 |         The number of jobs to use for the computation. This works by computing
109 |         each of the n_init runs in parallel.
110 | 
111 |         If -1 all CPUs are used. If 1 is given, no parallel computing code is
112 |         used at all, which is useful for debugging. For n_jobs below -1,
113 |         (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
114 |         are used.
115 | 
116 |     return_n_iter : bool, optional
117 |         Whether or not to return the number of iterations.
118 | 
119 |     Returns
120 |     -------
121 |     centroid : float ndarray with shape (k, n_features)
122 |         Centroids found at the last iteration of k-means.
123 | 
124 |     label : integer ndarray with shape (n_samples,)
125 |         label[i] is the code or index of the centroid the
126 |         i'th observation is closest to.
127 | 
128 |     inertia : float
129 |         The final value of the inertia criterion (sum of squared distances to
130 |         the closest centroid for all observations in the training set).
131 | 
132 |     best_n_iter : int
133 |         Number of iterations corresponding to the best results.
134 |         Returned only if `return_n_iter` is set to True.
135 | 
136 |     """
137 |     if sp.issparse(X):
138 |         raise NotImplementedError("Not implemented for sparse X")
139 | 
140 |     if n_init <= 0:
141 |         raise ValueError("Invalid number of initializations."
142 |                          " n_init=%d must be bigger than zero." % n_init)
143 |     random_state = check_random_state(random_state)
144 | 
145 |     if max_iter <= 0:
146 |         raise ValueError('Number of iterations should be a positive number,'
147 |                          ' got %d instead' % max_iter)
148 | 
149 |     X = as_float_array(X, copy=copy_x)
150 |     tol = _tolerance(X, tol)
151 | 
152 |     # Validate init array
153 |     if hasattr(init, '__array__'):
154 |         init = check_array(init, dtype=X.dtype.type, copy=True)
155 |         _validate_center_shape(X, n_clusters, init)
156 | 
157 |         if n_init != 1:
158 |             warnings.warn(
159 |                 'Explicit initial center position passed: '
160 |                 'performing only one init in k-means instead of n_init=%d'
161 |                 % n_init, RuntimeWarning, stacklevel=2)
162 |             n_init = 1
163 | 
164 |     # subtract of mean of x for more accurate distance computations
165 |     if not sp.issparse(X):
166 |         X_mean = X.mean(axis=0)
167 |         # The copy was already done above
168 |         X -= X_mean
169 | 
170 |         if hasattr(init, '__array__'):
171 |             init -= X_mean
172 | 
173 |     # precompute squared norms of data points
174 |     x_squared_norms = row_norms(X, squared=True)
175 | 
176 |     best_labels, best_inertia, best_centers = None, None, None
177 | 
178 |     if n_jobs == 1:
179 |         # For a single thread, less memory is needed if we just store one set
180 |         # of the best results (as opposed to one set per run per thread).
181 |         for it in range(n_init):
182 |             # run a k-means once
183 |             labels, inertia, centers, n_iter_ = kmeans_constrained_single(
184 |                 X, n_clusters,
185 |                 size_min=size_min, size_max=size_max, distance_func=distance_func,
186 |                 max_iter=max_iter, init=init, verbose=verbose, tol=tol,
187 |                 x_squared_norms=x_squared_norms, random_state=random_state)
188 |             # determine if these results are the best so far
189 |             if best_inertia is None or inertia < best_inertia:
190 |                 best_labels = labels.copy()
191 |                 best_centers = centers.copy()
192 |                 best_inertia = inertia
193 |                 best_n_iter = n_iter_
194 |     else:
195 |         # parallelisation of k-means runs
196 |         seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
197 |         results = Parallel(n_jobs=n_jobs, verbose=0)(
198 |             delayed(kmeans_constrained_single)(X, n_clusters,
199 |                                    size_min=size_min, size_max=size_max,
200 |                                    max_iter=max_iter, init=init, distance_func=distance_func,
201 |                                    verbose=verbose, tol=tol,
202 |                                    x_squared_norms=x_squared_norms,
203 |                                    # Change seed to ensure variety
204 |                                    random_state=seed)
205 |             for seed in seeds)
206 |         # Get results with the lowest inertia
207 |         labels, inertia, centers, n_iters = zip(*results)
208 |         best = np.argmin(inertia)
209 |         best_labels = labels[best]
210 |         best_inertia = inertia[best]
211 |         best_centers = centers[best]
212 |         best_n_iter = n_iters[best]
213 | 
214 |     if not sp.issparse(X):
215 |         if not copy_x:
216 |             X += X_mean
217 |         best_centers += X_mean
218 | 
219 |     if return_n_iter:
220 |         return best_centers, best_labels, best_inertia, best_n_iter
221 |     else:
222 |         return best_centers, best_labels, best_inertia
223 | 
224 | 
225 | def kmeans_constrained_single(X, n_clusters, size_min=None, size_max=None,
226 |                          max_iter=300, init='k-means++', distance_func=cdist,
227 |                          verbose=False, x_squared_norms=None,
228 |                          random_state=None, tol=1e-4):
229 |     """A single run of k-means constrained, assumes preparation completed prior.
230 | 
231 |     Parameters
232 |     ----------
233 |     X : array-like of floats, shape (n_samples, n_features)
234 |         The observations to cluster.
235 | 
236 |     size_min : int, optional, default: None
237 |         Constrain the label assignment so that each cluster has a minimum
238 |         size of size_min. If None, no constrains will be applied
239 | 
240 |     size_max : int, optional, default: None
241 |         Constrain the label assignment so that each cluster has a maximum
242 |         size of size_max. If None, no constrains will be applied
243 | 
244 |     n_clusters : int
245 |         The number of clusters to form as well as the number of
246 |         centroids to generate.
247 | 
248 |     max_iter : int, optional, default 300
249 |         Maximum number of iterations of the k-means algorithm to run.
250 | 
251 |     init : {'k-means++', 'random', or ndarray, or a callable}, optional
252 |         Method for initialization, default to 'k-means++':
253 | 
254 |         'k-means++' : selects initial cluster centers for k-mean
255 |         clustering in a smart way to speed up convergence. See section
256 |         Notes in k_init for more details.
257 | 
258 |         'random': generate k centroids from a Gaussian with mean and
259 |         variance estimated from the data.
260 | 
261 |         If an ndarray is passed, it should be of shape (k, p) and gives
262 |         the initial centers.
263 | 
264 |         If a callable is passed, it should take arguments X, k and
265 |         and a random state and return an initialization.
266 | 
267 |     tol : float, optional
268 |         The relative increment in the results before declaring convergence.
269 | 
270 |     verbose : boolean, optional
271 |         Verbosity mode
272 | 
273 |     x_squared_norms : array
274 |         Precomputed x_squared_norms.
275 | 
276 |     random_state : int, RandomState instance or None, optional, default: None
277 |         If int, random_state is the seed used by the random number generator;
278 |         If RandomState instance, random_state is the random number generator;
279 |         If None, the random number generator is the RandomState instance used
280 |         by `np.random`.
281 | 
282 |     Returns
283 |     -------
284 |     centroid : float ndarray with shape (k, n_features)
285 |         Centroids found at the last iteration of k-means.
286 | 
287 |     label : integer ndarray with shape (n_samples,)
288 |         label[i] is the code or index of the centroid the
289 |         i'th observation is closest to.
290 | 
291 |     inertia : float
292 |         The final value of the inertia criterion (sum of squared distances to
293 |         the closest centroid for all observations in the training set).
294 | 
295 |     n_iter : int
296 |         Number of iterations run.
297 |     """
298 |     if sp.issparse(X):
299 |         raise NotImplementedError("Not implemented for sparse X")
300 | 
301 |     random_state = check_random_state(random_state)
302 |     n_samples = X.shape[0]
303 | 
304 |     best_labels, best_inertia, best_centers = None, None, None
305 |     # init
306 |     centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms)
307 |     if verbose:
308 |         print("Initialization complete")
309 | 
310 |     # Allocate memory to store the distances for each sample to its
311 |     # closer center for reallocation in case of ties
312 |     distances = np.zeros(shape=(n_samples,), dtype=X.dtype)
313 | 
314 |     # Determine min and max sizes if non given
315 |     if size_min is None:
316 |         size_min = 0
317 |     if size_max is None:
318 |         size_max = n_samples  # Number of data points
319 | 
320 |     # Check size min and max
321 |     if not ((size_min >= 0) and (size_min <= n_samples)
322 |             and (size_max >= 0) and (size_max <= n_samples)):
323 |         raise ValueError("size_min and size_max must be a positive number smaller "
324 |                          "than the number of data points or `None`")
325 |     if size_max < size_min:
326 |         raise ValueError("size_max must be larger than size_min")
327 |     if size_min*n_clusters > n_samples:
328 |         raise ValueError("The product of size_min and n_clusters cannot exceed the number of samples (X)")
329 | 
330 |     # iterations
331 |     for i in range(max_iter):
332 |         centers_old = centers.copy()
333 |         # labels assignment is also called the E-step of EM
334 |         labels, inertia = \
335 |             _labels_constrained(X, centers, size_min, size_max, 
336 |             distances=distances, distance_func=distance_func)
337 | 
338 |         # computation of the means is also called the M-step of EM
339 |         if sp.issparse(X):
340 |             centers = _centers_sparse(X, labels, n_clusters, distances)
341 |         else:
342 |             centers = _centers_dense(X, labels, n_clusters, distances)
343 | 
344 |         if verbose:
345 |             print("Iteration %2d, inertia %.3f" % (i, inertia))
346 | 
347 |         if best_inertia is None or inertia < best_inertia:
348 |             best_labels = labels.copy()
349 |             best_centers = centers.copy()
350 |             best_inertia = inertia
351 | 
352 |         center_shift_total = squared_norm(centers_old - centers)
353 |         if center_shift_total <= tol:
354 |             if verbose:
355 |                 print("Converged at iteration %d: "
356 |                       "center shift %e within tolerance %e"
357 |                       % (i, center_shift_total, tol))
358 |             break
359 | 
360 |     if center_shift_total > 0:
361 |         # rerun E-step in case of non-convergence so that predicted labels
362 |         # match cluster centers
363 |         best_labels, best_inertia = \
364 |             _labels_constrained(X, centers, size_min, size_max, 
365 |             distances=distances, distance_func=distance_func)
366 | 
367 |     return best_labels, best_inertia, best_centers, i + 1
368 | 
369 | 
370 | def _labels_constrained(X, centers, size_min, size_max, distances, distance_func=cdist):
371 |     """Compute labels using the min and max cluster size constraint
372 | 
373 |     This will overwrite the 'distances' array in-place.
374 | 
375 |     Parameters
376 |     ----------
377 |     X : numpy array, shape (n_sample, n_features)
378 |         Input data.
379 | 
380 |     size_min : int
381 |         Minimum size for each cluster
382 | 
383 |     size_max : int
384 |         Maximum size for each cluster
385 | 
386 |     centers : numpy array, shape (n_clusters, n_features)
387 |         Cluster centers which data is assigned to.
388 | 
389 |     distances : numpy array, shape (n_samples,)
390 |         Pre-allocated array in which distances are stored.
391 | 
392 |     Returns
393 |     -------
394 |     labels : numpy array, dtype=np.int, shape (n_samples,)
395 |         Indices of clusters that samples are assigned to.
396 | 
397 |     inertia : float
398 |         Sum of squared distances of samples to their closest cluster center.
399 | 
400 |     """
401 |     C = centers
402 | 
403 |     # Distances to each centre C. (the `distances` parameter is the distance to the closest centre)
404 |     # K-mean original uses squared distances but this equivalent for constrained k-means
405 |     # D = euclidean_distances(X, C, squared=False)
406 |     D = distance_func(X, C)
407 | 
408 |     edges, costs, capacities, supplies, n_C, n_X = minimum_cost_flow_problem_graph(X, C, D, size_min, size_max)
409 |     labels = solve_min_cost_flow_graph(edges, costs, capacities, supplies, n_C, n_X)
410 | 
411 |     # cython k-means M step code assumes int32 inputs
412 |     labels = labels.astype(np.int32)
413 | 
414 |     # Change distances in-place
415 |     distances[:] = D[np.arange(D.shape[0]), labels]**2  # Square for M step of EM
416 |     inertia = distances.sum()
417 | 
418 |     return labels, inertia
419 | 
420 | 
421 | def minimum_cost_flow_problem_graph(X, C, D, size_min, size_max):
422 | 
423 |     # Setup minimum cost flow formulation graph
424 |     # Vertices indexes:
425 |     # X-nodes: [0, n(x)-1], C-nodes: [n(X), n(X)+n(C)-1], C-dummy nodes:[n(X)+n(C), n(X)+2*n(C)-1],
426 |     # Artificial node: [n(X)+2*n(C), n(X)+2*n(C)+1-1]
427 | 
428 |     # Create indices of nodes
429 |     n_X = X.shape[0]
430 |     n_C = C.shape[0]
431 |     X_ix = np.arange(n_X)
432 |     C_dummy_ix = np.arange(X_ix[-1] + 1, X_ix[-1] + 1 + n_C)
433 |     C_ix = np.arange(C_dummy_ix[-1] + 1, C_dummy_ix[-1] + 1 + n_C)
434 |     art_ix = C_ix[-1] + 1
435 | 
436 |     # Edges
437 |     edges_X_C_dummy = cartesian([X_ix, C_dummy_ix])  # All X's connect to all C dummy nodes (C')
438 |     edges_C_dummy_C = np.stack([C_dummy_ix, C_ix], axis=1)  # Each C' connects to a corresponding C (centroid)
439 |     edges_C_art = np.stack([C_ix, art_ix * np.ones(n_C)], axis=1)  # All C connect to artificial node
440 | 
441 |     edges = np.concatenate([edges_X_C_dummy, edges_C_dummy_C, edges_C_art])
442 | 
443 |     # Costs
444 |     costs_X_C_dummy = D.reshape(D.size)
445 |     costs = np.concatenate([costs_X_C_dummy, np.zeros(edges.shape[0] - len(costs_X_C_dummy))])
446 | 
447 |     # Capacities - can set for max-k
448 |     capacities_C_dummy_C = size_max * np.ones(n_C)
449 |     cap_non = n_X  # The total supply and therefore wont restrict flow
450 |     capacities = np.concatenate([
451 |         np.ones(edges_X_C_dummy.shape[0]),
452 |         capacities_C_dummy_C,
453 |         cap_non * np.ones(n_C)
454 |     ])
455 | 
456 |     # Sources and sinks
457 |     supplies_X = np.ones(n_X)
458 |     supplies_C = -1 * size_min * np.ones(n_C)  # Demand node
459 |     supplies_art = -1 * (n_X - n_C*size_min)  # Demand node
460 |     supplies = np.concatenate([
461 |         supplies_X,
462 |         np.zeros(n_C),  # C_dummies
463 |         supplies_C,
464 |         [supplies_art]
465 |     ])
466 | 
467 |     # All arrays must be of int dtype for `SimpleMinCostFlow`
468 |     edges = edges.astype('int32')
469 |     costs = np.around(costs*1000, 0).astype('int32')  # Times by 1000 to give extra precision
470 |     capacities = capacities.astype('int32')
471 |     supplies = supplies.astype('int32')
472 | 
473 |     return edges, costs, capacities, supplies, n_C, n_X
474 | 
475 | 
476 | def solve_min_cost_flow_graph(edges, costs, capacities, supplies, n_C, n_X):
477 | 
478 |     # Instantiate a SimpleMinCostFlow solver.
479 |     min_cost_flow = SimpleMinCostFlowVectorized()
480 | 
481 |     if (edges.dtype != 'int32') or (costs.dtype != 'int32') \
482 |             or (capacities.dtype != 'int32') or (supplies.dtype != 'int32'):
483 |         raise ValueError("`edges`, `costs`, `capacities`, `supplies` must all be int dtype")
484 | 
485 |     N_edges = edges.shape[0]
486 |     N_nodes = len(supplies)
487 | 
488 |     # Add each edge with associated capacities and cost
489 |     min_cost_flow.AddArcWithCapacityAndUnitCostVectorized(edges[:,0], edges[:,1], capacities, costs)
490 | 
491 |     # Add node supplies
492 |     min_cost_flow.SetNodeSupplyVectorized(np.arange(N_nodes, dtype='int32'), supplies)
493 | 
494 |     # Find the minimum cost flow between node 0 and node 4.
495 |     if min_cost_flow.Solve() != min_cost_flow.OPTIMAL:
496 |         raise Exception('There was an issue with the min cost flow input.')
497 | 
498 |     # Assignment
499 |     labels_M = min_cost_flow.FlowVectorized(np.arange(n_X * n_C, dtype='int32')).reshape(n_X, n_C)
500 | 
501 |     labels = labels_M.argmax(axis=1)
502 |     return labels
503 | 
504 | 
505 | class KMeansConstrained(KMeans):
506 |     """K-Means clustering with minimum and maximum cluster size constraints
507 | 
508 |     Parameters
509 |     ----------
510 | 
511 |     n_clusters : int, optional, default: 8
512 |         The number of clusters to form as well as the number of
513 |         centroids to generate.
514 | 
515 |     size_min : int, optional, default: None
516 |         Constrain the label assignment so that each cluster has a minimum
517 |         size of size_min. If None, no constrains will be applied
518 | 
519 |     size_max : int, optional, default: None
520 |         Constrain the label assignment so that each cluster has a maximum
521 |         size of size_max. If None, no constrains will be applied
522 | 
523 |     init : {'k-means++', 'random' or an ndarray}
524 |         Method for initialization, defaults to 'k-means++':
525 | 
526 |         'k-means++' : selects initial cluster centers for k-mean
527 |         clustering in a smart way to speed up convergence. See section
528 |         Notes in k_init for more details.
529 | 
530 |         'random': choose k observations (rows) at random from data for
531 |         the initial centroids.
532 | 
533 |         If an ndarray is passed, it should be of shape (n_clusters, n_features)
534 |         and gives the initial centers.
535 | 
536 |     n_init : int, default: 10
537 |         Number of times the k-means algorithm will be run with different
538 |         centroid seeds. The final results will be the best output of
539 |         n_init consecutive runs in terms of inertia.
540 | 
541 |     max_iter : int, default: 300
542 |         Maximum number of iterations of the k-means algorithm for a
543 |         single run.
544 | 
545 |     tol : float, default: 1e-4
546 |         Relative tolerance with regards to inertia to declare convergence
547 | 
548 |     verbose : int, default 0
549 |         Verbosity mode.
550 | 
551 |     random_state : int, RandomState instance or None, optional, default: None
552 |         If int, random_state is the seed used by the random number generator;
553 |         If RandomState instance, random_state is the random number generator;
554 |         If None, the random number generator is the RandomState instance used
555 |         by `np.random`.
556 | 
557 |     copy_x : boolean, default True
558 |         When pre-computing distances it is more numerically accurate to center
559 |         the data first.  If copy_x is True, then the original data is not
560 |         modified.  If False, the original data is modified, and put back before
561 |         the function returns, but small numerical differences may be introduced
562 |         by subtracting and then adding the data mean.
563 | 
564 |     n_jobs : int
565 |         The number of jobs to use for the computation. This works by computing
566 |         each of the n_init runs in parallel.
567 | 
568 |         If -1 all CPUs are used. If 1 is given, no parallel computing code is
569 |         used at all, which is useful for debugging. For n_jobs below -1,
570 |         (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
571 |         are used.
572 | 
573 |     Attributes
574 |     ----------
575 |     cluster_centers_ : array, [n_clusters, n_features]
576 |         Coordinates of cluster centers
577 | 
578 |     labels_ :
579 |         Labels of each point
580 | 
581 |     inertia_ : float
582 |         Sum of squared distances of samples to their closest cluster center.
583 | 
584 |     Examples
585 |     --------
586 | 
587 |     >>> from k_means_constrained import KMeansConstrained
588 |     >>> import numpy as np
589 |     >>> X = np.array([[1, 2], [1, 4], [1, 0],
590 |     ...               [4, 2], [4, 4], [4, 0]])
591 |     >>> clf = KMeansConstrained(n_clusters=2, size_min=2, size_max=5, random_state=0).fit(X)
592 |     >>> clf.labels_
593 |     array([0, 0, 0, 1, 1, 1], dtype=int32)
594 |     >>> clf.predict([[0, 0], [4, 4]])
595 |     array([0, 1], dtype=int32)
596 |     >>> clf.cluster_centers_
597 |     array([[ 1.,  2.],
598 |            [ 4.,  2.]])
599 | 
600 |     Notes
601 |     ------
602 |     K-means problem constrained with a minimum and/or maximum size for each cluster.
603 | 
604 |     The constrained assignment is formulated as a Minimum Cost Flow (MCF) linear network optimisation
605 |     problem. This is then solved using a cost-scaling push-relabel algorithm. The implementation used is
606 |      Google's Operations Research tools's `SimpleMinCostFlow`.
607 | 
608 |     Ref:
609 |     1. Bradley, P. S., K. P. Bennett, and Ayhan Demiriz. "Constrained k-means clustering."
610 |         Microsoft Research, Redmond (2000): 1-8.
611 |     2. Google's SimpleMinCostFlow implementation:
612 |         https://github.com/google/or-tools/blob/master/ortools/graph/min_cost_flow.h
613 |     """
614 | 
615 |     def __init__(self, n_clusters=8, size_min=None, size_max=None, distance_func=cdist,
616 |                     init='k-means++', n_init=10, max_iter=300, tol=1e-4,
617 |                  verbose=False, random_state=None, copy_x=True, n_jobs=1):
618 | 
619 |         self.size_min = size_min
620 |         self.size_max = size_max
621 | 
622 |         super().__init__(n_clusters=n_clusters, init=init, n_init=n_init, max_iter=max_iter, tol=tol,
623 |                          verbose=verbose, random_state=random_state, copy_x=copy_x, n_jobs=n_jobs)
624 |         self.distance_func = distance_func
625 | 
626 |     def fit(self, X, y=None):
627 |         """Compute k-means clustering.
628 | 
629 |         Parameters
630 |         ----------
631 |         X : array-like, shape=(n_samples, n_features)
632 |             Training instances to cluster.
633 | 
634 |         y : Ignored
635 | 
636 |         """
637 |         if sp.issparse(X):
638 |             raise NotImplementedError("Not implemented for sparse X")
639 | 
640 |         random_state = check_random_state(self.random_state)
641 |         X = self._check_fit_data(X)
642 | 
643 |         self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
644 |             k_means_constrained(
645 |                 X, n_clusters=self.n_clusters,
646 |                 size_min=self.size_min, size_max=self.size_max,
647 |                 init=self.init,
648 |                 distance_func=self.distance_func,
649 |                 n_init=self.n_init, max_iter=self.max_iter, verbose=self.verbose,
650 |                 tol=self.tol, random_state=random_state, copy_x=self.copy_x,
651 |                 n_jobs=self.n_jobs,
652 |                 return_n_iter=True)
653 |         return self
654 | 


--------------------------------------------------------------------------------
/size_constrained_clustering/sklearn_import/metrics/pairwise.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import warnings
  3 | from functools import partial
  4 | 
  5 | import numpy as np
  6 | from scipy.sparse import issparse, csr_matrix
  7 | from scipy.spatial import distance
  8 | from joblib import cpu_count, delayed, Parallel
  9 | 
 10 | from sklearn_import.metrics.pairwise_fast import _sparse_manhattan
 11 | 
 12 | from sklearn_import.preprocessing.data import normalize
 13 | 
 14 | from sklearn_import.utils import gen_batches, gen_even_slices
 15 | 
 16 | from sklearn_import.utils.validation import check_array
 17 | from sklearn_import.utils.extmath import row_norms, safe_sparse_dot
 18 | 
 19 | 
 20 | def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
 21 |                         X_norm_squared=None):
 22 |     """
 23 |     Considering the rows of X (and Y=X) as vectors, compute the
 24 |     distance matrix between each pair of vectors.
 25 | 
 26 |     For efficiency reasons, the euclidean distance between a pair of row
 27 |     vector x and y is computed as::
 28 | 
 29 |         dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))
 30 | 
 31 |     This formulation has two advantages over other ways of computing distances.
 32 |     First, it is computationally efficient when dealing with sparse data.
 33 |     Second, if one argument varies but the other remains unchanged, then
 34 |     `dot(x, x)` and/or `dot(y, y)` can be pre-computed.
 35 | 
 36 |     However, this is not the most precise way of doing this computation, and
 37 |     the distance matrix returned by this function may not be exactly
 38 |     symmetric as required by, e.g., ``scipy.spatial.distance`` functions.
 39 | 
 40 |     Read more in the :ref:`User Guide <metrics>`.
 41 | 
 42 |     Parameters
 43 |     ----------
 44 |     X : {array-like, sparse matrix}, shape (n_samples_1, n_features)
 45 | 
 46 |     Y : {array-like, sparse matrix}, shape (n_samples_2, n_features)
 47 | 
 48 |     Y_norm_squared : array-like, shape (n_samples_2, ), optional
 49 |         Pre-computed dot-products of vectors in Y (e.g.,
 50 |         ``(Y**2).sum(axis=1)``)
 51 | 
 52 |     squared : boolean, optional
 53 |         Return squared Euclidean distances.
 54 | 
 55 |     X_norm_squared : array-like, shape = [n_samples_1], optional
 56 |         Pre-computed dot-products of vectors in X (e.g.,
 57 |         ``(X**2).sum(axis=1)``)
 58 | 
 59 |     Returns
 60 |     -------
 61 |     distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2)
 62 | 
 63 |     Examples
 64 |     --------
 65 |     >>> from sklearn.metrics.pairwise import euclidean_distances
 66 |     >>> X = [[0, 1], [1, 1]]
 67 |     >>> # distance between rows of X
 68 |     >>> euclidean_distances(X, X)
 69 |     array([[ 0.,  1.],
 70 |            [ 1.,  0.]])
 71 |     >>> # get distance to origin
 72 |     >>> euclidean_distances(X, [[0, 0]])
 73 |     array([[ 1.        ],
 74 |            [ 1.41421356]])
 75 | 
 76 |     See also
 77 |     --------
 78 |     paired_distances : distances betweens pairs of elements of X and Y.
 79 |     """
 80 |     X, Y = check_pairwise_arrays(X, Y)
 81 | 
 82 |     if X_norm_squared is not None:
 83 |         XX = check_array(X_norm_squared)
 84 |         if XX.shape == (1, X.shape[0]):
 85 |             XX = XX.T
 86 |         elif XX.shape != (X.shape[0], 1):
 87 |             raise ValueError(
 88 |                 "Incompatible dimensions for X and X_norm_squared")
 89 |     else:
 90 |         XX = row_norms(X, squared=True)[:, np.newaxis]
 91 | 
 92 |     if X is Y:  # shortcut in the common case euclidean_distances(X, X)
 93 |         YY = XX.T
 94 |     elif Y_norm_squared is not None:
 95 |         YY = np.atleast_2d(Y_norm_squared)
 96 | 
 97 |         if YY.shape != (1, Y.shape[0]):
 98 |             raise ValueError(
 99 |                 "Incompatible dimensions for Y and Y_norm_squared")
100 |     else:
101 |         YY = row_norms(Y, squared=True)[np.newaxis, :]
102 | 
103 |     distances = safe_sparse_dot(X, Y.T, dense_output=True)
104 |     distances *= -2
105 |     distances += XX
106 |     distances += YY
107 |     np.maximum(distances, 0, out=distances)
108 | 
109 |     if X is Y:
110 |         # Ensure that distances between vectors and themselves are set to 0.0.
111 |         # This may not be the case due to floating point rounding errors.
112 |         distances.flat[::distances.shape[0] + 1] = 0.0
113 | 
114 |     return distances if squared else np.sqrt(distances, out=distances)
115 | 
116 | 
117 | def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean",
118 |                                   batch_size=500, metric_kwargs=None):
119 |     """Compute minimum distances between one point and a set of points.
120 | 
121 |     This function computes for each row in X, the index of the row of Y which
122 |     is closest (according to the specified distance). The minimal distances are
123 |     also returned.
124 | 
125 |     This is mostly equivalent to calling:
126 | 
127 |         (pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis),
128 |          pairwise_distances(X, Y=Y, metric=metric).min(axis=axis))
129 | 
130 |     but uses much less memory, and is faster for large arrays.
131 | 
132 |     Parameters
133 |     ----------
134 |     X : {array-like, sparse matrix}, shape (n_samples1, n_features)
135 |         Array containing points.
136 | 
137 |     Y : {array-like, sparse matrix}, shape (n_samples2, n_features)
138 |         Arrays containing points.
139 | 
140 |     axis : int, optional, default 1
141 |         Axis along which the argmin and distances are to be computed.
142 | 
143 |     metric : string or callable, default 'euclidean'
144 |         metric to use for distance computation. Any metric from scikit-learn
145 |         or scipy.spatial.distance can be used.
146 | 
147 |         If metric is a callable function, it is called on each
148 |         pair of instances (rows) and the resulting value recorded. The callable
149 |         should take two arrays as input and return one value indicating the
150 |         distance between them. This works for Scipy's metrics, but is less
151 |         efficient than passing the metric name as a string.
152 | 
153 |         Distance matrices are not supported.
154 | 
155 |         Valid values for metric are:
156 | 
157 |         - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
158 |           'manhattan']
159 | 
160 |         - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
161 |           'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
162 |           'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
163 |           'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath',
164 |           'sqeuclidean', 'yule']
165 | 
166 |         See the documentation for scipy.spatial.distance for details on these
167 |         metrics.
168 | 
169 |     batch_size : integer
170 |         To reduce memory consumption over the naive solution, data are
171 |         processed in batches, comprising batch_size rows of X and
172 |         batch_size rows of Y. The default value is quite conservative, but
173 |         can be changed for fine-tuning. The larger the number, the larger the
174 |         memory usage.
175 | 
176 |     metric_kwargs : dict, optional
177 |         Keyword arguments to pass to specified metric function.
178 | 
179 |     Returns
180 |     -------
181 |     argmin : numpy.ndarray
182 |         Y[argmin[i], :] is the row in Y that is closest to X[i, :].
183 | 
184 |     distances : numpy.ndarray
185 |         distances[i] is the distance between the i-th row in X and the
186 |         argmin[i]-th row in Y.
187 | 
188 |     See also
189 |     --------
190 |     sklearn.metrics.pairwise_distances
191 |     sklearn.metrics.pairwise_distances_argmin
192 |     """
193 |     dist_func = None
194 |     if metric in PAIRWISE_DISTANCE_FUNCTIONS:
195 |         dist_func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
196 |     elif not callable(metric) and not isinstance(metric, str):
197 |         raise ValueError("'metric' must be a string or a callable")
198 | 
199 |     X, Y = check_pairwise_arrays(X, Y)
200 | 
201 |     if metric_kwargs is None:
202 |         metric_kwargs = {}
203 | 
204 |     if axis == 0:
205 |         X, Y = Y, X
206 | 
207 |     # Allocate output arrays
208 |     indices = np.empty(X.shape[0], dtype=np.intp)
209 |     values = np.empty(X.shape[0])
210 |     values.fill(np.infty)
211 | 
212 |     for chunk_x in gen_batches(X.shape[0], batch_size):
213 |         X_chunk = X[chunk_x, :]
214 | 
215 |         for chunk_y in gen_batches(Y.shape[0], batch_size):
216 |             Y_chunk = Y[chunk_y, :]
217 | 
218 |             if dist_func is not None:
219 |                 if metric == 'euclidean':  # special case, for speed
220 |                     d_chunk = safe_sparse_dot(X_chunk, Y_chunk.T,
221 |                                               dense_output=True)
222 |                     d_chunk *= -2
223 |                     d_chunk += row_norms(X_chunk, squared=True)[:, np.newaxis]
224 |                     d_chunk += row_norms(Y_chunk, squared=True)[np.newaxis, :]
225 |                     np.maximum(d_chunk, 0, d_chunk)
226 |                 else:
227 |                     d_chunk = dist_func(X_chunk, Y_chunk, **metric_kwargs)
228 |             else:
229 |                 d_chunk = pairwise_distances(X_chunk, Y_chunk,
230 |                                              metric=metric, **metric_kwargs)
231 | 
232 |             # Update indices and minimum values using chunk
233 |             min_indices = d_chunk.argmin(axis=1)
234 |             min_values = d_chunk[np.arange(chunk_x.stop - chunk_x.start),
235 |                                  min_indices]
236 | 
237 |             flags = values[chunk_x] > min_values
238 |             indices[chunk_x][flags] = min_indices[flags] + chunk_y.start
239 |             values[chunk_x][flags] = min_values[flags]
240 | 
241 |     if metric == "euclidean" and not metric_kwargs.get("squared", False):
242 |         np.sqrt(values, values)
243 |     return indices, values
244 | 
245 | 
246 | def check_pairwise_arrays(X, Y, precomputed=False, dtype=None):
247 |     """ Set X and Y appropriately and checks inputs
248 | 
249 |     If Y is None, it is set as a pointer to X (i.e. not a copy).
250 |     If Y is given, this does not happen.
251 |     All distance metrics should use this function first to assert that the
252 |     given parameters are correct and safe to use.
253 | 
254 |     Specifically, this function first ensures that both X and Y are arrays,
255 |     then checks that they are at least two dimensional while ensuring that
256 |     their elements are floats (or dtype if provided). Finally, the function
257 |     checks that the size of the second dimension of the two arrays is equal, or
258 |     the equivalent check for a precomputed distance matrix.
259 | 
260 |     Parameters
261 |     ----------
262 |     X : {array-like, sparse matrix}, shape (n_samples_a, n_features)
263 | 
264 |     Y : {array-like, sparse matrix}, shape (n_samples_b, n_features)
265 | 
266 |     precomputed : bool
267 |         True if X is to be treated as precomputed distances to the samples in
268 |         Y.
269 | 
270 |     dtype : string, type, list of types or None (default=None)
271 |         Data type required for X and Y. If None, the dtype will be an
272 |         appropriate float type selected by _return_float_dtype.
273 | 
274 |         .. versionadded:: 0.18
275 | 
276 |     Returns
277 |     -------
278 |     safe_X : {array-like, sparse matrix}, shape (n_samples_a, n_features)
279 |         An array equal to X, guaranteed to be a numpy array.
280 | 
281 |     safe_Y : {array-like, sparse matrix}, shape (n_samples_b, n_features)
282 |         An array equal to Y if Y was not None, guaranteed to be a numpy array.
283 |         If Y was None, safe_Y will be a pointer to X.
284 | 
285 |     """
286 |     X, Y, dtype_float = _return_float_dtype(X, Y)
287 | 
288 |     warn_on_dtype = dtype is not None
289 |     estimator = 'check_pairwise_arrays'
290 |     if dtype is None:
291 |         dtype = dtype_float
292 | 
293 |     if Y is X or Y is None:
294 |         X = Y = check_array(X, accept_sparse='csr', dtype=dtype,
295 |                             warn_on_dtype=warn_on_dtype, estimator=estimator)
296 |     else:
297 |         X = check_array(X, accept_sparse='csr', dtype=dtype,
298 |                         warn_on_dtype=warn_on_dtype, estimator=estimator)
299 |         Y = check_array(Y, accept_sparse='csr', dtype=dtype,
300 |                         warn_on_dtype=warn_on_dtype, estimator=estimator)
301 | 
302 |     if precomputed:
303 |         if X.shape[1] != Y.shape[0]:
304 |             raise ValueError("Precomputed metric requires shape "
305 |                              "(n_queries, n_indexed). Got (%d, %d) "
306 |                              "for %d indexed." %
307 |                              (X.shape[0], X.shape[1], Y.shape[0]))
308 |     elif X.shape[1] != Y.shape[1]:
309 |         raise ValueError("Incompatible dimension for X and Y matrices: "
310 |                          "X.shape[1] == %d while Y.shape[1] == %d" % (
311 |                              X.shape[1], Y.shape[1]))
312 | 
313 |     return X, Y
314 | 
315 | 
316 | def manhattan_distances(X, Y=None, sum_over_features=True,
317 |                         size_threshold=None):
318 |     """ Compute the L1 distances between the vectors in X and Y.
319 | 
320 |     With sum_over_features equal to False it returns the componentwise
321 |     distances.
322 | 
323 |     Read more in the :ref:`User Guide <metrics>`.
324 | 
325 |     Parameters
326 |     ----------
327 |     X : array_like
328 |         An array with shape (n_samples_X, n_features).
329 | 
330 |     Y : array_like, optional
331 |         An array with shape (n_samples_Y, n_features).
332 | 
333 |     sum_over_features : bool, default=True
334 |         If True the function returns the pairwise distance matrix
335 |         else it returns the componentwise L1 pairwise-distances.
336 |         Not supported for sparse matrix inputs.
337 | 
338 |     size_threshold : int, default=5e8
339 |         Unused parameter.
340 | 
341 |     Returns
342 |     -------
343 |     D : array
344 |         If sum_over_features is False shape is
345 |         (n_samples_X * n_samples_Y, n_features) and D contains the
346 |         componentwise L1 pairwise-distances (ie. absolute difference),
347 |         else shape is (n_samples_X, n_samples_Y) and D contains
348 |         the pairwise L1 distances.
349 | 
350 |     Examples
351 |     --------
352 |     >>> from sklearn.metrics.pairwise import manhattan_distances
353 |     >>> manhattan_distances([[3]], [[3]])#doctest:+ELLIPSIS
354 |     array([[ 0.]])
355 |     >>> manhattan_distances([[3]], [[2]])#doctest:+ELLIPSIS
356 |     array([[ 1.]])
357 |     >>> manhattan_distances([[2]], [[3]])#doctest:+ELLIPSIS
358 |     array([[ 1.]])
359 |     >>> manhattan_distances([[1, 2], [3, 4]],\
360 |          [[1, 2], [0, 3]])#doctest:+ELLIPSIS
361 |     array([[ 0.,  2.],
362 |            [ 4.,  4.]])
363 |     >>> import numpy as np
364 |     >>> X = np.ones((1, 2))
365 |     >>> y = 2 * np.ones((2, 2))
366 |     >>> manhattan_distances(X, y, sum_over_features=False)#doctest:+ELLIPSIS
367 |     array([[ 1.,  1.],
368 |            [ 1.,  1.]]...)
369 |     """
370 |     if size_threshold is not None:
371 |         warnings.warn('Use of the "size_threshold" is deprecated '
372 |                       'in 0.19 and it will be removed version '
373 |                       '0.21 of scikit-learn', DeprecationWarning)
374 |     X, Y = check_pairwise_arrays(X, Y)
375 | 
376 |     if issparse(X) or issparse(Y):
377 |         if not sum_over_features:
378 |             raise TypeError("sum_over_features=%r not supported"
379 |                             " for sparse matrices" % sum_over_features)
380 | 
381 |         X = csr_matrix(X, copy=False)
382 |         Y = csr_matrix(Y, copy=False)
383 |         D = np.zeros((X.shape[0], Y.shape[0]))
384 |         _sparse_manhattan(X.data, X.indices, X.indptr,
385 |                           Y.data, Y.indices, Y.indptr,
386 |                           X.shape[1], D)
387 |         return D
388 | 
389 |     if sum_over_features:
390 |         return distance.cdist(X, Y, 'cityblock')
391 | 
392 |     D = X[:, np.newaxis, :] - Y[np.newaxis, :, :]
393 |     D = np.abs(D, D)
394 |     return D.reshape((-1, X.shape[1]))
395 | 
396 | 
397 | def cosine_distances(X, Y=None):
398 |     """Compute cosine distance between samples in X and Y.
399 | 
400 |     Cosine distance is defined as 1.0 minus the cosine similarity.
401 | 
402 |     Read more in the :ref:`User Guide <metrics>`.
403 | 
404 |     Parameters
405 |     ----------
406 |     X : array_like, sparse matrix
407 |         with shape (n_samples_X, n_features).
408 | 
409 |     Y : array_like, sparse matrix (optional)
410 |         with shape (n_samples_Y, n_features).
411 | 
412 |     Returns
413 |     -------
414 |     distance matrix : array
415 |         An array with shape (n_samples_X, n_samples_Y).
416 | 
417 |     See also
418 |     --------
419 |     sklearn.metrics.pairwise.cosine_similarity
420 |     scipy.spatial.distance.cosine (dense matrices only)
421 |     """
422 |     # 1.0 - cosine_similarity(X, Y) without copy
423 |     S = cosine_similarity(X, Y)
424 |     S *= -1
425 |     S += 1
426 |     np.clip(S, 0, 2, out=S)
427 |     if X is Y or Y is None:
428 |         # Ensure that distances between vectors and themselves are set to 0.0.
429 |         # This may not be the case due to floating point rounding errors.
430 |         S[np.diag_indices_from(S)] = 0.0
431 |     return S
432 | 
433 | 
434 | PAIRWISE_DISTANCE_FUNCTIONS = {
435 |     # If updating this dictionary, update the doc in both distance_metrics()
436 |     # and also in pairwise_distances()!
437 |     'cityblock': manhattan_distances,
438 |     'cosine': cosine_distances,
439 |     'euclidean': euclidean_distances,
440 |     'l2': euclidean_distances,
441 |     'l1': manhattan_distances,
442 |     'manhattan': manhattan_distances,
443 |     'precomputed': None,  # HACK: precomputed is always allowed, never called
444 | }
445 | 
446 | 
447 | def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds):
448 |     """ Compute the distance matrix from a vector array X and optional Y.
449 | 
450 |     This method takes either a vector array or a distance matrix, and returns
451 |     a distance matrix. If the input is a vector array, the distances are
452 |     computed. If the input is a distances matrix, it is returned instead.
453 | 
454 |     This method provides a safe way to take a distance matrix as input, while
455 |     preserving compatibility with many other algorithms that take a vector
456 |     array.
457 | 
458 |     If Y is given (default is None), then the returned matrix is the pairwise
459 |     distance between the arrays from both X and Y.
460 | 
461 |     Valid values for metric are:
462 | 
463 |     - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
464 |       'manhattan']. These metrics support sparse matrix inputs.
465 | 
466 |     - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
467 |       'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
468 |       'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
469 |       'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
470 |       See the documentation for scipy.spatial.distance for details on these
471 |       metrics. These metrics do not support sparse matrix inputs.
472 | 
473 |     Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are
474 |     valid scipy.spatial.distance metrics), the scikit-learn implementation
475 |     will be used, which is faster and has support for sparse matrices (except
476 |     for 'cityblock'). For a verbose description of the metrics from
477 |     scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics
478 |     function.
479 | 
480 |     Read more in the :ref:`User Guide <metrics>`.
481 | 
482 |     Parameters
483 |     ----------
484 |     X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
485 |              [n_samples_a, n_features] otherwise
486 |         Array of pairwise distances between samples, or a feature array.
487 | 
488 |     Y : array [n_samples_b, n_features], optional
489 |         An optional second feature array. Only allowed if metric != "precomputed".
490 | 
491 |     metric : string, or callable
492 |         The metric to use when calculating distance between instances in a
493 |         feature array. If metric is a string, it must be one of the options
494 |         allowed by scipy.spatial.distance.pdist for its metric parameter, or
495 |         a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
496 |         If metric is "precomputed", X is assumed to be a distance matrix.
497 |         Alternatively, if metric is a callable function, it is called on each
498 |         pair of instances (rows) and the resulting value recorded. The callable
499 |         should take two arrays from X as input and return a value indicating
500 |         the distance between them.
501 | 
502 |     n_jobs : int
503 |         The number of jobs to use for the computation. This works by breaking
504 |         down the pairwise matrix into n_jobs even slices and computing them in
505 |         parallel.
506 | 
507 |         If -1 all CPUs are used. If 1 is given, no parallel computing code is
508 |         used at all, which is useful for debugging. For n_jobs below -1,
509 |         (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one
510 |         are used.
511 | 
512 |     **kwds : optional keyword parameters
513 |         Any further parameters are passed directly to the distance function.
514 |         If using a scipy.spatial.distance metric, the parameters are still
515 |         metric dependent. See the scipy docs for usage examples.
516 | 
517 |     Returns
518 |     -------
519 |     D : array [n_samples_a, n_samples_a] or [n_samples_a, n_samples_b]
520 |         A distance matrix D such that D_{i, j} is the distance between the
521 |         ith and jth vectors of the given matrix X, if Y is None.
522 |         If Y is not None, then D_{i, j} is the distance between the ith array
523 |         from X and the jth array from Y.
524 | 
525 |     """
526 |     if (metric not in _VALID_METRICS and
527 |             not callable(metric) and metric != "precomputed"):
528 |         raise ValueError("Unknown metric %s. "
529 |                          "Valid metrics are %s, or 'precomputed', or a "
530 |                          "callable" % (metric, _VALID_METRICS))
531 | 
532 |     if metric == "precomputed":
533 |         X, _ = check_pairwise_arrays(X, Y, precomputed=True)
534 |         return X
535 |     elif metric in PAIRWISE_DISTANCE_FUNCTIONS:
536 |         func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
537 |     elif callable(metric):
538 |         func = partial(_pairwise_callable, metric=metric, **kwds)
539 |     else:
540 |         if issparse(X) or issparse(Y):
541 |             raise TypeError("scipy distance metrics do not"
542 |                             " support sparse matrices.")
543 | 
544 |         dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None
545 | 
546 |         X, Y = check_pairwise_arrays(X, Y, dtype=dtype)
547 | 
548 |         if n_jobs == 1 and X is Y:
549 |             return distance.squareform(distance.pdist(X, metric=metric,
550 |                                                       **kwds))
551 |         func = partial(distance.cdist, metric=metric, **kwds)
552 | 
553 |     return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
554 | 
555 | 
556 | def _return_float_dtype(X, Y):
557 |     """
558 |     1. If dtype of X and Y is float32, then dtype float32 is returned.
559 |     2. Else dtype float is returned.
560 |     """
561 |     if not issparse(X) and not isinstance(X, np.ndarray):
562 |         X = np.asarray(X)
563 | 
564 |     if Y is None:
565 |         Y_dtype = X.dtype
566 |     elif not issparse(Y) and not isinstance(Y, np.ndarray):
567 |         Y = np.asarray(Y)
568 |         Y_dtype = Y.dtype
569 |     else:
570 |         Y_dtype = Y.dtype
571 | 
572 |     if X.dtype == Y_dtype == np.float32:
573 |         dtype = np.float32
574 |     else:
575 |         dtype = np.float
576 | 
577 |     return X, Y, dtype
578 | 
579 | 
580 | def _parallel_pairwise(X, Y, func, n_jobs, **kwds):
581 |     """Break the pairwise matrix in n_jobs even slices
582 |     and compute them in parallel"""
583 |     if n_jobs < 0:
584 |         n_jobs = max(cpu_count() + 1 + n_jobs, 1)
585 | 
586 |     if Y is None:
587 |         Y = X
588 | 
589 |     if n_jobs == 1:
590 |         # Special case to avoid picklability checks in delayed
591 |         return func(X, Y, **kwds)
592 | 
593 |     # TODO: in some cases, backend='threading' may be appropriate
594 |     fd = delayed(func)
595 |     ret = Parallel(n_jobs=n_jobs, verbose=0)(
596 |         fd(X, Y[s], **kwds)
597 |         for s in gen_even_slices(Y.shape[0], n_jobs))
598 | 
599 |     return np.hstack(ret)
600 | 
601 | 
602 | def _pairwise_callable(X, Y, metric, **kwds):
603 |     """Handle the callable case for pairwise_{distances,kernels}
604 |     """
605 |     X, Y = check_pairwise_arrays(X, Y)
606 | 
607 |     if X is Y:
608 |         # Only calculate metric for upper triangle
609 |         out = np.zeros((X.shape[0], Y.shape[0]), dtype='float')
610 |         iterator = itertools.combinations(range(X.shape[0]), 2)
611 |         for i, j in iterator:
612 |             out[i, j] = metric(X[i], Y[j], **kwds)
613 | 
614 |         # Make symmetric
615 |         # NB: out += out.T will produce incorrect results
616 |         out = out + out.T
617 | 
618 |         # Calculate diagonal
619 |         # NB: nonzero diagonals are allowed for both metrics and kernels
620 |         for i in range(X.shape[0]):
621 |             x = X[i]
622 |             out[i, i] = metric(x, x, **kwds)
623 | 
624 |     else:
625 |         # Calculate all cells
626 |         out = np.empty((X.shape[0], Y.shape[0]), dtype='float')
627 |         iterator = itertools.product(range(X.shape[0]), range(Y.shape[0]))
628 |         for i, j in iterator:
629 |             out[i, j] = metric(X[i], Y[j], **kwds)
630 | 
631 |     return out
632 | 
633 | 
634 | PAIRWISE_BOOLEAN_FUNCTIONS = [
635 |     'dice',
636 |     'jaccard',
637 |     'kulsinski',
638 |     'matching',
639 |     'rogerstanimoto',
640 |     'russellrao',
641 |     'sokalmichener',
642 |     'sokalsneath',
643 |     'yule',
644 | ]
645 | 
646 | 
647 | def cosine_similarity(X, Y=None, dense_output=True):
648 |     """Compute cosine similarity between samples in X and Y.
649 | 
650 |     Cosine similarity, or the cosine kernel, computes similarity as the
651 |     normalized dot product of X and Y:
652 | 
653 |         K(X, Y) = <X, Y> / (||X||*||Y||)
654 | 
655 |     On L2-normalized data, this function is equivalent to linear_kernel.
656 | 
657 |     Read more in the :ref:`User Guide <cosine_similarity>`.
658 | 
659 |     Parameters
660 |     ----------
661 |     X : ndarray or sparse array, shape: (n_samples_X, n_features)
662 |         Input data.
663 | 
664 |     Y : ndarray or sparse array, shape: (n_samples_Y, n_features)
665 |         Input data. If ``None``, the output will be the pairwise
666 |         similarities between all samples in ``X``.
667 | 
668 |     dense_output : boolean (optional), default True
669 |         Whether to return dense output even when the input is sparse. If
670 |         ``False``, the output is sparse if both input arrays are sparse.
671 | 
672 |         .. versionadded:: 0.17
673 |            parameter ``dense_output`` for dense output.
674 | 
675 |     Returns
676 |     -------
677 |     kernel matrix : array
678 |         An array with shape (n_samples_X, n_samples_Y).
679 |     """
680 |     # to avoid recursive import
681 | 
682 |     X, Y = check_pairwise_arrays(X, Y)
683 | 
684 |     X_normalized = normalize(X, copy=True)
685 |     if X is Y:
686 |         Y_normalized = X_normalized
687 |     else:
688 |         Y_normalized = normalize(Y, copy=True)
689 | 
690 |     K = safe_sparse_dot(X_normalized, Y_normalized.T, dense_output=dense_output)
691 | 
692 |     return K
693 | 
694 | 
695 | _VALID_METRICS = ['euclidean', 'l2', 'l1', 'manhattan', 'cityblock',
696 |                   'braycurtis', 'canberra', 'chebyshev', 'correlation',
697 |                   'cosine', 'dice', 'hamming', 'jaccard', 'kulsinski',
698 |                   'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
699 |                   'russellrao', 'seuclidean', 'sokalmichener',
700 |                   'sokalsneath', 'sqeuclidean', 'yule', "wminkowski"]
701 | 


--------------------------------------------------------------------------------