├── jqmcvi ├── __init__.py ├── basec.pyx └── base.py ├── tests ├── __init__.py ├── ps.pkl ├── ccs.pkl ├── lbls.pkl └── cvi_dev.py ├── theory.pdf ├── .gitignore ├── setup.py ├── LICENSE └── README.md /jqmcvi/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /theory.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jqmviegas/jqm_cvi/HEAD/theory.pdf -------------------------------------------------------------------------------- /tests/ps.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jqmviegas/jqm_cvi/HEAD/tests/ps.pkl -------------------------------------------------------------------------------- /tests/ccs.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jqmviegas/jqm_cvi/HEAD/tests/ccs.pkl -------------------------------------------------------------------------------- /tests/lbls.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jqmviegas/jqm_cvi/HEAD/tests/lbls.pkl -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/lib.win-amd64-3.5/jqmcvi/__init__.py 2 | build/lib.win-amd64-3.5/jqmcvi/base.py -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """A setuptools based setup module. 2 | See: 3 | https://packaging.python.org/en/latest/distributing.html 4 | https://github.com/pypa/sampleproject 5 | """ 6 | 7 | from distutils.core import setup 8 | from distutils.extension import Extension 9 | from Cython.Distutils import build_ext 10 | import numpy 11 | 12 | setup( 13 | name="jqmcvi", 14 | 15 | version="1.0", 16 | 17 | author="Joaquim L. Viegas", 18 | author_email = "jqmviegas@gmail.com", 19 | 20 | license="MIT", 21 | 22 | cmdclass = {'build_ext': build_ext}, 23 | ext_modules = [ 24 | Extension("jqmcvi.basec", ["jqmcvi/basec.pyx"], 25 | include_dirs=[numpy.get_include()]), 26 | ], 27 | packages=["jqmcvi"] 28 | ) 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Joaquim L. Viegas 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE -------------------------------------------------------------------------------- /tests/cvi_dev.py: -------------------------------------------------------------------------------- 1 | #! D:\Anaconda3 2 | # -*- coding: utf-8 -*- 3 | 4 | __author__ = "Joaquim Viegas" 5 | 6 | #============================================================================== 7 | # Description 8 | #============================================================================== 9 | 10 | import jqmcvi.basec as jqmcvi 11 | import jqmcvi.base as jqmcvin 12 | 13 | import pandas as pd 14 | import numpy as np 15 | import pickle 16 | from timeit import timeit 17 | from sklearn.metrics import silhouette_score as Sil 18 | 19 | if __name__ == "__main__": 20 | ccs = pickle.load(open('ccs.pkl', 'rb')) 21 | ps = pickle.load(open('ps.pkl', 'rb')) 22 | lbls = pickle.load(open('lbls.pkl', 'rb')) 23 | 24 | cps = [] 25 | for i in range(0, len(ccs)): 26 | cps.append([]) 27 | 28 | i = 0 29 | for lbl in lbls: 30 | cps[lbl].append(ps[i]) 31 | i +=1 32 | 33 | # cps[0] = cps[0][0:500] 34 | # cps[1] = cps[1][0:500] 35 | cps[0] = np.array(cps[0]) 36 | cps[1] = np.array(cps[1]) 37 | 38 | print(timeit("Sil(ps, lbls, metric='euclidean')", setup="from __main__ import Sil, ps, lbls", number=1)) 39 | print(timeit("jqmcvi.dunn(cps)", setup="from __main__ import jqmcvi, cps", number=1)) 40 | print(timeit("jqmcvin.dunn_fast(ps, lbls)", setup="from __main__ import jqmcvin, ps, lbls", number=1)) 41 | print(timeit("jqmcvi.davisbouldin(cps, ccs)", setup="from __main__ import jqmcvi, cps, ccs", number=1)) 42 | print(timeit("jqmcvin.davisbouldin(cps, ccs)", setup="from __main__ import jqmcvin, cps, ccs", number=1)) 43 | 44 | print(Sil(ps, lbls, metric='euclidean')) 45 | print(jqmcvi.dunn(cps)) 46 | print(jqmcvin.dunn_fast(ps, lbls)) 47 | print(jqmcvi.davisbouldin(cps, ccs)) 48 | print(jqmcvin.davisbouldin(cps, ccs)) 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | jqmcvi 2 | ======= 3 | 4 | Small module with Cluster Validity Indices (CVI) 5 | ------------------------------------------------ 6 | 7 | Dunn and Davius Bouldin indices are implemented. It follows the equations presented in theory.pdf. 8 | 9 | > base.py : Python + NumPy 10 | > 11 | > basec.pyx : Python + NumPy optimized with Cython 12 | > 13 | > basec.pyx tested in Windows 8.1 x64, Python 3.4 and compiled with VS2010 (python setup.py build_ext -i) 14 | 15 | Functions: 16 | ---------- 17 | 18 | **dunn(k_list)**: 19 | > Slow implementation of Dunn index that depends on numpy 20 | > 21 | > -- basec.pyx Cython implementation is much faster but slower than dunn_fast() 22 | 23 | ```python 24 | """ Dunn index [CVI] 25 | 26 | Parameters 27 | ---------- 28 | k_list : list of np.arrays 29 | A list containing a numpy array for each cluster |c| = number of clusters 30 | c[K] is np.array([N, p]) (N : number of samples in cluster K, p : sample dimension) 31 | """ 32 | ``` 33 | 34 | **dunn_fast(points, labels)**: 35 | > Fast implementation of Dunn index that depends on numpy and sklearn.pairwise 36 | > 37 | > -- No Cython implementation 38 | 39 | ```python 40 | """ Dunn index - FAST (using sklearn pairwise euclidean_distance function) 41 | 42 | Parameters 43 | ---------- 44 | points : np.array 45 | np.array([N, p]) of all points 46 | labels: np.array 47 | np.array([N]) labels of all points 48 | """ 49 | ``` 50 | 51 | **davisbouldin(k_list, k_centers)**: 52 | > Implementation of Davis Boulding index that depends on numpy 53 | > 54 | > -- basec.pyx Cython implementation is much faster 55 | 56 | ```python 57 | """ Davis Bouldin Index 58 | 59 | Parameters 60 | ---------- 61 | k_list : list of np.arrays 62 | A list containing a numpy array for each cluster |c| = number of clusters 63 | c[K] is np.array([N, p]) (N : number of samples in cluster K, p : sample dimension) 64 | k_centers : np.array 65 | The array of the cluster centers (prototypes) of type np.array([K, p]) 66 | """ 67 | ``` 68 | 69 | Installation 70 | 71 | > python setup.py install 72 | > 73 | -------------------------------------------------------------------------------- /jqmcvi/basec.pyx: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #cython: language_level=3, boundscheck=False 3 | __author__ = "Joaquim Viegas" 4 | 5 | """ JQM_CV - Cython implementations of Dunn and Davis Bouldin clustering validity indices 6 | 7 | dunn(k_list): 8 | Slow implementation of Dunn index that depends on numpy 9 | -- basec.pyx Cython implementation is much faster but flower than dunn_fast() 10 | dunn_fast(points, labels): 11 | Fast implementation of Dunn index that depends on numpy and sklearn.pairwise 12 | -- No Cython implementation 13 | davisbouldin(k_list, k_centers): 14 | Implementation of Davis Boulding index that depends on numpy 15 | -- basec.pyx Cython implementation is much faster 16 | """ 17 | 18 | import numpy as np 19 | cimport numpy as np 20 | from cython.view cimport array as cvarray 21 | from sklearn.metrics.pairwise import euclidean_distances 22 | 23 | cdef double d_euc(double[:] arr1, double[:] arr2): 24 | """ Euclidean distance 25 | ... 26 | """ 27 | cdef double total = 0 28 | 29 | for i in range(arr1.shape[0]): 30 | total += (arr1[i] - arr2[i])*(arr1[i] - arr2[i]) 31 | 32 | return total**.5 33 | 34 | cdef double dunn_delta(double [:, :] ck, double [:, :] cl): 35 | cdef: 36 | double [:, :] values = np.zeros([ck.shape[0], cl.shape[0]], dtype=np.float64) 37 | Py_ssize_t i, j 38 | 39 | for i in range(ck.shape[0]): 40 | for j in range(cl.shape[0]): 41 | values[i, j] = d_euc(ck[i], cl[j]) 42 | 43 | return np.min(values) 44 | 45 | cdef double dunn_big_delta(double [:, :] ci): 46 | cdef: 47 | double [:,:] values = np.zeros([ci.shape[0], ci.shape[0]], dtype=np.float64) 48 | Py_ssize_t i, j 49 | 50 | for i in range(ci.shape[0]): 51 | for j in range(ci.shape[0]): 52 | values[i, j] = d_euc(ci[i], ci[j]) 53 | 54 | return np.max(values) 55 | 56 | def dunn(k_list): 57 | """ Dunn index [CVI] 58 | 59 | Parameters 60 | ---------- 61 | k_list : list of np.arrays 62 | A list containing a numpy array for each cluster |c| = number of clusters 63 | c[K] is np.array([N, p]) (N : number of samples in cluster K, p : sample dimension) 64 | """ 65 | cdef: 66 | Py_ssize_t len_k_list = len(k_list) 67 | double [:,:] deltas = np.ones([len_k_list, len_k_list], dtype=np.float64)*100000 68 | double [:,:] big_deltas = np.zeros([len_k_list, 1], dtype=np.float64) 69 | Py_ssize_t k, l 70 | 71 | for k in range(0, len_k_list): 72 | for l in range(0, k): 73 | deltas[k, l] = dunn_delta(k_list[k], k_list[l]) 74 | for l in range(k+1, len_k_list): 75 | deltas[k, l] = dunn_delta(k_list[k], k_list[l]) 76 | 77 | big_deltas[k] = dunn_big_delta(k_list[k]) 78 | res = np.min(deltas)/np.max(big_deltas)*1 79 | return res 80 | 81 | cdef double big_s(double [:, :] x, double [:] center): 82 | cdef: 83 | Py_ssize_t len_x = x.shape[0] 84 | double total = 0 85 | 86 | for i in range(len_x): 87 | total += d_euc(x[i], center) 88 | 89 | return total/len_x 90 | 91 | def davisbouldin(k_list, k_centers): 92 | """ Davis Bouldin Index 93 | 94 | Parameters 95 | ---------- 96 | k_list : list of np.arrays 97 | A list containing a numpy array for each cluster |c| = number of clusters 98 | c[K] is np.array([N, p]) (N : number of samples in cluster K, p : sample dimension) 99 | k_centers : np.array 100 | The array of the cluster centers (prototypes) of type np.array([K, p]) 101 | """ 102 | cdef: 103 | Py_ssize_t len_k_list = len(k_list) 104 | Py_ssize_t k, j 105 | double [:] big_ss = np.zeros([len_k_list], dtype=np.float64) 106 | double [:, :] d_eucs = np.zeros([len_k_list, len_k_list], dtype=np.float64) 107 | double db = 0 108 | double [:] values = np.zeros([len_k_list-1], dtype=np.float64) 109 | 110 | for k in range(len_k_list): 111 | big_ss[k] = big_s(k_list[k], k_centers[k]) 112 | 113 | for k in range(len_k_list): 114 | for l in range(0, len_k_list): 115 | d_eucs[k, l] = d_euc(k_centers[k], k_centers[l]) 116 | 117 | for k in range(len_k_list): 118 | for l in range(0, k): 119 | values[l] = (big_ss[k] + big_ss[l])/d_eucs[k, l] 120 | for l in range(k+1, len_k_list): 121 | values[l-1] = (big_ss[k] + big_ss[l])/d_eucs[k, l] 122 | 123 | db += np.max(values) 124 | res = db/len_k_list 125 | return res 126 | -------------------------------------------------------------------------------- /jqmcvi/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | __author__ = "Joaquim Viegas" 3 | 4 | """ JQM_CV - Python implementations of Dunn and Davis Bouldin clustering validity indices 5 | 6 | dunn(k_list): 7 | Slow implementation of Dunn index that depends on numpy 8 | -- basec.pyx Cython implementation is much faster but flower than dunn_fast() 9 | dunn_fast(points, labels): 10 | Fast implementation of Dunn index that depends on numpy and sklearn.pairwise 11 | -- No Cython implementation 12 | davisbouldin(k_list, k_centers): 13 | Implementation of Davis Boulding index that depends on numpy 14 | -- basec.pyx Cython implementation is much faster 15 | """ 16 | 17 | import numpy as np 18 | from sklearn.metrics.pairwise import euclidean_distances 19 | 20 | def delta(ck, cl): 21 | values = np.ones([len(ck), len(cl)])*10000 22 | 23 | for i in range(0, len(ck)): 24 | for j in range(0, len(cl)): 25 | values[i, j] = np.linalg.norm(ck[i]-cl[j]) 26 | 27 | return np.min(values) 28 | 29 | def big_delta(ci): 30 | values = np.zeros([len(ci), len(ci)]) 31 | 32 | for i in range(0, len(ci)): 33 | for j in range(0, len(ci)): 34 | values[i, j] = np.linalg.norm(ci[i]-ci[j]) 35 | 36 | return np.max(values) 37 | 38 | def dunn(k_list): 39 | """ Dunn index [CVI] 40 | 41 | Parameters 42 | ---------- 43 | k_list : list of np.arrays 44 | A list containing a numpy array for each cluster |c| = number of clusters 45 | c[K] is np.array([N, p]) (N : number of samples in cluster K, p : sample dimension) 46 | """ 47 | deltas = np.ones([len(k_list), len(k_list)])*1000000 48 | big_deltas = np.zeros([len(k_list), 1]) 49 | l_range = list(range(0, len(k_list))) 50 | 51 | for k in l_range: 52 | for l in (l_range[0:k]+l_range[k+1:]): 53 | deltas[k, l] = delta(k_list[k], k_list[l]) 54 | 55 | big_deltas[k] = big_delta(k_list[k]) 56 | 57 | di = np.min(deltas)/np.max(big_deltas) 58 | return di 59 | 60 | def delta_fast(ck, cl, distances): 61 | values = distances[np.where(ck)][:, np.where(cl)] 62 | values = values[np.nonzero(values)] 63 | 64 | return np.min(values) 65 | 66 | def big_delta_fast(ci, distances): 67 | values = distances[np.where(ci)][:, np.where(ci)] 68 | #values = values[np.nonzero(values)] 69 | 70 | return np.max(values) 71 | 72 | def dunn_fast(points, labels): 73 | """ Dunn index - FAST (using sklearn pairwise euclidean_distance function) 74 | 75 | Parameters 76 | ---------- 77 | points : np.array 78 | np.array([N, p]) of all points 79 | labels: np.array 80 | np.array([N]) labels of all points 81 | """ 82 | distances = euclidean_distances(points) 83 | ks = np.sort(np.unique(labels)) 84 | 85 | deltas = np.ones([len(ks), len(ks)])*1000000 86 | big_deltas = np.zeros([len(ks), 1]) 87 | 88 | l_range = list(range(0, len(ks))) 89 | 90 | for k in l_range: 91 | for l in (l_range[0:k]+l_range[k+1:]): 92 | deltas[k, l] = delta_fast((labels == ks[k]), (labels == ks[l]), distances) 93 | 94 | big_deltas[k] = big_delta_fast((labels == ks[k]), distances) 95 | 96 | di = np.min(deltas)/np.max(big_deltas) 97 | return di 98 | 99 | 100 | def big_s(x, center): 101 | len_x = len(x) 102 | total = 0 103 | 104 | for i in range(len_x): 105 | total += np.linalg.norm(x[i]-center) 106 | 107 | return total/len_x 108 | 109 | def davisbouldin(k_list, k_centers): 110 | """ Davis Bouldin Index 111 | 112 | Parameters 113 | ---------- 114 | k_list : list of np.arrays 115 | A list containing a numpy array for each cluster |c| = number of clusters 116 | c[K] is np.array([N, p]) (N : number of samples in cluster K, p : sample dimension) 117 | k_centers : np.array 118 | The array of the cluster centers (prototypes) of type np.array([K, p]) 119 | """ 120 | len_k_list = len(k_list) 121 | big_ss = np.zeros([len_k_list], dtype=np.float64) 122 | d_eucs = np.zeros([len_k_list, len_k_list], dtype=np.float64) 123 | db = 0 124 | 125 | for k in range(len_k_list): 126 | big_ss[k] = big_s(k_list[k], k_centers[k]) 127 | 128 | for k in range(len_k_list): 129 | for l in range(0, len_k_list): 130 | d_eucs[k, l] = np.linalg.norm(k_centers[k]-k_centers[l]) 131 | 132 | for k in range(len_k_list): 133 | values = np.zeros([len_k_list-1], dtype=np.float64) 134 | for l in range(0, k): 135 | values[l] = (big_ss[k] + big_ss[l])/d_eucs[k, l] 136 | for l in range(k+1, len_k_list): 137 | values[l-1] = (big_ss[k] + big_ss[l])/d_eucs[k, l] 138 | 139 | db += np.max(values) 140 | res = db/len_k_list 141 | return res 142 | --------------------------------------------------------------------------------