├── jqmcvi
    ├── __init__.py
    ├── basec.pyx
    └── base.py
├── tests
    ├── __init__.py
    ├── ps.pkl
    ├── ccs.pkl
    ├── lbls.pkl
    └── cvi_dev.py
├── theory.pdf
├── .gitignore
├── setup.py
├── LICENSE
└── README.md


/jqmcvi/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/theory.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jqmviegas/jqm_cvi/HEAD/theory.pdf


--------------------------------------------------------------------------------
/tests/ps.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jqmviegas/jqm_cvi/HEAD/tests/ps.pkl


--------------------------------------------------------------------------------
/tests/ccs.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jqmviegas/jqm_cvi/HEAD/tests/ccs.pkl


--------------------------------------------------------------------------------
/tests/lbls.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jqmviegas/jqm_cvi/HEAD/tests/lbls.pkl


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | build/lib.win-amd64-3.5/jqmcvi/__init__.py
2 | build/lib.win-amd64-3.5/jqmcvi/base.py


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """A setuptools based setup module.
 2 | See:
 3 | https://packaging.python.org/en/latest/distributing.html
 4 | https://github.com/pypa/sampleproject
 5 | """
 6 | 
 7 | from distutils.core import setup
 8 | from distutils.extension import Extension
 9 | from Cython.Distutils import build_ext
10 | import numpy
11 | 
12 | setup(
13 |     name="jqmcvi",
14 | 
15 |     version="1.0",
16 | 
17 |     author="Joaquim L. Viegas",
18 |     author_email = "jqmviegas@gmail.com",
19 | 
20 |     license="MIT",
21 | 
22 |     cmdclass = {'build_ext': build_ext},
23 |     ext_modules = [
24 |         Extension("jqmcvi.basec", ["jqmcvi/basec.pyx"],
25 |                   include_dirs=[numpy.get_include()]),
26 |     ],
27 |     packages=["jqmcvi"]
28 | )
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Joaquim L. Viegas
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE


--------------------------------------------------------------------------------
/tests/cvi_dev.py:
--------------------------------------------------------------------------------
 1 | #! D:\Anaconda3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | __author__ = "Joaquim Viegas"
 5 | 
 6 | #==============================================================================
 7 | # Description
 8 | #==============================================================================
 9 | 
10 | import jqmcvi.basec as jqmcvi
11 | import jqmcvi.base as jqmcvin
12 | 
13 | import pandas as pd
14 | import numpy as np
15 | import pickle
16 | from timeit import timeit
17 | from sklearn.metrics import silhouette_score as Sil
18 | 
19 | if __name__ == "__main__":  
20 |     ccs = pickle.load(open('ccs.pkl', 'rb'))
21 |     ps = pickle.load(open('ps.pkl', 'rb'))
22 |     lbls = pickle.load(open('lbls.pkl', 'rb'))
23 |     
24 |     cps = []
25 |     for i in range(0, len(ccs)):
26 |         cps.append([])
27 |     
28 |     i = 0
29 |     for lbl in lbls:
30 |         cps[lbl].append(ps[i])
31 |         i +=1
32 |         
33 | #    cps[0] = cps[0][0:500]
34 | #    cps[1] = cps[1][0:500]
35 |     cps[0] = np.array(cps[0])
36 |     cps[1] = np.array(cps[1])
37 |     
38 |     print(timeit("Sil(ps, lbls, metric='euclidean')", setup="from __main__ import Sil, ps, lbls", number=1))
39 |     print(timeit("jqmcvi.dunn(cps)", setup="from __main__ import jqmcvi, cps", number=1))
40 |     print(timeit("jqmcvin.dunn_fast(ps, lbls)", setup="from __main__ import jqmcvin, ps, lbls", number=1))
41 |     print(timeit("jqmcvi.davisbouldin(cps, ccs)", setup="from __main__ import jqmcvi, cps, ccs", number=1))       
42 |     print(timeit("jqmcvin.davisbouldin(cps, ccs)", setup="from __main__ import jqmcvin, cps, ccs", number=1))       
43 | 
44 |     print(Sil(ps, lbls, metric='euclidean'))
45 |     print(jqmcvi.dunn(cps))
46 |     print(jqmcvin.dunn_fast(ps, lbls))
47 |     print(jqmcvi.davisbouldin(cps, ccs))
48 |     print(jqmcvin.davisbouldin(cps, ccs))
49 |     


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | jqmcvi
 2 | =======
 3 | 
 4 | Small module with Cluster Validity Indices (CVI)
 5 | ------------------------------------------------
 6 | 
 7 | Dunn and Davius Bouldin indices are implemented. It follows the equations presented in theory.pdf.
 8 | 
 9 | > base.py : Python + NumPy
10 | >
11 | > basec.pyx : Python + NumPy optimized with Cython
12 | >
13 | > basec.pyx tested in Windows 8.1 x64, Python 3.4 and compiled with VS2010 (python setup.py build_ext -i)
14 | 
15 | Functions:
16 | ----------
17 | 
18 | **dunn(k_list)**:
19 | > Slow implementation of Dunn index that depends on numpy
20 | >
21 | > -- basec.pyx Cython implementation is much faster but slower than dunn_fast()
22 | 
23 | ```python
24 | 	""" Dunn index [CVI]
25 |     
26 |     Parameters
27 |     ----------
28 |     k_list : list of np.arrays
29 |         A list containing a numpy array for each cluster |c| = number of clusters
30 |         c[K] is np.array([N, p]) (N : number of samples in cluster K, p : sample dimension)
31 |     """
32 | ```
33 | 
34 | **dunn_fast(points, labels)**:
35 | > Fast implementation of Dunn index that depends on numpy and sklearn.pairwise
36 | >
37 | > -- No Cython implementation
38 | 
39 | ```python
40 | 	""" Dunn index - FAST (using sklearn pairwise euclidean_distance function)
41 |     
42 |     Parameters
43 |     ----------
44 |     points : np.array
45 |         np.array([N, p]) of all points
46 |     labels: np.array
47 |         np.array([N]) labels of all points
48 |     """
49 | ```
50 | 
51 | **davisbouldin(k_list, k_centers)**:
52 | > Implementation of Davis Boulding index that depends on numpy
53 | > 
54 | > -- basec.pyx Cython implementation is much faster
55 | 
56 | ```python
57 | 	""" Davis Bouldin Index
58 | 	
59 | 	Parameters
60 |     ----------
61 |     k_list : list of np.arrays
62 |         A list containing a numpy array for each cluster |c| = number of clusters
63 |         c[K] is np.array([N, p]) (N : number of samples in cluster K, p : sample dimension)
64 |     k_centers : np.array
65 |         The array of the cluster centers (prototypes) of type np.array([K, p])
66 |     """
67 | ```
68 | 
69 | Installation 
70 | 
71 | > python setup.py install
72 | >
73 | 


--------------------------------------------------------------------------------
/jqmcvi/basec.pyx:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #cython: language_level=3, boundscheck=False
  3 | __author__ = "Joaquim Viegas"
  4 | 
  5 | """ JQM_CV - Cython implementations of Dunn and Davis Bouldin clustering validity indices
  6 | 
  7 | dunn(k_list):
  8 |     Slow implementation of Dunn index that depends on numpy
  9 |     -- basec.pyx Cython implementation is much faster but flower than dunn_fast()
 10 | dunn_fast(points, labels):
 11 |     Fast implementation of Dunn index that depends on numpy and sklearn.pairwise
 12 |     -- No Cython implementation
 13 | davisbouldin(k_list, k_centers):
 14 |     Implementation of Davis Boulding index that depends on numpy
 15 |     -- basec.pyx Cython implementation is much faster
 16 | """
 17 | 
 18 | import numpy as np
 19 | cimport numpy as np
 20 | from cython.view cimport array as cvarray
 21 | from sklearn.metrics.pairwise import euclidean_distances
 22 | 
 23 | cdef double d_euc(double[:] arr1, double[:] arr2):
 24 |     """ Euclidean distance
 25 |         ...
 26 |     """
 27 |     cdef double total = 0
 28 |     
 29 |     for i in range(arr1.shape[0]):
 30 |         total += (arr1[i] - arr2[i])*(arr1[i] - arr2[i])
 31 |         
 32 |     return total**.5
 33 | 
 34 | cdef double dunn_delta(double [:, :] ck, double [:, :] cl):
 35 |     cdef: 
 36 |         double [:, :] values = np.zeros([ck.shape[0], cl.shape[0]], dtype=np.float64)
 37 |         Py_ssize_t i, j
 38 |         
 39 |     for i in range(ck.shape[0]):
 40 |         for j in range(cl.shape[0]):
 41 |             values[i, j] = d_euc(ck[i], cl[j])
 42 |             
 43 |     return np.min(values)
 44 | 
 45 | cdef double dunn_big_delta(double [:, :] ci):
 46 |     cdef: 
 47 |         double [:,:] values = np.zeros([ci.shape[0], ci.shape[0]], dtype=np.float64)
 48 |         Py_ssize_t i, j
 49 |         
 50 |     for i in range(ci.shape[0]):
 51 |         for j in range(ci.shape[0]):
 52 |             values[i, j] = d_euc(ci[i], ci[j])
 53 |     
 54 |     return np.max(values)
 55 | 
 56 | def dunn(k_list):
 57 |     """ Dunn index [CVI]
 58 |     
 59 |     Parameters
 60 |     ----------
 61 |     k_list : list of np.arrays
 62 |         A list containing a numpy array for each cluster |c| = number of clusters
 63 |         c[K] is np.array([N, p]) (N : number of samples in cluster K, p : sample dimension)
 64 |     """
 65 |     cdef: 
 66 |         Py_ssize_t len_k_list = len(k_list)
 67 |         double [:,:] deltas = np.ones([len_k_list, len_k_list], dtype=np.float64)*100000
 68 |         double [:,:] big_deltas = np.zeros([len_k_list, 1], dtype=np.float64)
 69 |         Py_ssize_t k, l
 70 |         
 71 |     for k in range(0, len_k_list):
 72 |         for l in range(0, k):
 73 |             deltas[k, l] = dunn_delta(k_list[k], k_list[l])
 74 |         for l in range(k+1, len_k_list):
 75 |             deltas[k, l] = dunn_delta(k_list[k], k_list[l])
 76 |         
 77 |         big_deltas[k] = dunn_big_delta(k_list[k])
 78 |     res = np.min(deltas)/np.max(big_deltas)*1
 79 |     return res
 80 |     
 81 | cdef double big_s(double [:, :] x, double [:] center):
 82 |     cdef:
 83 |         Py_ssize_t len_x = x.shape[0]
 84 |         double total = 0
 85 |         
 86 |     for i in range(len_x):
 87 |         total += d_euc(x[i], center)    
 88 |     
 89 |     return total/len_x
 90 | 
 91 | def davisbouldin(k_list, k_centers):
 92 |     """ Davis Bouldin Index
 93 | 	
 94 | 	Parameters
 95 |     ----------
 96 |     k_list : list of np.arrays
 97 |         A list containing a numpy array for each cluster |c| = number of clusters
 98 |         c[K] is np.array([N, p]) (N : number of samples in cluster K, p : sample dimension)
 99 |     k_centers : np.array
100 |         The array of the cluster centers (prototypes) of type np.array([K, p])
101 |     """
102 |     cdef: 
103 |         Py_ssize_t len_k_list = len(k_list)
104 |         Py_ssize_t k, j
105 |         double [:] big_ss = np.zeros([len_k_list], dtype=np.float64)
106 |         double [:, :] d_eucs = np.zeros([len_k_list, len_k_list], dtype=np.float64)
107 |         double db = 0
108 |         double [:] values = np.zeros([len_k_list-1], dtype=np.float64)
109 | 
110 |     for k in range(len_k_list):
111 |         big_ss[k] = big_s(k_list[k], k_centers[k])
112 | 
113 |     for k in range(len_k_list):
114 |         for l in range(0, len_k_list):
115 |             d_eucs[k, l] = d_euc(k_centers[k], k_centers[l])
116 | 
117 |     for k in range(len_k_list):
118 |         for l in range(0, k):
119 |             values[l] = (big_ss[k] + big_ss[l])/d_eucs[k, l]
120 |         for l in range(k+1, len_k_list):
121 |             values[l-1] = (big_ss[k] + big_ss[l])/d_eucs[k, l]
122 | 
123 |         db += np.max(values)
124 |     res = db/len_k_list
125 |     return res
126 |     


--------------------------------------------------------------------------------
/jqmcvi/base.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | __author__ = "Joaquim Viegas"
  3 | 
  4 | """ JQM_CV - Python implementations of Dunn and Davis Bouldin clustering validity indices
  5 | 
  6 | dunn(k_list):
  7 |     Slow implementation of Dunn index that depends on numpy
  8 |     -- basec.pyx Cython implementation is much faster but flower than dunn_fast()
  9 | dunn_fast(points, labels):
 10 |     Fast implementation of Dunn index that depends on numpy and sklearn.pairwise
 11 |     -- No Cython implementation
 12 | davisbouldin(k_list, k_centers):
 13 |     Implementation of Davis Boulding index that depends on numpy
 14 |     -- basec.pyx Cython implementation is much faster
 15 | """
 16 | 
 17 | import numpy as np
 18 | from sklearn.metrics.pairwise import euclidean_distances
 19 | 
 20 | def delta(ck, cl):
 21 |     values = np.ones([len(ck), len(cl)])*10000
 22 |     
 23 |     for i in range(0, len(ck)):
 24 |         for j in range(0, len(cl)):
 25 |             values[i, j] = np.linalg.norm(ck[i]-cl[j])
 26 |             
 27 |     return np.min(values)
 28 |     
 29 | def big_delta(ci):
 30 |     values = np.zeros([len(ci), len(ci)])
 31 |     
 32 |     for i in range(0, len(ci)):
 33 |         for j in range(0, len(ci)):
 34 |             values[i, j] = np.linalg.norm(ci[i]-ci[j])
 35 |             
 36 |     return np.max(values)
 37 |     
 38 | def dunn(k_list):
 39 |     """ Dunn index [CVI]
 40 |     
 41 |     Parameters
 42 |     ----------
 43 |     k_list : list of np.arrays
 44 |         A list containing a numpy array for each cluster |c| = number of clusters
 45 |         c[K] is np.array([N, p]) (N : number of samples in cluster K, p : sample dimension)
 46 |     """
 47 |     deltas = np.ones([len(k_list), len(k_list)])*1000000
 48 |     big_deltas = np.zeros([len(k_list), 1])
 49 |     l_range = list(range(0, len(k_list)))
 50 |     
 51 |     for k in l_range:
 52 |         for l in (l_range[0:k]+l_range[k+1:]):
 53 |             deltas[k, l] = delta(k_list[k], k_list[l])
 54 |         
 55 |         big_deltas[k] = big_delta(k_list[k])
 56 | 
 57 |     di = np.min(deltas)/np.max(big_deltas)
 58 |     return di
 59 | 
 60 | def delta_fast(ck, cl, distances):
 61 |     values = distances[np.where(ck)][:, np.where(cl)]
 62 |     values = values[np.nonzero(values)]
 63 | 
 64 |     return np.min(values)
 65 |     
 66 | def big_delta_fast(ci, distances):
 67 |     values = distances[np.where(ci)][:, np.where(ci)]
 68 |     #values = values[np.nonzero(values)]
 69 |             
 70 |     return np.max(values)
 71 | 
 72 | def dunn_fast(points, labels):
 73 |     """ Dunn index - FAST (using sklearn pairwise euclidean_distance function)
 74 |     
 75 |     Parameters
 76 |     ----------
 77 |     points : np.array
 78 |         np.array([N, p]) of all points
 79 |     labels: np.array
 80 |         np.array([N]) labels of all points
 81 |     """
 82 |     distances = euclidean_distances(points)
 83 |     ks = np.sort(np.unique(labels))
 84 |     
 85 |     deltas = np.ones([len(ks), len(ks)])*1000000
 86 |     big_deltas = np.zeros([len(ks), 1])
 87 |     
 88 |     l_range = list(range(0, len(ks)))
 89 |     
 90 |     for k in l_range:
 91 |         for l in (l_range[0:k]+l_range[k+1:]):
 92 |             deltas[k, l] = delta_fast((labels == ks[k]), (labels == ks[l]), distances)
 93 |         
 94 |         big_deltas[k] = big_delta_fast((labels == ks[k]), distances)
 95 | 
 96 |     di = np.min(deltas)/np.max(big_deltas)
 97 |     return di
 98 |     
 99 |     
100 | def  big_s(x, center):
101 |     len_x = len(x)
102 |     total = 0
103 |         
104 |     for i in range(len_x):
105 |         total += np.linalg.norm(x[i]-center)    
106 |     
107 |     return total/len_x
108 | 
109 | def davisbouldin(k_list, k_centers):
110 |     """ Davis Bouldin Index
111 |     
112 |     Parameters
113 |     ----------
114 |     k_list : list of np.arrays
115 |         A list containing a numpy array for each cluster |c| = number of clusters
116 |         c[K] is np.array([N, p]) (N : number of samples in cluster K, p : sample dimension)
117 |     k_centers : np.array
118 |         The array of the cluster centers (prototypes) of type np.array([K, p])
119 |     """
120 |     len_k_list = len(k_list)
121 |     big_ss = np.zeros([len_k_list], dtype=np.float64)
122 |     d_eucs = np.zeros([len_k_list, len_k_list], dtype=np.float64)
123 |     db = 0    
124 | 
125 |     for k in range(len_k_list):
126 |         big_ss[k] = big_s(k_list[k], k_centers[k])
127 | 
128 |     for k in range(len_k_list):
129 |         for l in range(0, len_k_list):
130 |             d_eucs[k, l] = np.linalg.norm(k_centers[k]-k_centers[l])
131 | 
132 |     for k in range(len_k_list):
133 |         values = np.zeros([len_k_list-1], dtype=np.float64)
134 |         for l in range(0, k):
135 |             values[l] = (big_ss[k] + big_ss[l])/d_eucs[k, l]
136 |         for l in range(k+1, len_k_list):
137 |             values[l-1] = (big_ss[k] + big_ss[l])/d_eucs[k, l]
138 | 
139 |         db += np.max(values)
140 |     res = db/len_k_list
141 |     return res
142 | 


--------------------------------------------------------------------------------