├── .gitignore
├── LICENSE
├── README.md
├── main.py
└── mcp
    ├── __init__.py
    └── mcp.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Christoforos Anagnostopoulos
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # hmeasure-python
2 | Measuring Classification Performance: the hmeasure package for Python.
3 | 
4 | The H-measure package implements a large number of classification performance metrics, such as the AUC, Error Rate, sensitivity and specificity, and additionally extends standard libraries by incorporating recent advances in this area, notably including the H-measure which was proposed by David Hand as coherent alternative to the AUC. 
5 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import mcp.mcp as mcp
 4 | reload(mcp)
 5 | 
 6 | from sklearn.metrics import confusion_matrix
 7 | from sklearn.metrics import precision_recall_curve
 8 | from sklearn.metrics import roc_auc_score
 9 | from sklearn.metrics import roc_curve
10 | 
11 | 
12 | 
13 | true_labels = np.asarray([0, 0, 1, 0, 0, 1, 1, 0, 1, 1])
14 | scores1 = np.asarray([0, 0.1, 0.2, 0.3, 0.3, 0.5, 0.6, 0.8, 0.8, 0.9])
15 | scores2 = np.asarray([0.4, 0.1, 0.5, 0.2, 0.3, 0.0, 0.5, 0.2, 0.2, 0.15])
16 | scores = scores1
17 | 
18 | 
19 | mcp.get_score_distributions(true_labels, scores1) # ours
20 | precision_recall_curve(true_labels, scores1) # fail
21 | roc_curve(true_labels, scores1) # fail
22 | 
23 | 
24 | mcp.plot_single_roc(true_labels, scores1)
25 | mcp.plot_single_roc(true_labels, scores2)
26 | 
27 | our_auc = mcp.h_measure_single(true_labels, scores1)['auc']
28 | sk_auc = roc_auc_score(true_labels, scores1) # pass
29 | assert our_auc == sk_auc
30 | 
31 | 


--------------------------------------------------------------------------------
/mcp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canagnos/hmeasure-python/49e04ff1df3c961d21b09756e5757123b72e7b6e/mcp/__init__.py


--------------------------------------------------------------------------------
/mcp/mcp.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import numpy as np
  3 | import pandas as pd
  4 | import matplotlib.pyplot as plt
  5 | from sklearn.metrics import confusion_matrix
  6 | from sklearn.metrics import roc_curve
  7 | 
  8 | 
  9 | # Calculate raw ROC, replacing any tied sequences by a diagonal
 10 | # Raw ROC starts at F0[1]=0, F1[1]=0, and ends at F0[K1]=1, F1[K1]=1.
 11 | def get_score_distributions(true_labels, scores):
 12 | 
 13 |     n1 = np.sum(true_labels)
 14 |     n0 = np.sum(1-true_labels)
 15 |     n = n1 + n0
 16 | 
 17 |     # Count the instances of each unique score for label 1s, and ranks them by score
 18 |     df = pd.DataFrame([
 19 |         np.asarray(true_labels).reshape(n),
 20 |         np.asarray(1-true_labels).reshape(n),
 21 |         np.asarray(scores).reshape(n1 + n0)
 22 |     ]).transpose()
 23 |     df.columns = ['true_labels','true_labels_inverted', 'scores']
 24 | 
 25 |     s1 = np.asarray(df.groupby('scores')['true_labels'].sum()/n1)
 26 |     s0 = np.asarray(df.groupby('scores')['true_labels_inverted'].sum()/n0)
 27 |     ind = np.asarray(df['scores'])
 28 | 
 29 |     # Make sure to add the endpoints 0,0 and 1,1 for convenience even if they appear already
 30 |     s1 = np.concatenate(([0], s1, [1-np.sum(s1)]))
 31 |     s0 = np.concatenate(([0], s0, [1 - np.sum(s0)]))
 32 |     ind = np.concatenate(([0], ind, [1]))
 33 |     s = len(ind)
 34 |     f1 = np.cumsum(s1)
 35 |     f0 = np.cumsum(s0)
 36 | 
 37 |     return {'f1':f1, 'f0':f0, 's1':s1, 's0':s0, 's':s}
 38 | 
 39 | 
 40 | def plot_single_roc(true_labels, scores):
 41 |     out = get_score_distributions(true_labels, scores)
 42 |     plt.plot(out['f1'], out['f0'])
 43 |     return True
 44 | 
 45 | 
 46 | def h_measure_single(
 47 |         true_labels,
 48 |         scores,
 49 |         severity_ratio=0.5,
 50 |         threshold=0.5,
 51 |         level=0.5,
 52 |         verbose=True
 53 | ):
 54 |     assert np.all(true_labels.shape == scores.shape)
 55 |     assert len(true_labels.shape) == 1 or true_labels.shape[1] == 1
 56 |     assert type(true_labels) is np.matrix or type(true_labels) is np.ndarray
 57 |     assert type(scores) is np.matrix or type(scores) is np.ndarray
 58 | 
 59 |     # this is a numeric version of the class labels
 60 |     n1 = np.sum(true_labels)
 61 |     n0 = np.sum(1-true_labels)
 62 |     n = n1 + n0
 63 |     pi0 = n0 / n
 64 |     pi1 = n1 / n
 65 | 
 66 |     # retrieve severity ratio - set to default if absent
 67 |     if severity_ratio is None:
 68 |         severity_ratio = pi1/pi0
 69 | 
 70 |     # order data into increasing scores
 71 |     scores_sorted = np.sort(scores)
 72 |     scores_order = np.argsort(scores)
 73 | 
 74 |     out = get_score_distributions(true_labels, scores)
 75 | 
 76 |     # now compute statistics - start with AUC
 77 |     auc = 1.0 - np.sum(out['s0'] * (out['f1'] - 0.5 * out['s1']))
 78 |     if auc < 0.5 and verbose:
 79 |         print 'ROC curve lying below the diagonal. Double-check scores.'
 80 | 
 81 |     return {'auc':auc}
 82 | 
 83 |     # move to scalar misclassification statistics
 84 |     conf_matrix = confusion_matrix(true_labels, scores > threshold)
 85 | 
 86 |     conf_matrix_metrics
 87 | 
 88 |     #misclass_out = misclass_counts((s>threshold),true_class)
 89 |     #misclass_metrics = misclass_out['metrics']
 90 |     #temp = misclass_out['conf_matrix']
 91 |     #misclass_conf = dataFrame(
 92 |     #	TP=temp[1,1], FP=temp[2,1],
 93 |     #	TN=temp[2,2], FN=temp[1,2])
 94 |     return True
 95 | 
 96 | def confusion_matrix_metrics(conf_matrix):
 97 | 
 98 |     # note the semantics of conf_matrix:
 99 |     TN = conf_matrix[0][0]
100 |     FN = conf_matrix[1][0]
101 | 
102 | 
103 | 
104 | def h_measure(
105 |         true_labels,
106 |         scores,
107 |         severity_ratio=None,
108 |         threshold=None,
109 |         level=None
110 | ):
111 | 
112 |     # let's say that trueLabels is 1xn and scores kxn np matrices
113 |     assert true_labels.shape[1] == scores.shape[1]
114 |     assert true_labels.shape[0] == 1
115 |     assert type(true_labels) is np.matrix
116 |     assert type(scores) is np.matrix
117 | 
118 |     # make sure the only two class labels are 0 and 1
119 |     assert np.all(np.sort(np.unique(np.array(trueLabels)[0])) == np.array([0, 1]))
120 | 
121 |     # validate optional arguments
122 |     assert 0 <= severity_ratio <= 1
123 |     assert 0 <= level <= 1
124 |     assert 0 <= threshold
125 | 
126 |     return True
127 | 
128 | 


--------------------------------------------------------------------------------