├── README.md └── auc.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # metrics 2 | Python implementation of machine learning metrics 3 | -------------------------------------------------------------------------------- /auc.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Fast computation of roc auc metric in Python" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Python code to compute the roc auc metric. That code runs about twcie as fast as the corresponding scikit-learn function." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import numpy as np \n", 26 | "from numba import jit\n", 27 | "\n", 28 | "@jit\n", 29 | "def fast_auc(y_true, y_prob):\n", 30 | " y_true = np.asarray(y_true)\n", 31 | " y_true = y_true[np.argsort(y_prob)]\n", 32 | " nfalse = 0\n", 33 | " auc = 0\n", 34 | " n = len(y_true)\n", 35 | " for i in range(n):\n", 36 | " y_i = y_true[i]\n", 37 | " nfalse += (1 - y_i)\n", 38 | " auc += y_i * nfalse\n", 39 | " auc /= (nfalse * (n - nfalse))\n", 40 | " return auc" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "Let's create a random example." 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 2, 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "y_true = np.random.randint(0,2,1000000)\n", 59 | "y_pred = np.random.rand(1000000)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "The roc auc should be close to 0.5 for random prediction." 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 3, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "0.501004845745664" 78 | ] 79 | }, 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "fast_auc(y_true, y_pred)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "It is the case. Let's see what scikit-learn code does here." 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 4, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/plain": [ 104 | "0.50100484574566395" 105 | ] 106 | }, 107 | "execution_count": 4, 108 | "metadata": {}, 109 | "output_type": "execute_result" 110 | } 111 | ], 112 | "source": [ 113 | "from sklearn.metrics import roc_auc_score\n", 114 | "\n", 115 | "roc_auc_score(y_true, y_pred)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "Seems we are in good shape as the result is very close.\n", 123 | "\n", 124 | "A little sanity check." 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 5, 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "data": { 134 | "text/plain": [ 135 | "1.0" 136 | ] 137 | }, 138 | "execution_count": 5, 139 | "metadata": {}, 140 | "output_type": "execute_result" 141 | } 142 | ], 143 | "source": [ 144 | "fast_auc(y_true, y_true)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "Which one is faster?" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 6, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "name": "stdout", 161 | "output_type": "stream", 162 | "text": [ 163 | "10 loops, best of 3: 130 ms per loop\n" 164 | ] 165 | } 166 | ], 167 | "source": [ 168 | "%timeit fast_auc(y_true, y_pred)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 7, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "name": "stdout", 178 | "output_type": "stream", 179 | "text": [ 180 | "1 loop, best of 3: 275 ms per loop\n" 181 | ] 182 | } 183 | ], 184 | "source": [ 185 | "%timeit roc_auc_score(y_true, y_pred)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "My code is more than twice as fast." 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": { 199 | "collapsed": true 200 | }, 201 | "outputs": [], 202 | "source": [] 203 | } 204 | ], 205 | "metadata": { 206 | "anaconda-cloud": {}, 207 | "kernelspec": { 208 | "display_name": "Python [conda root]", 209 | "language": "python", 210 | "name": "conda-root-py" 211 | }, 212 | "language_info": { 213 | "codemirror_mode": { 214 | "name": "ipython", 215 | "version": 3 216 | }, 217 | "file_extension": ".py", 218 | "mimetype": "text/x-python", 219 | "name": "python", 220 | "nbconvert_exporter": "python", 221 | "pygments_lexer": "ipython3", 222 | "version": "3.5.1" 223 | } 224 | }, 225 | "nbformat": 4, 226 | "nbformat_minor": 1 227 | } 228 | --------------------------------------------------------------------------------