├── .DS_Store ├── .ipynb_checkpoints ├── LogisticRegScikitlearn-checkpoint.ipynb └── Statistics-checkpoint.ipynb ├── LICENSE ├── README.md ├── deck-17.pdf └── notebooks ├── LogisticRegScikitlearn.ipynb └── Statistics.ipynb /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/springcoil/PyDataLondonTutorial/b6461a26fbca6b2404f905cf7472b329a631f8a9/.DS_Store -------------------------------------------------------------------------------- /.ipynb_checkpoints/LogisticRegScikitlearn-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Logistic regression in Scikitlearn\n", 8 | "* We'll explore a Logistic Regression model in Scikitlearn\n", 9 | "* We'll talk about how to model debug etc. \n", 10 | "* We'll do some feature engineering etc." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": { 17 | "collapsed": true 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "import pandas as pd\n", 22 | "import numpy as np\n", 23 | "from sklearn import linear_model\n", 24 | "from sklearn.cross_validation import train_test_split" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "data = pd.read_csv(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\", header=None, names=['age', 'workclass', 'fnlwgt', \n", 36 | " 'education-categorical', 'educ', \n", 37 | " 'marital-status', 'occupation',\n", 38 | " 'relationship', 'race', 'sex', \n", 39 | " 'captial-gain', 'capital-loss', \n", 40 | " 'hours', 'native-country', \n", 41 | " 'income'])" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "metadata": { 48 | "collapsed": false 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "income = 1 * (data['income'] == \" >50K\")" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "# Let's explore the data a bit. " 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 1, 65 | "metadata": { 66 | "collapsed": false 67 | }, 68 | "outputs": [ 69 | { 70 | "ename": "NameError", 71 | "evalue": "name 'income' is not defined", 72 | "output_type": "error", 73 | "traceback": [ 74 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 75 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 76 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mincome\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue_counts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 77 | "\u001b[0;31mNameError\u001b[0m: name 'income' is not defined" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "income.value_counts()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "# Exploring the data\n", 90 | "* Let us get a feel for the parameters.\n", 91 | "* We see that age is a tailed distribution.\n", 92 | "* Certainly not Gaussian! We don't see much of a correlation between many of the features, with the exception of Age and Age2.\n", 93 | "* Hours worked has some interesting behaviour. How would one describe this distribution?" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 3, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [ 103 | { 104 | "ename": "NameError", 105 | "evalue": "name 'data' is not defined", 106 | "output_type": "error", 107 | "traceback": [ 108 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 109 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 110 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mseaborn\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mseaborn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mseaborn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpairplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 111 | "\u001b[0;31mNameError\u001b[0m: name 'data' is not defined" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "import seaborn as seaborn\n", 117 | "g = seaborn.pairplot(data)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 4, 123 | "metadata": { 124 | "collapsed": false 125 | }, 126 | "outputs": [ 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "LogisticRegression(C=100000.0, class_weight=None, dual=False,\n", 131 | " fit_intercept=True, intercept_scaling=1, max_iter=100,\n", 132 | " multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,\n", 133 | " solver='liblinear', tol=0.0001, verbose=0, warm_start=False)" 134 | ] 135 | }, 136 | "execution_count": 4, 137 | "metadata": {}, 138 | "output_type": "execute_result" 139 | } 140 | ], 141 | "source": [ 142 | "logreg = linear_model.LogisticRegression(C=1e5)\n", 143 | "\n", 144 | "age2 = np.square(data['age'])\n", 145 | "data = data[['age', 'educ', 'hours']]\n", 146 | "data['age2'] = age2\n", 147 | "data['income'] = income\n", 148 | "X = data[['age', 'age2', 'educ', 'hours']]\n", 149 | "Y = data['income']\n", 150 | "logreg.fit(X, Y)\n" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 5, 156 | "metadata": { 157 | "collapsed": false 158 | }, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/plain": [ 163 | "0.79303461195909219" 164 | ] 165 | }, 166 | "execution_count": 5, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "# check the accuracy on the training set\n", 173 | "logreg.score(X, Y)\n", 174 | "\n" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 6, 180 | "metadata": { 181 | "collapsed": false 182 | }, 183 | "outputs": [ 184 | { 185 | "data": { 186 | "text/plain": [ 187 | "0.24080955744602439" 188 | ] 189 | }, 190 | "execution_count": 6, 191 | "metadata": {}, 192 | "output_type": "execute_result" 193 | } 194 | ], 195 | "source": [ 196 | "\n", 197 | "Y.mean()\n" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "So we've decent predictions but not great ones. Only 24% of the class earns more than 50k, which means that you could obtain 76% accuracy by always predicting \"no\". So we're doing better than the null error rate but not by much. \n", 205 | "Let's examine the coefficients and see what we learn. " 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 7, 211 | "metadata": { 212 | "collapsed": false 213 | }, 214 | "outputs": [ 215 | { 216 | "data": { 217 | "text/html": [ 218 | "
\n", 219 | "\n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | "
01
0age[0.162458514116]
1age2[-0.00138241828468]
2educ[0.283606412852]
3hours[0.0290797158473]
\n", 250 | "
" 251 | ], 252 | "text/plain": [ 253 | " 0 1\n", 254 | "0 age [0.162458514116]\n", 255 | "1 age2 [-0.00138241828468]\n", 256 | "2 educ [0.283606412852]\n", 257 | "3 hours [0.0290797158473]" 258 | ] 259 | }, 260 | "execution_count": 7, 261 | "metadata": {}, 262 | "output_type": "execute_result" 263 | } 264 | ], 265 | "source": [ 266 | "g = np.transpose(logreg.coef_)\n", 267 | "pd.DataFrame(list(zip(X.columns, g )))" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "# Classical Machine Learning technique - using a training set and testing set. " 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 8, 280 | "metadata": { 281 | "collapsed": false 282 | }, 283 | "outputs": [ 284 | { 285 | "data": { 286 | "text/plain": [ 287 | "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", 288 | " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", 289 | " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", 290 | " verbose=0, warm_start=False)" 291 | ] 292 | }, 293 | "execution_count": 8, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "# evaluate the model by splitting into train and test sets\n", 300 | "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)\n", 301 | "model2 = linear_model.LogisticRegression()\n", 302 | "model2.fit(X_train, y_train)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 9, 308 | "metadata": { 309 | "collapsed": false 310 | }, 311 | "outputs": [ 312 | { 313 | "name": "stdout", 314 | "output_type": "stream", 315 | "text": [ 316 | "[0 0 0 ..., 1 0 0]\n" 317 | ] 318 | } 319 | ], 320 | "source": [ 321 | "# predict class labels for the test set\n", 322 | "predicted = model2.predict(X_test)\n", 323 | "print(predicted)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 10, 329 | "metadata": { 330 | "collapsed": false 331 | }, 332 | "outputs": [ 333 | { 334 | "name": "stdout", 335 | "output_type": "stream", 336 | "text": [ 337 | "[[ 0.85986473 0.14013527]\n", 338 | " [ 0.75614576 0.24385424]\n", 339 | " [ 0.82441467 0.17558533]\n", 340 | " ..., \n", 341 | " [ 0.48120856 0.51879144]\n", 342 | " [ 0.79467429 0.20532571]\n", 343 | " [ 0.92966606 0.07033394]]\n" 344 | ] 345 | } 346 | ], 347 | "source": [ 348 | "\n", 349 | "# generate class probabilities\n", 350 | "probs = model2.predict_proba(X_test)\n", 351 | "print(probs)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": { 357 | "collapsed": true 358 | }, 359 | "source": [ 360 | "# Model evaluation.\n", 361 | "* We can look at the model as a black box.\n", 362 | "* We can evaluate it and score it.\n", 363 | "* We can also probably use something like Hyperparameter tuning or something like a Grid search to improve our results. " 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": { 370 | "collapsed": true 371 | }, 372 | "outputs": [], 373 | "source": [] 374 | } 375 | ], 376 | "metadata": { 377 | "kernelspec": { 378 | "display_name": "Python 3", 379 | "language": "python", 380 | "name": "python3" 381 | }, 382 | "language_info": { 383 | "codemirror_mode": { 384 | "name": "ipython", 385 | "version": 3 386 | }, 387 | "file_extension": ".py", 388 | "mimetype": "text/x-python", 389 | "name": "python", 390 | "nbconvert_exporter": "python", 391 | "pygments_lexer": "ipython3", 392 | "version": "3.5.1" 393 | } 394 | }, 395 | "nbformat": 4, 396 | "nbformat_minor": 0 397 | } 398 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyDataLondonTutorial 2 | PyDataLondonTutorial on statistics - and how to use Python to do these things :) 3 | -------------------------------------------------------------------------------- /deck-17.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/springcoil/PyDataLondonTutorial/b6461a26fbca6b2404f905cf7472b329a631f8a9/deck-17.pdf --------------------------------------------------------------------------------