├── .DS_Store
├── .ipynb_checkpoints
    ├── LogisticRegScikitlearn-checkpoint.ipynb
    └── Statistics-checkpoint.ipynb
├── LICENSE
├── README.md
├── deck-17.pdf
└── notebooks
    ├── LogisticRegScikitlearn.ipynb
    └── Statistics.ipynb


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/springcoil/PyDataLondonTutorial/b6461a26fbca6b2404f905cf7472b329a631f8a9/.DS_Store


--------------------------------------------------------------------------------
/.ipynb_checkpoints/LogisticRegScikitlearn-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Logistic regression in Scikitlearn\n",
  8 |     "* We'll explore a Logistic Regression model in Scikitlearn\n",
  9 |     "* We'll talk about how to model debug etc. \n",
 10 |     "* We'll do some feature engineering etc."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "metadata": {
 17 |     "collapsed": true
 18 |    },
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import pandas as pd\n",
 22 |     "import numpy as np\n",
 23 |     "from sklearn import linear_model\n",
 24 |     "from sklearn.cross_validation import train_test_split"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "metadata": {
 31 |     "collapsed": true
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "data = pd.read_csv(\"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\", header=None, names=['age', 'workclass', 'fnlwgt', \n",
 36 |     "                'education-categorical', 'educ', \n",
 37 |     "                'marital-status', 'occupation',\n",
 38 |     "                'relationship', 'race', 'sex', \n",
 39 |     "                'captial-gain', 'capital-loss', \n",
 40 |     "                'hours', 'native-country', \n",
 41 |     "                'income'])"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 3,
 47 |    "metadata": {
 48 |     "collapsed": false
 49 |    },
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "income = 1 * (data['income'] == \" >50K\")"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "# Let's explore the data a bit. "
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 1,
 65 |    "metadata": {
 66 |     "collapsed": false
 67 |    },
 68 |    "outputs": [
 69 |     {
 70 |      "ename": "NameError",
 71 |      "evalue": "name 'income' is not defined",
 72 |      "output_type": "error",
 73 |      "traceback": [
 74 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 75 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
 76 |       "\u001b[0;32m<ipython-input-1-db765db66048>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mincome\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue_counts\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
 77 |       "\u001b[0;31mNameError\u001b[0m: name 'income' is not defined"
 78 |      ]
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "income.value_counts()"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "# Exploring the data\n",
 90 |     "* Let us get a feel for the parameters.\n",
 91 |     "* We see that age is a tailed distribution.\n",
 92 |     "* Certainly not Gaussian! We don't see much of a correlation between many of the features, with the exception of Age and Age2.\n",
 93 |     "* Hours worked has some interesting behaviour. How would one describe this distribution?"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 3,
 99 |    "metadata": {
100 |     "collapsed": false
101 |    },
102 |    "outputs": [
103 |     {
104 |      "ename": "NameError",
105 |      "evalue": "name 'data' is not defined",
106 |      "output_type": "error",
107 |      "traceback": [
108 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
109 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
110 |       "\u001b[0;32m<ipython-input-3-f37959d9617e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mseaborn\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mseaborn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mseaborn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpairplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
111 |       "\u001b[0;31mNameError\u001b[0m: name 'data' is not defined"
112 |      ]
113 |     }
114 |    ],
115 |    "source": [
116 |     "import seaborn as seaborn\n",
117 |     "g = seaborn.pairplot(data)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 4,
123 |    "metadata": {
124 |     "collapsed": false
125 |    },
126 |    "outputs": [
127 |     {
128 |      "data": {
129 |       "text/plain": [
130 |        "LogisticRegression(C=100000.0, class_weight=None, dual=False,\n",
131 |        "          fit_intercept=True, intercept_scaling=1, max_iter=100,\n",
132 |        "          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,\n",
133 |        "          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)"
134 |       ]
135 |      },
136 |      "execution_count": 4,
137 |      "metadata": {},
138 |      "output_type": "execute_result"
139 |     }
140 |    ],
141 |    "source": [
142 |     "logreg = linear_model.LogisticRegression(C=1e5)\n",
143 |     "\n",
144 |     "age2 = np.square(data['age'])\n",
145 |     "data = data[['age', 'educ', 'hours']]\n",
146 |     "data['age2'] = age2\n",
147 |     "data['income'] = income\n",
148 |     "X = data[['age', 'age2', 'educ', 'hours']]\n",
149 |     "Y = data['income']\n",
150 |     "logreg.fit(X, Y)\n"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 5,
156 |    "metadata": {
157 |     "collapsed": false
158 |    },
159 |    "outputs": [
160 |     {
161 |      "data": {
162 |       "text/plain": [
163 |        "0.79303461195909219"
164 |       ]
165 |      },
166 |      "execution_count": 5,
167 |      "metadata": {},
168 |      "output_type": "execute_result"
169 |     }
170 |    ],
171 |    "source": [
172 |     "# check the accuracy on the training set\n",
173 |     "logreg.score(X, Y)\n",
174 |     "\n"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 6,
180 |    "metadata": {
181 |     "collapsed": false
182 |    },
183 |    "outputs": [
184 |     {
185 |      "data": {
186 |       "text/plain": [
187 |        "0.24080955744602439"
188 |       ]
189 |      },
190 |      "execution_count": 6,
191 |      "metadata": {},
192 |      "output_type": "execute_result"
193 |     }
194 |    ],
195 |    "source": [
196 |     "\n",
197 |     "Y.mean()\n"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "So we've decent predictions but not great ones. Only 24% of the class earns more than 50k, which means that you could obtain 76% accuracy by always predicting \"no\". So we're doing better than the null error rate but not by much. \n",
205 |     "Let's examine the coefficients and see what we learn. "
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 7,
211 |    "metadata": {
212 |     "collapsed": false
213 |    },
214 |    "outputs": [
215 |     {
216 |      "data": {
217 |       "text/html": [
218 |        "<div>\n",
219 |        "<table border=\"1\" class=\"dataframe\">\n",
220 |        "  <thead>\n",
221 |        "    <tr style=\"text-align: right;\">\n",
222 |        "      <th></th>\n",
223 |        "      <th>0</th>\n",
224 |        "      <th>1</th>\n",
225 |        "    </tr>\n",
226 |        "  </thead>\n",
227 |        "  <tbody>\n",
228 |        "    <tr>\n",
229 |        "      <th>0</th>\n",
230 |        "      <td>age</td>\n",
231 |        "      <td>[0.162458514116]</td>\n",
232 |        "    </tr>\n",
233 |        "    <tr>\n",
234 |        "      <th>1</th>\n",
235 |        "      <td>age2</td>\n",
236 |        "      <td>[-0.00138241828468]</td>\n",
237 |        "    </tr>\n",
238 |        "    <tr>\n",
239 |        "      <th>2</th>\n",
240 |        "      <td>educ</td>\n",
241 |        "      <td>[0.283606412852]</td>\n",
242 |        "    </tr>\n",
243 |        "    <tr>\n",
244 |        "      <th>3</th>\n",
245 |        "      <td>hours</td>\n",
246 |        "      <td>[0.0290797158473]</td>\n",
247 |        "    </tr>\n",
248 |        "  </tbody>\n",
249 |        "</table>\n",
250 |        "</div>"
251 |       ],
252 |       "text/plain": [
253 |        "       0                    1\n",
254 |        "0    age     [0.162458514116]\n",
255 |        "1   age2  [-0.00138241828468]\n",
256 |        "2   educ     [0.283606412852]\n",
257 |        "3  hours    [0.0290797158473]"
258 |       ]
259 |      },
260 |      "execution_count": 7,
261 |      "metadata": {},
262 |      "output_type": "execute_result"
263 |     }
264 |    ],
265 |    "source": [
266 |     "g = np.transpose(logreg.coef_)\n",
267 |     "pd.DataFrame(list(zip(X.columns, g )))"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "markdown",
272 |    "metadata": {},
273 |    "source": [
274 |     "# Classical Machine Learning technique - using a training set and testing set. "
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 8,
280 |    "metadata": {
281 |     "collapsed": false
282 |    },
283 |    "outputs": [
284 |     {
285 |      "data": {
286 |       "text/plain": [
287 |        "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
288 |        "          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
289 |        "          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
290 |        "          verbose=0, warm_start=False)"
291 |       ]
292 |      },
293 |      "execution_count": 8,
294 |      "metadata": {},
295 |      "output_type": "execute_result"
296 |     }
297 |    ],
298 |    "source": [
299 |     "# evaluate the model by splitting into train and test sets\n",
300 |     "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)\n",
301 |     "model2 = linear_model.LogisticRegression()\n",
302 |     "model2.fit(X_train, y_train)"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 9,
308 |    "metadata": {
309 |     "collapsed": false
310 |    },
311 |    "outputs": [
312 |     {
313 |      "name": "stdout",
314 |      "output_type": "stream",
315 |      "text": [
316 |       "[0 0 0 ..., 1 0 0]\n"
317 |      ]
318 |     }
319 |    ],
320 |    "source": [
321 |     "# predict class labels for the test set\n",
322 |     "predicted = model2.predict(X_test)\n",
323 |     "print(predicted)"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 10,
329 |    "metadata": {
330 |     "collapsed": false
331 |    },
332 |    "outputs": [
333 |     {
334 |      "name": "stdout",
335 |      "output_type": "stream",
336 |      "text": [
337 |       "[[ 0.85986473  0.14013527]\n",
338 |       " [ 0.75614576  0.24385424]\n",
339 |       " [ 0.82441467  0.17558533]\n",
340 |       " ..., \n",
341 |       " [ 0.48120856  0.51879144]\n",
342 |       " [ 0.79467429  0.20532571]\n",
343 |       " [ 0.92966606  0.07033394]]\n"
344 |      ]
345 |     }
346 |    ],
347 |    "source": [
348 |     "\n",
349 |     "# generate class probabilities\n",
350 |     "probs = model2.predict_proba(X_test)\n",
351 |     "print(probs)"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "markdown",
356 |    "metadata": {
357 |     "collapsed": true
358 |    },
359 |    "source": [
360 |     "# Model evaluation.\n",
361 |     "* We can look at the model as a black box.\n",
362 |     "* We can evaluate it and score it.\n",
363 |     "* We can also probably use something like Hyperparameter tuning or something like a Grid search to improve our results. "
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": null,
369 |    "metadata": {
370 |     "collapsed": true
371 |    },
372 |    "outputs": [],
373 |    "source": []
374 |   }
375 |  ],
376 |  "metadata": {
377 |   "kernelspec": {
378 |    "display_name": "Python 3",
379 |    "language": "python",
380 |    "name": "python3"
381 |   },
382 |   "language_info": {
383 |    "codemirror_mode": {
384 |     "name": "ipython",
385 |     "version": 3
386 |    },
387 |    "file_extension": ".py",
388 |    "mimetype": "text/x-python",
389 |    "name": "python",
390 |    "nbconvert_exporter": "python",
391 |    "pygments_lexer": "ipython3",
392 |    "version": "3.5.1"
393 |   }
394 |  },
395 |  "nbformat": 4,
396 |  "nbformat_minor": 0
397 | }
398 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PyDataLondonTutorial
2 | PyDataLondonTutorial on statistics - and how to use Python to do these things :) 
3 | 


--------------------------------------------------------------------------------
/deck-17.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/springcoil/PyDataLondonTutorial/b6461a26fbca6b2404f905cf7472b329a631f8a9/deck-17.pdf


--------------------------------------------------------------------------------
	0	1
0	age	[0.162458514116]
1	age2	[-0.00138241828468]
2	educ	[0.283606412852]
3	hours	[0.0290797158473]