├── README.md └── ipython ├── Labs_Student ├── .ipynb_checkpoints │ ├── Lab1_Python_Fundamentals-checkpoint.ipynb │ └── NumPyBasics-checkpoint.ipynb ├── Lab1_Python_Fundamentals.ipynb ├── Lab2_NumPy_Vectorization_Student.ipynb ├── Lab3_Pandas_Exploration_Student.ipynb ├── Lab4_Survey_Questions_part1_Student.ipynb ├── Lab4_intro_regex.ipynb ├── Lab_6_FeatureRanking_AUC_Student.ipynb ├── Lab_7_sklearn_magic_student.ipynb ├── NumPyBasics.ipynb ├── SimpleiPythonExample.ipynb ├── ads_dataset_cut.txt ├── lab_5_student.ipynb ├── lab_8_text.ipynb └── test.txt ├── Labs_complete ├── Lab1_Python_Fundamentals.ipynb ├── Lab2_NumPy_Vectorization.ipynb ├── Lab3_Pandas_Exploration.ipynb ├── Lab4_Survey_Questions_part1.ipynb ├── Lab_6_FeatureRanking_AUC.ipynb └── lab_7_sklearn_complete.ipynb ├── README.md ├── data ├── Cell2Cell_data.csv ├── Cell2Cell_info.pdf ├── ads_dataset.txt ├── ads_dataset_cut.txt ├── advertising_events.csv ├── boson_testing_cut.csv ├── boson_training_cut_2000.csv ├── loansData.csv ├── osquery_contributors.html ├── spam_ham.csv └── survey_responses_2016.csv ├── hw ├── hw_1 │ ├── Homework1.ipynb │ ├── data │ │ ├── ads_dataset.tsv │ │ ├── advertising_events.csv │ │ └── osquery_contributors.html │ └── images │ │ └── osquery_contributors.png ├── hw_2 │ ├── data │ │ └── cell2cell_data.csv │ └── hw_2.ipynb ├── hw_3 │ ├── Homework_3.ipynb │ └── data │ │ ├── boson_testing_cut.csv │ │ └── boson_training_cut_2000.csv └── hw_4 │ ├── data │ └── imdb.csv │ └── hw_4.ipynb ├── references ├── Syllabus_2016.pdf ├── churn_architecture.png ├── churn_dataset_info.pdf └── churn_sampling_scheme.png └── utils ├── ClassifierBakeoff.py ├── ClassifierBakeoff.pyc ├── bias_variance.py ├── bias_variance.pyc ├── churn_analysis.py ├── course_utils.py ├── course_utils.pyc ├── eval_plots.py └── eval_plots.pyc /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/README.md -------------------------------------------------------------------------------- /ipython/Labs_Student/.ipynb_checkpoints/NumPyBasics-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "'''\n", 12 | "The core data type in Numpy is the ndarray, which enables fast and space-efficient multidimensional array processing.\n", 13 | "Note: This notebook is adapted from chapter 4 Python for Data Analysis by Wes McKinney and O'Reilly publishing. NumPy has many, \n", 14 | "many features that won't be covered here. The following snippets are just to illustrate basic data types and operations within\n", 15 | "numpy.\n", 16 | "\n", 17 | "Another good resource for learning more about ndarrays is here:\n", 18 | "http://docs.scipy.org/doc/numpy/reference/arrays.html\n", 19 | "'''\n", 20 | "\n", 21 | "#First, import NumPy\n", 22 | "import numpy as np\n", 23 | "\n", 24 | "#It is easy to create Nx1 and NxM arrays from standard Python lists\n", 25 | "l1 = [0,1,2]\n", 26 | "l2 = [3,4,5]\n", 27 | "\n", 28 | "nd1 = np.array(l1)\n", 29 | "nd2 = np.array([l1, l2])" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "The ndarray has dimension n=3 and m=1\n", 44 | "The ndarray has elements of type=int64\n", 45 | "The ndarray has dimension n=2 and m=3\n", 46 | "The ndarray has elements of type=int64\n" 47 | ] 48 | } 49 | ], 50 | "source": [ 51 | "#Now, we can get ask for some basic info to describe the ndarray\n", 52 | "def desc_ndarray(nd):\n", 53 | " try:\n", 54 | " print \"The ndarray has dimension n=%s and m=%s\" % (nd.shape[0],nd.shape[1])\n", 55 | " except:\n", 56 | " print \"The ndarray has dimension n=%s and m=1\" % nd.shape[0]\n", 57 | " print \"The ndarray has elements of type=%s\" % nd.dtype\n", 58 | "\n", 59 | "desc_ndarray(nd1)\n", 60 | "\n", 61 | "desc_ndarray(nd2)\n", 62 | "\n" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 3, 68 | "metadata": { 69 | "collapsed": false 70 | }, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/plain": [ 75 | "[array([ 0., 0., 0., 0.]),\n", 76 | " array([ 1., 1., 1., 1.]),\n", 77 | " array([ 0.47121338, 1.83328779, 0.4438019 , -0.52309325])]" 78 | ] 79 | }, 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "#There are short cuts for creating certain frequently used special ndarrays, i.e.,\n", 87 | "\n", 88 | "k=4\n", 89 | "\n", 90 | "#1. an ndarray of all zeros\n", 91 | "zero = np.zeros(k)\n", 92 | "\n", 93 | "#2. an ndarray of all ones\n", 94 | "one = np.ones(k)\n", 95 | "\n", 96 | "#3. an ndarray of random elements (this one is standard normal, but there are many distributions to choose from)\n", 97 | "rand = np.random.randn(k)\n", 98 | "\n", 99 | "[zero, one, rand]" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 4, 105 | "metadata": { 106 | "collapsed": false 107 | }, 108 | "outputs": [ 109 | { 110 | "data": { 111 | "text/plain": [ 112 | "[array([[ 0.69394907, 0.85723722],\n", 113 | " [-0.16779156, 0.41709003],\n", 114 | " [-0.94008249, -0.21591983],\n", 115 | " [-0.61305106, 0.41435495]]),\n", 116 | " array([-0.16779156, 0.41709003]),\n", 117 | " 0.41709003439166575]" 118 | ] 119 | }, 120 | "execution_count": 4, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "'''\n", 127 | "For indexing an array:\n", 128 | "1. If nx1 array, follow the same protocol as a regular Python list\n", 129 | "2. If nxm array use the following examples\n", 130 | "'''\n", 131 | "\n", 132 | "arr2d = np.random.randn(4,2)\n", 133 | "\n", 134 | "#A single index gets a full row\n", 135 | "\n", 136 | "#2 indexes returns a value\n", 137 | "[arr2d, arr2d[1], arr2d[1,1]]" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 5, 143 | "metadata": { 144 | "collapsed": false 145 | }, 146 | "outputs": [ 147 | { 148 | "data": { 149 | "text/plain": [ 150 | "[array([-0.4386254 , -0.67720483, -1.19775067, -0.21300288]),\n", 151 | " array([-0.8772508 , -1.35440967, -2.39550135, -0.42600575]),\n", 152 | " array([-0.8772508 , -1.35440967, -2.39550135, -0.42600575]),\n", 153 | " array([-0., -0., -0., -0.])]" 154 | ] 155 | }, 156 | "execution_count": 5, 157 | "metadata": {}, 158 | "output_type": "execute_result" 159 | } 160 | ], 161 | "source": [ 162 | "'''\n", 163 | "Operations between Arrays and Scalars\n", 164 | "An important feature of ndarrays is they allow batch operations on data without writing any for loops. \n", 165 | "This is called vectorization.\n", 166 | "Any arithmetic operations between equal-size arrays applies the operation elementwise. \n", 167 | "'''\n", 168 | "\n", 169 | "#examples\n", 170 | "\n", 171 | "k = 4\n", 172 | "rand = np.random.randn(k)\n", 173 | "[rand, rand + rand, 2*rand, rand*np.zeros(4)]\n", 174 | "\n" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 7, 180 | "metadata": { 181 | "collapsed": false 182 | }, 183 | "outputs": [ 184 | { 185 | "data": { 186 | "text/plain": [ 187 | "[array([ 0.19631415, 0.41059714, 4.26249299]),\n", 188 | " array([-1.46310809, 1.15559786, 0.10690073]),\n", 189 | " array([-1.26679394, 1.566195 , 4.36939372])]" 190 | ] 191 | }, 192 | "execution_count": 7, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "'''\n", 199 | "Matrix operations\n", 200 | "It is easy to do matrix operations with Nd arrays. The standard arithmetic operators don't work here though. And it is important \n", 201 | "to make sure matrix shapes are compatible\n", 202 | "'''\n", 203 | "\n", 204 | "k = 3\n", 205 | "r1 = np.random.randn(k)\n", 206 | "r2 = np.random.randn(k)\n", 207 | "\n", 208 | "#Matrix addition is the standard matrix operator\n", 209 | "[r1, r2 , r1 + r2]\n" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 8, 215 | "metadata": { 216 | "collapsed": false 217 | }, 218 | "outputs": [ 219 | { 220 | "data": { 221 | "text/plain": [ 222 | "[array([[ 0.19631415, 0.41059714, 4.26249299],\n", 223 | " [-1.46310809, 1.15559786, 0.10690073]]),\n", 224 | " array([[ 0.19631415, -1.46310809],\n", 225 | " [ 0.41059714, 1.15559786],\n", 226 | " [ 4.26249299, 0.10690073]])]" 227 | ] 228 | }, 229 | "execution_count": 8, 230 | "metadata": {}, 231 | "output_type": "execute_result" 232 | } 233 | ], 234 | "source": [ 235 | "#The Transpose can be taken with the attribute T\n", 236 | "arr2d = np.array([r1, r2])\n", 237 | "[arr2d, arr2d.T]" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 9, 243 | "metadata": { 244 | "collapsed": false 245 | }, 246 | "outputs": [ 247 | { 248 | "data": { 249 | "text/plain": [ 250 | "[array([[ 0.19631415, 0.41059714, 4.26249299],\n", 251 | " [-1.46310809, 1.15559786, 0.10690073]]),\n", 252 | " array([[ 3.85392468e-02, 1.68590015e-01, 1.81688465e+01],\n", 253 | " [ 2.14068529e+00, 1.33540642e+00, 1.14277663e-02]]),\n", 254 | " array([[ 18.37597578, 0.64291997],\n", 255 | " [ 0.64291997, 3.48751947]])]" 256 | ] 257 | }, 258 | "execution_count": 9, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [ 264 | "'''\n", 265 | "Matrix multiplication, like inner products can be done on arrays.\n", 266 | "Just remember that the standard multiplication operator does elementwise multiplication (provided they are the same shape).\n", 267 | "We need the dot method in order to do an inner product\n", 268 | "\n", 269 | "Numpy has a linalg library that can run most matrix operations on ndarrays:\n", 270 | "http://docs.scipy.org/doc/numpy/reference/routines.linalg.html\n", 271 | "\n", 272 | "One can also create a matrix object and use the methods in numpy.matrix to achieve the same thing:\n", 273 | "http://docs.scipy.org/doc/numpy/reference/generated/numpy.matrix.html\n", 274 | "'''\n", 275 | "\n", 276 | "[arr2d, arr2d * arr2d, arr2d.dot(arr2d.T)]" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 11, 282 | "metadata": { 283 | "collapsed": false 284 | }, 285 | "outputs": [ 286 | { 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "10000 loops, best of 3: 119 µs per loop\n" 291 | ] 292 | } 293 | ], 294 | "source": [ 295 | "'''\n", 296 | "One important feature of vectorization is that it allows elementwise processing that is much faster than writing a traditional\n", 297 | "loop.\n", 298 | "'''\n", 299 | "import math\n", 300 | "\n", 301 | "#show an example and profile i\n", 302 | "%timeit [math.sqrt(x) for x in range(1000)]" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 12, 308 | "metadata": { 309 | "collapsed": false 310 | }, 311 | "outputs": [ 312 | { 313 | "name": "stdout", 314 | "output_type": "stream", 315 | "text": [ 316 | "The slowest run took 9.83 times longer than the fastest. This could mean that an intermediate result is being cached \n", 317 | "100000 loops, best of 3: 5.19 µs per loop\n" 318 | ] 319 | } 320 | ], 321 | "source": [ 322 | "%timeit np.sqrt(np.arange(1000))" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 16, 328 | "metadata": { 329 | "collapsed": false 330 | }, 331 | "outputs": [ 332 | { 333 | "name": "stderr", 334 | "output_type": "stream", 335 | "text": [ 336 | "ERROR: Line magic function `%inline` not found.\n" 337 | ] 338 | } 339 | ], 340 | "source": [ 341 | "'''\n", 342 | "The last thing we'll cover in this module is the numpy.random library. In general, it is advised to use numpy for\n", 343 | "random number generation as opposed to python's built in random module.\n", 344 | "\n", 345 | "Random number generation has many uses. One common use is generating fake (i.e. random) data to test modeling procedures\n", 346 | "or to do Monte Carlo Simulations\n", 347 | "'''\n", 348 | "import matplotlib.pyplot as plt\n", 349 | "%inline\n", 350 | "\n", 351 | "\n", 352 | "#Generate random pairs that have a multivariate normal distribution\n", 353 | "N = 1000\n", 354 | "mu = np.array([0,0])\n", 355 | "cov = 0.5\n", 356 | "sig = np.array([[1, cov],[cov, 1]]) #Must be square, symmetric and non-negative definite\n", 357 | "x, y = np.random.multivariate_normal(mu, sig, N).T\n", 358 | "#Now let's plot and see what that looks like\n", 359 | "\n", 360 | "\n", 361 | "plt.plot(x, y,'x'); plt.axis('equal'); plt.show()\n", 362 | "\n" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 18, 368 | "metadata": { 369 | "collapsed": false 370 | }, 371 | "outputs": [], 372 | "source": [ 373 | "'''\n", 374 | "One final example (taken from Wes Mckinney's book):\n", 375 | "\n", 376 | "Let's generate a random walk and visualize it\n", 377 | "'''\n", 378 | "import matplotlib.pyplot as plt\n", 379 | "\n", 380 | "nsteps = 1000\n", 381 | "draws = np.random.randint(0, 2, size = nsteps) #Randint let's us generate random integers in a range\n", 382 | "steps = np.where(draws>0, 1, -1) #there function let's us do boolean logic on a conditional applied to an entire array\n", 383 | "walk = steps.cumsum() #Cumsum returns an array with the same size as steps, that has cum sum of steps up to index i\n", 384 | "plt.plot(np.arange(len(walk)), walk);plt.show()" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 30, 390 | "metadata": { 391 | "collapsed": false 392 | }, 393 | "outputs": [], 394 | "source": [] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": { 400 | "collapsed": false 401 | }, 402 | "outputs": [], 403 | "source": [] 404 | } 405 | ], 406 | "metadata": { 407 | "kernelspec": { 408 | "display_name": "Python 3", 409 | "language": "python", 410 | "name": "python3" 411 | }, 412 | "language_info": { 413 | "codemirror_mode": { 414 | "name": "ipython", 415 | "version": 3 416 | }, 417 | "file_extension": ".py", 418 | "mimetype": "text/x-python", 419 | "name": "python", 420 | "nbconvert_exporter": "python", 421 | "pygments_lexer": "ipython3", 422 | "version": "3.6.0" 423 | } 424 | }, 425 | "nbformat": 4, 426 | "nbformat_minor": 0 427 | } 428 | -------------------------------------------------------------------------------- /ipython/Labs_Student/Lab2_NumPy_Vectorization_Student.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import warnings\n", 12 | "warnings.filterwarnings('ignore')\n", 13 | "import numpy as np\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "%matplotlib inline" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "First we'll generate a random matrix" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 5, 28 | "metadata": { 29 | "collapsed": false 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "#Number of columns (features)\n", 34 | "K = 5\n", 35 | "\n", 36 | "#Number of records\n", 37 | "N = 1000\n", 38 | "\n", 39 | "#Generate an NxK matrix of uniform random variables\n", 40 | "X = #Student: generate a uniform random matrix here" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "Let's peak at our data to confirm it looks as we expect it" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 6, 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "#Student - Put in a command to view the first 100 rows\n" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 8, 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "#Student - put in a command to see the dimensions of X\n" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "This exercise is about designing a scoring function for a logistic regression. As we are not concerned with fitting a model to data, we can just make up a logistic regression.

\n", 77 | "\n", 78 | "For quick intro, the Logistic Regression takes the form of $\\hat{Y} = f(x * \\beta^T)$, where $x$ is the $1xK$ vector of features and $\\beta$ is the $1xK$ vector of weights. The function $f$, called a 'link' function, is the inverse logit:

\n", 79 | "\n", 80 | "
$f(a)=\\frac{1}{1+e^{-a}}$


\n", 81 | "\n", 82 | "In this notebook we'll write a function that, given inputs of $X$ and $\\beta$, returns a value for $\\hat{Y}$.\n", 83 | "

\n", 84 | "First let's generate a random set of weights to represent $\\beta$.\n" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 9, 90 | "metadata": { 91 | "collapsed": false 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "#Student - generate a K dimensional vector of uniform random variables in the interval [-1, 1]\n", 96 | "beta = #input command here\n", 97 | "beta" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "Notice how we applied a neat NumPy trick here. The numpy.random.random() function returns an array, yet we applied what appears to be a scalar operation on the vector. This is an example of what NumPy calls vectorization (a major point of this tutorial), which offers us both a very fast way to do run vector computations as well as a clean and concise method of coding. \n", 105 | "\n", 106 | "

\n", 107 | "\n", 108 | "Question: we designed the above $beta$ vector such that $E[\\beta_i]=0$. How can we confirm that we did this correctly?" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 8, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "#start by taking the mean of the beta we already calculated\n", 120 | "\n", 121 | "#Student - fill in command here\n", 122 | "\n" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 10, 128 | "metadata": { 129 | "collapsed": false 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "#It is likely the above is not equal to zero. Let's simulate this 100k times and see what the distribution of means is\n", 134 | "#Student input code here\n", 135 | "means = []\n" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "Now let's use matplotlibs hist function to plot the histogram of means here. " 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 12, 148 | "metadata": { 149 | "collapsed": false 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "plt.hist(means)\n", 154 | "plt.show()" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "We should expect the distribution to be centered around zero. Is it?" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "Now let's write our scoring function. Let's try to use as much of Numpy's inner optimization as possible (hint, this can be done in two lines and without writing any loops)." 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 45, 174 | "metadata": { 175 | "collapsed": true 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "def score_logistic_regression(X, beta):\n", 180 | " '''\n", 181 | " This function takes in an NxK matrix X and 1xK vector beta.\n", 182 | " The function should apply the logistic scoring function to each record of X.\n", 183 | " The output should be an Nx1 vector of scores\n", 184 | " '''\n", 185 | " \n", 186 | " #First let's calculate X*beta - make sure to use numpy's 'dot' method\n", 187 | " #student - put in code here\n", 188 | " \n", 189 | " #Now let's input this into the link function\n", 190 | " #student - put in code here\n", 191 | " \n", 192 | " return prob_score" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "So how much faster is it by using Numpy? We can test this be writing the same function that uses no Numpy and executes via loops." 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 44, 205 | "metadata": { 206 | "collapsed": false 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "def score_logistic_regression_NoNumpy(X, beta):\n", 211 | " '''\n", 212 | " This function takes in an NxK matrix X and 1xK vector beta.\n", 213 | " The function should apply the logistic scoring function to each record of X.\n", 214 | " The output should be an Nx1 vector of scores\n", 215 | " '''\n", 216 | " #Let's calculate xbeta using loops\n", 217 | " xbeta = []\n", 218 | " for row in X:\n", 219 | " \n", 220 | " xb = 0\n", 221 | " for i, el in enumerate(row):\n", 222 | " xb += el * beta[i]\n", 223 | " \n", 224 | " xbeta.append(xb)\n", 225 | " \n", 226 | " #Now let's apply the link function to each xbeta\n", 227 | " prob_score = []\n", 228 | " for xb in xbeta:\n", 229 | " prob_score.append(1 / (1 + np.exp(-1 * xb)))\n", 230 | " \n", 231 | " return prob_score" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "Before doing any analysis, let's test the output of each to make sure they equal" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 14, 244 | "metadata": { 245 | "collapsed": false 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "#Student - write a unit test that calls each function with the same inputs and checks to see they return the same values. " 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "If they equal then we can proceed with timing analysis" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 15, 262 | "metadata": { 263 | "collapsed": false 264 | }, 265 | "outputs": [], 266 | "source": [ 267 | "%timeit score_logistic_regression_NoNumpy(X, beta)" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 16, 273 | "metadata": { 274 | "collapsed": false 275 | }, 276 | "outputs": [], 277 | "source": [ 278 | "%timeit score_logistic_regression(X, beta)" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": { 290 | "collapsed": true 291 | }, 292 | "outputs": [], 293 | "source": [] 294 | } 295 | ], 296 | "metadata": { 297 | "anaconda-cloud": {}, 298 | "kernelspec": { 299 | "display_name": "Python [py35]", 300 | "language": "python", 301 | "name": "Python [py35]" 302 | }, 303 | "language_info": { 304 | "codemirror_mode": { 305 | "name": "ipython", 306 | "version": 3 307 | }, 308 | "file_extension": ".py", 309 | "mimetype": "text/x-python", 310 | "name": "python", 311 | "nbconvert_exporter": "python", 312 | "pygments_lexer": "ipython3", 313 | "version": "3.5.2" 314 | } 315 | }, 316 | "nbformat": 4, 317 | "nbformat_minor": 0 318 | } 319 | -------------------------------------------------------------------------------- /ipython/Labs_Student/Lab3_Pandas_Exploration_Student.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "In this lab we're going to do simple data exploration using Pandas. Our objective is to learn basic operations that aid in visual data exploration.\n", 8 | "\n", 9 | "
\n", 10 | "\n", 11 | "First, let's import our required libraries and read in the data." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": { 18 | "collapsed": false 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import pandas as pd\n", 23 | "import matplotlib.pyplot as plt\n", 24 | "import numpy as np\n", 25 | "import os\n", 26 | "\n", 27 | "\n", 28 | "cwd = os.getcwd()\n", 29 | "\n", 30 | "#If on MAC, this will likely work\n", 31 | "datadir = '/'.join(cwd.split('/')[0:-1]) + '/data/'\n", 32 | "#If on window's machine, explicitly put in data dir\n", 33 | "#datadir = \n", 34 | "\n", 35 | "%matplotlib inline\n", 36 | "\n", 37 | "\n", 38 | "#Now read in the dataset loansdata.csv\n", 39 | "loansData = " 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "The full schema can be found here, but let's use native Pandas methods to also explore the data. Although not specified above, this csv has row headers and the read_csv function implicitly knows this.
\n", 47 | "\n", 48 | "Let's take a look at the column names, in a nicely readable way:" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 1, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "print(\"Column Names Are:\")\n", 60 | "print(\"\")\n", 61 | "\n", 62 | "for column_name in loansData.columns.values:\n", 63 | " print(column_name)\n", 64 | "\n", 65 | "print(\"\")\n", 66 | "print('Total # of Columns = {}'.format(len(loansData.columns)))" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "It is oftentimes helpful to take a quick glance at the first few records (when possible). This is an easy way to discover basic data flaws (i.e., all nulls, misaligned fields, etc.). We can do this in Pandas very easily." 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 2, 79 | "metadata": { 80 | "collapsed": false 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "loansData.head().transpose() #We transpose it so it will fit in the display window" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "Next, let's run a quick line to get summary statistics of the numeric fields." 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 3, 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "loansData.describe().transpose()" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "Let's now focus on a single column, 'Monthly.Income.' First things first, let's display the first five records of just this field." 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 4, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "#Student input line here\n" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Let's try and understand the distribution of this field. We can do this using the hist() method and matplotlib." 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 5, 133 | "metadata": { 134 | "collapsed": false 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "#plt.figure()\n", 139 | "#hist_inc = loansData['Monthly.Income'].hist()\n", 140 | "#plt.title('Histogram of Monthly Income')\n", 141 | "#plt.show()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "How would you characterize the shape of this distribution? Is there anything we can do to the income variable to make the distribution more bell curved? Let's create a new column in the dataframe called 'Monthly.LogIncome' and print a histogram of it. What might be some advantages of making such a transformation?\n", 149 | "
\n" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 6, 155 | "metadata": { 156 | "collapsed": false 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "#Student: Add a new column Monthly.LogIncome to the dataset that is the log of the Monthly.Income columne" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": { 167 | "collapsed": true 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "#Now look at the distribution\n", 172 | "plt.figure()\n", 173 | "h = loansData['Monthly.LogIncome'].hist()\n", 174 | "plt.title('Histogram of Log(Monthly Income)')\n", 175 | "plt.show()" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": { 181 | "collapsed": false 182 | }, 183 | "source": [ 184 | "Now let's answer some questions about the data.\n", 185 | "\n", 186 | "Q1: What is the cardinality (i.e., # of distinct values) for 'Interest.Rate' and 'FICO.Range'" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 7, 192 | "metadata": { 193 | "collapsed": false 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "#Student input code here - hint, the 'describe()' method returns a useful dataframe\n" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "Q2: What data type did Pandas set fo Interest.Rate? Can we create a new field which is stored as a float?" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 8, 210 | "metadata": { 211 | "collapsed": false 212 | }, 213 | "outputs": [], 214 | "source": [ 215 | "#Student input code here - show the field type\n" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "In the cell below, create a new field 'Interest.Rate.Num' where 'Interest.Rate' is converted to a float. Hint: this can be done in one line using the .str assessor, strip and astype methods." 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 13, 228 | "metadata": { 229 | "collapsed": false 230 | }, 231 | "outputs": [], 232 | "source": [ 233 | "#Student input code here - convert field to a float using the '.str' assessor\n" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "Q3: Can we get a sense of the relationship between monthly income and interest rate? Use the scatter() function from Matplotlib." 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 9, 246 | "metadata": { 247 | "collapsed": false 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "#Student input line here\n" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "Given the skewed distribution of Income, is this chart visually helpful? Let's try the Log of income instead." 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 10, 264 | "metadata": { 265 | "collapsed": false 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "#Student input code here\n" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "Q4: What is the average interest rate for each FICO range?
\n", 277 | "Hint: use the groupby() method in Pandas." 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 11, 283 | "metadata": { 284 | "collapsed": false 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "#Student input code here\n", 289 | "fico_grp = " 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 12, 295 | "metadata": { 296 | "collapsed": false 297 | }, 298 | "outputs": [], 299 | "source": [ 300 | "#fico_grp" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": { 307 | "collapsed": true 308 | }, 309 | "outputs": [], 310 | "source": [] 311 | } 312 | ], 313 | "metadata": { 314 | "kernelspec": { 315 | "display_name": "Python [py35]", 316 | "language": "python", 317 | "name": "Python [py35]" 318 | }, 319 | "language_info": { 320 | "codemirror_mode": { 321 | "name": "ipython", 322 | "version": 3 323 | }, 324 | "file_extension": ".py", 325 | "mimetype": "text/x-python", 326 | "name": "python", 327 | "nbconvert_exporter": "python", 328 | "pygments_lexer": "ipython3", 329 | "version": "3.5.2" 330 | } 331 | }, 332 | "nbformat": 4, 333 | "nbformat_minor": 0 334 | } 335 | -------------------------------------------------------------------------------- /ipython/Labs_Student/Lab4_Survey_Questions_part1_Student.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "Let's start by reading in the data" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "metadata": { 23 | "collapsed": false 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "import pandas as pd\n", 28 | "import os\n", 29 | "import numpy as np\n", 30 | "import matplotlib.pyplot as plt\n", 31 | "%matplotlib inline\n", 32 | "\n", 33 | "\n", 34 | "#We assume data is in a parallel directory to this one called 'data'\n", 35 | "cwd = os.getcwd()\n", 36 | "datadir = '/'.join(cwd.split('/')[0:-1]) + '/data/'\n", 37 | "#or you can hardcode the directory\n", 38 | "#datadir = \n", 39 | "\n", 40 | "print(datadir)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "Now read in the data called survey_responses_2016.csv into a pandas data frame." 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 4, 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "#Student put in read data command here:" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "Let's look at the column headers and use something more descriptive" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 2, 71 | "metadata": { 72 | "collapsed": false 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "#Student put in code to look at column names" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "Column names like 'profile_1-profile_7' aren't very descriptive. As a quick data maintenance task, let's rename the columns starting with 'profile'. The dictionary in the next cell maps the integer index to a descriptive text.\n", 84 | "\n", 85 | "Tactically, let's loop through each column name. Within the loop let's check whether the column name starts with 'profile.' If it does, let's create a new name that swaps the key with the value using profile_mapping dictionary (i.e., profile_1 -> profile_Viz). We then add the new column name to a list. If it doesn't start with 'profile' just add the old column name to the list. " 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 6, 91 | "metadata": { 92 | "collapsed": true 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "profile_mapping = {1:'Viz',\n", 97 | " 2:'CS',\n", 98 | " 3:'Math',\n", 99 | " 4:'Stats',\n", 100 | " 5:'ML',\n", 101 | " 6:'Bus',\n", 102 | " 7:'Com'}" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 7, 108 | "metadata": { 109 | "collapsed": false 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "#Student put code here to change the header names\n", 114 | "newcols = []\n", 115 | "\n", 116 | "for colname in data.columns:\n", 117 | " #finish the loop \n", 118 | " \n", 119 | "#Now swap the old columns with the values in newcols \n" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "Let's use this data to illustrate common data analytic techniques. We have one numeric variable (len_answer) and different categorical variables which may carry some signal of the 'len_answer' variable. \n", 127 | "\n", 128 | "'Len_answer' is the character count of the response to the following question: \"Besides the examples given in lecture 1, discuss a case where data science has created value for some company. Please explain the company's goals and how any sort of data analysis could have helped the company achieve said goals.\" As this is a subjective business question, let's hypothesize that students with more professional experience might be more likely to give longer answers. \n", 129 | "\n", 130 | "In more technical terms, we'll test whether the variance of len_answer can be explained away by the categorical representation of a student's experience. \n", 131 | "\n", 132 | "The first thing we should do is look at the distribution of len_answer." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 3, 138 | "metadata": { 139 | "collapsed": false 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "#Student - plot a histogram here for len_answer\n" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "It looks like we have at least one strong outlier and a thick distribution around 0. Let's also use the Pandas describe() method to get a stronger sense of the distribution." 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 4, 156 | "metadata": { 157 | "collapsed": false 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "data.len_answer.describe()" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "Let's consider cleaning up the data. We'll remove the max value as well as those with a length less than 20 (which we think is a generous minimum to communicate a reasonable answer.\n", 169 | "\n", 170 | "Create a new data_frame that removes these outliers." 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 10, 176 | "metadata": { 177 | "collapsed": false 178 | }, 179 | "outputs": [ 180 | { 181 | "data": { 182 | "text/plain": [ 183 | "(93, 20)" 184 | ] 185 | }, 186 | "execution_count": 10, 187 | "metadata": {}, 188 | "output_type": "execute_result" 189 | } 190 | ], 191 | "source": [ 192 | "#Student create a filtered data frame here\n", 193 | "outlier_filter = \n", 194 | "\n", 195 | "#\n", 196 | "data_clean.shape" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "Now that we have cleaned our data, let's run a pairwise t-test on each experience level to see if their difference in len_answer is statistically significant. To run a t-test, we'll need the mean, standard-deviation and count for each group. We can achieve this with a pandas groupby operation." 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 11, 209 | "metadata": { 210 | "collapsed": false 211 | }, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/html": [ 216 | "
\n", 217 | "\n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | "
len_answer
meanstdcount
experience
2-5 years, I'm getting good at what I do!732.222222398.57046818
5+ years, I'm a veteran!717.333333269.7937486
< 2 years, I'm fresh!489.312500285.27150116
None, I just finished my undergrad!507.000000335.53625353
\n", 263 | "
" 264 | ], 265 | "text/plain": [ 266 | " len_answer \n", 267 | " mean std count\n", 268 | "experience \n", 269 | "2-5 years, I'm getting good at what I do! 732.222222 398.570468 18\n", 270 | "5+ years, I'm a veteran! 717.333333 269.793748 6\n", 271 | "< 2 years, I'm fresh! 489.312500 285.271501 16\n", 272 | "None, I just finished my undergrad! 507.000000 335.536253 53" 273 | ] 274 | }, 275 | "execution_count": 11, 276 | "metadata": {}, 277 | "output_type": "execute_result" 278 | } 279 | ], 280 | "source": [ 281 | "#Student input code here\n", 282 | "\n", 283 | "#run this to look at the grouped df\n", 284 | "data_clean_grouped" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "Visually, we can see a potential split between the [0, 2] year experience range and the [2+] experience range. Let's be more rigorous and run t-tests. Let's write a function that takes in the necessary statistics and returns a p-value.\n", 292 | "\n", 293 | "Remember, the t-stat for the difference between two means is:\n", 294 | "\n", 295 | "
$t = \\frac{\\hat{\\mu_1} - \\hat{\\mu_2}}{\\sqrt{\\frac{\\hat{\\sigma_1}^2}{n_1} + \\frac{\\hat{\\sigma_2}^2}{n_2}}}$
\n", 296 | "\n", 297 | "The p-value can be found using a t-distribution, but for simplicity, let's approximate this with the normal distribution. For the 2-tailed test, the p-value is: 2 * (1 - Norm.CDF(T))." 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 31, 303 | "metadata": { 304 | "collapsed": false 305 | }, 306 | "outputs": [], 307 | "source": [ 308 | "#Student complete the function\n", 309 | "from scipy.stats import norm\n", 310 | "def pvalue_diffmeans_twotail(mu1, sig1, n1, mu2, sig2, n2):\n", 311 | " '''\n", 312 | " P-value calculator for the hypothesis test of mu1 != mu2.\n", 313 | " Takes in the approprate inputs to compute the t-statistic for the difference between means\n", 314 | " Outputs a p-value for a two-sample t-test.\n", 315 | " '''\n", 316 | "\n", 317 | " \n", 318 | " return (t, p_value)" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "Now loop through all possible pairs in data_clean_grouped and perform a t-test." 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 10, 331 | "metadata": { 332 | "collapsed": false 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "#Student put in code here:\n", 337 | "\n", 338 | "\n", 339 | "get distinct values in the data frame for the experience variable\n", 340 | "grps = \n", 341 | "\n", 342 | "#Now loop through each pair\n", 343 | "for i, grp1 in enumerate(grps):\n", 344 | " for grp2 in grps[i + 1:]:\n", 345 | " \n", 346 | " '''\n", 347 | " Also, the result of groupby uses a multi-index. So be sure to index on 'len_answer' as well.\n", 348 | " Then pull out the mean, std, and cnt from that result. \n", 349 | " ''' \n", 350 | "\n", 351 | " #some code should go here\n", 352 | " \n", 353 | " print('Two tailed T-Test between groups: {} and {}'.format(grp1, grp2))\n", 354 | " print('Diff = {} characters'.format(round(row1['mean'] - row2['mean'], 0)))\n", 355 | " print('The t-stat is {} and p-value is {}'.format(round(tstat, 3), round(p_value, 3)))\n", 356 | " print('')" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "What are some observations you might have about the above results? Are there any with large deviances that are not statistically significant at at least a 95% level?\n", 364 | "\n", 365 | "Also, how do the numbers change if you rerun it using the original data, and not the cleaned data. What is the effect of outliers on the results?" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 11, 371 | "metadata": { 372 | "collapsed": false 373 | }, 374 | "outputs": [], 375 | "source": [ 376 | "#Rerun everything without cleaning outliers\n", 377 | "\n", 378 | "grps = \n", 379 | "\n", 380 | "#Now loop through each pair\n", 381 | "for i, grp1 in enumerate(grps):\n", 382 | " for grp2 in grps[i + 1:]:\n", 383 | " \n", 384 | " '''\n", 385 | " Also, the result of groupby uses a multi-index. So be sure to index on 'len_answer' as well.\n", 386 | " Then pull out the mean, std, and cnt from that result. \n", 387 | " ''' \n", 388 | " \n", 389 | " print('Two tailed T-Test between groups: {} and {}'.format(grp1, grp2))\n", 390 | " print('Diff = {} characters'.format(round(row1['mean'] - row2['mean'], 0)))\n", 391 | " print('The t-stat is {} and p-value is {}'.format(round(tstat, 3), round(p_value, 3)))\n", 392 | " print('')" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "metadata": { 399 | "collapsed": true 400 | }, 401 | "outputs": [], 402 | "source": [] 403 | } 404 | ], 405 | "metadata": { 406 | "anaconda-cloud": {}, 407 | "kernelspec": { 408 | "display_name": "Python [py35]", 409 | "language": "python", 410 | "name": "Python [py35]" 411 | }, 412 | "language_info": { 413 | "codemirror_mode": { 414 | "name": "ipython", 415 | "version": 3 416 | }, 417 | "file_extension": ".py", 418 | "mimetype": "text/x-python", 419 | "name": "python", 420 | "nbconvert_exporter": "python", 421 | "pygments_lexer": "ipython3", 422 | "version": "3.5.2" 423 | } 424 | }, 425 | "nbformat": 4, 426 | "nbformat_minor": 0 427 | } 428 | -------------------------------------------------------------------------------- /ipython/Labs_Student/Lab4_intro_regex.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction to Data Science\n", 8 | "## Lab 4: Intro to Regular Expressions" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "1\\. I realize that question 2 on the homework might be a little difficult; let's walk through the problem" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": { 22 | "collapsed": false 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "import re # you might find this package useful\n", 27 | "\n", 28 | "contributors = dict()\n", 29 | "\n", 30 | "# Read through each line of the data\n", 31 | "f = open(\"C:/Users/kevin/Documents/GitHub/DataScienceCourse/ipython/data/osquery_contributors.html\", \"r\")" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "for line in f:\n", 43 | " print(line)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "collapsed": true 51 | }, 52 | "outputs": [], 53 | "source": [] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "https://developers.google.com/edu/python/regular-expressions: useful introduction to regular expressions!" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "match = re.search(pat, str)\n" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": { 73 | "collapsed": false 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "str = 'an example word:cat!!'\n", 78 | "match = re.search('word:\\w\\w\\w', str)\n" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "collapsed": false 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "# If-statement after search() tests if it succeeded\n", 90 | "if match: \n", 91 | " print('found', match.group()) \n" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "## Search for pattern 'iii' in string 'piiig'.\n", 103 | "## All of the pattern must match, but it may appear anywhere.\n", 104 | "## On success, match.group() is matched text.\n", 105 | "match = re.search('iii', 'piiig') \n" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": { 112 | "collapsed": false 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "match.group()" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": { 123 | "collapsed": false 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "match = re.search('igs', 'piiig') \n", 128 | "match.group()\n" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "collapsed": false 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "## . = any char but \\n\n", 140 | "match = re.search('..g', 'piiig') \n", 141 | "match.group()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": { 148 | "collapsed": false 149 | }, 150 | "outputs": [], 151 | "source": [ 152 | "## \\d = digit char, \\w = word char\n", 153 | "match = re.search('\\d\\d\\d', 'p123g')\n", 154 | "match.group()" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": { 161 | "collapsed": false 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "match = re.search('\\w\\w\\w', '@@abcd!!')\n", 166 | "match.group()" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": { 173 | "collapsed": false 174 | }, 175 | "outputs": [], 176 | "source": [ 177 | " ## i+ = one or more i's, as many as possible.\n", 178 | "\n", 179 | "\n", 180 | "match = re.search('pi+', 'piiig')\n", 181 | "match.group()" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": { 188 | "collapsed": true 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "## Finds the first/leftmost solution, and within it drives the +\n", 193 | " ## as far as possible (aka 'leftmost and largest').\n", 194 | " ## In this example, note that it does not get to the second set of i's.\n", 195 | "\n", 196 | "match = re.search('i+', 'piigiiii')" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": { 203 | "collapsed": true 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "## \\s* = zero or more whitespace chars\n", 208 | " ## Here look for 3 digits, possibly separated by whitespace.\n", 209 | "match = re.search('\\d\\s*\\d\\s*\\d', 'xx1 2 3xx') \n" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "collapsed": true 217 | }, 218 | "outputs": [], 219 | "source": [ 220 | "match = re.search('\\d\\s*\\d\\s*\\d', 'xx12 3xx') \n" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": { 227 | "collapsed": true 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "match = re.search('\\d\\s*\\d\\s*\\d', 'xx123xx')" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": { 238 | "collapsed": true 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "## ^ = matches the start of string, so this fails:\n", 243 | "match = re.search('^b\\w+', 'foobar') \n" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "collapsed": false 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "## but without the ^ it succeeds:\n", 255 | "match = re.search('b\\w+', 'foobar')\n", 256 | "match.group()" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": { 263 | "collapsed": false 264 | }, 265 | "outputs": [], 266 | "source": [ 267 | "str = 'purple alice-b@google.com monkey dishwasher'\n", 268 | "match = re.search( %Insert code here%%, str)\n" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": { 275 | "collapsed": true 276 | }, 277 | "outputs": [], 278 | "source": [ 279 | "str = 'purple alice-b@google.com monkey dishwasher'\n", 280 | "match = re.search('([\\w.-]+)@([\\w.-]+)', str)" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": { 287 | "collapsed": false 288 | }, 289 | "outputs": [], 290 | "source": [ 291 | "match.group()" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": { 298 | "collapsed": true 299 | }, 300 | "outputs": [], 301 | "source": [] 302 | } 303 | ], 304 | "metadata": { 305 | "kernelspec": { 306 | "display_name": "Python 3", 307 | "language": "python", 308 | "name": "python3" 309 | }, 310 | "language_info": { 311 | "codemirror_mode": { 312 | "name": "ipython", 313 | "version": 3 314 | }, 315 | "file_extension": ".py", 316 | "mimetype": "text/x-python", 317 | "name": "python", 318 | "nbconvert_exporter": "python", 319 | "pygments_lexer": "ipython3", 320 | "version": "3.6.0" 321 | } 322 | }, 323 | "nbformat": 4, 324 | "nbformat_minor": 0 325 | } 326 | -------------------------------------------------------------------------------- /ipython/Labs_Student/Lab_6_FeatureRanking_AUC_Student.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "In this lab we'll look at:\n", 10 | "- How to build ROC curves\n", 11 | "- Use two different evaluation metrics to perform feature ranking\n", 12 | "- Compare/contrast the results of feature ranking on different evaluation measures\n", 13 | "- Build models on subsets of the features, using the different methods to select the subset\n", 14 | "- Compare these different models\n" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import pandas as pd\n", 26 | "import numpy as np\n", 27 | "import matplotlib.pyplot as plt\n", 28 | "from scipy.stats import entropy\n", 29 | "import os\n", 30 | "\n", 31 | "%matplotlib inline" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "First we'll load the dataset and take a quick peak at its columns and size" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "metadata": { 45 | "collapsed": false 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "#load dataset\n", 50 | "cwd = os.getcwd()\n", 51 | "datadir = '/'.join(cwd.split('/')[0:-1]) + '/data/'\n", 52 | "f = datadir + 'ads_dataset_cut.txt'\n", 53 | "data = pd.read_csv(f, sep = '\\t')\n", 54 | "data.columns, data.shape" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "In the next step we'll use the Decision Tree classifier's built in feature importance attribute to compute the normalized Mutual Information/Information Gain of each feature. Note a few things about this approach: 1). With extremely high dimensional data, one may want to calculate the normalized MI directly for each feature (the code to do that is a bit more complex so we used the DT instead), 2). The DT is a greedy algorithm, so the feature importance ranks it produces may not be equal to the rank of normalized MI calculated individually." 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 88, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "#import the decision tree module from sklearn\n", 73 | "from sklearn.tree import DecisionTreeClassifier\n", 74 | "\n", 75 | "#build a decision tree with max_depth = 20 using entropy\n", 76 | "Y = data['y_buy']\n", 77 | "X = data.drop('y_buy', 1)\n", 78 | "\n", 79 | "#Student - instantiate the DT\n", 80 | "dt = \n", 81 | "#Student - now fit the DT\n", 82 | "\n", 83 | "#Student - Now use built in feature importance attribute to get MI of each feature and Y\n", 84 | "feature_mi = " 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "Now we'll add the feature importances to a dictionary where key-values are: {feature_name:dt_feature_importance}. This can be done in one line using the zip and dict functions." 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 89, 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "#Student - Add features and their importances to a dictionary\n", 103 | "feature_mi_dict = " 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "Now we are going to compute feature ranks using AUC. We can do this without fitting a model, by just seeing how well the individual feature ranks the positives and negatives." 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 99, 116 | "metadata": { 117 | "collapsed": false 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "#define a function to print ROC curves. \n", 122 | "#It should take in only arrays/lists of predictions and outcomes\n", 123 | "from sklearn.metrics import roc_curve, auc\n", 124 | "\n", 125 | "def plotUnivariateROC(preds, truth, label_string):\n", 126 | " '''\n", 127 | " preds is an nx1 array of predictions\n", 128 | " truth is an nx1 array of truth labels\n", 129 | " label_string is text to go into the plotting label\n", 130 | " '''\n", 131 | " #Student input code here\n", 132 | " #1. call the roc_curve function to get the ROC X and Y values\n", 133 | " fpr, tpr, thresholds = \n", 134 | " #2. Input fpr and tpr into the auc function to get the AUC\n", 135 | " roc_auc = \n", 136 | " \n", 137 | " #we are doing this as a special case because we are sending unfitted predictions\n", 138 | " #into the function\n", 139 | " if roc_auc < 0.5:\n", 140 | " fpr, tpr, thresholds = roc_curve(truth, -1 * preds)\n", 141 | " roc_auc = auc(fpr, tpr)\n", 142 | "\n", 143 | " #chooses a random color for plotting\n", 144 | " c = (np.random.rand(), np.random.rand(), np.random.rand())\n", 145 | "\n", 146 | " #create a plot and set some options\n", 147 | " plt.plot(fpr, tpr, color = c, label = label_string + ' (AUC = %0.3f)' % roc_auc)\n", 148 | " \n", 149 | "\n", 150 | " plt.plot([0, 1], [0, 1], 'k--')\n", 151 | " plt.xlim([0.0, 1.0])\n", 152 | " plt.ylim([0.0, 1.0])\n", 153 | " plt.xlabel('FPR')\n", 154 | " plt.ylabel('TPR')\n", 155 | " plt.title('ROC')\n", 156 | " plt.legend(loc=\"lower right\")\n", 157 | " \n", 158 | " return roc_auc" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "Next we'll run each feature through the above function to get its invdividual AUC and also plot on a chart. We add some extra lines of matplotlib code to control the formatting and position of the legend. We also want to add each to a dictionary of the format {feature_name:feature_auc}, similar to what we did above (though not using the same one liner). Take some time to review the chart and think about why different features produce differently shaped curves. " 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 6, 171 | "metadata": { 172 | "collapsed": false 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "\n", 177 | "fig = plt.figure(figsize = (12, 6))\n", 178 | "ax = plt.subplot(111)\n", 179 | "\n", 180 | "#Plot the univariate AUC on the training data. Store the AUC\n", 181 | "\n", 182 | "feature_auc_dict = {}\n", 183 | "for col in data.drop('y_buy',1).columns:\n", 184 | " #Student put code here\n", 185 | " feature_auc_dict[col] = \n", 186 | "\n", 187 | "\n", 188 | "# Put a legend below current axis\n", 189 | "box = ax.get_position()\n", 190 | "ax.set_position([box.x0, box.y0 + box.height * 0.0 , box.width, box.height * 1])\n", 191 | "ax.legend(loc = 'upper center', bbox_to_anchor = (0.5, -0.15), fancybox = True, \n", 192 | " shadow = True, ncol = 4, prop = {'size':10})" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "Next we want to add both of the dictionaries created above into a data frame." 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 7, 205 | "metadata": { 206 | "collapsed": false 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "#Student - Add auc and mi each to a single dataframe\n", 211 | "df_auc = \n", 212 | "df_mi = \n", 213 | "\n", 214 | "#Student - Now merge the two on the feature name\n", 215 | "feat_imp_df = \n", 216 | "feat_imp_df" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "To put the different metrics on the same scale, we'll use pandas rank() method for each feature." 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 8, 229 | "metadata": { 230 | "collapsed": false 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "#Student - Now create a df that holds the ranks of auc and mi \n", 235 | "feat_ranks =\n", 236 | "\n", 237 | "#Student - Plot the two ranks\n", 238 | "\n", 239 | "#Student - Plot a y=x reference line\n" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 10, 245 | "metadata": { 246 | "collapsed": false 247 | }, 248 | "outputs": [], 249 | "source": [ 250 | "#Student - Now create lists of top 5 features for both auc and mi\n", 251 | "top5_auc = \n", 252 | "top5_mi = \n", 253 | "top5_auc, top5_mi" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "The next step is the conclusive step from all the analysis done above. We want to test which method above can be used to produce the best subset of features. What we'll do is use the top 5 features ranked by both AUC and the decision tree feature importance and compare them against each other with different algorithms." 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 14, 266 | "metadata": { 267 | "collapsed": false 268 | }, 269 | "outputs": [], 270 | "source": [ 271 | "'''\n", 272 | "Now do the following\n", 273 | "1. Split the data into 80/20 train/test\n", 274 | "2. For each set of features:\n", 275 | "- build a decision trees max_depth = 5 \n", 276 | "- build a logistic regression C = 100\n", 277 | "- get the auc and log-loss on the test set\n", 278 | "'''\n", 279 | "\n", 280 | "\n", 281 | "def getLogLoss(Ps, Ys, eps = 10**-6):\n", 282 | " return ((Ys == 1) * np.log(Ps + eps) + (Ys == 0) * np.log(1 - Ps + eps)).mean()\n", 283 | "\n", 284 | "#Student - Split into train and test randomly without using sklearn package\n", 285 | "#Note, there are many ways to do this:\n", 286 | "\n", 287 | "train_pct = 0.8\n", 288 | "#1. create an array of n random uniform variables drawn on [0,1] range\n", 289 | "rand = \n", 290 | "#2. Convert to boolean where True = random number < train_pct\n", 291 | "rand_filt = \n", 292 | "\n", 293 | "#Student - Use the filter to index data into training and test data sets\n", 294 | "train = \n", 295 | "test = \n", 296 | "\n", 297 | "\n", 298 | "fsets = [top5_auc, top5_mi]\n", 299 | "fset_descr = ['auc', 'mi']\n", 300 | "mxdepths = [5]\n", 301 | "Cs = [10**2]\n", 302 | "\n", 303 | "\n", 304 | "#Set up plotting box\n", 305 | "fig = plt.figure(figsize = (15, 8))\n", 306 | "ax = plt.subplot(111)\n", 307 | "\n", 308 | "\n", 309 | "\n", 310 | "for i, fset in enumerate(fsets):\n", 311 | " \n", 312 | " descr = fset_descr[i]\n", 313 | " #set training and testing data\n", 314 | " Y_train = train['y_buy']\n", 315 | " X_train = train[fset]\n", 316 | " Y_test = test['y_buy']\n", 317 | " X_test = test[fset]\n", 318 | " \n", 319 | " \n", 320 | " #Student - for all d in mxdepths and C in Cs, build DT's and LR's respectively\n", 321 | " # get the predictions on the test set and also get the log-loss, then plot\n", 322 | " \n", 323 | " #Student - instantiate the class\n", 324 | " dt = \n", 325 | " #Don't forget to fit the tree\n", 326 | " #Now make a prediction\n", 327 | " preds_dt = \n", 328 | " #Now compute the log-loss\n", 329 | " ll_dt = \n", 330 | " \n", 331 | " plotUnivariateROC(preds_dt, Y_test, '{}:DT:md={}:(LL={})'.format(descr, d, round(ll_dt, 3)))\n", 332 | "\n", 333 | " \n", 334 | " #Student - instantiate the class\n", 335 | " lr = \n", 336 | " #Don't forget to fit the LR\n", 337 | " #Now make a prediction\n", 338 | " preds_lr = \n", 339 | " #Now compute the log-loss\n", 340 | " ll_lr = \n", 341 | "\n", 342 | " plotUnivariateROC(preds_lr, Y_test, '{}:LR:C={}:(LL={})'.format(descr, C, round(ll_lr, 3)))\n", 343 | "\n", 344 | " \n", 345 | "# Put a legend below current axis\n", 346 | "box = ax.get_position()\n", 347 | "ax.set_position([box.x0, box.y0 + box.height * 0.0 , box.width, box.height * 1])\n", 348 | "ax.legend(loc = 'upper center', bbox_to_anchor = (0.5, -0.15), fancybox = True, \n", 349 | " shadow = True, ncol = 2, prop = {'size':10})\n" 350 | ] 351 | } 352 | ], 353 | "metadata": { 354 | "anaconda-cloud": {}, 355 | "kernelspec": { 356 | "display_name": "Python [py35]", 357 | "language": "python", 358 | "name": "Python [py35]" 359 | }, 360 | "language_info": { 361 | "codemirror_mode": { 362 | "name": "ipython", 363 | "version": 3 364 | }, 365 | "file_extension": ".py", 366 | "mimetype": "text/x-python", 367 | "name": "python", 368 | "nbconvert_exporter": "python", 369 | "pygments_lexer": "ipython3", 370 | "version": "3.5.2" 371 | } 372 | }, 373 | "nbformat": 4, 374 | "nbformat_minor": 0 375 | } 376 | -------------------------------------------------------------------------------- /ipython/Labs_Student/Lab_7_sklearn_magic_student.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "In this lab we'll demonstrate several common techniques and helpful tools used in a model building process:\n", 10 | "\n", 11 | "- Use Sklearn to generate polynomial features and rescale them\n", 12 | "- Create folds for cross-validation\n", 13 | "- Perform a grid search to optimize hyper-parameters using cross-validation\n", 14 | "- Create pipelines to perform grids search in less code\n", 15 | "- Improve upon a baseline model incrementally by adding in more complexity\n", 16 | "\n", 17 | "This lab will require using several Sklearn classes. It would be helpful to refer to appropriate documentation:\n", 18 | "- http://scikit-learn.org/stable/modules/preprocessing.html\n", 19 | "- http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler\n", 20 | "- http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html#sklearn.preprocessing.PolynomialFeatures\n", 21 | "- http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV\n", 22 | "- http://scikit-learn.org/stable/modules/pipeline.html#pipeline\n", 23 | "\n", 24 | "Also, here is a helpful tutorial that explains how to use much of the above:\n", 25 | "- https://civisanalytics.com/blog/data-science/2016/01/06/workflows-python-using-pipeline-gridsearchcv-for-compact-code/\n", 26 | "\n", 27 | "Like always, let's first load in the data.\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 1, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "import os\n", 39 | "import pandas as pd\n", 40 | "from sklearn.linear_model import LogisticRegression\n", 41 | "from sklearn.grid_search import GridSearchCV\n", 42 | "from sklearn.cross_validation import KFold\n", 43 | "cwd = os.getcwd()\n", 44 | "datadir = '/'.join(cwd.split('/')[0:-1]) + '/data/'\n", 45 | "\n", 46 | "data = pd.read_csv(datadir + 'Cell2Cell_data.csv', header=0, sep=',')\n", 47 | "\n", 48 | "#Randomly sort the data:\n", 49 | "data = data.sample(frac = 1)\n", 50 | "data.columns" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "Next we're going to prep the data. From prior analysis (Churn Case Study) we learned that we can drop a few variables, as they are either highly redundant or don't carry a strong relationship with the outcome.\n", 58 | "\n", 59 | "After dropping, we're going to use the SkLearn KFold class to set up cross validation fold indexes." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 2, 65 | "metadata": { 66 | "collapsed": false 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "#Prior analysis (from Churn Case study) has shown that we can drop a few redundant variables\n", 71 | "#We want to drop a few to speed up later calculations\n", 72 | "dropvar_list = ['incalls', 'creditcd', 'marryyes', 'travel', 'pcown']\n", 73 | "data_subset = data.drop(dropvar_list, 1)\n", 74 | "\n", 75 | "#Set up X and Y\n", 76 | "X = data_subset.drop('churndep', 1)\n", 77 | "Y = data_subset['churndep']\n", 78 | "\n", 79 | "#Use Kfold to create 4 folds\n", 80 | "kfolds = KFold(#Student -nput code here)\n" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "Next let's use cross-validation to build a baseline model. We're going to use LR with no feature pre-processing. We're going to look at both L1 and L2 regularization with different weights. We can do this very succinctly with SkLearns GridSearchCV package." 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 3, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "#1st, set up a paramater grid\n", 99 | "param_grid_lr = {'C':#Student put code here, \n", 100 | " 'penalty':#Student put code here}\n", 101 | "\n", 102 | "#2nd, call the GridSearchCV class, use LogisticRegression and 'log_loss' for scoring\n", 103 | "lr_grid_search = GridSearchCV(#Student put code here) \n", 104 | "lr_grid_search.fit(X, Y)\n", 105 | "\n", 106 | "#3rd, get the score of the best model and print it\n", 107 | "best_1 = #Student put code here\n", 108 | "print(best_1)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 4, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "#Next let's look at the best-estimator chosen to see what the parameters were\n", 120 | "lr_grid_search.#Student put code here" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "Now let's see if we can beat this by standardizing the features. We'll approach this using the GridSearchCV class but also build a pipeline. Later we'll extend the pipeline to allow for feature engineering as well." 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 5, 133 | "metadata": { 134 | "collapsed": false 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "from sklearn.pipeline import Pipeline\n", 139 | "from sklearn.preprocessing import StandardScaler\n", 140 | "\n", 141 | "#Create a set of steps. All but the last step is a transformer (something that processes data). \n", 142 | "#Build a list of steps, where the first is StandardScaler and the second is LogisticRegression\n", 143 | "#The last step should be an estimator.\n", 144 | "\n", 145 | "steps = [('scaler', #Student put code here,\n", 146 | " ('lr', #Student put code here)]\n", 147 | "\n", 148 | "#Now set up the pipeline\n", 149 | "pipeline = Pipeline(#Student put code here)\n", 150 | "\n", 151 | "#Now set up the parameter grid, paying close to the correct convention here\n", 152 | "parameters_scaler = dict(lr__C = #Student put code here,\n", 153 | " lr__penalty = #Student put code here)\n", 154 | "\n", 155 | "#Now run another grid search\n", 156 | "lr_grid_search_scaler = GridSearchCV(#Student put code here)\n", 157 | " \n", 158 | "#Don't forget to fit this GridSearchCV pipeline\n", 159 | "#Student put code here\n", 160 | "\n", 161 | "#Again, print the score of the best model\n", 162 | "best_2 = #Student put code here\n", 163 | "print(best_2)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 6, 169 | "metadata": { 170 | "collapsed": false 171 | }, 172 | "outputs": [], 173 | "source": [ 174 | "#Let's see the model after scaling. Did the optimal parameters change?\n", 175 | "lr_grid_search_scaler.best_estimator_.steps[-1][1]" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "Now that we've built a pipeline estimator that performs feature scaling and then logistic regression, let's add to it a feature engineering step. We'll then again use GridSearchCV to find an optimal parameter configuration and see if we can beat our best score above." 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 7, 188 | "metadata": { 189 | "collapsed": false 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "from sklearn.preprocessing import PolynomialFeatures\n", 194 | "\n", 195 | "#Create a set of steps. All but the last step is a transformer (something that processes data). \n", 196 | "# Step 1 - PolynomialFeatures\n", 197 | "# Step 2 - StandardScaler\n", 198 | "# Step 3 - LogisticRegression\n", 199 | "\n", 200 | "steps_poly = [#Student put code here]\n", 201 | "\n", 202 | "#Now set up the pipeline\n", 203 | "pipeline_poly = #Student put code here\n", 204 | "\n", 205 | "#Now set up a new parameter grid, use the same paramaters used above for logistic regression, \n", 206 | "#but add polynomial features up to degree 2 with and without interactions. \n", 207 | "parameters_poly = dict(#Student put code here)\n", 208 | "\n", 209 | "#Now run another grid search\n", 210 | "lr_grid_search_poly = #Student put code here\n", 211 | "lr_grid_search_poly.fit(X, Y)\n", 212 | "\n", 213 | "best_3 = lr_grid_search_poly.best_score_\n", 214 | "print(best_3)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 8, 220 | "metadata": { 221 | "collapsed": false 222 | }, 223 | "outputs": [], 224 | "source": [ 225 | "#Let's look at the best estimator, stepwise\n", 226 | "lr_grid_search_poly.best_estimator_.steps" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "Now make a bar chart to plot results" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 9, 239 | "metadata": { 240 | "collapsed": false 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "import numpy as np\n", 245 | "results = -1 * np.array([best_1, best_2, best_3])\n", 246 | "labs = ['LR', 'Scaler-LR', 'Poly-Scaler-LR']\n", 247 | "\n", 248 | "fig = plt.figure(facecolor = 'w', figsize = (12, 6))\n", 249 | "ax = plt.subplot(111)\n", 250 | "\n", 251 | "width = 0.5\n", 252 | "ind = np.arange(3)\n", 253 | "rec = ax.bar(ind + width, results, width, color='r')\n", 254 | "\n", 255 | "ax.set_xticks(ind + width)\n", 256 | "ax.set_xticklabels(labs, size = 14)\n", 257 | "ax.set_ylim([0.6, 0.7])\n", 258 | "\n", 259 | "plt.plot(np.arange(4), min(results) * np.ones(4))" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": { 266 | "collapsed": true 267 | }, 268 | "outputs": [], 269 | "source": [] 270 | } 271 | ], 272 | "metadata": { 273 | "anaconda-cloud": {}, 274 | "kernelspec": { 275 | "display_name": "Python [py35]", 276 | "language": "python", 277 | "name": "Python [py35]" 278 | }, 279 | "language_info": { 280 | "codemirror_mode": { 281 | "name": "ipython", 282 | "version": 3 283 | }, 284 | "file_extension": ".py", 285 | "mimetype": "text/x-python", 286 | "name": "python", 287 | "nbconvert_exporter": "python", 288 | "pygments_lexer": "ipython3", 289 | "version": "3.5.2" 290 | } 291 | }, 292 | "nbformat": 4, 293 | "nbformat_minor": 0 294 | } 295 | -------------------------------------------------------------------------------- /ipython/Labs_Student/NumPyBasics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "'''\n", 12 | "The core data type in Numpy is the ndarray, which enables fast and space-efficient multidimensional array processing.\n", 13 | "Note: This notebook is adapted from chapter 4 Python for Data Analysis by Wes McKinney and O'Reilly publishing. NumPy has many, \n", 14 | "many features that won't be covered here. The following snippets are just to illustrate basic data types and operations within\n", 15 | "numpy.\n", 16 | "\n", 17 | "Another good resource for learning more about ndarrays is here:\n", 18 | "http://docs.scipy.org/doc/numpy/reference/arrays.html\n", 19 | "'''\n", 20 | "\n", 21 | "#First, import NumPy\n", 22 | "import numpy as np\n", 23 | "\n", 24 | "#It is easy to create Nx1 and NxM arrays from standard Python lists\n", 25 | "l1 = [0,1,2]\n", 26 | "l2 = [3,4,5]\n", 27 | "\n", 28 | "nd1 = np.array(l1)\n", 29 | "nd2 = np.array([l1, l2])" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "The ndarray has dimension n=3 and m=1\n", 44 | "The ndarray has elements of type=int64\n", 45 | "The ndarray has dimension n=2 and m=3\n", 46 | "The ndarray has elements of type=int64\n" 47 | ] 48 | } 49 | ], 50 | "source": [ 51 | "#Now, we can get ask for some basic info to describe the ndarray\n", 52 | "def desc_ndarray(nd):\n", 53 | " try:\n", 54 | " print \"The ndarray has dimension n=%s and m=%s\" % (nd.shape[0],nd.shape[1])\n", 55 | " except:\n", 56 | " print \"The ndarray has dimension n=%s and m=1\" % nd.shape[0]\n", 57 | " print \"The ndarray has elements of type=%s\" % nd.dtype\n", 58 | "\n", 59 | "desc_ndarray(nd1)\n", 60 | "\n", 61 | "desc_ndarray(nd2)\n", 62 | "\n" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 3, 68 | "metadata": { 69 | "collapsed": false 70 | }, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/plain": [ 75 | "[array([ 0., 0., 0., 0.]),\n", 76 | " array([ 1., 1., 1., 1.]),\n", 77 | " array([ 0.47121338, 1.83328779, 0.4438019 , -0.52309325])]" 78 | ] 79 | }, 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "#There are short cuts for creating certain frequently used special ndarrays, i.e.,\n", 87 | "\n", 88 | "k=4\n", 89 | "\n", 90 | "#1. an ndarray of all zeros\n", 91 | "zero = np.zeros(k)\n", 92 | "\n", 93 | "#2. an ndarray of all ones\n", 94 | "one = np.ones(k)\n", 95 | "\n", 96 | "#3. an ndarray of random elements (this one is standard normal, but there are many distributions to choose from)\n", 97 | "rand = np.random.randn(k)\n", 98 | "\n", 99 | "[zero, one, rand]" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 4, 105 | "metadata": { 106 | "collapsed": false 107 | }, 108 | "outputs": [ 109 | { 110 | "data": { 111 | "text/plain": [ 112 | "[array([[ 0.69394907, 0.85723722],\n", 113 | " [-0.16779156, 0.41709003],\n", 114 | " [-0.94008249, -0.21591983],\n", 115 | " [-0.61305106, 0.41435495]]),\n", 116 | " array([-0.16779156, 0.41709003]),\n", 117 | " 0.41709003439166575]" 118 | ] 119 | }, 120 | "execution_count": 4, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "'''\n", 127 | "For indexing an array:\n", 128 | "1. If nx1 array, follow the same protocol as a regular Python list\n", 129 | "2. If nxm array use the following examples\n", 130 | "'''\n", 131 | "\n", 132 | "arr2d = np.random.randn(4,2)\n", 133 | "\n", 134 | "#A single index gets a full row\n", 135 | "\n", 136 | "#2 indexes returns a value\n", 137 | "[arr2d, arr2d[1], arr2d[1,1]]" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 5, 143 | "metadata": { 144 | "collapsed": false 145 | }, 146 | "outputs": [ 147 | { 148 | "data": { 149 | "text/plain": [ 150 | "[array([-0.4386254 , -0.67720483, -1.19775067, -0.21300288]),\n", 151 | " array([-0.8772508 , -1.35440967, -2.39550135, -0.42600575]),\n", 152 | " array([-0.8772508 , -1.35440967, -2.39550135, -0.42600575]),\n", 153 | " array([-0., -0., -0., -0.])]" 154 | ] 155 | }, 156 | "execution_count": 5, 157 | "metadata": {}, 158 | "output_type": "execute_result" 159 | } 160 | ], 161 | "source": [ 162 | "'''\n", 163 | "Operations between Arrays and Scalars\n", 164 | "An important feature of ndarrays is they allow batch operations on data without writing any for loops. \n", 165 | "This is called vectorization.\n", 166 | "Any arithmetic operations between equal-size arrays applies the operation elementwise. \n", 167 | "'''\n", 168 | "\n", 169 | "#examples\n", 170 | "\n", 171 | "k = 4\n", 172 | "rand = np.random.randn(k)\n", 173 | "[rand, rand + rand, 2*rand, rand*np.zeros(4)]\n", 174 | "\n" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 7, 180 | "metadata": { 181 | "collapsed": false 182 | }, 183 | "outputs": [ 184 | { 185 | "data": { 186 | "text/plain": [ 187 | "[array([ 0.19631415, 0.41059714, 4.26249299]),\n", 188 | " array([-1.46310809, 1.15559786, 0.10690073]),\n", 189 | " array([-1.26679394, 1.566195 , 4.36939372])]" 190 | ] 191 | }, 192 | "execution_count": 7, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "'''\n", 199 | "Matrix operations\n", 200 | "It is easy to do matrix operations with Nd arrays. The standard arithmetic operators don't work here though. And it is important \n", 201 | "to make sure matrix shapes are compatible\n", 202 | "'''\n", 203 | "\n", 204 | "k = 3\n", 205 | "r1 = np.random.randn(k)\n", 206 | "r2 = np.random.randn(k)\n", 207 | "\n", 208 | "#Matrix addition is the standard matrix operator\n", 209 | "[r1, r2 , r1 + r2]\n" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 8, 215 | "metadata": { 216 | "collapsed": false 217 | }, 218 | "outputs": [ 219 | { 220 | "data": { 221 | "text/plain": [ 222 | "[array([[ 0.19631415, 0.41059714, 4.26249299],\n", 223 | " [-1.46310809, 1.15559786, 0.10690073]]),\n", 224 | " array([[ 0.19631415, -1.46310809],\n", 225 | " [ 0.41059714, 1.15559786],\n", 226 | " [ 4.26249299, 0.10690073]])]" 227 | ] 228 | }, 229 | "execution_count": 8, 230 | "metadata": {}, 231 | "output_type": "execute_result" 232 | } 233 | ], 234 | "source": [ 235 | "#The Transpose can be taken with the attribute T\n", 236 | "arr2d = np.array([r1, r2])\n", 237 | "[arr2d, arr2d.T]" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 9, 243 | "metadata": { 244 | "collapsed": false 245 | }, 246 | "outputs": [ 247 | { 248 | "data": { 249 | "text/plain": [ 250 | "[array([[ 0.19631415, 0.41059714, 4.26249299],\n", 251 | " [-1.46310809, 1.15559786, 0.10690073]]),\n", 252 | " array([[ 3.85392468e-02, 1.68590015e-01, 1.81688465e+01],\n", 253 | " [ 2.14068529e+00, 1.33540642e+00, 1.14277663e-02]]),\n", 254 | " array([[ 18.37597578, 0.64291997],\n", 255 | " [ 0.64291997, 3.48751947]])]" 256 | ] 257 | }, 258 | "execution_count": 9, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [ 264 | "'''\n", 265 | "Matrix multiplication, like inner products can be done on arrays.\n", 266 | "Just remember that the standard multiplication operator does elementwise multiplication (provided they are the same shape).\n", 267 | "We need the dot method in order to do an inner product\n", 268 | "\n", 269 | "Numpy has a linalg library that can run most matrix operations on ndarrays:\n", 270 | "http://docs.scipy.org/doc/numpy/reference/routines.linalg.html\n", 271 | "\n", 272 | "One can also create a matrix object and use the methods in numpy.matrix to achieve the same thing:\n", 273 | "http://docs.scipy.org/doc/numpy/reference/generated/numpy.matrix.html\n", 274 | "'''\n", 275 | "\n", 276 | "[arr2d, arr2d * arr2d, arr2d.dot(arr2d.T)]" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 11, 282 | "metadata": { 283 | "collapsed": false 284 | }, 285 | "outputs": [ 286 | { 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "10000 loops, best of 3: 119 µs per loop\n" 291 | ] 292 | } 293 | ], 294 | "source": [ 295 | "'''\n", 296 | "One important feature of vectorization is that it allows elementwise processing that is much faster than writing a traditional\n", 297 | "loop.\n", 298 | "'''\n", 299 | "import math\n", 300 | "\n", 301 | "#show an example and profile i\n", 302 | "%timeit [math.sqrt(x) for x in range(1000)]" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 12, 308 | "metadata": { 309 | "collapsed": false 310 | }, 311 | "outputs": [ 312 | { 313 | "name": "stdout", 314 | "output_type": "stream", 315 | "text": [ 316 | "The slowest run took 9.83 times longer than the fastest. This could mean that an intermediate result is being cached \n", 317 | "100000 loops, best of 3: 5.19 µs per loop\n" 318 | ] 319 | } 320 | ], 321 | "source": [ 322 | "%timeit np.sqrt(np.arange(1000))" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 16, 328 | "metadata": { 329 | "collapsed": false 330 | }, 331 | "outputs": [ 332 | { 333 | "name": "stderr", 334 | "output_type": "stream", 335 | "text": [ 336 | "ERROR: Line magic function `%inline` not found.\n" 337 | ] 338 | } 339 | ], 340 | "source": [ 341 | "'''\n", 342 | "The last thing we'll cover in this module is the numpy.random library. In general, it is advised to use numpy for\n", 343 | "random number generation as opposed to python's built in random module.\n", 344 | "\n", 345 | "Random number generation has many uses. One common use is generating fake (i.e. random) data to test modeling procedures\n", 346 | "or to do Monte Carlo Simulations\n", 347 | "'''\n", 348 | "import matplotlib.pyplot as plt\n", 349 | "%inline\n", 350 | "\n", 351 | "\n", 352 | "#Generate random pairs that have a multivariate normal distribution\n", 353 | "N = 1000\n", 354 | "mu = np.array([0,0])\n", 355 | "cov = 0.5\n", 356 | "sig = np.array([[1, cov],[cov, 1]]) #Must be square, symmetric and non-negative definite\n", 357 | "x, y = np.random.multivariate_normal(mu, sig, N).T\n", 358 | "#Now let's plot and see what that looks like\n", 359 | "\n", 360 | "\n", 361 | "plt.plot(x, y,'x'); plt.axis('equal'); plt.show()\n", 362 | "\n" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 18, 368 | "metadata": { 369 | "collapsed": false 370 | }, 371 | "outputs": [], 372 | "source": [ 373 | "'''\n", 374 | "One final example (taken from Wes Mckinney's book):\n", 375 | "\n", 376 | "Let's generate a random walk and visualize it\n", 377 | "'''\n", 378 | "import matplotlib.pyplot as plt\n", 379 | "\n", 380 | "nsteps = 1000\n", 381 | "draws = np.random.randint(0, 2, size = nsteps) #Randint let's us generate random integers in a range\n", 382 | "steps = np.where(draws>0, 1, -1) #there function let's us do boolean logic on a conditional applied to an entire array\n", 383 | "walk = steps.cumsum() #Cumsum returns an array with the same size as steps, that has cum sum of steps up to index i\n", 384 | "plt.plot(np.arange(len(walk)), walk);plt.show()" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 30, 390 | "metadata": { 391 | "collapsed": false 392 | }, 393 | "outputs": [], 394 | "source": [] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": { 400 | "collapsed": false 401 | }, 402 | "outputs": [], 403 | "source": [] 404 | } 405 | ], 406 | "metadata": { 407 | "kernelspec": { 408 | "display_name": "Python 3", 409 | "language": "python", 410 | "name": "python3" 411 | }, 412 | "language_info": { 413 | "codemirror_mode": { 414 | "name": "ipython", 415 | "version": 3 416 | }, 417 | "file_extension": ".py", 418 | "mimetype": "text/x-python", 419 | "name": "python", 420 | "nbconvert_exporter": "python", 421 | "pygments_lexer": "ipython3", 422 | "version": "3.6.0" 423 | } 424 | }, 425 | "nbformat": 4, 426 | "nbformat_minor": 0 427 | } 428 | -------------------------------------------------------------------------------- /ipython/Labs_Student/test.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/Labs_Student/test.txt -------------------------------------------------------------------------------- /ipython/Labs_complete/Lab4_Survey_Questions_part1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Let's start by reading in the data" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 3, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "/Users/briand/Desktop/ds course/ipython/data/\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "import pandas as pd\n", 27 | "import os\n", 28 | "import numpy as np\n", 29 | "import matplotlib.pyplot as plt\n", 30 | "%matplotlib inline\n", 31 | "\n", 32 | "\n", 33 | "#We assume data is in a parallel directory to this one called 'data'\n", 34 | "cwd = os.getcwd()\n", 35 | "datadir = '/'.join(cwd.split('/')[0:-1]) + '/data/'\n", 36 | "print(datadir)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 4, 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "#Student put in read data command here:\n", 48 | "data = pd.read_csv(datadir + 'survey_responses_2016.csv', header = 0, sep=',')" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Let's look at the column headers and use something more descriptive" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 5, 61 | "metadata": { 62 | "collapsed": false 63 | }, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "Index(['id', 'cs_python', 'cs_java', 'cs_c', 'cs_perl', 'cs_javascript',\n", 69 | " 'cs_r', 'cs_sas', 'profile_1', 'profile_2', 'profile_3', 'profile_4',\n", 70 | " 'profile_5', 'profile_6', 'profile_7', 'fruit', 'len_answer', 'season',\n", 71 | " 'experience_coded', 'experience'],\n", 72 | " dtype='object')" 73 | ] 74 | }, 75 | "execution_count": 5, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "#Student put in code to look at column names\n", 82 | "data.columns" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "Column names like 'profile_1-profile_7' aren't very descriptive. As a quick data maintenance task, let's rename the columns starting with 'profile'. The dictionary in the next cell maps the integer index to a descriptive text.\n", 90 | "\n", 91 | "Tactically, let's loop through each column name. Within the loop let's check whether the column name starts with 'profile.' If it does, let's create a new name that swaps the key with the value using profile_mapping dictionary (i.e., profile_1 -> profile_Viz). We then add the new column name to a list. If it doesn't start with 'profile' just add the old column name to the list. " 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 6, 97 | "metadata": { 98 | "collapsed": true 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "profile_mapping = {1:'Viz',\n", 103 | " 2:'CS',\n", 104 | " 3:'Math',\n", 105 | " 4:'Stats',\n", 106 | " 5:'ML',\n", 107 | " 6:'Bus',\n", 108 | " 7:'Com'}" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 7, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "#Student put code here to change the header names\n", 120 | "newcols = []\n", 121 | "\n", 122 | "for colname in data.columns:\n", 123 | " \n", 124 | " if colname[0:7] == 'profile':\n", 125 | " \n", 126 | " newcols.append('profile_{}'.format(profile_mapping[int(colname[-1])]))\n", 127 | " \n", 128 | " else:\n", 129 | " \n", 130 | " newcols.append(colname)\n", 131 | " \n", 132 | "#Now swap the old columns with the values in newcols \n", 133 | "data.columns = newcols " 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "Let's use this data to illustrate common data analytic techniques. We have one numeric variable (len_answer) and different categorical variables which may carry some signal of the 'len_answer' variable. \n", 141 | "\n", 142 | "'Len_answer' is the character count of the response to the following question: \"Besides the examples given in lecture 1, discuss a case where data science has created value for some company. Please explain the company's goals and how any sort of data analysis could have helped the company achieve said goals.\" As this is a subjective business question, let's hypothesize that students with more professional experience might be more likely to give longer answers. \n", 143 | "\n", 144 | "In more technical terms, we'll test whether the variance of len_answer can be explained away by the categorical representation of a student's experience. \n", 145 | "\n", 146 | "The first thing we should do is look at the distribution of len_answer." 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 8, 152 | "metadata": { 153 | "collapsed": false 154 | }, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "(array([ 41., 35., 18., 5., 1., 1., 0., 0., 0., 1.]),\n", 160 | " array([ 0. , 368.3, 736.6, 1104.9, 1473.2, 1841.5, 2209.8,\n", 161 | " 2578.1, 2946.4, 3314.7, 3683. ]),\n", 162 | " )" 163 | ] 164 | }, 165 | "execution_count": 8, 166 | "metadata": {}, 167 | "output_type": "execute_result" 168 | }, 169 | { 170 | "data": { 171 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEACAYAAAC9Gb03AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAElJJREFUeJzt3X+sZGddx/H3Z1tpCm2vVewuYWVLQ4RCbJaqjaYaF6Hc\nFSJt+INghfJDTf+w2NgEaWvMFmMMNXHRxPCPRbJFkRIitk1Qts0yJKCUanftFpZaowsWudcSmqvY\n8KPs1z/mXJkud3fm3nvmzvTp+5Wc7Jlnzpznu8/sfubMM3PmpKqQJLVr26wLkCRNl0EvSY0z6CWp\ncQa9JDXOoJekxhn0ktS4iYM+ybYkh5Pc1d3el+TRJA90y97plSlJ2qgz17Ht9cDngfNG2vZX1f5+\nS5Ik9WmiI/okO4HXALedfFfvFUmSejXp1M17gXcCJ59Ge12SI0luS7LQb2mSpD6MDfokrwWWq+oI\nTz2Cfx9wUVXtBpYAp3AkaQ5l3G/dJPkD4E3Ak8DZwLnAX1fVNSPb7ALurqpL1ni8P6YjSRtQVb1M\nj489oq+qm6vqBVV1EfBG4FBVXZNkx8hmrwceOs0+5n7Zt2/fzGuwTmu0TutcXfq0nm/dnOwPk+wG\nTgDHgWt7qUiS1Kt1BX1VfQr4VLd+zZjNJUlzwDNjO3v27Jl1CROxzv48HWoE6+zb06XOPo39MHbT\nHSQ17T4kqTVJqK36MFaS9PRm0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BL\nUuMMeklqnEEvSY3bzO/RT2xlZWUrunmKbdu2ce655255v5I0b7bk1yuf9azzptrHWr773W/yuc/9\nA5deeumW9y1Jm9Xnr1dOfESfZBvwj8CjVfW6JOcDdwC7GF5h6g1Vteah+7e/vfVH9AsLizz22GNb\n3q8kzZv1zNFfD3xh5PaNwL1V9WLgEHBTn4VJkvoxUdAn2Qm8BrhtpPlK4EC3fgC4qt/SJEl9mPSI\n/r3AO4HRCf3tVbUMUFVLwAU91yZJ6sHYOfokrwWWq+pIkj2n2fQ0n+reMrK+p1skSasGgwGDwWAq\n+x77rZskfwC8CXgSOBs4F/gY8JPAnqpaTrID+GRVXbzG4+u0rwFTsrCwyB133MDi4uKW9y1Jm7Wl\n14ytqpur6gVVdRHwRuBQVb0ZuBt4a7fZW4A7+yhIktSvzZwZ+x7giiQPA6/sbkuS5sy6zoytqk8B\nn+rWvw68ahpFSZL642/dSFLjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6\nSWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaNzbok5yV5L4kh5McTbKva9+X5NEkD3TL3umX\nK0lar7FXmKqqbyV5RVU9keQM4DNJ/ra7e39V7Z9uiZKkzZho6qaqnuhWz2L44lDd7V6uUC5Jmp6J\ngj7JtiSHgSXgnqq6v7vruiRHktyWZGFqVUqSNixVNX6r1Y2T84CPAe8AHgO+VlWV5PeB51XVr67x\nmIJ9Iy17umW6FhYW2bbtKI8//tWp97WW7dt3sbR0fCZ9S3r6GQwGDAaD/7/97ne/m6rqZdZkXUEP\nkOR3gf8dnZtPsgu4u6ouWWP7+t5Mz9ZZWFhkZeUgs+h7KKx3bCVpVZLegn6Sb908d3VaJsnZwBXA\nF5PsGNns9cBDfRQkSerX2G/dAM8DDiTZxvCF4Y6q+niS25PsBk4Ax4Frp1emJGmjJvl65VHg0jXa\nr5lKRZKkXnlmrCQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS\n1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcZNcSvCsJPclOZzkaJJ9Xfv5SQ4meTjJJ1YvNyhJmi9j\ng76qvgW8oqpeDuwGfjHJZcCNwL1V9WLgEHDTVCuVJG3IRFM3VfVEt3oWw8sPFnAlcKBrPwBc1Xt1\nkqRNmyjok2xLchhYAu6pqvuB7VW1DFBVS8AF0ytTkrRRYy8ODlBVJ4CXJzkP+FiSlzE8qn/KZqfe\nwy0j63u6RZK0ajAYMBgMprLvVJ0mn9d6QPK7wBPArwF7qmo5yQ7gk1V18Rrb12lfA6ZkYWGRlZWD\nzKLvobDesZWkVUmoqvSxr0m+dfPc1W/UJDkbuAI4BtwFvLXb7C3AnX0UJEnq1yRTN88DDiTZxvCF\n4Y6q+niSzwIfSfJ24EvAG6ZYpyRpg8YGfVUdBS5do/3rwKumUZQkqT+eGStJjTPoJalxBr0kNc6g\nl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjJrnC\n1M4kh5J8PsnRJO/o2vcleTTJA92yd/rlSpLWa5IrTD0J3FBVR5KcA/xTknu6+/ZX1f7plSdJ2qxJ\nrjC1BCx1699Icgx4fnd3LxeulSRNz7rm6JNcCOwG7uuarktyJMltqxcQlyTNl4mDvpu2+ShwfVV9\nA3gfcFFV7WZ4xO8UjiTNoUnm6ElyJsOQ/2BV3QlQVY+NbPJnwN2n3sMtI+t7ukWStGowGDAYDKay\n71TV+I2S24GvVdUNI207uvl7kvwW8FNVdfUajy0Y30ffFhYWWVk5yCz6HgqTjK0krSUJVdXL56Bj\nj+iTXA78CnA0yWGGyXkzcHWS3cAJ4DhwbR8FSZL6Ncm3bj4DnLHGXX/XfzmSpL55ZqwkNc6gl6TG\nGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxB\nL0mNM+glqXFjgz7JziSHknw+ydEkv9m1n5/kYJKHk3wiycL0y5UkrdckR/RPAjdU1cuAnwF+I8lL\ngBuBe6vqxcAh4KbplSlJ2qixQV9VS1V1pFv/BnAM2AlcCRzoNjsAXDWtIiVJG7euOfokFwK7gc8C\n26tqGYYvBsAFfRcnSdq8iYM+yTnAR4HruyP7OmmTk29LkubAmZNslORMhiH/waq6s2teTrK9qpaT\n7AD+69R7uGVkfU+3SJJWDQYDBoPBVPadqvEH4kluB75WVTeMtN0KfL2qbk3yLuD8qrpxjcfWLA72\nFxYWWVk5yOzeaIRJxlaS1pKEqkof+xp7RJ/kcuBXgKNJDjNMzpuBW4GPJHk78CXgDX0UJEnq19ig\nr6rPAGec4u5X9VuOJKlvnhkrSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGjfRmbHaiLNIejnXYV22\nb9/F0tLxLe9X0vwy6KfmW8zirNzl5a1/cZE035y6kaTGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z\n6CWpcQa9JDVubNAneX+S5SQPjrTtS/Jokge6Ze90y5QkbdQkR/QfABbXaN9fVZd2y9/1XJckqSdj\ng76qPg08vsZdnmsvSU8Dm5mjvy7JkSS3JVnorSJJUq82+qNm7wN+r6oqye8D+4FfPfXmt4ys7+kW\nSdKqwWDAYDCYyr5TNf4XFpPsAu6uqkvWc193f83iVxwXFhZZWTnILPoeyoz6DpM8p5LmWxKqqpcp\n8kmnbsLInHySHSP3vR54qI9iJEn9Gzt1k+RDDOdafjjJl4F9wCuS7AZOAMeBa6dYoyRpE8YGfVVd\nvUbzB6ZQiyRpCjwzVpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1Lj\nDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUuLFBn+T9SZaTPDjSdn6Sg0keTvKJJAvTLVOStFGT\nHNF/AFg8qe1G4N6qejFwCLip78IkSf0YG/RV9Wng8ZOarwQOdOsHgKt6rkuS1JONztFfUFXLAFW1\nBFzQX0mSpD6NvTj4hOr0d98ysr6nWyRJqwaDAYPBYCr7TtWYjAaS7ALurqpLutvHgD1VtZxkB/DJ\nqrr4FI+tsa8DU7CwsMjKykFm0fdQZtR3mOQ5lTTfklBV6WNfk07dpFtW3QW8tVt/C3BnH8VIkvo3\nydcrPwT8PfBjSb6c5G3Ae4ArkjwMvLK7LUmaQ2Pn6Kvq6lPc9aqea5EkTYFnxkpS4wx6SWqcQS9J\njTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4\ng16SGrepi4MnOQ6sACeA71TVZX0UJUnqz6aCnmHA76mqx/soRpLUv81O3aSHfUiSpmizIV3APUnu\nT/LrfRQkSerXZqduLq+qryb5EYaBf6yqPv39m90ysr6nWzQdZ5FkJj1v376LpaXjM+lberobDAYM\nBoOp7DtV1c+Okn3A/1TV/pPaa3jgv7UWFhZZWTnILPoeyoz6nlW/w777+vckPdMloap6OWrb8NRN\nkmcnOadbfw7wauChPoqSJPVnM1M324GPDY/YORP4y6o62E9ZkqS+bDjoq+rfgd091iJJmgK/GilJ\njTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4\ng16SGmfQS1LjNhX0SfYm+WKSf0nyrr6K0tPV8Hq1s1jOOOM5M+t7x44LZz3wzyg7dlzoc71Om7mU\n4DbgT4FF4GXALyd5SV+Fbb3BrAuY0GDWBZzGtxher7aAT46sT385ceKJDTyunxqXl7/Uz/CdwrQu\nGN23rapzON6bec42/rxP+7mels0c0V8GPFJVX6qq7wAfBq7sp6xZGMy6gAkNZl3AhAazLmACg1kX\nMBGDvm+DWRew5TYT9M8H/mPk9qNdmyRpjmzm4uATO++8X9qKbp7im998YMv7lKR5lKra2AOTnwZu\nqaq93e0bgaqqW0/abmMdSNIzXFWlj/1sJujPAB4GXgl8Ffgc8MtVdayPwiRJ/djw1E1VfTfJdcBB\nhnP97zfkJWn+bPiIXpL09DC1M2Pn7WSqJMeT/HOSw0k+17Wdn+RgkoeTfCLJwsj2NyV5JMmxJK+e\nYl3vT7Kc5MGRtnXXleTSJA924/3HW1TnviSPJnmgW/bOss4kO5McSvL5JEeT/GbXPlfjuUad7+ja\n5208z0pyX/d/5miSfV37vI3nqeqcq/Hs9r+tq+Wu7vbWjGVV9b4wfAH5V2AX8APAEeAl0+hrHTX9\nG3D+SW23Ar/drb8LeE+3/lLgMMOprQu7v0umVNfPAruBBzdTF3Af8FPd+seBxS2ocx9wwxrbXjyL\nOoEdwO5u/RyGnyG9ZN7G8zR1ztV4dvt8dvfnGcBnGZ4/M1fjeZo653E8fwv4C+Cu7vaWjOW0jujn\n8WSq8P3vYK4EDnTrB4CruvXXAR+uqier6jjwCMO/U++q6tPA45upK8kO4Nyqur/b7vaRx0yzThiO\n68munEWdVbVUVUe69W8Ax4CdzNl4nqLO1XNQ5mY8u/qe6FbPYhg6xZyN52nqhDkazyQ7gdcAt51U\ny9THclpBP48nUxVwT5L7k/xa17a9qpZh+J8PuKBrP7n+r7C19V+wzrqez3CMV23leF+X5EiS20be\nds68ziQXMnwH8lnW/zzPos77uqa5Gs9uquEwsATc0wXM3I3nKeqE+RrP9wLv5HsvQrBFY/lM+vXK\ny6vqUoavqL+R5Od46oCzxu15Ma91vQ+4qKp2M/wP9kczrgeAJOcAHwWu746Y5/J5XqPOuRvPqjpR\nVS9n+M7osiQvYw7Hc406X8ocjWeS1wLL3Tu50303fipjOa2g/wrwgpHbO7u2mamqr3Z/Pgb8DcOp\nmOUk2wG6t0T/1W3+FeBHRx6+1fWvt66Z1FtVj1U3UQj8Gd+b3ppZnUnOZBieH6yqO7vmuRvPteqc\nx/FcVVX/zfBHYvYyh+O5Vp1zNp6XA69L8m/AXwG/kOSDwNJWjOW0gv5+4EVJdiV5FvBG4K4p9TVW\nkmd3R08keQ7wauBoV9Nbu83eAqwGw13AG5M8K8kLgRcxPCFsaiXy1Ff5ddXVveVbSXJZkgDXjDxm\nanV2/zBXvR54aA7q/HPgC1X1JyNt8zie31fnvI1nkueuTnckORu4guHnCXM1nqeo84vzNJ5VdXNV\nvaCqLmKYh4eq6s3A3WzFWPb5ifJJny7vZfhtgkeAG6fVz4S1vJDhN38OMwz4G7v2HwLu7eo8CPzg\nyGNuYvhJ9zHg1VOs7UPAfzL8jd8vA28Dzl9vXcBPdH+3R4A/2aI6bwce7Mb2bxjON86sToZHTd8d\nea4f6P4drvt5nlGd8zaeP97VdqSr63c2+v9mRnXO1XiO9PHzfO9bN1sylp4wJUmNeyZ9GCtJz0gG\nvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9Jjfs/4GVOfh6ByxoAAAAASUVORK5CYII=\n", 172 | "text/plain": [ 173 | "" 174 | ] 175 | }, 176 | "metadata": {}, 177 | "output_type": "display_data" 178 | } 179 | ], 180 | "source": [ 181 | "#Student - build and plot a histogram here\n", 182 | "plt.hist(data.len_answer)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "It looks like we have at least one strong outlier and a thick distribution around 0. Let's also use the Pandas describe() method to get a stronger sense of the distribution." 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 9, 195 | "metadata": { 196 | "collapsed": false 197 | }, 198 | "outputs": [ 199 | { 200 | "data": { 201 | "text/plain": [ 202 | "count 102.000000\n", 203 | "mean 547.725490\n", 204 | "std 480.267152\n", 205 | "min 0.000000\n", 206 | "25% 262.500000\n", 207 | "50% 460.500000\n", 208 | "75% 745.750000\n", 209 | "max 3683.000000\n", 210 | "Name: len_answer, dtype: float64" 211 | ] 212 | }, 213 | "execution_count": 9, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "data.len_answer.describe()" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "Let's consider cleaning up the data. We'll remove the max value as well as those with a length less than 20 (which we think is a generous minimum to communicate a reasonable answer.\n", 227 | "\n", 228 | "Create a new data_frame that removes these outliers." 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 10, 234 | "metadata": { 235 | "collapsed": false 236 | }, 237 | "outputs": [ 238 | { 239 | "data": { 240 | "text/plain": [ 241 | "(93, 20)" 242 | ] 243 | }, 244 | "execution_count": 10, 245 | "metadata": {}, 246 | "output_type": "execute_result" 247 | } 248 | ], 249 | "source": [ 250 | "#Student create a filtered data frame here\n", 251 | "outlier_filter = (data.len_answer > 20) & (data.len_answer < data.len_answer.max())\n", 252 | "data_clean = data[outlier_filter]\n", 253 | "data_clean.shape" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "Now that we have cleaned our data, let's run a pairwise t-test on each experience level to see if their difference in len_answer is statistically significant. To run a t-test, we'll need the mean, standard-deviation and count for each group. We can achieve this with a pandas groupby operation." 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 11, 266 | "metadata": { 267 | "collapsed": false 268 | }, 269 | "outputs": [ 270 | { 271 | "data": { 272 | "text/html": [ 273 | "
\n", 274 | "\n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | "
len_answer
meanstdcount
experience
2-5 years, I'm getting good at what I do!732.222222398.57046818
5+ years, I'm a veteran!717.333333269.7937486
< 2 years, I'm fresh!489.312500285.27150116
None, I just finished my undergrad!507.000000335.53625353
\n", 320 | "
" 321 | ], 322 | "text/plain": [ 323 | " len_answer \n", 324 | " mean std count\n", 325 | "experience \n", 326 | "2-5 years, I'm getting good at what I do! 732.222222 398.570468 18\n", 327 | "5+ years, I'm a veteran! 717.333333 269.793748 6\n", 328 | "< 2 years, I'm fresh! 489.312500 285.271501 16\n", 329 | "None, I just finished my undergrad! 507.000000 335.536253 53" 330 | ] 331 | }, 332 | "execution_count": 11, 333 | "metadata": {}, 334 | "output_type": "execute_result" 335 | } 336 | ], 337 | "source": [ 338 | "#Student input code here\n", 339 | "data_clean_grouped = data_clean[['len_answer', 'experience']].groupby(['experience']).agg(['mean', 'std', 'count'])\n", 340 | "data_clean_grouped" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "Visually, we can see a potential split between the [0, 2] year experience range and the [2+] experience range. Let's be more rigorous and run t-tests. Let's write a function that takes in the necessary statistics and returns a p-value.\n", 348 | "\n", 349 | "Remember, the t-stat for the difference between two means is:\n", 350 | "\n", 351 | "
$t = \\frac{\\hat{\\mu_1} - \\hat{\\mu_2}}{\\sqrt{\\frac{\\hat{\\sigma_1}^2}{n_1} + \\frac{\\hat{\\sigma_2}^2}{n_2}}}$
\n", 352 | "\n", 353 | "The p-value can be found using a t-distribution, but for simplicity, let's approximate this with the normal distribution. For the 2-tailed test, the p-value is: 2 * (1 - Norm.CDF(T))." 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 31, 359 | "metadata": { 360 | "collapsed": false 361 | }, 362 | "outputs": [], 363 | "source": [ 364 | "#Student complete the function\n", 365 | "from scipy.stats import norm\n", 366 | "def pvalue_diffmeans_twotail(mu1, sig1, n1, mu2, sig2, n2):\n", 367 | " '''\n", 368 | " P-value calculator for the hypothesis test of mu1 != mu2.\n", 369 | " Takes in the approprate inputs to compute the t-statistic for the difference between means\n", 370 | " Outputs a p-value for a two-sample t-test.\n", 371 | " '''\n", 372 | " diff = mu1 - mu2\n", 373 | " stderror = np.sqrt(sig1**2 / n1 + sig2**2 / n2)\n", 374 | " t = diff / stderror\n", 375 | " \n", 376 | " p_value = 2 * (1- norm.cdf(t))\n", 377 | " \n", 378 | " return (t, p_value)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "Now loop through all possible pairs in data_clean_grouped and perform a t-test." 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 32, 391 | "metadata": { 392 | "collapsed": false 393 | }, 394 | "outputs": [ 395 | { 396 | "name": "stdout", 397 | "output_type": "stream", 398 | "text": [ 399 | "Two tailed T-Test between groups: 2-5 years, I'm getting good at what I do! and 5+ years, I'm a veteran!\n", 400 | "Diff = 15.0 characters\n", 401 | "The t-stat is 0.103 and p-value is 0.918\n", 402 | "\n", 403 | "Two tailed T-Test between groups: 2-5 years, I'm getting good at what I do! and < 2 years, I'm fresh!\n", 404 | "Diff = 243.0 characters\n", 405 | "The t-stat is 2.059 and p-value is 0.039\n", 406 | "\n", 407 | "Two tailed T-Test between groups: 2-5 years, I'm getting good at what I do! and None, I just finished my undergrad!\n", 408 | "Diff = 225.0 characters\n", 409 | "The t-stat is 2.152 and p-value is 0.031\n", 410 | "\n", 411 | "Two tailed T-Test between groups: 5+ years, I'm a veteran! and < 2 years, I'm fresh!\n", 412 | "Diff = 228.0 characters\n", 413 | "The t-stat is 1.738 and p-value is 0.082\n", 414 | "\n", 415 | "Two tailed T-Test between groups: 5+ years, I'm a veteran! and None, I just finished my undergrad!\n", 416 | "Diff = 210.0 characters\n", 417 | "The t-stat is 1.762 and p-value is 0.078\n", 418 | "\n", 419 | "Two tailed T-Test between groups: < 2 years, I'm fresh! and None, I just finished my undergrad!\n", 420 | "Diff = -18.0 characters\n", 421 | "The t-stat is -0.208 and p-value is 1.165\n", 422 | "\n" 423 | ] 424 | } 425 | ], 426 | "source": [ 427 | "#Student put in code here:\n", 428 | "\n", 429 | "#get distinct values in the data frame for the experience variable\n", 430 | "\n", 431 | "#data_grouped = data[['len_answer', 'experience']].groupby(['experience']).agg(['mean', 'std', 'count'])\n", 432 | "#ttest_data = data_grouped\n", 433 | "\n", 434 | "\n", 435 | "ttest_data = data_clean_grouped\n", 436 | "\n", 437 | "\n", 438 | "grps = ttest_data.index.values\n", 439 | "\n", 440 | "#Now loop through each pair\n", 441 | "for i, grp1 in enumerate(grps):\n", 442 | " for grp2 in grps[i + 1:]:\n", 443 | " \n", 444 | " '''\n", 445 | " hint: since the grp name is the index, pull out the record corresponding to that index value. \n", 446 | " Also, the result of groupby uses a multi-index. So be sure to index on 'len_answer' as well.\n", 447 | " Then pull out the mean, std, and cnt from that result. \n", 448 | " ''' \n", 449 | " row1 = ttest_data.ix[grp1].ix['len_answer']\n", 450 | " row2 = ttest_data.ix[grp2].ix['len_answer']\n", 451 | " \n", 452 | " tstat, p_value = pvalue_diffmeans_twotail(row1['mean'], row1['std'], row1['count'], row2['mean'], row2['std'], row2['count'])\n", 453 | " \n", 454 | " print('Two tailed T-Test between groups: {} and {}'.format(grp1, grp2))\n", 455 | " print('Diff = {} characters'.format(round(row1['mean'] - row2['mean'], 0)))\n", 456 | " print('The t-stat is {} and p-value is {}'.format(round(tstat, 3), round(p_value, 3)))\n", 457 | " print('')" 458 | ] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "metadata": {}, 463 | "source": [ 464 | "What are some observations you might have about the above results? Are there any with large deviances that are not statistically significant at at least a 95% level?\n", 465 | "\n", 466 | "Also, how do the numbers change if you rerun it using the original data, and not the cleaned data. What is the effect of outliers on the results?" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 33, 472 | "metadata": { 473 | "collapsed": false 474 | }, 475 | "outputs": [ 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "Two tailed T-Test between groups: 2-5 years, I'm getting good at what I do! and 5+ years, I'm a veteran!\n", 481 | "Diff = 79.0 characters\n", 482 | "The t-stat is 0.466 and p-value is 0.641\n", 483 | "\n", 484 | "Two tailed T-Test between groups: 2-5 years, I'm getting good at what I do! and < 2 years, I'm fresh!\n", 485 | "Diff = 54.0 characters\n", 486 | "The t-stat is 0.252 and p-value is 0.801\n", 487 | "\n", 488 | "Two tailed T-Test between groups: 2-5 years, I'm getting good at what I do! and None, I just finished my undergrad!\n", 489 | "Diff = 230.0 characters\n", 490 | "The t-stat is 2.148 and p-value is 0.032\n", 491 | "\n", 492 | "Two tailed T-Test between groups: 5+ years, I'm a veteran! and < 2 years, I'm fresh!\n", 493 | "Diff = -25.0 characters\n", 494 | "The t-stat is -0.104 and p-value is 1.083\n", 495 | "\n", 496 | "Two tailed T-Test between groups: 5+ years, I'm a veteran! and None, I just finished my undergrad!\n", 497 | "Diff = 152.0 characters\n", 498 | "The t-stat is 1.04 and p-value is 0.298\n", 499 | "\n", 500 | "Two tailed T-Test between groups: < 2 years, I'm fresh! and None, I just finished my undergrad!\n", 501 | "Diff = 176.0 characters\n", 502 | "The t-stat is 0.894 and p-value is 0.372\n", 503 | "\n" 504 | ] 505 | } 506 | ], 507 | "source": [ 508 | "#Rerun everything without cleaning outliers\n", 509 | "data_grouped = data[['len_answer', 'experience']].groupby(['experience']).agg(['mean', 'std', 'count'])\n", 510 | "ttest_data = data_grouped\n", 511 | "\n", 512 | "\n", 513 | "grps = ttest_data.index.values\n", 514 | "\n", 515 | "#Now loop through each pair\n", 516 | "for i, grp1 in enumerate(grps):\n", 517 | " for grp2 in grps[i + 1:]:\n", 518 | " \n", 519 | " '''\n", 520 | " hint: since the grp name is the index, pull out the record corresponding to that index value. \n", 521 | " Also, the result of groupby uses a multi-index. So be sure to index on 'len_answer' as well.\n", 522 | " Then pull out the mean, std, and cnt from that result. \n", 523 | " ''' \n", 524 | " row1 = ttest_data.ix[grp1].ix['len_answer']\n", 525 | " row2 = ttest_data.ix[grp2].ix['len_answer']\n", 526 | " \n", 527 | " tstat, p_value = pvalue_diffmeans_twotail(row1['mean'], row1['std'], row1['count'], row2['mean'], row2['std'], row2['count'])\n", 528 | " \n", 529 | " print('Two tailed T-Test between groups: {} and {}'.format(grp1, grp2))\n", 530 | " print('Diff = {} characters'.format(round(row1['mean'] - row2['mean'], 0)))\n", 531 | " print('The t-stat is {} and p-value is {}'.format(round(tstat, 3), round(p_value, 3)))\n", 532 | " print('')" 533 | ] 534 | } 535 | ], 536 | "metadata": { 537 | "anaconda-cloud": {}, 538 | "kernelspec": { 539 | "display_name": "Python [py35]", 540 | "language": "python", 541 | "name": "Python [py35]" 542 | }, 543 | "language_info": { 544 | "codemirror_mode": { 545 | "name": "ipython", 546 | "version": 3 547 | }, 548 | "file_extension": ".py", 549 | "mimetype": "text/x-python", 550 | "name": "python", 551 | "nbconvert_exporter": "python", 552 | "pygments_lexer": "ipython3", 553 | "version": "3.5.2" 554 | } 555 | }, 556 | "nbformat": 4, 557 | "nbformat_minor": 0 558 | } 559 | -------------------------------------------------------------------------------- /ipython/Labs_complete/lab_7_sklearn_complete.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "In this lab we'll demonstrate several common techniques and helpful tools used in a model building process:\n", 10 | "\n", 11 | "- Use Sklearn to generate polynomial features and rescale them\n", 12 | "- Create folds for cross-validation\n", 13 | "- Perform a grid search to optimize hyper-parameters using cross-validation\n", 14 | "- Create pipelines to perform grids search in less code\n", 15 | "- Improve upon a baseline model incrementally by adding in more complexity\n", 16 | "\n", 17 | "This lab will require using several Sklearn classes. It would be helpful to refer to appropriate documentation:\n", 18 | "- http://scikit-learn.org/stable/modules/preprocessing.html\n", 19 | "- http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler\n", 20 | "- http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html#sklearn.preprocessing.PolynomialFeatures\n", 21 | "- http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV\n", 22 | "- http://scikit-learn.org/stable/modules/pipeline.html#pipeline\n", 23 | "\n", 24 | "Also, here is a helpful tutorial that explains how to use much of the above:\n", 25 | "- https://civisanalytics.com/blog/data-science/2016/01/06/workflows-python-using-pipeline-gridsearchcv-for-compact-code/\n", 26 | "\n", 27 | "Like always, let's first load in the data.\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 99, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/plain": [ 40 | "Index(['revenue', 'outcalls', 'incalls', 'months', 'eqpdays', 'webcap',\n", 41 | " 'marryyes', 'travel', 'pcown', 'creditcd', 'retcalls', 'churndep'],\n", 42 | " dtype='object')" 43 | ] 44 | }, 45 | "execution_count": 99, 46 | "metadata": {}, 47 | "output_type": "execute_result" 48 | } 49 | ], 50 | "source": [ 51 | "import os\n", 52 | "import pandas as pd\n", 53 | "from sklearn.linear_model import LogisticRegression\n", 54 | "from sklearn.grid_search import GridSearchCV\n", 55 | "from sklearn.cross_validation import KFold\n", 56 | "cwd = os.getcwd()\n", 57 | "datadir = '/'.join(cwd.split('/')[0:-1]) + '/data/'\n", 58 | "\n", 59 | "\n", 60 | "\n", 61 | "data = pd.read_csv(datadir + 'Cell2Cell_data.csv', header=0, sep=',')\n", 62 | "\n", 63 | "#Randomly sort the data:\n", 64 | "data = data.sample(frac = 1)\n", 65 | "data.columns" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "Next we're going to prep the data. From prior analysis (Churn Case Study) we learned that we can drop a few variables, as they are either highly redundant or don't carry a strong relationship with the outcome.\n", 73 | "\n", 74 | "After dropping, we're going to use the SkLearn KFold class to set up cross validation fold indexes." 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 64, 80 | "metadata": { 81 | "collapsed": false 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "#Prior analysis (from Churn Case study) has shown that we can drop a few redundant variables\n", 86 | "#We want to drop a few to speed up later calculations\n", 87 | "dropvar_list = ['incalls', 'creditcd', 'marryyes', 'travel', 'pcown']\n", 88 | "data_subset = data.drop(dropvar_list, 1)\n", 89 | "\n", 90 | "#Set up X and Y\n", 91 | "X = data_subset.drop('churndep', 1)\n", 92 | "Y = data_subset['churndep']\n", 93 | "\n", 94 | "#Use Kfold to create 4 folds\n", 95 | "kfolds = KFold(data_subset.shape[0], n_folds = 4)\n" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "Next let's use cross-validation to build a baseline model. We're going to use LR with no feature pre-processing. We're going to look at both L1 and L2 regularization with different weights. We can do this very succinctly with SkLearns GridSearchCV package." 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 91, 108 | "metadata": { 109 | "collapsed": false 110 | }, 111 | "outputs": [ 112 | { 113 | "name": "stdout", 114 | "output_type": "stream", 115 | "text": [ 116 | "-0.682495178553\n" 117 | ] 118 | } 119 | ], 120 | "source": [ 121 | "#1st, set up a paramater grid\n", 122 | "param_grid_lr = {'C':[10**i for i in range(-3, 3)], 'penalty':['l1', 'l2']}\n", 123 | "\n", 124 | "#2nd, call the GridSearchCV class, use LogisticRegression and 'log_loss' for scoring\n", 125 | "lr_grid_search = GridSearchCV(LogisticRegression(), param_grid_lr, cv = kfolds, scoring = 'log_loss') \n", 126 | "lr_grid_search.fit(X, Y)\n", 127 | "\n", 128 | "#3rd, get the score of the best model and print it\n", 129 | "best_1 = lr_grid_search.best_score_\n", 130 | "print(best_1)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 39, 136 | "metadata": { 137 | "collapsed": false 138 | }, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,\n", 144 | " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", 145 | " penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n", 146 | " verbose=0, warm_start=False)" 147 | ] 148 | }, 149 | "execution_count": 39, 150 | "metadata": {}, 151 | "output_type": "execute_result" 152 | } 153 | ], 154 | "source": [ 155 | "#Next let's look at the best-estimator chosen to see what the parameters were\n", 156 | "lr_grid_search.best_estimator_" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "Now let's see if we can beat this by standardizing the features. We'll approach this using the GridSearchCV class but also build a pipeline. Later we'll extend the pipeline to allow for feature engineering as well." 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 72, 169 | "metadata": { 170 | "collapsed": false 171 | }, 172 | "outputs": [ 173 | { 174 | "name": "stdout", 175 | "output_type": "stream", 176 | "text": [ 177 | "-0.682490465504\n" 178 | ] 179 | } 180 | ], 181 | "source": [ 182 | "from sklearn.pipeline import Pipeline\n", 183 | "from sklearn.preprocessing import StandardScaler\n", 184 | "\n", 185 | "#Create a set of steps. All but the last step is a transformer (something that processes data). \n", 186 | "#Build a list of steps, where the first is StandardScaler and the second is LogisticRegression\n", 187 | "#The last step should be an estimator.\n", 188 | "\n", 189 | "steps = [('scaler', StandardScaler()),\n", 190 | " ('lr', LogisticRegression())]\n", 191 | "\n", 192 | "#Now set up the pipeline\n", 193 | "pipeline = Pipeline(steps)\n", 194 | "\n", 195 | "#Now set up the parameter grid, paying close to the correct convention here\n", 196 | "parameters_scaler = dict(lr__C = [10**i for i in range(-3, 3)],\n", 197 | " lr__penalty = ['l1', 'l2'])\n", 198 | "\n", 199 | "#Now run another grid search\n", 200 | "lr_grid_search_scaler = GridSearchCV(pipeline, param_grid = parameters_scaler, cv = kfolds, scoring = 'log_loss')\n", 201 | "lr_grid_search_scaler.fit(X, Y)\n", 202 | "\n", 203 | "\n", 204 | "#Again, print the score of the best model\n", 205 | "best_2 = lr_grid_search_scaler.best_score_\n", 206 | "print(best_2)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 73, 212 | "metadata": { 213 | "collapsed": false 214 | }, 215 | "outputs": [ 216 | { 217 | "data": { 218 | "text/plain": [ 219 | "LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,\n", 220 | " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", 221 | " penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n", 222 | " verbose=0, warm_start=False)" 223 | ] 224 | }, 225 | "execution_count": 73, 226 | "metadata": {}, 227 | "output_type": "execute_result" 228 | } 229 | ], 230 | "source": [ 231 | "#Let's see the model after scaling. Did the optimal parameters change?\n", 232 | "lr_grid_search_scaler.best_estimator_.steps[-1][1]" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "Now that we've built a pipeline estimator that performs feature scaling and then logistic regression, let's add to it a feature engineering step. We'll then again use GridSearchCV to find an optimal parameter configuration and see if we can beat our best score above." 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 85, 245 | "metadata": { 246 | "collapsed": false 247 | }, 248 | "outputs": [ 249 | { 250 | "name": "stdout", 251 | "output_type": "stream", 252 | "text": [ 253 | "-0.68035039448\n" 254 | ] 255 | } 256 | ], 257 | "source": [ 258 | "from sklearn.preprocessing import PolynomialFeatures\n", 259 | "\n", 260 | "#Create a set of steps. All but the last step is a transformer (something that processes data). \n", 261 | "# Step 1 - PolynomialFeatures\n", 262 | "# Step 2 - StandardScaler\n", 263 | "# Step 3 - LogisticRegression\n", 264 | "\n", 265 | "steps_poly = [('polyfeat', PolynomialFeatures()),\n", 266 | " ('scaler', StandardScaler()),\n", 267 | " ('lr', LogisticRegression())]\n", 268 | "\n", 269 | "#Now set up the pipeline\n", 270 | "pipeline_poly = Pipeline(steps_poly)\n", 271 | "\n", 272 | "#Now set up a new parameter grid, use the same paramaters used above for logistic regression, but add polynomial features up to degree 3. \n", 273 | "parameters_poly = dict(polyfeat__degree = [1, 2],\n", 274 | " polyfeat__interaction_only = [True, False],\n", 275 | " lr__C = [10**i for i in range(-3, 3)],\n", 276 | " lr__penalty = ['l1', 'l2'])\n", 277 | "\n", 278 | "#Now run another grid search\n", 279 | "lr_grid_search_poly = GridSearchCV(pipeline_poly, param_grid = parameters_poly, cv = kfolds, scoring = 'log_loss')\n", 280 | "\n", 281 | "lr_grid_search_poly.fit(X, Y)\n", 282 | "best_3 = lr_grid_search_poly.best_score_\n", 283 | "print(best_3)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 100, 289 | "metadata": { 290 | "collapsed": false 291 | }, 292 | "outputs": [ 293 | { 294 | "data": { 295 | "text/plain": [ 296 | "[('polyfeat',\n", 297 | " PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)),\n", 298 | " ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)),\n", 299 | " ('lr',\n", 300 | " LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,\n", 301 | " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", 302 | " penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n", 303 | " verbose=0, warm_start=False))]" 304 | ] 305 | }, 306 | "execution_count": 100, 307 | "metadata": {}, 308 | "output_type": "execute_result" 309 | } 310 | ], 311 | "source": [ 312 | "#Let's look at the best estimator, stepwise\n", 313 | "lr_grid_search_poly.best_estimator_.steps" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "Now make a bar chart to plot results" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 98, 326 | "metadata": { 327 | "collapsed": false 328 | }, 329 | "outputs": [ 330 | { 331 | "data": { 332 | "text/plain": [ 333 | "[]" 334 | ] 335 | }, 336 | "execution_count": 98, 337 | "metadata": {}, 338 | "output_type": "execute_result" 339 | }, 340 | { 341 | "data": { 342 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsYAAAF0CAYAAAAggv9WAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X10VOWB+PHvJLwMkAjFQUkm4T3DDC9ZE40QQUxshZa2\nsi2gWFmVAoFSqYXWrShHw3a1ptb60mwPac8etbQQzto9RLs1iw2NuFRhC126rVWKLdCEY3cRxQUR\nJLm/P1rm12hgggTClO/nnJzDZJ4797nEx3xzuXcSCoIgQJIkSTrPZXT1BCRJkqRzgWEsSZIkYRhL\nkiRJgGEsSZIkAYaxJEmSBBjGkiRJEtDBMK6vrycejxOLxaiqqnrf89/4xjcoKiqiuLiYsWPH0q1b\nN958880ObStJkiSdC0Kp3se4tbWVWCxGQ0MDubm5lJSUUFtbSzweb3f8j370Ix5++GF+8pOfnPK2\nkiRJUldJecZ4y5YtFBQUMHjwYLp3786sWbOoq6s74fg1a9Zwww03fKBtJUmSpK6SMoybm5vJz89P\nPs7Ly6O5ubndsYcPH6a+vp7p06ef8raSJElSV+rUm++efvppJk6cSL9+/TrzZSVJkqQzrluqAdFo\nlD179iQfNzU1EY1G2x1bW1ubvIziVLcNhUIdnrQkSZL0QZ3oFruUN9+1tLQwcuRIGhoayMnJ4fLL\nL2fNmjUkEok24w4cOMCwYcNoamqiV69ep7Qt/CmMU0xF56jKykoqKyu7ehrSece1J3Ud11/6Ollz\npjxjnJmZSXV1NZMnT6a1tZW5c+eSSCSoqakhFApRUVEBwLp165gyZUoyik+2rSRJknSuSRnGAB/9\n6Ed55ZVX2nxuwYIFbR7ffPPN3HzzzR3aVpIkSTrX+JvvdNrKysq6egrSecm1J3Ud199fp5TXGJ8t\nXmMsSZKkM+1kzekZY0mSJAnDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZIAw1iSJEkCDGNJkiQJMIwl\nSZIkwDCWJEmSAMNYkiRJAgxjSZIkCTCMJUmSJMAwliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYk\nSZIAw1iSJEkCDGNJkiQJMIwlSZIkwDCWJEmSAMNYkiRJAgxjSZIkCTCMJUmSJMAwliRJkgDDWJIk\nSQIMY0mSJAkwjCVJkiTAMJYkSZIAw1iSJEkCDGNJkiQJMIwlSZIkwDCWJEmSAMNYkiRJAgxjSZIk\nCTCMJUmSJMAwliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZIAw1iSJEkCDGNJkiQJMIwlSZIk\nwDCWJEmSAMNYkiRJAgxjSZIkCTCMJUmSJMAwliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZIA\n6NbVE1D6GzJwILv/+MeunoY+oMEXX8yu117r6mnoA3DtpTfXnnTuCQVBEHT1JABCoRDnyFR0ikKh\nEH7l0lcIXHtpyrWX3lx7Utc4WXN6KYUkSZJEB8O4vr6eeDxOLBajqqqq3TGNjY0UFRUxZswYysvL\nk59/6KGHGDNmDIWFhdx4440cPXq0c2YuSZIkdaKUl1K0trYSi8VoaGggNzeXkpISamtricfjyTEH\nDhzgiiuuYP369USjUfbt20ckEmHv3r1MnDiRl19+mR49enD99dfz8Y9/nJtuuun9E/FSirTlP+em\nN/85N3259tKba0/qGqd1KcWWLVsoKChg8ODBdO/enVmzZlFXV9dmzOrVq5k+fTrRaBSASCSSfK6l\npYVDhw5x7Ngx3n77bXJzc0/nWCRJkqQzImUYNzc3k5+fn3ycl5dHc3NzmzE7duxg//79lJeXU1JS\nwqpVqwDIzc3lS1/6EoMGDSIajdKvXz8+8pGPdPIhSJIkSaevU96u7dixY2zbto0NGzZw6NAhSktL\nKS0tJRKJUFdXx+7du+nbty8zZsxg9erVfOYzn2n3dUKhzpiNzr4Av3TpzbWXrlx76c61J51bUoZx\nNBplz549ycdNTU3JSyaOy8vLIxKJEA6HCYfDTJo0ie3btxMEAcOGDaN///4AfPrTn+ZnP/vZCcP4\nnnsqk38uKyujrKzsAxySzjavc0xvXueYvlx76c21l95cf+mj8c8fx604ydiUYVxSUsLOnTvZvXs3\nOTk51NbWsmbNmjZjpk2bxuLFi2lpaeHIkSNs3ryZpUuXcvDgQV588UXeeecdevbsSUNDAyUlJSfc\nV2VlZarpSJIkSR1W9ueP404rjDMzM6murmby5Mm0trYyd+5cEokENTU1hEIhKioqiMfjTJkyhcLC\nQjIzM6moqGDUqFEAzJgxg6KiIrp3705RUREVFRWnc2ySJEnSGeFvvtNp85+T0pv/nJu+XHvpzbWX\n3lx/6etka8/ffCdJkiRhGEuSJEmAYSxJkiQBhrEkSZIEGMaSJEkSYBhLkiRJgGEsSZIkAYaxJEmS\nBBjGkiRJEmAYS5IkSYBhLEmSJAGGsSRJkgQYxpIkSRJgGEuSJEmAYSxJkiQBhrEkSZIEGMaSJEkS\nYBhLkiRJgGEsSZIkAYaxJEmSBBjGkiRJEmAYS5IkSYBhLEmSJAGGsSRJkgQYxpIkSRJgGEuSJEmA\nYSxJkiQBhrEkSZIEGMaSJEkSYBhLkiRJgGEsSZIkAYaxJEmSBBjGkiRJEmAYS5IkSYBhLEmSJAGG\nsSRJkgQYxpIkSRJgGEuSJEmAYSxJkiQBhrEkSZIEGMaSJEkSYBhLkiRJgGEsSZIkAYaxJEmSBBjG\nkiRJEmAYS5IkSYBhLEmSJAGGsSRJkgQYxpIkSRJgGEuSJEmAYSxJkiQBhrEkSZIEGMaSJEkSYBhL\nkiRJgGEsSZIkAYaxJEmSBBjGkiRJEmAYS5IkSUAHw7i+vp54PE4sFqOqqqrdMY2NjRQVFTFmzBjK\ny8uTnz9w4AAzZ84kkUgwevRoNm/e3DkzlyRJkjpRKAiC4GQDWltbicViNDQ0kJubS0lJCbW1tcTj\n8eSYAwcOcMUVV7B+/Xqi0Sj79u0jEokAcMstt3DVVVcxZ84cjh07xttvv80FF1zw/omEQqSYis5R\noVAIv3LpKwSuvTTl2ktvrr305vpLXydbeynPGG/ZsoWCggIGDx5M9+7dmTVrFnV1dW3GrF69munT\npxONRgGSUfzWW2/x/PPPM2fOHAC6devWbhRLkiRJXS1lGDc3N5Ofn598nJeXR3Nzc5sxO3bsYP/+\n/ZSXl1NSUsKqVasA+P3vf08kEmHOnDkUFxdTUVHB4cOHO/kQJEmSpNPXKTffHTt2jG3btvHMM89Q\nX1/PV7/6VXbu3Jn8/Oc//3m2bdtG7969uf/++ztjl5IkSVKn6pZqQDQaZc+ePcnHTU1NyUsmjsvL\nyyMSiRAOhwmHw0yaNInt27czceJE8vPzueyyywCYMWPGCW/eA6isrEz+uaysjLKyslM8HEmSJOn/\na/zzR0ekDOOSkhJ27tzJ7t27ycnJoba2ljVr1rQZM23aNBYvXkxLSwtHjhxh8+bNLF26lIsvvpj8\n/Hx27NiRvIFv1KhRJ9zXX4axJEmSdLrK/vxx3IqTjE0ZxpmZmVRXVzN58mRaW1uZO3cuiUSCmpoa\nQqEQFRUVxONxpkyZQmFhIZmZmVRUVCQD+NFHH+XGG2/k3XffZdiwYTz22GOnc2ySJEnSGZHy7drO\nFt+uLX35ljXpzbeMSl+uvfTm2ktvrr/0dVpv1yZJkiSdDwxjSZIkCcNYkiRJAgxjSZIkCTCMJUmS\nJMAwliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZIAw1iSJEkCDGNJkiQJMIwlSZIkwDCWJEmS\nAMNYkiRJAgxjSZIkCTCMJUmSJMAwliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZIAw1iSJEkC\nDGNJkiQJMIwlSZIkwDCWJEmSAMNYkiRJAgxjSZIkCTCMJUmSJMAwliRJkgDDWJIkSQIMY0mSJAkw\njCVJkiTAMJYkSZIAw1iSJEkCDGNJkiQJMIwlSZIkwDCWJEmSAMNYkiRJAgxjSZIkCTCMJUmSJMAw\nliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZIAw1iSJEkCDGNJkiQJMIwlSZIkwDCWJEmSAMNY\nkiRJAgxjSZIkCTCMJUmSJMAwliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZKADoZxfX098Xic\nWCxGVVVVu2MaGxspKipizJgxlJeXt3mutbWV4uJirr322tOfsSRJknQGdEs1oLW1lVtvvZWGhgZy\nc3MpKSlh2rRpxOPx5JgDBw7w+c9/nvXr1xONRtm3b1+b13jkkUcYNWoUb731VucfgSRJktQJUp4x\n3rJlCwUFBQwePJju3bsza9Ys6urq2oxZvXo106dPJxqNAhCJRJLPNTU18eMf/5h58+Z18tQlSZKk\nzpMyjJubm8nPz08+zsvLo7m5uc2YHTt2sH//fsrLyykpKWHVqlXJ55YsWcIDDzxAKBTqxGlLkiRJ\nnSvlpRQdcezYMbZt28aGDRs4dOgQpaWllJaW8sorr3DxxRdzySWX0NjYSBAEnbE7SZIkqdOlDONo\nNMqePXuSj5uampKXTByXl5dHJBIhHA4TDoeZNGkS27dvZ+vWrTz11FP8+Mc/5vDhw/zf//0fN910\nE9/73vfa3VdlZWXyz2VlZZSVlX2wo5IkSZKAxj9/dEQoSHEat6WlhZEjR9LQ0EBOTg6XX345a9as\nIZFIJMe8/PLLLF68mPr6eo4cOcK4ceNYu3Yto0aNSo557rnnePDBB3nqqafan0go5BnlNBUKhfAr\nl75C4NpLU6699ObaS2+uv/R1srWX8oxxZmYm1dXVTJ48mdbWVubOnUsikaCmpoZQKERFRQXxeJwp\nU6ZQWFhIZmYmFRUVbaJYkiRJOtelPGN8tnjGOH35U3N686xV+nLtpTfXXnpz/aWvk609f/OdJEmS\nhGEsSZIkAYaxJEmSBBjGkiRJEmAYS5IkSYBhLEmSJAGGsSRJkgQYxpIkSRJgGEuSJEmAYSxJkiQB\nhrEkSZIEGMaSJEkSYBhLkiRJgGEsSZIkAYaxJEmSBBjGkiRJEmAYS5IkSYBhLEmSJAGGsSRJkgQY\nxpIkSRJgGEuSJEmAYSxJkiQBhrEkSZIEGMaSJEkSYBhLkiRJgGEsSZIkAYaxJEmSBBjGkiRJEmAY\nS5IkSYBhLEmSJAGGsSRJkgQYxpIkSRJgGEuSJEmAYSxJkiQBhrEkSZIEGMaSJEkSYBhLkiRJgGEs\nSZIkAYaxJEmSBBjGkiRJEmAYS5IkSYBhLEmSJAGGsSRJkgQYxpIkSRJgGEuSJEmAYSxJkiQBhrEk\nSZIEGMaSJEkSYBhLkiRJgGEsSZIkAYaxJEmSBBjGkiRJEmAYS5IkSYBhLEmSJAGGsSRJkgQYxpIk\nSRJgGEuSJEmAYSxJkiQBHQzj+vp64vE4sViMqqqqdsc0NjZSVFTEmDFjKC8vB6CpqYmrr76a0aNH\nM3bsWB599NHOm7kkSZLUiUJBEAQnG9Da2kosFqOhoYHc3FxKSkqora0lHo8nxxw4cIArrriC9evX\nE41G2bdvH5FIhNdee43XXnuNSy65hIMHD3LppZdSV1fXZtvkREIhUkxF56hQKIRfufQVAtdemnLt\npTfXXnpz/aWvk629lGeMt2zZQkFBAYMHD6Z79+7MmjWLurq6NmNWr17N9OnTiUajAEQiEQAGDhzI\nJZdcAkBWVhaJRILm5ubTORZJkiTpjEgZxs3NzeTn5ycf5+XlvS9ud+zYwf79+ykvL6ekpIRVq1a9\n73V27drFf/3XfzFu3LhOmLYkSZLUubp1xoscO3aMbdu2sWHDBg4dOkRpaSmlpaWMGDECgIMHDzJj\nxgweeeQRsrKyOmOXkiRJUqdKGcbRaJQ9e/YkHzc1NSUvmTguLy+PSCRCOBwmHA4zadIktm/fzogR\nIzh27BgzZszg7/7u75g2bdpJ91VZWZn8c1lZGWVlZad2NJIkSdJfaPzzR0ekvPmupaWFkSNH0tDQ\nQE5ODpdffjlr1qwhkUgkx7z88sssXryY+vp6jhw5wrhx41i7di2jRo3ipptuIhKJ8M1vfvPkE/Hm\nu7TlDQjpzRuA0pdrL7259tKb6y99nWztpTxjnJmZSXV1NZMnT6a1tZW5c+eSSCSoqakhFApRUVFB\nPB5nypQpFBYWkpmZSUVFBaNGjWLTpk384Ac/YOzYsRQVFREKhbjvvvv46Ec/2tnHKEmSJJ2WlGeM\nzxbPGKcvf2pOb561Sl+uvfTm2ktvrr/0dVpv1yZJkiSdDwxjSZIkCcNYkiRJAgxjSZIkCTCMJUmS\nJMAwliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZIAw1iSJEkCDGNJkiQJMIwlSZIkwDCWJEmS\nAMNYkiRJAgxjSZIkCTCMJUmSJMAwliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZIAw1iSJEkC\nDGNJkiQJMIwlSZIkwDCWJEmSAMNYkiRJAgxjSZIkCTCMJUmSJMAwliRJkgDDWJIkSQIMY0mSJAkw\njCVJkiTAMJYkSZIAw1iSJEkCDGNJkiQJMIwlSZIkwDCWJEmSAMNYkiRJAgxjSZIkCTCMJUmSJMAw\nliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZIAw1iSJEkCDGNJkiQJMIwlSZIkwDCWJEmSAMNY\nkiRJAgxjSZIkCTCMJUmSJMAwliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZKADoZxfX098Xic\nWCxGVVVVu2MaGxspKipizJgxlJeXn9K2kiRJUlcLBUEQnGxAa2srsViMhoYGcnNzKSkpoba2lng8\nnhxz4MABrrjiCtavX080GmXfvn1EIpEObZucSChEiqnoHBUKhfArl75C4NpLU6699ObaS2+uv/R1\nsrWX8ozxli1bKCgoYPDgwXTv3p1Zs2ZRV1fXZszq1auZPn060WgUgEgk0uFtJUmSpHNByjBubm4m\nPz8/+TgvL4/m5uY2Y3bs2MH+/fspLy+npKSEVatWdXhbSZIk6VzQrTNe5NixY2zbto0NGzZw6NAh\nSktLKS0t7YyXliRJks6KlGEcjUbZs2dP8nFTU1Pykonj8vLyiEQihMNhwuEwkyZNYvv27R3a9i+F\nQqEPcgw6B/iVS2+uvfTlVy69ufbSm1+9vz4pw7ikpISdO3eye/ducnJyqK2tZc2aNW3GTJs2jcWL\nF9PS0sKRI0fYvHkzS5cuZeTIkSm3Pc4bECRJktSVUoZxZmYm1dXVTJ48mdbWVubOnUsikaCmpoZQ\nKERFRQXxeJwpU6ZQWFhIZmYmFRUVjBo1CqDdbSVJkqRzTcq3a5MkSZLOB/7mO0k6h61YsYKxY8d2\n9TSkc8qcOXO49tpru3oaKT3xxBNkZ2d39TR0CgxjdcjJ/ic0ZMgQMjIyyMjIoHfv3iQSCb7xjW+c\n5RlKXWPfvn0sWrSIoUOHEg6HGThwINdccw0NDQ2dto8zdYPWc889R0ZGBvv372/3+RUrVpCRkUFm\nZiaZmZlEo1Fmz55NU1PTGZmPzh9z5sxJ/rfVo0cPhg8fzu23387bb7991uZw+PBh7rzzTgoKCujV\nqxcDBgxg4sSJrF27tlP3c6bW7+7du8nIyGDbtm3tPv/EE08k/44zMjIYOHAg1157LS+99NIZmc9f\ni055uzad30KhEJWVlSxcuJB33nmHn/zkJyxcuJC+ffsyf/78rp6edEZ9+tOf5p133uGxxx5j+PDh\n/M///A/PPfccr7/+eldPLendd9+le/fu7T6X6pt2PB7nueeeo6WlhVdffZVFixZx/fXXs2nTpjMx\nVZ1HrrnmGr7//e9z9OhRnn/+eebOncvhw4eprq4+K/tfsGABL7zwAo8++iijR4/mjTfe4MUXXzzh\nD4pd5XTWb58+ffjd735Ha2srzc3N3H777XziE59gx44ddOtmArbHM8bqFFlZWVx00UUMGjSIz372\nsxQWFrJ+/fqunpZ0Rh04cID/+I//4P7776esrIz8/HwuvfRSli5dynXXXQf86ZvanXfeyZAhQwiH\nw4wYMSL5jb+1tZV58+YxbNgwevfuTSwW44EHHki538cee4zRo0fTq1cv4vE4Dz/8cJt39snIyODb\n3/4206dPJysri7vuuusDH2O3bt0YMGAAAwcOZMKECcyfP58XX3yRgwcPfuDXlAB69uzJgAEDiEaj\nzJo1i9mzZ7Nu3ToANm7cyPjx4+nVqxcDBw5k6dKlvPvuu+2+zqpVq4hEIu97/sYbb+Rv//ZvT7j/\np59+mmXLlvGxj32MQYMG8Td/8zcsWLCAz33uc23GPfjgg8RiMcLhMIMGDWqznpYtW0Y8Hqd3794M\nHTqUr3zlKxw9evSkx/30009z2WWX0atXL4YPH87y5cvbzH3o0KGsWLGCuXPn8qEPfYjZs2ef8LVS\n3SYWCoUYMGAAF198McXFxSxZsoTdu3fzyiuvnHS785k/LqjTNTY28pvf/IZYLNbVU5HOqKysLLKy\nsnjqqaeYMGECPXv2fN+Ym266iU2bNvHoo49yySWX0NzczK5du4A/hXFeXh5PPvkkkUiELVu2UFFR\nQSQSYc6cOe3u87vf/S6VlZVUV1dTXFzMr371K+bPn0+PHj1YtGhRctw//MM/cN999/Hggw922j/l\nvvbaa/zwhz9MXlohdaaePXty5MgR9u7dy9SpU7n55pt54oknePXVV5k7dy6ZmZnt/uA4c+ZMvvjF\nL1JXV8eMGTMAeOutt1i3bt1JL4sYOHAg9fX1zJgxgwsuuKDdMcuWLaOmpoaHHnqISZMm8frrr7N1\n69bk81lZWTz++OPk5uby0ksvsXDhQsLhMCtWrGj39f793/+d2bNn861vfYtJkyaxe/duFi5cyNGj\nR/n617+eHPfQQw+xfPlytm7d2mlvZ/vmm2/ygx/8AOCEZ6AFBFIH3HLLLcEnP/nJdp8bMmRIEA6H\ng6ysrKBHjx5BKBQKevfuHbz44otneZbS2fev//qvwYUXXhiEw+GgtLQ0+PKXvxxs3rw5CIIg+O1v\nfxuEQqFg/fr1HX69O+64I7jmmmuSjysrK4OxY8cmHw8aNCj4/ve/32abhx9+OBg1alTycSgUCm67\n7baU+2psbAwyMjKC119/vd3nKysrg8zMzCArKyvo3bt3EAqFgoyMjGDJkiUdPh6pPe/9nrJ58+bg\nwgsvDGbNmhXcddddQSwWazP+8ccfD8LhcHD48OF2t7/11luDj33sY8nH3/72t4OcnJygpaXlhHPY\nuHFjMGjQoKB79+5BcXFxcOuttwbPPvts8vmDBw8G4XA4+M53vtPh41q5cmVQUFDQZt7Z2dnJx5Mm\nTQr+8R//sc0269atC7KyspKPhwwZElx77bUp97Vr164gFAoFW7dubff5xx9/PAiFQkF2dnbQp0+f\nIBQKBaFQKPjUpz7V4eM5H3kphTrF0qVL2b59Oxs3buTqq6/mnnvuYdy4cV09LemM+9SnPsXevXv5\n0Y9+xNSpU3nhhRcYP348X/va1/jFL35BZmYmZWVlJ9x+5cqVlJSUcNFFF5Gdnc1DDz3U5jeG/qV9\n+/bxhz/8gQULFpCdnZ38uOOOO/j973/fZuyll17a5vGYMWOS4z/+8Y93+PhGjBjBL3/5S37+859z\n3333UVxczL333tvh7aUTeeaZZ8jOzqZXr15MmDCB8vJyvvWtb/Gb3/yG8ePHtxk7ceJEjh49ys6d\nO9t9rfnz5/Pss8+yd+9e4E+XG91yyy1kZGTwhz/8Ifnf/gUXXMD9998PwJVXXsnvfvc7fvrTn3L9\n9dfz29/+lsmTJycvpXjppZc4evQoV1999QmP4cknn+TKK68kJyeH7OxslixZcsL1C7B161buvffe\nNuv3M5/5DIcPH+aPf/xjctxll13WZrupU6cmx5/Ku9T06dOH7du3s23bNr7zne8Qi8VYuXJlh7c/\nH3kphTrFhRdeyLBhwxg2bBhPPvkkBQUFjBs3jquuuqqrpyadcT169ODDH/4wH/7wh1m+fDnz589n\nxYoVrFq16qTbrV27liVLlvDNb36T0tJSLrjgAqqrq5PXWb5Xa2srADU1NZSWlp70tfv06dPm8TPP\nPJO8jrFXr14dPTR69OjB0KFDAUgkEuzYsYNFixbx2GOPdfg1pPZcddVVfPe736Vbt27k5uamvDwn\nCIITXhZUWFhIUVERjz/+ONOmTePnP/958rKB3Nxctm/fnhzbv3//5J8zMzOZMGECEyZM4O///u+5\n9957ufvuu1m2bFnK+W/evJkbbriBFStWMGXKFPr160ddXR233377CbdpbW3lnnvuYebMme97bsCA\nAck/v3f9/vM//zOHDx8GTu0yiFAolFy/sViMvXv3MmvWLDZs2NDh1zjfGMbqdP369ePWW2/li1/8\nIr/4xS+6ejrSWZdIJDh27BiJRIKWlhZ++tOfMnny5PeN27RpE+PHj29zs8+JzogBXHTRReTm5rJz\n505uvPHGU5pTfn7+KY0/keXLlzNy5Ei+8IUvUFRU1CmvqfPT8RvW3iuRSPAv//IvbT73/PPP07Nn\nT4YPH37C15s/fz5f//rX+d///V8mTpxIQUEB8Kf4HTZsWIfmdPy38x48eJBEIkGPHj1oaGhod7+b\nNm0iLy+PO++8M/m54/cPnEhxcTEvv/xyh+dzXE5OzimNP5HjP4ivW7fupDcmns8MY3XYW2+91ean\nboC+ffu2O3bRokVUVVXx5JNPJm+GkP7a7N+/n5kzZybfiSU7O5v//M//5IEHHuAjH/kIY8aM4brr\nrmPevHk8/PDDFBcX09TUxK5du5g9ezaxWIwnnniC+vp6RowYwZo1a9i4cWObM1rvtWLFCr7whS/Q\nt29fpk6dyrvvvsu2bdtobm7mjjvuOOVjCIKA//7v/6Zfv35tPl9YWNju+GHDhjFt2jSWL1/Ov/3b\nv53y/qRUFi1axCOPPMLnPvc5brvtNl599VWWLVvG4sWLCYfDJ9zuhhtuYOnSpaxcuZKampqU+ykv\nL+eGG27qaEpeAAACmklEQVTgsssu48ILL+TXv/41d911F4lEgkQiQSgU4rbbbmPZsmX06NGjzc13\nCxcuJBaL0dzczOrVqyktLaW+vp7a2tqT7vPuu+/mk5/8JIMGDeK6666jW7du/OpXv2LLli1UVVWd\n8t8VwCuvvPK+s+3xeLzdsdnZ2cybN4+7777bMD6Rrr7IWenhlltuCTIyMt73MXPmzGDo0KHBgw8+\n+L5tKioqgtGjR3fBbKWz48iRI8Fdd90VXH755UH//v2DPn36BLFYLPjyl78cvPHGG0EQBMHRo0eD\nr3zlK0FeXl4QDoeDESNGBP/0T/+UfG7evHlB//79gw996EPBvHnzgq9+9avB0KFDk/t47813QRAE\ntbW1waWXXhr06tUr6N+/f3DllVcGa9euTT6fkZER/PCHP0w5/+M33/3lx/Eb7A4dOtTuvoMgCH72\ns58FGRkZwQsvvPCB/t6kk93QHQRB8Pzzzwfjx48PwuFwMHDgwOBLX/pScPTo0ZTbf/aznw369u0b\nvP322ynncP/99wdXXnllMGDAgKBXr17B0KFDgwULFgRNTU1txlVVVQXDhw8PevbsGQwaNChYvnx5\n8rk777wzuOiii4Ls7Oxg+vTpwcqVK4OMjIzk8++9+S4IguDZZ58NJk2aFPTp0yfo27dvUFJSkvx/\nQhAEJ/ye+l67du1q9/tyRkZG8Otf/7rdfQdBEOzZsyfo0aNHsGbNmpT7OB+FgqCT3gdEkiSpC02d\nOpX8/PwOnTGW2uOlFJIkKa29+eabbNy4kWeffZZf/vKXXT0dpTHDWJIkpbWioiLeeOMNvva1ryVv\noJM+CC+lkCRJkgB/wYckSZKEYSxJkiQBhrEkSZIEGMaSJEkSYBhLkiRJgGEsSZIkAfD/AHeNuVNj\ny6ctAAAAAElFTkSuQmCC\n", 343 | "text/plain": [ 344 | "" 345 | ] 346 | }, 347 | "metadata": {}, 348 | "output_type": "display_data" 349 | } 350 | ], 351 | "source": [ 352 | "import numpy as np\n", 353 | "results = -1 * np.array([best_1, best_2, best_3])\n", 354 | "labs = ['LR', 'Scaler-LR', 'Poly-Scaler-LR']\n", 355 | "\n", 356 | "fig = plt.figure(facecolor = 'w', figsize = (12, 6))\n", 357 | "ax = plt.subplot(111)\n", 358 | "\n", 359 | "width = 0.5\n", 360 | "ind = np.arange(3)\n", 361 | "rec = ax.bar(ind + width, results, width, color='r')\n", 362 | "\n", 363 | "ax.set_xticks(ind + width)\n", 364 | "ax.set_xticklabels(labs, size = 14)\n", 365 | "ax.set_ylim([0.6, 0.7])\n", 366 | "\n", 367 | "plt.plot(np.arange(4), min(results) * np.ones(4))" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": { 374 | "collapsed": true 375 | }, 376 | "outputs": [], 377 | "source": [] 378 | } 379 | ], 380 | "metadata": { 381 | "anaconda-cloud": {}, 382 | "kernelspec": { 383 | "display_name": "Python [py35]", 384 | "language": "python", 385 | "name": "Python [py35]" 386 | }, 387 | "language_info": { 388 | "codemirror_mode": { 389 | "name": "ipython", 390 | "version": 3 391 | }, 392 | "file_extension": ".py", 393 | "mimetype": "text/x-python", 394 | "name": "python", 395 | "nbconvert_exporter": "python", 396 | "pygments_lexer": "ipython3", 397 | "version": "3.5.2" 398 | } 399 | }, 400 | "nbformat": 4, 401 | "nbformat_minor": 0 402 | } 403 | -------------------------------------------------------------------------------- /ipython/README.md: -------------------------------------------------------------------------------- 1 | Intro to Data Science Recitation 2 | =================== 3 | 4 | Materials for the Intro to Data Science course, Spring 2017 5 | 6 | You will probably want to fork this repository and keep it up to date. This will ensure that you have the latest versions of all of my example code, and any necessary data or supplementary material that comes with it. 7 | -------------------------------------------------------------------------------- /ipython/data/Cell2Cell_info.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/data/Cell2Cell_info.pdf -------------------------------------------------------------------------------- /ipython/data/survey_responses_2016.csv: -------------------------------------------------------------------------------- 1 | id,cs_python,cs_java,cs_c,cs_perl,cs_javascript,cs_r,cs_sas,profile_1,profile_2,profile_3,profile_4,profile_5,profile_6,profile_7,fruit,len_answer,season,experience_coded,experience 2 | 1,1,1,0,0,0,1,0,1,5,6,5,1,6,5,Oranges,136,Fall,2,"< 2 years, I'm fresh!" 3 | 2,0,1,0,0,0,0,0,1,3,6,4,1,5,5,Apples,112,Summer,1,"None, I just finished my undergrad!" 4 | 3,1,0,1,0,0,1,1,1,4,4,7,1,1,5,Oranges,97,Spring,1,"None, I just finished my undergrad!" 5 | 4,0,0,1,0,0,1,0,2,1,8,7,2,5,5,Oranges,1879,Winter,1,"None, I just finished my undergrad!" 6 | 5,1,0,0,0,1,1,0,5,4,6,6,6,1,7,Apples,699,Spring,2,"< 2 years, I'm fresh!" 7 | 6,0,0,0,0,0,0,0,2,2,7,2,2,3,5,Apples,340,Fall,1,"None, I just finished my undergrad!" 8 | 7,0,0,0,1,0,1,0,8,5,6,5,3,7,5,Oranges,874,Winter,4,"5+ years, I'm a veteran!" 9 | 8,0,0,0,0,0,1,0,3,2,10,7,1,5,5,Apples,118,Fall,1,"None, I just finished my undergrad!" 10 | 9,1,1,1,0,0,1,1,1,3,8,8,1,1,1,Oranges,0,Winter,1,"None, I just finished my undergrad!" 11 | 10,1,1,1,0,0,0,0,0,6,8,6,0,0,3,Oranges,310,Spring,1,"None, I just finished my undergrad!" 12 | 11,1,0,0,0,0,1,0,6,5,8,8,1,3,5,Oranges,947,Spring,1,"None, I just finished my undergrad!" 13 | 12,0,0,1,0,0,1,1,2,2,2,3,1,1,3,Oranges,379,Fall,2,"< 2 years, I'm fresh!" 14 | 13,0,1,0,0,0,0,0,3,7,10,9,1,1,5,Oranges,863,Winter,1,"None, I just finished my undergrad!" 15 | 14,0,0,0,0,0,0,1,7,2,6,6,1,6,7,Oranges,1033,Summer,4,"5+ years, I'm a veteran!" 16 | 15,1,1,0,0,0,0,0,1,8,5,5,6,6,8,Apples,144,Summer,1,"None, I just finished my undergrad!" 17 | 16,1,0,0,0,0,1,0,1,2,7,5,1,1,3,Apples,331,Fall,1,"None, I just finished my undergrad!" 18 | 17,0,1,1,0,0,1,0,5,5,5,5,1,1,3,Apples,261,Summer,1,"None, I just finished my undergrad!" 19 | 18,1,1,1,0,0,0,0,2,6,6,4,3,7,7,Oranges,752,Spring,1,"None, I just finished my undergrad!" 20 | 19,1,1,0,0,0,1,1,4,4,6,6,4,2,3,Oranges,829,Fall,1,"None, I just finished my undergrad!" 21 | 20,0,0,0,0,0,1,1,4,5,6,8,3,3,8,Oranges,269,Fall,1,"None, I just finished my undergrad!" 22 | 21,1,0,0,0,0,1,0,5,8,8,7,5,3,5,Oranges,271,Fall,3,"2-5 years, I'm getting good at what I do!" 23 | 22,1,0,0,0,0,0,0,6,1,8,9,1,7,9,Apples,556,Fall,1,"None, I just finished my undergrad!" 24 | 23,1,1,0,0,0,0,0,1,4,8,7,2,2,9,Apples,148,Summer,2,"< 2 years, I'm fresh!" 25 | 24,1,1,1,0,0,1,0,3,7,5,2,2,3,7,Oranges,639,Summer,2,"< 2 years, I'm fresh!" 26 | 25,1,1,1,0,0,1,0,1,4,5,6,2,6,4,Apples,486,Spring,1,"None, I just finished my undergrad!" 27 | 26,1,0,0,0,0,1,0,8,5,7,8,6,6,6,Apples,399,Summer,1,"None, I just finished my undergrad!" 28 | 27,0,1,0,0,0,1,0,1,2,4,3,1,2,2,Apples,492,Fall,1,"None, I just finished my undergrad!" 29 | 28,0,1,0,0,0,1,1,2,3,8,8,3,4,5,Oranges,1172,Fall,1,"None, I just finished my undergrad!" 30 | 29,1,1,1,0,0,1,0,4,4,3,3,1,7,8,Apples,340,Spring,4,"5+ years, I'm a veteran!" 31 | 30,0,0,0,0,0,0,0,5,3,7,7,3,5,5,Apples,0,Winter,1,"None, I just finished my undergrad!" 32 | 31,1,0,1,0,0,1,0,6,3,4,8,1,2,3,Oranges,548,Fall,1,"None, I just finished my undergrad!" 33 | 32,1,0,0,0,0,0,0,3,5,8,8,3,9,9,Apples,3683,Spring,2,"< 2 years, I'm fresh!" 34 | 33,1,0,0,0,0,0,1,1,1,3,6,1,4,5,Oranges,0,Fall,2,"< 2 years, I'm fresh!" 35 | 34,0,0,1,0,0,1,0,1,3,8,7,5,4,6,Oranges,712,Winter,1,"None, I just finished my undergrad!" 36 | 35,1,0,0,0,0,1,1,2,2,7,8,5,3,5,Apples,243,Summer,1,"None, I just finished my undergrad!" 37 | 36,1,1,1,0,0,1,0,6,7,4,5,2,4,7,Apples,1141,Fall,1,"None, I just finished my undergrad!" 38 | 37,1,1,0,0,0,1,0,3,5,9,9,1,5,7,Oranges,440,Summer,1,"None, I just finished my undergrad!" 39 | 38,1,1,0,0,0,1,0,3,5,9,9,1,5,7,Oranges,0,Summer,1,"None, I just finished my undergrad!" 40 | 39,1,0,1,0,0,1,1,1,3,7,7,1,3,5,Oranges,334,Fall,1,"None, I just finished my undergrad!" 41 | 40,1,0,1,0,0,1,1,1,4,7,7,1,7,5,Oranges,0,Fall,1,"None, I just finished my undergrad!" 42 | 41,1,0,0,0,0,1,1,8,3,9,9,8,3,6,Oranges,844,Fall,3,"2-5 years, I'm getting good at what I do!" 43 | 42,1,1,0,0,1,0,0,6,6,8,7,3,5,5,Apples,267,Summer,2,"< 2 years, I'm fresh!" 44 | 43,1,0,1,0,0,0,0,5,5,8,4,6,7,8,Apples,1155,Spring,3,"2-5 years, I'm getting good at what I do!" 45 | 44,0,0,1,0,0,1,1,7,5,7,7,1,7,7,Apples,371,Winter,2,"< 2 years, I'm fresh!" 46 | 45,0,1,1,0,0,0,0,4,7,5,5,4,3,4,Apples,618,Fall,1,"None, I just finished my undergrad!" 47 | 46,1,1,1,0,0,1,0,1,8,9,7,5,1,5,Oranges,418,Winter,1,"None, I just finished my undergrad!" 48 | 47,0,0,1,0,0,0,1,3,3,7,8,1,6,5,Apples,780,Fall,1,"None, I just finished my undergrad!" 49 | 48,0,0,1,0,0,1,0,3,5,8,6,5,2,4,Apples,286,Fall,1,"None, I just finished my undergrad!" 50 | 49,1,1,0,0,1,1,1,5,3,8,8,3,3,6,Apples,303,Fall,1,"None, I just finished my undergrad!" 51 | 50,1,1,0,0,0,1,0,7,5,9,9,10,5,6,Apples,163,Summer,1,"None, I just finished my undergrad!" 52 | 51,0,1,0,0,0,1,0,6,5,7,7,5,6,7,Oranges,529,Fall,1,"None, I just finished my undergrad!" 53 | 52,0,1,0,0,0,1,0,1,2,9,9,1,4,5,Apples,613,Fall,1,"None, I just finished my undergrad!" 54 | 53,1,0,0,0,0,1,0,7,4,7,7,3,7,9,Apples,248,Fall,2,"< 2 years, I'm fresh!" 55 | 54,0,0,0,0,0,0,0,5,2,4,4,1,8,8,Apples,657,Summer,3,"2-5 years, I'm getting good at what I do!" 56 | 55,1,0,0,0,0,1,0,5,5,7,7,1,7,7,Oranges,459,Spring,4,"5+ years, I'm a veteran!" 57 | 56,0,1,0,0,0,1,0,2,8,6,5,5,4,8,Apples,1032,Fall,1,"None, I just finished my undergrad!" 58 | 57,0,0,1,0,0,0,0,1,1,8,8,1,5,8,Apples,252,Summer,3,"2-5 years, I'm getting good at what I do!" 59 | 58,1,1,1,0,0,1,1,6,5,7,8,6,4,4,Apples,185,Fall,1,"None, I just finished my undergrad!" 60 | 59,1,0,0,0,0,1,1,6,3,7,8,5,3,3,Apples,396,Spring,1,"None, I just finished my undergrad!" 61 | 60,1,0,1,0,0,1,1,7,5,9,9,3,7,7,Apples,720,Summer,1,"None, I just finished my undergrad!" 62 | 61,1,0,1,0,0,0,0,1,5,6,1,1,1,2,Oranges,451,Winter,1,"None, I just finished my undergrad!" 63 | 62,0,0,0,0,0,1,1,3,5,8,8,1,1,7,Oranges,1370,Spring,3,"2-5 years, I'm getting good at what I do!" 64 | 63,1,1,0,0,0,1,0,3,6,7,6,3,2,2,Apples,418,Fall,2,"< 2 years, I'm fresh!" 65 | 64,1,0,0,0,0,0,0,6,3,8,7,1,2,4,Apples,233,Fall,1,"None, I just finished my undergrad!" 66 | 65,1,1,0,0,0,1,0,7,7,8,7,6,2,2,Oranges,531,Fall,2,"< 2 years, I'm fresh!" 67 | 66,0,1,1,0,0,1,0,8,7,8,7,3,3,7,Oranges,835,Fall,3,"2-5 years, I'm getting good at what I do!" 68 | 67,0,1,1,0,0,1,1,5,5,7,8,3,8,7,Oranges,765,Spring,3,"2-5 years, I'm getting good at what I do!" 69 | 68,1,0,0,0,0,0,0,2,6,4,6,5,6,4,Oranges,727,0,3,"2-5 years, I'm getting good at what I do!" 70 | 69,1,1,1,0,1,0,0,4,7,5,5,6,3,5,Oranges,513,Fall,3,"2-5 years, I'm getting good at what I do!" 71 | 70,1,1,1,0,1,0,0,4,7,5,5,6,3,5,Oranges,909,Fall,3,"2-5 years, I'm getting good at what I do!" 72 | 71,0,0,0,0,0,0,1,4,2,5,5,1,6,5,Oranges,907,Fall,3,"2-5 years, I'm getting good at what I do!" 73 | 72,1,1,1,0,0,1,0,2,9,10,8,4,1,5,Oranges,0,Winter,1,"None, I just finished my undergrad!" 74 | 73,1,1,1,0,0,1,0,4,7,5,5,5,6,4,Apples,470,Fall,1,"None, I just finished my undergrad!" 75 | 74,0,0,0,0,0,1,0,2,2,6,6,3,8,6,Oranges,395,Spring,2,"< 2 years, I'm fresh!" 76 | 75,0,0,0,0,1,0,0,1,6,10,9,1,1,5,Oranges,549,Winter,1,"None, I just finished my undergrad!" 77 | 76,1,1,1,0,0,1,1,6,6,7,9,4,1,4,Oranges,354,Spring,1,"None, I just finished my undergrad!" 78 | 77,1,0,0,0,0,1,0,1,4,8,8,3,5,5,Apples,1273,Fall,2,"< 2 years, I'm fresh!" 79 | 78,1,0,0,0,0,0,0,1,5,8,8,1,8,5,Apples,982,Fall,3,"2-5 years, I'm getting good at what I do!" 80 | 79,0,0,0,0,0,1,1,1,1,5,7,5,6,4,Oranges,942,Spring,1,"None, I just finished my undergrad!" 81 | 80,1,1,1,0,0,1,0,3,8,8,6,6,6,7,Apples,308,Spring,3,"2-5 years, I'm getting good at what I do!" 82 | 81,1,0,0,0,0,0,0,3,3,4,4,3,4,5,Apples,1604,Spring,3,"2-5 years, I'm getting good at what I do!" 83 | 82,0,0,1,0,1,0,0,2,6,3,2,1,6,10,Apples,706,Fall,4,"5+ years, I'm a veteran!" 84 | 83,1,1,1,0,0,0,0,4,4,7,7,2,5,6,Oranges,236,Fall,1,"None, I just finished my undergrad!" 85 | 84,0,0,1,0,0,1,1,5,3,7,8,6,4,6,Oranges,462,Fall,1,"None, I just finished my undergrad!" 86 | 85,1,1,0,0,1,1,0,4,7,4,6,5,6,9,Oranges,761,Winter,2,"< 2 years, I'm fresh!" 87 | 86,1,0,0,0,0,0,0,2,2,5,4,1,1,7,Oranges,642,Fall,2,"< 2 years, I'm fresh!" 88 | 87,1,0,1,0,0,1,0,3,5,7,7,5,1,5,Oranges,567,Summer,1,"None, I just finished my undergrad!" 89 | 88,1,1,1,1,0,0,0,6,10,8,6,6,2,6,Oranges,633,Winter,1,"None, I just finished my undergrad!" 90 | 89,1,0,1,0,0,1,0,3,5,7,8,5,3,3,Apples,320,Fall,1,"None, I just finished my undergrad!" 91 | 90,0,0,0,0,0,1,0,3,2,4,7,2,6,6,Oranges,352,Spring,3,"2-5 years, I'm getting good at what I do!" 92 | 91,1,0,0,0,0,1,1,2,2,6,8,3,5,4,Oranges,620,Fall,2,"< 2 years, I'm fresh!" 93 | 92,1,0,0,0,0,1,0,5,5,5,7,7,3,5,Oranges,604,Winter,3,"2-5 years, I'm getting good at what I do!" 94 | 93,1,1,0,0,0,1,0,5,7,5,7,4,3,5,Oranges,125,Winter,3,"2-5 years, I'm getting good at what I do!" 95 | 94,1,0,1,0,0,1,0,5,3,7,3,3,4,7,Oranges,0,Summer,3,"2-5 years, I'm getting good at what I do!" 96 | 95,0,0,0,0,0,1,0,4,1,6,8,1,5,7,Apples,892,Spring,4,"5+ years, I'm a veteran!" 97 | 96,1,1,1,0,0,1,1,4,4,7,8,7,5,2,Apples,93,Fall,1,"None, I just finished my undergrad!" 98 | 97,1,1,1,0,1,1,0,5,7,6,6,6,4,5,Apples,302,Fall,2,"< 2 years, I'm fresh!" 99 | 98,0,0,0,0,0,1,0,2,2,7,5,1,3,3,Apples,832,Spring,1,"None, I just finished my undergrad!" 100 | 99,1,1,1,0,1,0,0,2,6,4,4,3,3,3,Apples,255,Spring,1,"None, I just finished my undergrad!" 101 | 100,1,0,1,0,0,1,1,7,8,6,6,8,4,7,Apples,130,Spring,1,"None, I just finished my undergrad!" 102 | 101,0,0,0,0,0,1,1,5,3,3,7,2,6,6,Apples,526,Spring,1,"None, I just finished my undergrad!" 103 | 102,1,0,0,0,0,0,1,1,1,1,1,1,1,1,Oranges,1,Summer,4,"5+ years, I'm a veteran!" -------------------------------------------------------------------------------- /ipython/hw/hw_1/Homework1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction to Data Science\n", 8 | "## Homework 1: Due Midnight, March 4th. 1/3 of a Grade Deducted for each day late" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "Student Name: \n", 16 | "\n", 17 | "Student Netid: \n", 18 | "***" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "### Part 1: Case study\n", 26 | "- Read [this article](http://www.nytimes.com/2012/02/19/magazine/shopping-habits.html) in the New York Times.\n", 27 | "- Use what we've learned in class and from the book to describe how one could set Target's problem up as a predictive modeling problem, such that they could have gotten the results that they did. Formulate your solution as a proposed plan using our data science terminology. Include all the aspects of the formulation that you see as relevant to solving the problem. Be precise but concise." 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "Place your answer here!" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "### Part 2: Dealing with messy data\n", 42 | "Not all data you will deal with is going to be clean. In fact, much of it will be very messy! For example, we have the HTML page that lists the contributors to Facebook's [osquery](https://github.com/facebook/osquery) project that is hosted on [Github.com](https://github.com). In this case, all we are interested in are the contributors and how many commits each of them has. Given the HTML page in `\"data/osquery_contributors.html\"` you will sift through tons of irrelevant data so that you can build a useful data structure.\n", 43 | "\n", 44 | "Notice that the first six (out of 59 total) contributors are named \"theopolis\", \"marpaia\", \"javuto\", \"jedi22\", \"unixist\", and \"mofarrell\". They have 553, 477, 104, 49, 30, 25 commits respectively.\n", 45 | "\n", 46 | "![Screenshot](images/osquery_contributors.png)\n", 47 | "\n", 48 | "To get a better of understanding of how this data is stored in the file, try searching through the raw data file for these usernames to look for any patterns. Your final dictionary should have 59 elements!" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "1\\. Turn this data into a Python dictionary called `contributors` where the keys are the contributor names and the values are the number of commits that each contributor has." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "import re # you might find this package useful\n", 67 | "\n", 68 | "contributors = dict()\n", 69 | "\n", 70 | "# Place your code here\n", 71 | " \n", 72 | "# This line will print your dictionary for grading purposed. Do not remove this line!!!\n", 73 | "print contributors" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "### Part 3: Dealing with data Pythonically" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": { 87 | "collapsed": true 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "# You might find these packages useful. You may import any others you want!\n", 92 | "import pandas as pd\n", 93 | "import numpy as np" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "1\\. Load the data set `\"data/ads_dataset.tsv\"` into a Python Pandas data frame called `ads`." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": true 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "# Place your code here" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "2\\. Write a Python function called `getDfSummary()` that does the following:\n", 119 | "- Takes as input a data frame\n", 120 | "- For each variable in the data frame calculates the following features:\n", 121 | " - `number_nan` to count the number of missing not-a-number values\n", 122 | " - Ignoring missing, NA, and Null values:\n", 123 | " - `number_distinct` to count the number of distinct values a variable can take on\n", 124 | " - `mean`, `max`, `min`, `std` (standard deviation), and `25%`, `50%`, `75%` to correspond to the appropriate percentiles\n", 125 | "- All of these new features should be loaded in a new data frame. Each row of the data frame should be a variable from the input data frame, and the columns should be the new summary features.\n", 126 | "- Returns this new data frame containing all of the summary information\n", 127 | "\n", 128 | "Hint: The pandas `describe()` [(manual page)](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.describe.html) method returns a useful series of values that can be used here." 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "collapsed": true 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "def getDfSummary(input_data):\n", 140 | " # Place your code here\n", 141 | " \n", 142 | " return output_data" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "3\\. How long does it take for your `getDfSummary()` function to work on your `ads` data frame? Show us the results below.\n", 150 | "\n", 151 | "Hint: `%timeit getDfSummary(ads)`" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "collapsed": true 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "# Place your code here" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "4\\. Using the results returned from `getDfSummary()`, which fields, if any, contain missing `NaN` values?" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "collapsed": true 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "# Place your code here" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "5\\. For the fields with missing values, does it look like the data is missing at random? Are there any other fields that correlate perfectly, or predict that the data is missing? If missing, what should the data value be?\n", 188 | "\n", 189 | "Hint: create another data frame that has just the records with a missing value. Get a summary of this data frame using `getDfSummary()` and compare the differences. Do some feature distributions change dramatically?" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": { 196 | "collapsed": true 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "# Place your code here" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "6\\. Which variables are binary?" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": { 214 | "collapsed": false 215 | }, 216 | "outputs": [], 217 | "source": [ 218 | "# Place your code here" 219 | ] 220 | } 221 | ], 222 | "metadata": { 223 | "kernelspec": { 224 | "display_name": "Python 3", 225 | "language": "python", 226 | "name": "python3" 227 | }, 228 | "language_info": { 229 | "codemirror_mode": { 230 | "name": "ipython", 231 | "version": 3 232 | }, 233 | "file_extension": ".py", 234 | "mimetype": "text/x-python", 235 | "name": "python", 236 | "nbconvert_exporter": "python", 237 | "pygments_lexer": "ipython3", 238 | "version": "3.5.2" 239 | } 240 | }, 241 | "nbformat": 4, 242 | "nbformat_minor": 0 243 | } 244 | -------------------------------------------------------------------------------- /ipython/hw/hw_1/images/osquery_contributors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/hw/hw_1/images/osquery_contributors.png -------------------------------------------------------------------------------- /ipython/hw/hw_2/hw_2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction to Data Science\n", 8 | "## Homework 2" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "Student Name: \n", 16 | "\n", 17 | "Student Netid: \n", 18 | "***" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "### Preparing a Training Set and Training a Decision Tree\n", 26 | "This is a hands-on task where we build a predictive model using Decision Trees discussed in class. For this part, we will be using the data in `cell2cell_data.csv`.\n", 27 | "\n", 28 | "These historical data consist of 39,859 customers: 19,901 customers that churned (i.e., left the company) and 19,958 that did not churn (see the `\"churndep\"` variable). Here are the data set's 11 possible predictor variables for churning behavior: \n", 29 | "\n", 30 | "```\n", 31 | "Pos. Var. Name Var. Description\n", 32 | "----- ---------- --------------------------------------------------------------\n", 33 | "1 revenue Mean monthly revenue in dollars\n", 34 | "2 outcalls Mean number of outbound voice calls\n", 35 | "3 incalls Mean number of inbound voice calls\n", 36 | "4 months Months in Service\n", 37 | "5 eqpdays Number of days the customer has had his/her current equipment\n", 38 | "6 webcap Handset is web capable\n", 39 | "7 marryyes Married (1=Yes; 0=No)\n", 40 | "8 travel Has traveled to non-US country (1=Yes; 0=No)\n", 41 | "9 pcown Owns a personal computer (1=Yes; 0=No)\n", 42 | "10 creditcd Possesses a credit card (1=Yes; 0=No)\n", 43 | "11 retcalls Number of calls previously made to retention team\n", 44 | "```\n", 45 | "\n", 46 | "The 12th column, the dependent variable `\"churndep\"`, equals 1 if the customer churned, and 0 otherwise. " 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "1\\. Load the data and prepare it for modeling. Note that the features are already processed for you, so the only thing needed here is split the data into training and testing. Use pandas to create two data frames: train_df and test_df, where train_df has 80% of the data chosen uniformly at random without replacement (test_df should have the other 20%). Also, make sure to write your own code to do the splits. You may use any random() function numpy but DO NOT use the data splitting functions from Sklearn." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "import pandas as pd\n", 65 | "\n", 66 | "# Code here" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "2\\. If we had to, how would we prove to ourselves or a colleague that our data was indeed randomly sampled on X? And by prove, I mean empirically, not just showing this person our code. Don't actually do the work, just describe in your own words a test you could here. Hint: think about this in terms of selection bias and use notes from our 2nd lecture." 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "3\\. Now build and train a decision tree classifier using `DecisionTreeClassifier()` [(manual page)](http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html) on train_df to predict the `\"churndep\"` target variable. Make sure to use `criterion='entropy'` when instantiating an instance of `DecisionTreeClassifier()`. For all other settings you should use all of the default options." 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "collapsed": true 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "import sklearn\n", 97 | "\n", 98 | "# Code here" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "4\\. Using the resulting model from 2.2, show a bar plot of feature names and their feature importance (hint: check the attributes of the `DecisionTreeClassifier()` object directly in IPython or check the manual!)." 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": { 112 | "collapsed": true 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "import matplotlib.pyplot as plt\n", 117 | "%matplotlib inline\n", 118 | "\n", 119 | "# Code here" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "5\\. Is the relationship between the top 3 most important features (as measured here) negative or positive? If your marketing director asked you to explain the top 3 drivers of churn, how would you interpret the relationship between these 3 features and the churn outcome? What \"real-life\" connection can you draw between each variable and churn?" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": { 133 | "collapsed": true 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "# Code/answer here" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "6\\. Using the classifier built in 2.2, try predicting `\"churndep\"` on both the train_df and test_df data sets. What is the accuracy on each?" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": { 151 | "collapsed": true 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "# Code here" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "### Part 2 - Finding a Good Decision Tree\n", 163 | "The default options for your decision tree may not be optimal. We need to analyze whether tuning the parameters can improve the accuracy of the classifier. For the following options `min_samples_split` and `min_samples_leaf`:" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "1\\. Generate a list of 10 values of each for the parameters mim_samples_split and min_samples_leaf. " 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "collapsed": true 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "# Code here\n", 182 | "\n", 183 | "min_samples_split_values = None\n", 184 | "min_samples_leaf_values = None" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "2\\. Explain in words your reasoning for choosing the above ranges." 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "3\\. For each combination of values in 3.1 (there should be 100), build a new classifier and check the classifier's accuracy on the test data. Plot the test set accuracy for these options. Use the values of `min_samples_split` as the x-axis and generate a new series (line) for each of `min_samples_leaf`." 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": { 210 | "collapsed": true 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "import matplotlib.pyplot as plt\n", 215 | "%matplotlib inline\n", 216 | "\n", 217 | "# Code here" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "4\\. Which configuration returns the best accuracy? What is this accuracy? (Note, if you don't see much variation in the test set accuracy across values of min_samples_split or min_samples_leaf, try redoing the above steps with a different range of values)." 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": { 231 | "collapsed": true 232 | }, 233 | "outputs": [], 234 | "source": [ 235 | "# Code here" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "5\\. If you were working for a marketing department, how would you use your churn production model in a real business environment? Explain why churn prediction might be good for the business and how one might improve churn by using this model." 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "Answer here!" 250 | ] 251 | } 252 | ], 253 | "metadata": { 254 | "kernelspec": { 255 | "display_name": "Python 3", 256 | "language": "python", 257 | "name": "python3" 258 | }, 259 | "language_info": { 260 | "codemirror_mode": { 261 | "name": "ipython", 262 | "version": 3 263 | }, 264 | "file_extension": ".py", 265 | "mimetype": "text/x-python", 266 | "name": "python", 267 | "nbconvert_exporter": "python", 268 | "pygments_lexer": "ipython3", 269 | "version": "3.6.0" 270 | } 271 | }, 272 | "nbformat": 4, 273 | "nbformat_minor": 0 274 | } 275 | -------------------------------------------------------------------------------- /ipython/hw/hw_3/Homework_3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Introduction to Data Science\n", 8 | "## Homework 3: Due 5pm to My Mailbox (2nd Floor, 19 W 4th St) Wednesday April 19th" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "Student Name: \n", 16 | "\n", 17 | "Student Netid:\n", 18 | "***" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "In this assignment we will be looking at data generated by particle physicists to test whether machine learning can help classify whether certain particle decay experiments identify the presence of a Higgs Boson. One does not need to know anything about particle physics to do well here, but if you are curious, full feature and data descriptions can be found here:\n", 26 | "\n", 27 | "- https://www.kaggle.com/c/higgs-boson/data\n", 28 | "- http://higgsml.lal.in2p3.fr/files/2014/04/documentation_v1.8.pdf\n", 29 | "\n", 30 | "The goal of this assignment is to learn to use cross-validation for model selection. We’ll also use learning curve analysis to understand how well different algorithms make use of limited data. For more documentation on cross-validation with Python, you can consult the following:\n", 31 | "\n", 32 | "- http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation\n" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "### Part 1: Data preparation\n", 40 | "Create a data preparation and cleaning function that does the following:\n", 41 | "- Has a single input that is a file name string\n", 42 | "- Reads data (the data is comma separated, has a row header and the first column `EventID` is the index) into a pandas `dataframe`\n", 43 | "- Cleans the data\n", 44 | " - Convert the feature `Label` to numeric (choose the minority class to be equal to 1)\n", 45 | " - Create a feature `Y` with numeric label\n", 46 | " - Drop the feature `Label`\n", 47 | " - If a feature has missing values (i.e., `-999`): \n", 48 | " - Create a dummy variable for the missing value\n", 49 | " - Call the variable `orig_var_name` + `_mv` where `orig_var_name` is the name of the actual var with a missing value\n", 50 | " - Give this new variable a 1 if the original variable is missing\n", 51 | " - Replace the missing value with the average of the feature (make sure to compute the mean on records where the value isn't missing). You may find pandas' `.replace()` function useful.\n", 52 | "- After the above is done, rescales the data so that each feature has zero mean and unit variance (hint: look up sklearn.preprocessing)\n", 53 | "- Returns the cleaned and rescaled dataset\n", 54 | "\n", 55 | "Hint: as a guide, this function can easily be done in less than 15 lines." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "collapsed": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "def cleanBosonData(infile_name):\n", 67 | " # Code here\n", 68 | " return data_clean" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "### Part 2: Basic evaluations\n", 76 | "In this part you will build an out-of-the box logistic regression (LR) model and support vector machine (SVM). You will then plot ROC for the LR and SVM model." 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "1\\. Clean the two data files included in this assignment (`data/boson_training_cut_2000.csv` and `data/boson_testing_cut.csv`) and use them as training and testing data sets." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "collapsed": true 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "data_train = cleanBosonData(\"data/boson_training_cut_2000.csv\")\n", 95 | "data_test = cleanBosonData(\"data/boson_testing_cut.csv\")" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "2\\. On the training set, build the following models:\n", 103 | "\n", 104 | "- A logistic regression using sklearn's `linear_model.LogisticRegression()`. For this model, use `C=1e30`.\n", 105 | "- An SVM using sklearn's `svm.svc()`. For this model, specify that `kernel=\"linear\"`.\n", 106 | "\n", 107 | "For each model above, plot the ROC curve of both models on the same plot. Make sure to use the test set for computing and plotting. In the legend, also print out the Area Under the ROC (AUC) for reference." 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": { 114 | "collapsed": true 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "import matplotlib\n", 119 | "import matplotlib.pyplot as plt\n", 120 | "%matplotlib inline\n", 121 | "\n", 122 | "# Code here" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "3\\. Which of the two models is generally better at ranking the test set? Are there any classification thresholds where the model identified above as \"better\" would underperform the other in a classification metric (such as TPR)?" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "Answer here!" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "### Part 3: Model selection with cross-validation\n", 144 | "We think we might be able to improve the performance of the SVM if we perform a grid search on the hyper-parameter $C$. Because we only have 1000 instances, we will have to use cross-validation to find the optimal $C$." 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "1\\. Write a cross-validation function that does the following:\n", 152 | "- Takes as inputs a dataset, a label name, # of splits/folds (`k`), a sequence of values for $C$ (`cs`)\n", 153 | "- Performs two loops\n", 154 | " - Outer Loop: `for each f in range(k)`:\n", 155 | " - Splits the data into `data_train` & `data_validate` according to cross-validation logic\n", 156 | " - Inner Loop: `for each c in cs`:\n", 157 | " - Trains an SVM on training split with `C=c, kernel=\"linear\"`\n", 158 | " - Computes AUC_c_k on validation data\n", 159 | " - Stores AUC_c_k in a dictionary of values\n", 160 | "- Returns a dictionary, where each key-value pair is: `c:[auc-c1,auc-c2,..auc-ck]`" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": { 167 | "collapsed": true 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "# Code here\n", 172 | "def xValSVM(dataset, label_name, k, cs):\n", 173 | " \n", 174 | " return aucs" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "2\\. Using the function written above, do the following:\n", 182 | "- Generate a sequence of 10 $C$ values in the interval `[10^(-8), ..., 10^1]` (i.e., do all powers of 10 from -8 to 1).\n", 183 | "2.\tCall aucs = xValSVM(train, ‘Y’, 10, cs)\n", 184 | "3.\tFor each c in cs, get mean(AUC) and StdErr(AUC) \n", 185 | "4.\tCompute the value for max(meanAUC-StdErr(AUC)) across all values of c.\n", 186 | "5.\tGenerate a plot with the following:\n", 187 | "a.\tLog10(c) on the x-axis\n", 188 | "b.\t1 series with mean(AUC) for each c\n", 189 | "c.\t1 series with mean(AUC)-stderr(AUC) for each c (use ‘k+’ as color pattern)\n", 190 | "d.\t1 series with mean(AUC)+stderr(AUC) for each c (use ‘k--‘ as color pattern)\n", 191 | "e.\ta reference line for max(AUC-StdErr(AUC)) (use ‘r’ as color pattern)\n", 192 | "\n", 193 | "Then answer the question: Did the model parameters selected beat the out-of-the-box model for SVM? " 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": { 200 | "collapsed": true 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "#Code here" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "answer here:" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "3\\. Which of the two algorithms are more suitable for smaller sample sizes, given the set of features? If it costs twice the investment to run enough experiments to double the data, do you think it is a worthy investment?\n" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "answer here:" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "4\\. Is there a reason why cross-validation might be biased? If so, in what direction is it biased? (Hint: refer to ESL figure 7.8)?\n", 233 | "\n" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "answer here:" 241 | ] 242 | } 243 | ], 244 | "metadata": { 245 | "kernelspec": { 246 | "display_name": "Python 3", 247 | "language": "python", 248 | "name": "python3" 249 | }, 250 | "language_info": { 251 | "codemirror_mode": { 252 | "name": "ipython", 253 | "version": 3 254 | }, 255 | "file_extension": ".py", 256 | "mimetype": "text/x-python", 257 | "name": "python", 258 | "nbconvert_exporter": "python", 259 | "pygments_lexer": "ipython3", 260 | "version": "3.5.2" 261 | } 262 | }, 263 | "nbformat": 4, 264 | "nbformat_minor": 0 265 | } 266 | -------------------------------------------------------------------------------- /ipython/hw/hw_4/hw_4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction to Data Science\n", 8 | "## Homework 4: Due printed out in Kevin's Mailbox 5pm May 1st\n", 9 | "## This assignment is OPTIONAL!! If you do it, your HW grade will be the average of all 4" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "Student Name: \n", 17 | "\n", 18 | "Student Netid:\n", 19 | "***" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "### Part 1: Naive Bayes" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "1\\. From your reading you know that the naive Bayes classifier works by calculating the conditional probabilities of each feature, $e_i$, occuring with each class $c$ and treating them independently. This results in the probability of a certain class occuring given a set of features, or a piece of evidence, $E$, as\n", 34 | "\n", 35 | "$$P(c \\mid E) = \\frac{p(e_1 \\mid c) \\cdot p(e_2 \\mid c) \\cdot \\cdot \\cdot p(e_k \\mid c) \\cdot p(c)}{p(E)}.$$\n", 36 | "\n", 37 | "The conditional probability of each piece of evidence occuring with a given class is given by\n", 38 | "\n", 39 | "$$P(e_i \\mid c) = \\frac{\\text{count}(e_i, c)}{\\text{count}(c)}.$$\n", 40 | "\n", 41 | "In the above equation $\\text{count}(e_i, c)$ is the number of documents in a given class that contain feature $e_i$ and $\\text{count}(c)$ is the number of documents that belong to class $c$. \n", 42 | "\n", 43 | "A common variation of the above is to use Laplace (sometimes called +1) smoothing. Recall the use of Laplace smoothing introduced toward the end of Chapter 3 in the section Probability Estimation. This is done in sklearn by setting `alpha=1` in the `BernoulliNB()` function (this is also the default behavior). The result of Laplace smoothing will slightly change the conditional probabilities,\n", 44 | "\n", 45 | "$$P(e_i \\mid c) = \\frac{\\text{count}(e_i, c) + 1}{\\text{count}(c) + 2}.$$\n", 46 | "\n", 47 | "In no more than **one paragraph**, describe why this is useful. Try to think of a case when not using Laplace smoothing would result in \"bad\" models. Try to give an example. Be precise." 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "Answer here!" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "### Part 2: Text classification for sentiment analysis\n", 62 | "For this part of the assignment, we are going to use a data set of movie ratings from IMDB.com. The data consists of the text of a movie review and a target variable which tells us whether the reviewer had a positive feeling towards the movie (equivalent to rating the movie between 7 and 10) or a negative feeling (rating the movie between 1 and 4). Neutral reactions are not included in the data.\n", 63 | "\n", 64 | "The data are located in \"`data/imdb.csv`\". The first column is the review text; the second is the text label 'P' for positive or 'N' for negative." 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "1\\. Load the data into a pandas `DataFrame()`." 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "collapsed": true 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "import pandas as pd\n", 83 | "data = None" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "2\\. Code the target variable to be numeric: use the value `1` to represent 'P' and `0` to represent 'N'." 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": { 97 | "collapsed": false 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "# Code here" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "3\\. Put all of the text into a data frame called `X` and the target variable in a data frame called `Y`. Make a train/test split where you give 75% of the data to training." 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "from sklearn.cross_validation import train_test_split\n", 120 | "\n", 121 | "X = None\n", 122 | "Y = None\n", 123 | "\n", 124 | "X_train, X_test, Y_train, Y_test = None" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "4\\. Create a binary `CountVectorizer()` and `TfidfVectorizer()`. Use the original single words as well as bigrams. Also, use an \"english\" stop word list. Fit these to the training data to extract a vocabulary and then transform both the train and test data." 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": { 138 | "collapsed": true 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "from sklearn.feature_extraction.text import CountVectorizer\n", 143 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 144 | "\n", 145 | "binary_vectorizer = None\n", 146 | "X_train_binary = None\n", 147 | "X_test_binary = None\n", 148 | "\n", 149 | "tfidf_vectorizer = None\n", 150 | "X_train_tfidf = None\n", 151 | "X_test_tfidf = None" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "4\\. Create `LogisticRegression()` and `BernoulliNB()` models. For all settings, keep the default values. In a single plot, show the AUC curve for both classifiers and both the binary and tfidf feature sets. In the legend, include the area under the ROC curve (AUC). Do not forget to label your axes. Your final plot will be a single window with 4 curves.\n", 159 | "\n", 160 | "Which model do you think does a better job? Why? Explain in no more than a paragraph." 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": { 167 | "collapsed": true 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "# Run this so your plots show properly\n", 172 | "import matplotlib.pyplot as plt\n", 173 | "%matplotlib inline\n", 174 | "plt.rcParams['figure.figsize'] = 12, 12" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": { 181 | "collapsed": false 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "from sklearn.linear_model import LogisticRegression\n", 186 | "from sklearn.naive_bayes import BernoulliNB\n", 187 | "from sklearn import metrics\n", 188 | "\n", 189 | "# Code here" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "Explanation here!" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": { 202 | "collapsed": true 203 | }, 204 | "source": [ 205 | "5\\. Use the model from question 4 that you think did the best job and predict the rating of the test data. Find 5 examples the should have been positive, but were incorrectly classified as negative. List the text below and include an explanation as to why you think it may have been incorrectly classified. You can pick any 5. They do not have to be at random." 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": { 212 | "collapsed": true 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "# Code here to display 5 incorrect reviews." 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "Explanation for the 5 reviews chosen here!" 224 | ] 225 | } 226 | ], 227 | "metadata": { 228 | "kernelspec": { 229 | "display_name": "Python 3", 230 | "language": "python", 231 | "name": "python3" 232 | }, 233 | "language_info": { 234 | "codemirror_mode": { 235 | "name": "ipython", 236 | "version": 3 237 | }, 238 | "file_extension": ".py", 239 | "mimetype": "text/x-python", 240 | "name": "python", 241 | "nbconvert_exporter": "python", 242 | "pygments_lexer": "ipython3", 243 | "version": "3.5.2" 244 | } 245 | }, 246 | "nbformat": 4, 247 | "nbformat_minor": 0 248 | } 249 | -------------------------------------------------------------------------------- /ipython/references/Syllabus_2016.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/references/Syllabus_2016.pdf -------------------------------------------------------------------------------- /ipython/references/churn_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/references/churn_architecture.png -------------------------------------------------------------------------------- /ipython/references/churn_dataset_info.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/references/churn_dataset_info.pdf -------------------------------------------------------------------------------- /ipython/references/churn_sampling_scheme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/references/churn_sampling_scheme.png -------------------------------------------------------------------------------- /ipython/utils/ClassifierBakeoff.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.linear_model import LogisticRegression 4 | from sklearn.metrics import roc_auc_score 5 | from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier 6 | from sklearn.tree import DecisionTreeClassifier 7 | 8 | 9 | def liftTable(pred, truth, b): 10 | df = pd.DataFrame({'p':pred + np.random.rand(len(pred))*0.000001, 'y':truth}) 11 | df['b'] = b - pd.qcut(df['p'], b, labels=False) 12 | df['n'] = np.ones(df.shape[0]) 13 | df_grp = df.groupby(['b']).sum() 14 | base = np.sum(df_grp['y'])/float(df.shape[0]) 15 | df_grp['n_cum'] = np.cumsum(df_grp['n'])/float(df.shape[0]) 16 | df_grp['y_cum'] = np.cumsum(df_grp['y']) 17 | df_grp['p_y_b'] = df_grp['y']/df_grp['n'] 18 | df_grp['lift_b'] = df_grp['p_y_b']/base 19 | df_grp['cum_lift_b'] = (df_grp['y_cum']/(float(df.shape[0])*df_grp['n_cum']))/base 20 | return df_grp 21 | 22 | 23 | def getMetrics(preds, labels): 24 | ''' 25 | Takes in non-binary predictions and labels and returns AUC, and several Lifts 26 | ''' 27 | auc = roc_auc_score(labels, preds) 28 | ltab = liftTable(preds, labels, 100) 29 | 30 | lift1 = ltab.ix[1].cum_lift_b 31 | lift5 = ltab.ix[5].cum_lift_b 32 | lift10 = ltab.ix[10].cum_lift_b 33 | lift25 = ltab.ix[25].cum_lift_b 34 | 35 | return [auc, lift1, lift5, lift10, lift25] 36 | 37 | 38 | def dToString(d, dm1, dm2): 39 | ''' 40 | Takes key-values and makes a string, d1 seprates k:v, d2 separates pairs 41 | ''' 42 | arg_str = '' 43 | for k in sorted(d.keys()): 44 | if len(arg_str) == 0: 45 | arg_str = '{}{}{}'.format(k, dm1, d[k]) 46 | else: 47 | arg_str = arg_str + '{}{}{}{}'.format(dm2, k, dm1, d[k]) 48 | return arg_str 49 | 50 | def getArgCombos(arg_lists): 51 | ''' 52 | Takes every combination and returns an iterable of dicts 53 | ''' 54 | keys = sorted(arg_lists.keys()) 55 | #Initialize the final iterable 56 | tot = 1 57 | for k in keys: 58 | tot = tot * len(arg_lists[k]) 59 | iter = [] 60 | #Fill it with empty dicts 61 | for i in range(tot): 62 | iter.append({}) 63 | #Now fill each dictionary 64 | kpass = 1 65 | for k in keys: 66 | klist = arg_lists[k] 67 | ktot = len(klist) 68 | for i in range(tot): 69 | iter[i][k] = klist[(i/kpass) % ktot] 70 | kpass = ktot * kpass 71 | return iter 72 | 73 | 74 | class LRAdaptor(object): 75 | ''' 76 | This adapts the LogisticRegression() Classifier so that LR can be used as an init for GBT 77 | This just overwrites the predict method to be predict_proba 78 | ''' 79 | def __init__(self, est): 80 | self.est = est 81 | 82 | def predict(self, X): 83 | return self.est.predict_proba(X)[:,1][:, np.newaxis] 84 | 85 | def fit(self, X, y): 86 | self.est.fit(X, y) 87 | 88 | class GenericClassifier(object): 89 | 90 | def __init__(self, modclass, dictargs): 91 | self.classifier = modclass(**dictargs) 92 | 93 | def fit(self, X, Y): 94 | self.classifier.fit(X,Y) 95 | 96 | def predict_proba(self, Xt): 97 | return self.classifier.predict_proba(Xt) 98 | 99 | 100 | class GenericClassifierOptimizer(object): 101 | 102 | def __init__(self, classtype, arg_lists): 103 | self.name = classtype.__name__ 104 | self.classtype = classtype 105 | self.arg_lists = arg_lists 106 | self.results = self._initDict() 107 | 108 | def _initDict(self): 109 | return {'alg':[], 'opt':[], 'auc':[], 'lift1':[], 'lift5':[], 'lift10':[], 'lift25':[]} 110 | 111 | def _updateResDict(self, opt, perf): 112 | self.results['alg'].append(self.name) 113 | self.results['opt'].append(opt) 114 | self.results['auc'].append(perf[0]) 115 | self.results['lift1'].append(perf[1]) 116 | self.results['lift5'].append(perf[2]) 117 | self.results['lift10'].append(perf[3]) 118 | self.results['lift25'].append(perf[4]) 119 | 120 | def runClassBake(self, X_train, Y_train, X_test, Y_test): 121 | 122 | arg_loop = getArgCombos(self.arg_lists) 123 | 124 | for d in arg_loop: 125 | 126 | mod = GenericClassifier(self.classtype, d) 127 | mod.fit(X_train, Y_train) 128 | 129 | perf = getMetrics(mod.predict_proba(X_test)[:,1], Y_test) 130 | self._updateResDict(dToString(d, ':', '|'), perf) 131 | 132 | 133 | 134 | class ClassifierBakeoff(object): 135 | 136 | def __init__(self, X_train, Y_train, X_test, Y_test, setup): 137 | self.instructions = setup 138 | self.X_train = X_train 139 | self.Y_train = Y_train 140 | self.X_test = X_test 141 | self.Y_test = Y_test 142 | self.results = self._initDict() 143 | 144 | def _initDict(self): 145 | return {'alg':[], 'opt':[], 'auc':[], 'lift1':[], 'lift5':[], 'lift10':[], 'lift25':[]} 146 | 147 | def _updateResDict(self, clfr_results): 148 | self.results['alg'] = self.results['alg'] + clfr_results['alg'] 149 | self.results['opt'] = self.results['opt'] + clfr_results['opt'] 150 | self.results['auc'] = self.results['auc'] + clfr_results['auc'] 151 | self.results['lift1'] = self.results['lift1'] + clfr_results['lift1'] 152 | self.results['lift5'] = self.results['lift5'] + clfr_results['lift5'] 153 | self.results['lift10'] = self.results['lift10'] + clfr_results['lift10'] 154 | self.results['lift25'] = self.results['lift25'] + clfr_results['lift25'] 155 | 156 | 157 | def bake(self): 158 | 159 | for clfr in self.instructions: 160 | 161 | classifierBake = GenericClassifierOptimizer(clfr, self.instructions[clfr]) 162 | classifierBake.runClassBake(self.X_train, self.Y_train, self.X_test, self.Y_test) 163 | self._updateResDict(classifierBake.results) 164 | 165 | 166 | 167 | 168 | 169 | 170 | -------------------------------------------------------------------------------- /ipython/utils/ClassifierBakeoff.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/utils/ClassifierBakeoff.pyc -------------------------------------------------------------------------------- /ipython/utils/bias_variance.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pandas as pd 3 | import numpy as np 4 | from matplotlib import pyplot as plt 5 | import sklearn.metrics as skm 6 | import warnings 7 | warnings.filterwarnings('ignore') 8 | from sklearn import linear_model 9 | 10 | def simPolynomial(sigma = 0, betas = [0, 0], n = 100): 11 | 12 | x = np.random.uniform(0, 100, n) 13 | e = np.random.normal(0, sigma, n) 14 | 15 | d = pd.DataFrame(x, columns=['x']) 16 | y = e 17 | for i, b in enumerate(betas): 18 | y = y + b*(x**i) 19 | d['y'] = y 20 | return d 21 | 22 | 23 | def fitLinReg(d, mn, mx, inter): 24 | ''' 25 | Runs a linear regression and fits it on a grid 26 | ''' 27 | 28 | regr = linear_model.LinearRegression(fit_intercept = inter) 29 | regr.fit(d.drop('y', 1), d['y']) 30 | yhat = regr.predict(pd.DataFrame(np.arange(mn, mx, 1))) 31 | 32 | return yhat 33 | 34 | def makePolyFeat(d, deg): 35 | ''' 36 | Goal: Generate features up to X**deg 37 | 1. a data frame with two features X and Y 38 | 4. a degree 'deg' (from which we make polynomial features 39 | 40 | ''' 41 | #Generate Polynomial terms 42 | for i in range(2, deg+1): 43 | d['x'+str(i)] = d['x']**i 44 | return d 45 | 46 | def fitFullReg(d, mn, mx, betas, inter): 47 | ''' 48 | Runs a linear regression and fits it on a grid. Creates polynomial features using the dimension of betas 49 | ''' 50 | 51 | regr = linear_model.LinearRegression(fit_intercept = inter) 52 | regr.fit(makePolyFeat(d.drop('y', 1), len(betas)), d['y']) 53 | dt = pd.DataFrame(np.arange(mn, mx, 1), columns = ['x']) 54 | yhat = regr.predict(makePolyFeat(dt, len(betas))) 55 | 56 | return yhat 57 | 58 | 59 | 60 | def plotLinearBiasStage(sigma, betas, ns, fs): 61 | 62 | mn = 0 63 | mx = 101 64 | 65 | d = simPolynomial(sigma, betas, 10000) 66 | plt.figure(figsize = fs) 67 | plt.plot(d['x'], d['y'], 'b.', markersize = 0.75) 68 | 69 | 70 | x = np.arange(mn, mx, 1) 71 | y_real = np.zeros(len(x)) 72 | for i, b in enumerate(betas): 73 | y_real += b*(x**i) 74 | 75 | #plt.plot(x, y_real + 2*sigma, 'k+') 76 | #plt.plot(x, y_real - 2*sigma, 'k--') 77 | plt.plot(x, y_real, 'k*') 78 | 79 | for n in ns: 80 | dn = simPolynomial(sigma, betas, n) 81 | yhat = fitLinReg(dn, mn, mx, True) 82 | plt.plot(x, yhat, label = 'n={}'.format(n)) 83 | 84 | 85 | plt.legend(loc = 4, ncol = 3) 86 | 87 | 88 | 89 | def plotVariance(sigma, betas, ns, fs): 90 | 91 | mn = 0 92 | mx = 101 93 | nworlds = 100 94 | 95 | d = simPolynomial(sigma, betas, 10000) 96 | x = np.arange(mn, mx, 1) 97 | 98 | fig = plt.figure(figsize = fs) 99 | for pos, n in enumerate(ns): 100 | 101 | #First model each world 102 | yhat_lin = [] 103 | yhat_non = [] 104 | for i in range(nworlds): 105 | 106 | dn = simPolynomial(sigma, betas, n) 107 | 108 | yhat_lin.append(fitLinReg(dn, mn, mx, True)) 109 | yhat_non.append(fitFullReg(dn, mn, mx, betas, True)) 110 | 111 | #Now compute appropriate stats and plot 112 | 113 | lin_df = pd.DataFrame(yhat_lin) 114 | non_df = pd.DataFrame(yhat_non) 115 | 116 | lin_sig = lin_df.apply(np.std, axis=0).values 117 | non_sig = non_df.apply(np.std, axis=0).values 118 | lin_mu = lin_df.apply(np.mean, axis=0).values 119 | non_mu = non_df.apply(np.mean, axis=0).values 120 | 121 | #Need to continue from here 122 | 123 | for i in range(nworlds): 124 | 125 | ax1 = fig.add_subplot(2, 3, pos + 1) 126 | plt.title('n={}'.format(n)) 127 | plt.plot(x, yhat_lin[i], '.', color = '0.75') 128 | 129 | if i == nworlds - 1: 130 | plt.plot(x, lin_mu, 'r-') 131 | plt.title('E[std|X] = {}'.format(round(lin_sig.mean(),1))) 132 | 133 | ax1.axes.get_xaxis().set_visible(False) 134 | ax1.set_ylim((-40, 80)) 135 | 136 | ax2 = fig.add_subplot(2, 3, pos + 4) 137 | plt.plot(x, yhat_non[i], '--', color = '0.75') 138 | 139 | if i == nworlds - 1: 140 | plt.plot(x, non_mu, 'r-') 141 | plt.title('E[std|X] = {}'.format(round(non_sig.mean(),1))) 142 | 143 | ax2.set_ylim((-40, 80)) 144 | 145 | if pos != 0: 146 | ax1.axes.get_yaxis().set_visible(False) 147 | ax2.axes.get_yaxis().set_visible(False) 148 | 149 | plt.legend() 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | def getVarianceTrend(sigma, betas): 158 | 159 | mn = 50 160 | mx = 51 161 | nworlds = 100 162 | ns = np.logspace(4, 16, num = 10, base = 2) 163 | 164 | res_dict = {'n':[], 'lin':[], 'quad':[], 'non':[]} 165 | 166 | for pos, n in enumerate(ns): 167 | 168 | yhat_lin = []; yhat_quad = []; yhat_non = [] 169 | 170 | for i in range(nworlds): 171 | 172 | dn = simPolynomial(sigma, betas, n) 173 | 174 | #yhat_lin.append(fitLinReg(dn, mn, mx, True)[0]) 175 | yhat_lin.append(fitFullReg(dn, mn, mx, betas[0:1], True)[0]) 176 | yhat_quad.append(fitFullReg(dn, mn, mx, betas[0:2], True)[0]) 177 | yhat_non.append(fitFullReg(dn, mn, mx, betas, True)[0]) 178 | 179 | res_dict['lin'].append(np.array(yhat_lin).std()) 180 | res_dict['quad'].append(np.array(yhat_quad).std()) 181 | res_dict['non'].append(np.array(yhat_non).std()) 182 | res_dict['n'].append(n) 183 | 184 | 185 | return res_dict 186 | 187 | def plotVarianceTrend(res_dict, fs): 188 | 189 | fig = plt.figure(figsize = fs) 190 | 191 | ax1 = fig.add_subplot(2, 1, 1) 192 | x = np.log2(res_dict['n']) 193 | plt.plot(x, np.power(res_dict['lin'], 2), 'b-', label = 'd = 1') 194 | plt.plot(x, np.power(res_dict['quad'], 2), 'r-', label = 'd = 2') 195 | plt.plot(x, np.power(res_dict['non'], 2), 'g-', label = 'd = 4') 196 | 197 | ax1.set_ylim((0, 100)) 198 | 199 | plt.title('Model Variance by Polynomial Order (d) and Sample Size (n)') 200 | plt.legend(loc = 1) 201 | plt.ylabel('Var( E_d[Y|X = 50] )') 202 | 203 | ax2 = fig.add_subplot(2, 1, 2) 204 | filt = (x > 0) 205 | plt.plot(x[filt], 2*np.log2(res_dict['lin']), 'b-', label = 'd = 1') 206 | plt.plot(x[filt], 2*np.log2(res_dict['quad']), 'r-', label = 'd = 2') 207 | plt.plot(x[filt], 2*np.log2(res_dict['non']), 'g-', label = 'd = 4') 208 | 209 | ax2.set_xlim((x[filt].min(), x.max())) 210 | plt.xlabel('Log2(Sample Size)') 211 | plt.ylabel('Log [ Var( E_d[Y|X = 50] ) ]') 212 | plt.legend(loc = 1) 213 | -------------------------------------------------------------------------------- /ipython/utils/bias_variance.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/utils/bias_variance.pyc -------------------------------------------------------------------------------- /ipython/utils/churn_analysis.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This script has a set of reference functions for performing analysis of the churn dataset 3 | ''' 4 | import sys 5 | import pandas as pd 6 | import numpy as np 7 | from matplotlib import pyplot as plt 8 | import sklearn.metrics as skm 9 | sys.path.append("./utils/") 10 | from ClassifierBakeoff import * 11 | 12 | import warnings 13 | warnings.filterwarnings('ignore') 14 | 15 | def getDfSummary(dat): 16 | ''' 17 | Get descriptive stats 18 | ''' 19 | #Get the names of the columns 20 | cols = dat.columns.values 21 | 22 | c_summ = [] 23 | #Outer Loop for the cols 24 | for c in cols: 25 | #Count the NAs 26 | missing = sum(pd.isnull(dat[c])) 27 | #Use describe to get summary statistics, and also drop the 'count' row 28 | sumval = dat[c].describe().drop(['count']) 29 | #Now count distinct values...note that nunique removes missing values for you 30 | distinct = dat[c].nunique() 31 | #Append missing and distinct to sumval 32 | sumval = sumval.append(pd.Series([missing, distinct], index=['missing', 'distinct'])) 33 | #Add each sumval to a list and then convert the entire thing to a DS 34 | c_summ.append(sumval) 35 | 36 | return pd.DataFrame(c_summ, index=cols) 37 | 38 | 39 | 40 | 41 | 42 | def plotCorr(dat, lab, h, w): 43 | ''' 44 | Do a heatmap to visualize the correlation matrix, dropping the label 45 | ''' 46 | 47 | dat = dat.drop(lab, 1) 48 | #Get correlation and 0 out the diagonal (for plotting purposes) 49 | c_dat = dat.corr() 50 | for i in range(c_dat.shape[0]): 51 | c_dat.iloc[i,i] = 0 52 | 53 | c_mat = c_dat.as_matrix() 54 | #c_mat = c_mat[:-1, :-1] 55 | fig, ax = plt.subplots() 56 | heatmap = plt.pcolor(c_mat, cmap = plt.cm.RdBu) 57 | 58 | #Set the tick labels and center them 59 | ax.set_xticks(np.arange(c_dat.shape[0]) + 0.5, minor = False) 60 | ax.set_yticks(np.arange(c_dat.shape[1]) + 0.5, minor = False) 61 | ax.set_xticklabels(c_dat.index.values, minor = False, rotation = 45) 62 | ax.set_yticklabels(c_dat.index.values, minor = False) 63 | heatmap.axes.set_ylim(0, len(c_dat.index)) 64 | heatmap.axes.set_xlim(0, len(c_dat.index)) 65 | plt.colorbar(heatmap, ax = ax) 66 | 67 | #plt.figure(figsize = (h, w)) 68 | fig = plt.gcf() 69 | fig.set_size_inches(h, w) 70 | 71 | 72 | def makeBar(df, h, lab, width): 73 | ''' 74 | Contains 75 | ''' 76 | df_s = df.sort(columns = [h], ascending = False) 77 | 78 | #Get a barplot 79 | ind = np.arange(df_s.shape[0]) 80 | labs = df_s[[lab]].values.ravel() 81 | 82 | fig = plt.figure(facecolor = 'w', figsize = (12, 6)) 83 | ax = plt.subplot(111) 84 | plt.subplots_adjust(bottom = 0.25) 85 | 86 | rec = ax.bar(ind + width, df_s[[h]].values, width, color='r') 87 | 88 | ax.set_xticks(ind + getTickAdj(labs, width)) 89 | ax.set_xticklabels(labs, rotation = 45, size = 14) 90 | 91 | 92 | def getTickAdj(labs, width): 93 | lens = map(len, labs) 94 | lens = -1 * width * (lens - np.mean(lens)) / np.max(lens) 95 | return lens 96 | 97 | def plotMI(dat, lab, width = 0.35, signed = 0): 98 | ''' 99 | Draw a bar chart of the normalized MI between each X and Y 100 | ''' 101 | X = dat.drop(lab, 1) 102 | Y = dat[[lab]].values 103 | cols = X.columns.values 104 | mis = [] 105 | 106 | #Start by getting MI 107 | for c in cols: 108 | mis.append(skm.normalized_mutual_info_score(Y.ravel(), X[[c]].values.ravel())) 109 | 110 | #Get signs by correlation 111 | corrs = dat.corr()[lab] 112 | corrs[corrs.index != lab] 113 | df = pd.DataFrame(zip(mis, cols), columns = ['MI', 'Lab']) 114 | df = pd.merge(df, pd.DataFrame(corrs, columns = ['corr']), how = 'inner', left_on = 'Lab', right_index=True) 115 | 116 | if signed == 0: 117 | makeBar(df, 'MI', 'Lab', width) 118 | 119 | else: 120 | makeBarSigned(df, 'MI', 'Lab', width) 121 | 122 | 123 | def makeBarSigned(df, h, lab, width): 124 | ''' 125 | Contains 126 | ''' 127 | df_s = df.sort(columns = [h], ascending = False) 128 | 129 | #Get a barplot 130 | ind = np.arange(df_s.shape[0]) 131 | labs = df_s[[lab]].values.ravel() 132 | h_pos = (df_s[['corr']].values.ravel() > 0) * df_s.MI 133 | h_neg = (df_s[['corr']].values.ravel() < 0) * df_s.MI 134 | 135 | fig = plt.figure(facecolor = 'w', figsize = (12, 6)) 136 | ax = plt.subplot(111) 137 | plt.subplots_adjust(bottom = 0.25) 138 | 139 | rec = ax.bar(ind + width, h_pos, width, color='r', label = 'Positive') 140 | rec = ax.bar(ind + width, h_neg, width, color='b', label = 'Negative') 141 | 142 | ax.set_xticks(ind + getTickAdj(labs, width)) 143 | ax.set_xticklabels(labs, rotation = 45, size = 14) 144 | 145 | plt.legend() 146 | 147 | 148 | 149 | def makeGS_Tup(ent, getmin = True): 150 | 151 | ostr = dToString(ent.parameters, ':', '|') 152 | if len(ostr.split('|')) > 2: 153 | sp = ostr.split('|') 154 | if len(sp) == 3: 155 | ostr = '{}|{}\n{}'.format(sp[0], sp[1], sp[2]) 156 | else: 157 | ostr = '{}|{}\n{}|{}'.format(sp[0], sp[1], sp[2], sp[3]) 158 | 159 | #ostr = dToString(ent.parameters, ':', '|') 160 | mu = np.abs(ent.mean_validation_score) #Log-Loss comes in at negative value 161 | sig = ent.cv_validation_scores.std() 162 | stderr = sig/np.sqrt(len(ent.cv_validation_scores)) 163 | 164 | if getmin: 165 | return (mu, ostr, mu + stderr, sig, stderr) #Note, this assumes minimization, thus adding stderr 166 | else: 167 | return (mu, ostr, mu - stderr, sig, stderr) 168 | 169 | 170 | def rankGS_Params(gs_obj_list, getmin = True): 171 | ''' 172 | Takes in the .grid_scores_ attributes of a GridSearchCV object 173 | ''' 174 | tup_list = [] 175 | 176 | for k in gs_obj_list: 177 | tup_list.append(makeGS_Tup(k, getmin)) 178 | 179 | tup_list.sort() 180 | 181 | if not getmin: 182 | tup_list.reverse() 183 | 184 | return tup_list 185 | 186 | 187 | 188 | def processGsObjList(gs_obj_list, getmin = True): 189 | 190 | rank_list = rankGS_Params(gs_obj_list, getmin) 191 | hts = [] 192 | desc = [] 193 | errs = [] 194 | std1 = rank_list[0][4] 195 | 196 | for tup in rank_list: 197 | hts.append(tup[0]) 198 | desc.append(tup[1]) 199 | errs.append(2 * tup[4]) 200 | 201 | return [hts, desc, errs, std1] 202 | 203 | def plotGridSearchSingle(gs_obj_list, getmin = True): 204 | 205 | hts, desc, errs, std1 = processGsObjList(gs_obj_list, getmin = True) 206 | 207 | gridBarH(hts, desc, errs, std1) 208 | 209 | 210 | 211 | def plotGridSearchMulti(tup_list, getmin = True): 212 | ''' 213 | Loop through a list of gs_obj_lists. The Obj list is in the 1 slot of each value in the dict 214 | ''' 215 | m_ht = [] 216 | m_desc = [] 217 | m_errs = [] 218 | 219 | best_min = 1000 #This assumes we are minimizing 220 | 221 | for tup in tup_list: 222 | lab = tup[0] 223 | gs_dict = tup[1] 224 | 225 | for k in gs_dict: 226 | clf = type(k).__name__.split('Classifier')[0] 227 | 228 | hts, desc, errs, std1 = processGsObjList(gs_dict[k][1], getmin = True) 229 | for i, d in enumerate(desc): 230 | desc[i] = '{} {} {}'.format(clf, lab, d) 231 | 232 | if hts[0] < best_min: 233 | best_std1 = std1 234 | 235 | m_ht = m_ht + hts 236 | m_desc = m_desc + desc 237 | m_errs = m_errs + errs 238 | 239 | gridBarH(m_ht, m_desc, m_errs, best_std1, int(len(m_ht)), 12) 240 | 241 | 242 | 243 | def gridBarH(hts, desc, errs, std1, h = 6, w = 12): 244 | 245 | fig = plt.figure(facecolor = 'w', figsize = (w, h)) 246 | ax = plt.subplot(111) 247 | plt.subplots_adjust(bottom = 0.25) 248 | 249 | width = 0.5 250 | 251 | pos = np.arange(len(hts)) 252 | 253 | rec = ax.barh(pos, np.array(hts), width, yerr = np.array(errs), color='r') 254 | 255 | ax.set_yticks(pos + width/2) 256 | ax.set_yticklabels(desc, size = 14) 257 | 258 | tmp = list(hts) 259 | tmp.sort() 260 | 261 | x_min = np.array(hts).min() - 2*np.array(hts).std() 262 | x_max = tmp[-2] + 2*np.array(hts).std() 263 | plt.xlim(x_min, x_max) 264 | 265 | 266 | plt.plot(tmp[0] * np.ones(len(tmp)), pos) 267 | plt.plot((tmp[0] + std1) * np.ones(len(tmp)), pos) 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | -------------------------------------------------------------------------------- /ipython/utils/course_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import sklearn 5 | import math 6 | from sklearn.metrics import roc_curve, auc 7 | import pickle 8 | 9 | def evenSplit(dat,fld): 10 | ''' 11 | Evenly splits the data on a given binary field, returns a shuffled dataframe 12 | ''' 13 | pos=dat[(dat[fld]==1)] 14 | neg=dat[(dat[fld]==0)] 15 | neg_shuf=neg.reindex(np.random.permutation(neg.index)) 16 | fin_temp=pos.append(neg_shuf[:pos.shape[0]],ignore_index=True) 17 | fin_temp=fin_temp.reindex(np.random.permutation(fin_temp.index)) 18 | return fin_temp 19 | 20 | 21 | def trainTest(dat, pct): 22 | ''' 23 | Randomly splits data into train and test 24 | ''' 25 | dat_shuf = dat.reindex(np.random.permutation(dat.index)) 26 | trn = dat_shuf[:int(np.floor(dat_shuf.shape[0]*pct))] 27 | tst = dat_shuf[int(np.floor(dat_shuf.shape[0]*pct)):] 28 | return [trn, tst] 29 | 30 | def downSample(dat,fld,mult): 31 | ''' 32 | Evenly splits the data on a given binary field, returns a shuffled dataframe 33 | ''' 34 | pos=dat[(dat[fld]==1)] 35 | neg=dat[(dat[fld]==0)] 36 | neg_shuf=neg.reindex(np.random.permutation(neg.index)) 37 | tot=min(pos.shape[0]*mult,neg.shape[0]) 38 | fin_temp=pos.append(neg_shuf[:tot],ignore_index=True) 39 | fin_temp=fin_temp.reindex(np.random.permutation(fin_temp.index)) 40 | return fin_temp 41 | 42 | 43 | def scaleData(d): 44 | ''' 45 | This function takes data and normalizes it to have the same scale (num-min)/(max-min) 46 | ''' 47 | #Note, by creating df_scale like this we preserve the index 48 | df_scale=pd.DataFrame(d.iloc[:,1],columns=['temp']) 49 | for c in d.columns.values: 50 | df_scale[c]=(d[c]-d[c].min())/(d[c].max()-d[c].min()) 51 | return df_scale.drop('temp',1) 52 | 53 | 54 | def plot_dec_line(mn,mx,b0,b1,a,col,lab): 55 | ''' 56 | This function plots a line in a 2 dim space 57 | ''' 58 | x = np.random.uniform(mn,mx,100) 59 | dec_line = map(lambda x_i: -1*(x_i*b0/b1+a/b1),x) 60 | plt.plot(x,dec_line,col,label=lab) 61 | 62 | 63 | 64 | def plotSVM(X, Y, my_svm): 65 | ''' 66 | Plots the separating line along with SV's and margin lines 67 | Code here derived or taken from this example http://scikit-learn.org/stable/auto_examples/svm/plot_separating_hyperplane.html 68 | ''' 69 | # get the separating hyperplane 70 | w = my_svm.coef_[0] 71 | a = -w[0] / w[1] 72 | xx = np.linspace(X.iloc[:,0].min(), X.iloc[:,1].max()) 73 | yy = a * xx - (my_svm.intercept_[0]) / w[1] 74 | # plot the parallels to the separating hyperplane that pass through the 75 | # support vectors 76 | b = my_svm.support_vectors_[0] 77 | yy_down = a * xx + (b[1] - a * b[0]) 78 | b = my_svm.support_vectors_[-1] 79 | yy_up = a * xx + (b[1] - a * b[0]) 80 | # plot the line, the points, and the nearest vectors to the plane 81 | plt.plot(xx, yy, 'k-') 82 | plt.plot(xx, yy_down, 'k--') 83 | plt.plot(xx, yy_up, 'k--') 84 | plt.scatter(my_svm.support_vectors_[:, 0], my_svm.support_vectors_[:, 1], s=80, facecolors='none') 85 | plt.plot(X[(Y==-1)].iloc[:,0], X[(Y==-1)].iloc[:,1],'r.') 86 | plt.plot(X[(Y==1)].iloc[:,0], X[(Y==1)].iloc[:,1],'b+') 87 | #plt.axis('tight') 88 | #plt.show() 89 | 90 | 91 | def getP(val): 92 | ''' 93 | Get f(x) where f is the logistic function 94 | ''' 95 | return (1+math.exp(-1*val))**-1 96 | 97 | def getY(val): 98 | ''' 99 | Return a binary indicator based on a binomial draw with prob=f(val). f the logistic function. 100 | ''' 101 | return (int(getP(val)>np.random.uniform(0,1,1)[0])) 102 | 103 | def gen_logistic_dataframe(n,alpha,betas): 104 | ''' 105 | Aa function that generates a random logistic dataset 106 | n is the number of samples 107 | alpha, betas are the logistic truth 108 | ''' 109 | X = np.random.random([n,len(betas)]) 110 | Y = map(getY,X.dot(betas)+alpha) 111 | d = pd.DataFrame(X,columns=['f'+str(j) for j in range(X.shape[1])]) 112 | d['Y'] = Y 113 | return d 114 | 115 | 116 | def plotAUC(truth, pred, lab): 117 | fpr, tpr, thresholds = roc_curve(truth, pred) 118 | roc_auc = auc(fpr, tpr) 119 | c = (np.random.rand(), np.random.rand(), np.random.rand()) 120 | plt.plot(fpr, tpr, color=c, label= lab+' (AUC = %0.2f)' % roc_auc) 121 | plt.plot([0, 1], [0, 1], 'k--') 122 | plt.xlim([0.0, 1.0]) 123 | plt.ylim([0.0, 1.0]) 124 | plt.xlabel('FPR') 125 | plt.ylabel('TPR') 126 | plt.title('ROC') 127 | plt.legend(loc="lower right") 128 | 129 | 130 | 131 | def LogLoss(dat, beta, alpha): 132 | X = dat.drop('Y',1) 133 | Y = dat['Y'] 134 | XB=X.dot(np.array(beta))+alpha*np.ones(len(Y)) 135 | P=(1+np.exp(-1*XB))**-1 136 | return ((Y==1)*np.log(P)+(Y==0)*np.log(1-P)).mean() 137 | 138 | 139 | def LogLossP(Y, P): 140 | return ((Y==1)*np.log(P)+(Y==0)*np.log(1-P)).mean() 141 | 142 | 143 | 144 | 145 | def plotSVD(sig): 146 | norm = math.sqrt(sum(sig*sig)) 147 | energy_k = [math.sqrt(k)/norm for k in np.cumsum(sig*sig)] 148 | 149 | plt.figure() 150 | ax1 = plt.subplot(211) 151 | ax1.bar(range(len(sig+1)), [0]+sig, 0.35) 152 | plt.title('Kth Singular Value') 153 | plt.tick_params(axis='x',which='both',bottom='off',top='off',labelbottom='off') 154 | 155 | ax2 = plt.subplot(212) 156 | plt.plot(range(len(sig)+1), [0]+energy_k) 157 | plt.title('Normalized Sum-of-Squares of Kth Singular Value') 158 | 159 | ax2.set_xlabel('Kth Singular Value') 160 | ax2.set_ylim([0, 1]) 161 | 162 | 163 | def genY(x, err, betas): 164 | ''' 165 | Goal: generate a Y variable as Y=XB+e 166 | Input 167 | 1. an np array x of length n 168 | 2. a random noise vector r of length n 169 | 3. a (d+1) x 1 vector of coefficients b - each represents ith degree of x 170 | ''' 171 | d = pd.DataFrame(x, columns=['x']) 172 | y = err 173 | for i,b in enumerate(betas): 174 | y = y + b*x**i 175 | d['y'] = y 176 | return d 177 | 178 | 179 | def makePolyFeat(d, deg): 180 | ''' 181 | Goal: Generate features up to X**deg 182 | 1. a data frame with two features X and Y 183 | 4. a degree 'deg' (from which we make polynomial features 184 | 185 | ''' 186 | #Generate Polynomial terms 187 | for i in range(2, deg+1): 188 | d['x'+str(i)] = d['x']**i 189 | return d 190 | 191 | 192 | def save_obj(obj, name ): 193 | with open(name + '.pkl', 'wb') as f: 194 | pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) 195 | 196 | def load_obj(name ): 197 | with open(name + '.pkl', 'r') as f: 198 | return pickle.load(f) 199 | 200 | 201 | 202 | 203 | def happyClass(sig, n): 204 | ''' 205 | sig is the noise parameter and n is sample size 206 | ''' 207 | eye1 = [(0.7, 0.75), 0.1] 208 | eye2 = [(0.3, 0.75), 0.1] 209 | 210 | X1 = np.random.random(n) 211 | X2 = np.random.random(n) 212 | Y1 = 1*(((X1 - eye1[0][0])**2 + (X2 - eye1[0][1])**2 + np.random.randn(n)*sig) < eye1[1]**2) 213 | Y2 = 1*(((X1 - eye2[0][0])**2 + (X2 - eye2[0][1])**2 + np.random.randn(n)*sig) < eye2[1]**2) 214 | Y3 = 1*(abs(X2 - 0.1 - 4*(X1 - 0.5)**2) + np.random.randn(n)*5*sig < 0.05) * 1*(X2 < 0.5) 215 | 216 | Y = 1*((Y1 + Y2 + Y3) > 0) 217 | D = pd.DataFrame({'X1':X1, 'X2':X2}) 218 | D['Y'] = Y 219 | 220 | return D 221 | 222 | 223 | def plotZgen(clf, dat, pc, t, fig): 224 | ''' 225 | This plots a 2d decision boundary given a trained classifier 226 | Note the data must have two fields X1 and X2 to work 227 | ''' 228 | plot_step = 0.02 229 | x_min, x_max = dat['X1'].min(), dat['X1'].max() 230 | y_min, y_max = dat['X2'].min(), dat['X2'].max() 231 | xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),np.arange(y_min, y_max, plot_step)) 232 | Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] 233 | Z = Z.reshape(xx.shape) 234 | ax = fig.add_subplot(pc[0], pc[1], pc[2]) 235 | cs = plt.contourf(xx, yy, Z, cmap=plt.cm.cool) 236 | plt.plot(dat['X1'][(dat.Y==1)], dat['X2'][(noisy_test.Y==1)], 'r.', markersize = 2) 237 | plt.title(t) 238 | ax.axes.get_xaxis().set_visible(False) 239 | ax.axes.get_yaxis().set_visible(False) 240 | 241 | -------------------------------------------------------------------------------- /ipython/utils/course_utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/utils/course_utils.pyc -------------------------------------------------------------------------------- /ipython/utils/eval_plots.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import pandas as pd 4 | 5 | def getMAE(pred, truth): 6 | return np.abs(truth - pred).mean() 7 | 8 | def getLL(pred, truth): 9 | ll_sum = 0 10 | for i in range(len(pred)): 11 | if (pred[i] == 0): 12 | p = 0.0001 13 | elif (pred[i] == 1): 14 | p = 0.9999 15 | else: 16 | p = pred[i] 17 | ll_sum += truth[i]*np.log(p)+(1-truth[i])*np.log(1-p) 18 | return (ll_sum)/len(pred) 19 | 20 | 21 | def plotCalib(truth, pred, bins = 100, f = 0, l = '', w = 8, h = 8, fig_i = 1, fig_j = 1, fig_k = 1): 22 | mae = np.round(getMAE(pred, truth),3) 23 | ll = np.round(getLL(pred, truth), 3) 24 | 25 | d = pd.DataFrame({'p':pred, 'y':truth}) 26 | d['p_bin'] = np.floor(d['p']*bins)/bins 27 | d_bin = d.groupby(['p_bin']).agg([np.mean, len]) 28 | filt = (d_bin['p']['len']>f) 29 | 30 | 31 | if fig_k == 1: 32 | fig = plt.figure(facecolor = 'w', figsize = (w, h)) 33 | 34 | x = d_bin['p']['mean'][filt] 35 | y = d_bin['y']['mean'][filt] 36 | n = d_bin['y']['len'][filt] 37 | 38 | stderr = np.sqrt(y * (1 - y)/n) 39 | 40 | ax = plt.subplot(fig_i, fig_j, fig_k) 41 | #plt.plot(x, y, 'b.', markersize = 9) 42 | plt.errorbar(x, y, yerr = 1.96 * stderr, fmt = 'o') 43 | plt.plot([0.0, 1.0], [0.0, 1.0], 'k-') 44 | plt.title(l + ':' + ' MAE = {}, LL = {}'.format(mae, ll)) 45 | 46 | plt.xlim([0.0, 1.0]) 47 | plt.ylim([0.0, 1.0]) 48 | plt.xlabel('prediction P(Y|X)') 49 | plt.ylabel('actual P(Y|X)') 50 | #plt.legend(loc=4) 51 | 52 | 53 | 54 | def liftTable(pred, truth, b): 55 | df = pd.DataFrame({'p':pred + np.random.rand(len(pred))*0.000001, 'y':truth}) 56 | df['b'] = b - pd.qcut(df['p'], b, labels=False) 57 | df['n'] = np.ones(df.shape[0]) 58 | df_grp = df.groupby(['b']).sum() 59 | tot_y = float(np.sum(df_grp['y'])) 60 | base = tot_y/float(df.shape[0]) 61 | df_grp['n_cum'] = np.cumsum(df_grp['n'])/float(df.shape[0]) 62 | df_grp['y_cum'] = np.cumsum(df_grp['y']) 63 | df_grp['p_y_b'] = df_grp['y']/df_grp['n'] 64 | df_grp['lift_b'] = df_grp['p_y_b']/base 65 | df_grp['cum_lift_b'] = (df_grp['y_cum']/(float(df.shape[0])*df_grp['n_cum']))/base 66 | df_grp['recall'] = df_grp['y_cum']/tot_y 67 | return df_grp 68 | 69 | 70 | def liftRecallCurve(pred, truth, b, h = 6, w = 12, title = ''): 71 | 72 | #Get the lift table 73 | lt = liftTable(pred, truth, b) 74 | 75 | fig, ax1 = plt.subplots(figsize = (w, h)) 76 | 77 | ax1.plot(lt['n_cum'], lt['cum_lift_b'], 'b-') 78 | 79 | ax1.set_xlabel('Quantile') 80 | # Make the y-axis label and tick labels match the line color. 81 | ax1.set_ylabel('Lift', color='b') 82 | for tl in ax1.get_yticklabels(): 83 | tl.set_color('b') 84 | 85 | ax2 = ax1.twinx() 86 | ax2.plot(lt['n_cum'], lt['recall'], 'r.') 87 | ax2.set_ylabel('Recall', color='r') 88 | for tl in ax2.get_yticklabels(): 89 | tl.set_color('r') 90 | 91 | plt.title(title) 92 | 93 | plt.show() 94 | 95 | -------------------------------------------------------------------------------- /ipython/utils/eval_plots.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/utils/eval_plots.pyc --------------------------------------------------------------------------------