├── README.md
└── ipython
    ├── Labs_Student
        ├── .ipynb_checkpoints
        │   ├── Lab1_Python_Fundamentals-checkpoint.ipynb
        │   └── NumPyBasics-checkpoint.ipynb
        ├── Lab1_Python_Fundamentals.ipynb
        ├── Lab2_NumPy_Vectorization_Student.ipynb
        ├── Lab3_Pandas_Exploration_Student.ipynb
        ├── Lab4_Survey_Questions_part1_Student.ipynb
        ├── Lab4_intro_regex.ipynb
        ├── Lab_6_FeatureRanking_AUC_Student.ipynb
        ├── Lab_7_sklearn_magic_student.ipynb
        ├── NumPyBasics.ipynb
        ├── SimpleiPythonExample.ipynb
        ├── ads_dataset_cut.txt
        ├── lab_5_student.ipynb
        ├── lab_8_text.ipynb
        └── test.txt
    ├── Labs_complete
        ├── Lab1_Python_Fundamentals.ipynb
        ├── Lab2_NumPy_Vectorization.ipynb
        ├── Lab3_Pandas_Exploration.ipynb
        ├── Lab4_Survey_Questions_part1.ipynb
        ├── Lab_6_FeatureRanking_AUC.ipynb
        └── lab_7_sklearn_complete.ipynb
    ├── README.md
    ├── data
        ├── Cell2Cell_data.csv
        ├── Cell2Cell_info.pdf
        ├── ads_dataset.txt
        ├── ads_dataset_cut.txt
        ├── advertising_events.csv
        ├── boson_testing_cut.csv
        ├── boson_training_cut_2000.csv
        ├── loansData.csv
        ├── osquery_contributors.html
        ├── spam_ham.csv
        └── survey_responses_2016.csv
    ├── hw
        ├── hw_1
        │   ├── Homework1.ipynb
        │   ├── data
        │   │   ├── ads_dataset.tsv
        │   │   ├── advertising_events.csv
        │   │   └── osquery_contributors.html
        │   └── images
        │   │   └── osquery_contributors.png
        ├── hw_2
        │   ├── data
        │   │   └── cell2cell_data.csv
        │   └── hw_2.ipynb
        ├── hw_3
        │   ├── Homework_3.ipynb
        │   └── data
        │   │   ├── boson_testing_cut.csv
        │   │   └── boson_training_cut_2000.csv
        └── hw_4
        │   ├── data
        │       └── imdb.csv
        │   └── hw_4.ipynb
    ├── references
        ├── Syllabus_2016.pdf
        ├── churn_architecture.png
        ├── churn_dataset_info.pdf
        └── churn_sampling_scheme.png
    └── utils
        ├── ClassifierBakeoff.py
        ├── ClassifierBakeoff.pyc
        ├── bias_variance.py
        ├── bias_variance.pyc
        ├── churn_analysis.py
        ├── course_utils.py
        ├── course_utils.pyc
        ├── eval_plots.py
        └── eval_plots.pyc


/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/README.md


--------------------------------------------------------------------------------
/ipython/Labs_Student/.ipynb_checkpoints/NumPyBasics-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "'''\n",
 12 |     "The core data type in Numpy is the ndarray, which enables fast and space-efficient multidimensional array processing.\n",
 13 |     "Note: This notebook is adapted from chapter 4 Python for Data Analysis by Wes McKinney and O'Reilly publishing. NumPy has many, \n",
 14 |     "many features that won't be covered here. The following snippets are just to illustrate basic data types and operations within\n",
 15 |     "numpy.\n",
 16 |     "\n",
 17 |     "Another good resource for learning more about ndarrays is here:\n",
 18 |     "http://docs.scipy.org/doc/numpy/reference/arrays.html\n",
 19 |     "'''\n",
 20 |     "\n",
 21 |     "#First, import NumPy\n",
 22 |     "import numpy as np\n",
 23 |     "\n",
 24 |     "#It is easy to create Nx1 and NxM arrays from standard Python lists\n",
 25 |     "l1 = [0,1,2]\n",
 26 |     "l2 = [3,4,5]\n",
 27 |     "\n",
 28 |     "nd1 = np.array(l1)\n",
 29 |     "nd2 = np.array([l1,  l2])"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {
 36 |     "collapsed": false
 37 |    },
 38 |    "outputs": [
 39 |     {
 40 |      "name": "stdout",
 41 |      "output_type": "stream",
 42 |      "text": [
 43 |       "The ndarray has dimension n=3 and m=1\n",
 44 |       "The ndarray has elements of type=int64\n",
 45 |       "The ndarray has dimension n=2 and m=3\n",
 46 |       "The ndarray has elements of type=int64\n"
 47 |      ]
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "#Now, we can get ask for some basic info to describe the ndarray\n",
 52 |     "def desc_ndarray(nd):\n",
 53 |     "    try:\n",
 54 |     "        print \"The ndarray has dimension n=%s and m=%s\" % (nd.shape[0],nd.shape[1])\n",
 55 |     "    except:\n",
 56 |     "        print \"The ndarray has dimension n=%s and m=1\" % nd.shape[0]\n",
 57 |     "    print \"The ndarray has elements of type=%s\" % nd.dtype\n",
 58 |     "\n",
 59 |     "desc_ndarray(nd1)\n",
 60 |     "\n",
 61 |     "desc_ndarray(nd2)\n",
 62 |     "\n"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 3,
 68 |    "metadata": {
 69 |     "collapsed": false
 70 |    },
 71 |    "outputs": [
 72 |     {
 73 |      "data": {
 74 |       "text/plain": [
 75 |        "[array([ 0.,  0.,  0.,  0.]),\n",
 76 |        " array([ 1.,  1.,  1.,  1.]),\n",
 77 |        " array([ 0.47121338,  1.83328779,  0.4438019 , -0.52309325])]"
 78 |       ]
 79 |      },
 80 |      "execution_count": 3,
 81 |      "metadata": {},
 82 |      "output_type": "execute_result"
 83 |     }
 84 |    ],
 85 |    "source": [
 86 |     "#There are short cuts for creating certain frequently used special ndarrays, i.e.,\n",
 87 |     "\n",
 88 |     "k=4\n",
 89 |     "\n",
 90 |     "#1. an ndarray of all zeros\n",
 91 |     "zero = np.zeros(k)\n",
 92 |     "\n",
 93 |     "#2. an ndarray of all ones\n",
 94 |     "one = np.ones(k)\n",
 95 |     "\n",
 96 |     "#3. an ndarray of random elements (this one is standard normal, but there are many distributions to choose from)\n",
 97 |     "rand = np.random.randn(k)\n",
 98 |     "\n",
 99 |     "[zero, one, rand]"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 4,
105 |    "metadata": {
106 |     "collapsed": false
107 |    },
108 |    "outputs": [
109 |     {
110 |      "data": {
111 |       "text/plain": [
112 |        "[array([[ 0.69394907,  0.85723722],\n",
113 |        "        [-0.16779156,  0.41709003],\n",
114 |        "        [-0.94008249, -0.21591983],\n",
115 |        "        [-0.61305106,  0.41435495]]),\n",
116 |        " array([-0.16779156,  0.41709003]),\n",
117 |        " 0.41709003439166575]"
118 |       ]
119 |      },
120 |      "execution_count": 4,
121 |      "metadata": {},
122 |      "output_type": "execute_result"
123 |     }
124 |    ],
125 |    "source": [
126 |     "'''\n",
127 |     "For indexing an array:\n",
128 |     "1. If nx1 array, follow the same protocol as a regular Python list\n",
129 |     "2. If nxm array use the following examples\n",
130 |     "'''\n",
131 |     "\n",
132 |     "arr2d = np.random.randn(4,2)\n",
133 |     "\n",
134 |     "#A single index gets a full row\n",
135 |     "\n",
136 |     "#2 indexes returns a value\n",
137 |     "[arr2d, arr2d[1],  arr2d[1,1]]"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 5,
143 |    "metadata": {
144 |     "collapsed": false
145 |    },
146 |    "outputs": [
147 |     {
148 |      "data": {
149 |       "text/plain": [
150 |        "[array([-0.4386254 , -0.67720483, -1.19775067, -0.21300288]),\n",
151 |        " array([-0.8772508 , -1.35440967, -2.39550135, -0.42600575]),\n",
152 |        " array([-0.8772508 , -1.35440967, -2.39550135, -0.42600575]),\n",
153 |        " array([-0., -0., -0., -0.])]"
154 |       ]
155 |      },
156 |      "execution_count": 5,
157 |      "metadata": {},
158 |      "output_type": "execute_result"
159 |     }
160 |    ],
161 |    "source": [
162 |     "'''\n",
163 |     "Operations between Arrays and Scalars\n",
164 |     "An important feature of ndarrays is they allow batch operations on data without writing any for loops.  \n",
165 |     "This is called vectorization.\n",
166 |     "Any arithmetic operations between equal-size arrays applies the operation elementwise. \n",
167 |     "'''\n",
168 |     "\n",
169 |     "#examples\n",
170 |     "\n",
171 |     "k = 4\n",
172 |     "rand = np.random.randn(k)\n",
173 |     "[rand, rand + rand, 2*rand, rand*np.zeros(4)]\n",
174 |     "\n"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 7,
180 |    "metadata": {
181 |     "collapsed": false
182 |    },
183 |    "outputs": [
184 |     {
185 |      "data": {
186 |       "text/plain": [
187 |        "[array([ 0.19631415,  0.41059714,  4.26249299]),\n",
188 |        " array([-1.46310809,  1.15559786,  0.10690073]),\n",
189 |        " array([-1.26679394,  1.566195  ,  4.36939372])]"
190 |       ]
191 |      },
192 |      "execution_count": 7,
193 |      "metadata": {},
194 |      "output_type": "execute_result"
195 |     }
196 |    ],
197 |    "source": [
198 |     "'''\n",
199 |     "Matrix operations\n",
200 |     "It is easy to do matrix operations with Nd arrays. The standard arithmetic operators don't work here though. And it is important \n",
201 |     "to make sure matrix shapes are compatible\n",
202 |     "'''\n",
203 |     "\n",
204 |     "k = 3\n",
205 |     "r1 = np.random.randn(k)\n",
206 |     "r2 = np.random.randn(k)\n",
207 |     "\n",
208 |     "#Matrix addition is the standard matrix operator\n",
209 |     "[r1, r2 , r1 + r2]\n"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 8,
215 |    "metadata": {
216 |     "collapsed": false
217 |    },
218 |    "outputs": [
219 |     {
220 |      "data": {
221 |       "text/plain": [
222 |        "[array([[ 0.19631415,  0.41059714,  4.26249299],\n",
223 |        "        [-1.46310809,  1.15559786,  0.10690073]]),\n",
224 |        " array([[ 0.19631415, -1.46310809],\n",
225 |        "        [ 0.41059714,  1.15559786],\n",
226 |        "        [ 4.26249299,  0.10690073]])]"
227 |       ]
228 |      },
229 |      "execution_count": 8,
230 |      "metadata": {},
231 |      "output_type": "execute_result"
232 |     }
233 |    ],
234 |    "source": [
235 |     "#The Transpose can be taken with the attribute T\n",
236 |     "arr2d = np.array([r1, r2])\n",
237 |     "[arr2d, arr2d.T]"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 9,
243 |    "metadata": {
244 |     "collapsed": false
245 |    },
246 |    "outputs": [
247 |     {
248 |      "data": {
249 |       "text/plain": [
250 |        "[array([[ 0.19631415,  0.41059714,  4.26249299],\n",
251 |        "        [-1.46310809,  1.15559786,  0.10690073]]),\n",
252 |        " array([[  3.85392468e-02,   1.68590015e-01,   1.81688465e+01],\n",
253 |        "        [  2.14068529e+00,   1.33540642e+00,   1.14277663e-02]]),\n",
254 |        " array([[ 18.37597578,   0.64291997],\n",
255 |        "        [  0.64291997,   3.48751947]])]"
256 |       ]
257 |      },
258 |      "execution_count": 9,
259 |      "metadata": {},
260 |      "output_type": "execute_result"
261 |     }
262 |    ],
263 |    "source": [
264 |     "'''\n",
265 |     "Matrix multiplication, like inner products can be done on arrays.\n",
266 |     "Just remember that the standard multiplication operator does elementwise multiplication (provided they are the same shape).\n",
267 |     "We need the dot method in order to do an inner product\n",
268 |     "\n",
269 |     "Numpy has a linalg library that can run most matrix operations on ndarrays:\n",
270 |     "http://docs.scipy.org/doc/numpy/reference/routines.linalg.html\n",
271 |     "\n",
272 |     "One can also create a matrix object and use the methods in numpy.matrix to achieve the same thing:\n",
273 |     "http://docs.scipy.org/doc/numpy/reference/generated/numpy.matrix.html\n",
274 |     "'''\n",
275 |     "\n",
276 |     "[arr2d, arr2d * arr2d, arr2d.dot(arr2d.T)]"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 11,
282 |    "metadata": {
283 |     "collapsed": false
284 |    },
285 |    "outputs": [
286 |     {
287 |      "name": "stdout",
288 |      "output_type": "stream",
289 |      "text": [
290 |       "10000 loops, best of 3: 119 µs per loop\n"
291 |      ]
292 |     }
293 |    ],
294 |    "source": [
295 |     "'''\n",
296 |     "One important feature of vectorization is that it allows elementwise processing that is much faster than writing a traditional\n",
297 |     "loop.\n",
298 |     "'''\n",
299 |     "import math\n",
300 |     "\n",
301 |     "#show an example and profile i\n",
302 |     "%timeit [math.sqrt(x) for x in range(1000)]"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 12,
308 |    "metadata": {
309 |     "collapsed": false
310 |    },
311 |    "outputs": [
312 |     {
313 |      "name": "stdout",
314 |      "output_type": "stream",
315 |      "text": [
316 |       "The slowest run took 9.83 times longer than the fastest. This could mean that an intermediate result is being cached \n",
317 |       "100000 loops, best of 3: 5.19 µs per loop\n"
318 |      ]
319 |     }
320 |    ],
321 |    "source": [
322 |     "%timeit np.sqrt(np.arange(1000))"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": 16,
328 |    "metadata": {
329 |     "collapsed": false
330 |    },
331 |    "outputs": [
332 |     {
333 |      "name": "stderr",
334 |      "output_type": "stream",
335 |      "text": [
336 |       "ERROR: Line magic function `%inline` not found.\n"
337 |      ]
338 |     }
339 |    ],
340 |    "source": [
341 |     "'''\n",
342 |     "The last thing we'll cover in this module is the numpy.random library. In general, it is advised to use numpy for\n",
343 |     "random number generation as opposed to python's built in random module.\n",
344 |     "\n",
345 |     "Random number generation has many uses. One common use is generating fake (i.e. random) data to test modeling procedures\n",
346 |     "or to do Monte Carlo Simulations\n",
347 |     "'''\n",
348 |     "import matplotlib.pyplot as plt\n",
349 |     "%inline\n",
350 |     "\n",
351 |     "\n",
352 |     "#Generate random pairs that have a multivariate normal distribution\n",
353 |     "N = 1000\n",
354 |     "mu = np.array([0,0])\n",
355 |     "cov = 0.5\n",
356 |     "sig = np.array([[1, cov],[cov, 1]]) #Must be square, symmetric and non-negative definite\n",
357 |     "x, y = np.random.multivariate_normal(mu, sig, N).T\n",
358 |     "#Now let's plot and see what that looks like\n",
359 |     "\n",
360 |     "\n",
361 |     "plt.plot(x, y,'x'); plt.axis('equal'); plt.show()\n",
362 |     "\n"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 18,
368 |    "metadata": {
369 |     "collapsed": false
370 |    },
371 |    "outputs": [],
372 |    "source": [
373 |     "'''\n",
374 |     "One final example (taken from Wes Mckinney's book):\n",
375 |     "\n",
376 |     "Let's generate a random walk and visualize it\n",
377 |     "'''\n",
378 |     "import matplotlib.pyplot as plt\n",
379 |     "\n",
380 |     "nsteps = 1000\n",
381 |     "draws = np.random.randint(0, 2, size = nsteps) #Randint let's us generate random integers in a range\n",
382 |     "steps = np.where(draws>0, 1, -1) #there function let's us do boolean logic on a conditional applied to an entire array\n",
383 |     "walk = steps.cumsum() #Cumsum returns an array with the same size as steps, that has cum sum of steps up to index i\n",
384 |     "plt.plot(np.arange(len(walk)), walk);plt.show()"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": 30,
390 |    "metadata": {
391 |     "collapsed": false
392 |    },
393 |    "outputs": [],
394 |    "source": []
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": null,
399 |    "metadata": {
400 |     "collapsed": false
401 |    },
402 |    "outputs": [],
403 |    "source": []
404 |   }
405 |  ],
406 |  "metadata": {
407 |   "kernelspec": {
408 |    "display_name": "Python 3",
409 |    "language": "python",
410 |    "name": "python3"
411 |   },
412 |   "language_info": {
413 |    "codemirror_mode": {
414 |     "name": "ipython",
415 |     "version": 3
416 |    },
417 |    "file_extension": ".py",
418 |    "mimetype": "text/x-python",
419 |    "name": "python",
420 |    "nbconvert_exporter": "python",
421 |    "pygments_lexer": "ipython3",
422 |    "version": "3.6.0"
423 |   }
424 |  },
425 |  "nbformat": 4,
426 |  "nbformat_minor": 0
427 | }
428 | 


--------------------------------------------------------------------------------
/ipython/Labs_Student/Lab2_NumPy_Vectorization_Student.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import warnings\n",
 12 |     "warnings.filterwarnings('ignore')\n",
 13 |     "import numpy as np\n",
 14 |     "import matplotlib.pyplot as plt\n",
 15 |     "%matplotlib inline"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "First we'll generate a random matrix"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 5,
 28 |    "metadata": {
 29 |     "collapsed": false
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "#Number of columns (features)\n",
 34 |     "K = 5\n",
 35 |     "\n",
 36 |     "#Number of records\n",
 37 |     "N = 1000\n",
 38 |     "\n",
 39 |     "#Generate an NxK matrix of uniform random variables\n",
 40 |     "X = #Student: generate a uniform random matrix here"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "Let's peak at our data to confirm it looks as we expect it"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 6,
 53 |    "metadata": {
 54 |     "collapsed": false
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "#Student - Put in a command to view the first 100 rows\n"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 8,
 64 |    "metadata": {
 65 |     "collapsed": false
 66 |    },
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "#Student - put in a command to see the dimensions of X\n"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "This exercise is about designing a scoring function for a logistic regression. As we are not concerned with fitting a model to data, we can just make up a logistic regression. <br> <br>\n",
 77 |     "\n",
 78 |     "For quick intro, the Logistic Regression takes the form of $\\hat{Y} = f(x * \\beta^T)$, where $x$ is the $1xK$ vector of features and $\\beta$ is the $1xK$ vector of weights. The function $f$, called a 'link' function, is the inverse logit: <br><br>\n",
 79 |     "\n",
 80 |     "<center>$f(a)=\\frac{1}{1+e^{-a}}$</center> <br><br>\n",
 81 |     "\n",
 82 |     "In this notebook we'll write a function that, given inputs of $X$ and $\\beta$, returns a value for $\\hat{Y}$.\n",
 83 |     "<br><br>\n",
 84 |     "First let's generate a random set of weights to represent $\\beta$.\n"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 9,
 90 |    "metadata": {
 91 |     "collapsed": false
 92 |    },
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "#Student - generate a K dimensional vector of uniform random variables in the interval [-1, 1]\n",
 96 |     "beta = #input command here\n",
 97 |     "beta"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "Notice how we applied a neat NumPy trick here. The numpy.random.random() function returns an array, yet we applied what appears to be a scalar operation on the vector. This is an example of what NumPy calls vectorization (a major point of this tutorial), which offers us both a very fast way to do run vector computations as well as a clean and concise method of coding. \n",
105 |     "\n",
106 |     "<br><br>\n",
107 |     "\n",
108 |     "<b>Question: we designed the above $beta$ vector such that $E[\\beta_i]=0$. How can we confirm that we did this correctly?</b>"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 8,
114 |    "metadata": {
115 |     "collapsed": false
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "#start by taking the mean of the beta we already calculated\n",
120 |     "\n",
121 |     "#Student - fill in command here\n",
122 |     "\n"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 10,
128 |    "metadata": {
129 |     "collapsed": false
130 |    },
131 |    "outputs": [],
132 |    "source": [
133 |     "#It is likely the above is not equal to zero. Let's simulate this 100k times and see what the distribution of means is\n",
134 |     "#Student input code here\n",
135 |     "means = []\n"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "metadata": {},
141 |    "source": [
142 |     "Now let's use matplotlibs hist function to plot the histogram of means here. "
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 12,
148 |    "metadata": {
149 |     "collapsed": false
150 |    },
151 |    "outputs": [],
152 |    "source": [
153 |     "plt.hist(means)\n",
154 |     "plt.show()"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "We should expect the distribution to be centered around zero. Is it?"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "Now let's write our scoring function. Let's try to use as much of Numpy's inner optimization as possible (hint, this can be done in two lines and without writing any loops)."
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 45,
174 |    "metadata": {
175 |     "collapsed": true
176 |    },
177 |    "outputs": [],
178 |    "source": [
179 |     "def score_logistic_regression(X, beta):\n",
180 |     "    '''\n",
181 |     "    This function takes in an NxK matrix X and 1xK vector beta.\n",
182 |     "    The function should apply the logistic scoring function to each record of X.\n",
183 |     "    The output should be an Nx1 vector of scores\n",
184 |     "    '''\n",
185 |     "    \n",
186 |     "    #First let's calculate X*beta - make sure to use numpy's 'dot' method\n",
187 |     "    #student - put in code here\n",
188 |     "    \n",
189 |     "    #Now let's input this into the link function\n",
190 |     "    #student - put in code here\n",
191 |     "    \n",
192 |     "    return prob_score"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "So how much faster is it by using Numpy? We can test this be writing the same function that uses no Numpy and executes via loops."
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 44,
205 |    "metadata": {
206 |     "collapsed": false
207 |    },
208 |    "outputs": [],
209 |    "source": [
210 |     "def score_logistic_regression_NoNumpy(X, beta):\n",
211 |     "    '''\n",
212 |     "    This function takes in an NxK matrix X and 1xK vector beta.\n",
213 |     "    The function should apply the logistic scoring function to each record of X.\n",
214 |     "    The output should be an Nx1 vector of scores\n",
215 |     "    '''\n",
216 |     "    #Let's calculate xbeta using loops\n",
217 |     "    xbeta = []\n",
218 |     "    for row in X:\n",
219 |     "        \n",
220 |     "        xb = 0\n",
221 |     "        for i, el in enumerate(row):\n",
222 |     "            xb += el * beta[i]\n",
223 |     "        \n",
224 |     "        xbeta.append(xb)\n",
225 |     "        \n",
226 |     "    #Now let's apply the link function to each xbeta\n",
227 |     "    prob_score = []\n",
228 |     "    for xb in xbeta:\n",
229 |     "        prob_score.append(1 / (1 + np.exp(-1 * xb)))\n",
230 |     "        \n",
231 |     "    return prob_score"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "metadata": {},
237 |    "source": [
238 |     "Before doing any analysis, let's test the output of each to make sure they equal"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 14,
244 |    "metadata": {
245 |     "collapsed": false
246 |    },
247 |    "outputs": [],
248 |    "source": [
249 |     "#Student - write a unit test that calls each function with the same inputs and checks to see they return the same values. "
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "markdown",
254 |    "metadata": {},
255 |    "source": [
256 |     "If they equal then we can proceed with timing analysis"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": 15,
262 |    "metadata": {
263 |     "collapsed": false
264 |    },
265 |    "outputs": [],
266 |    "source": [
267 |     "%timeit score_logistic_regression_NoNumpy(X, beta)"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 16,
273 |    "metadata": {
274 |     "collapsed": false
275 |    },
276 |    "outputs": [],
277 |    "source": [
278 |     "%timeit score_logistic_regression(X, beta)"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "metadata": {},
284 |    "source": []
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "metadata": {
290 |     "collapsed": true
291 |    },
292 |    "outputs": [],
293 |    "source": []
294 |   }
295 |  ],
296 |  "metadata": {
297 |   "anaconda-cloud": {},
298 |   "kernelspec": {
299 |    "display_name": "Python [py35]",
300 |    "language": "python",
301 |    "name": "Python [py35]"
302 |   },
303 |   "language_info": {
304 |    "codemirror_mode": {
305 |     "name": "ipython",
306 |     "version": 3
307 |    },
308 |    "file_extension": ".py",
309 |    "mimetype": "text/x-python",
310 |    "name": "python",
311 |    "nbconvert_exporter": "python",
312 |    "pygments_lexer": "ipython3",
313 |    "version": "3.5.2"
314 |   }
315 |  },
316 |  "nbformat": 4,
317 |  "nbformat_minor": 0
318 | }
319 | 


--------------------------------------------------------------------------------
/ipython/Labs_Student/Lab3_Pandas_Exploration_Student.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "In this lab we're going to do simple data exploration using Pandas. Our objective is to learn basic operations that aid in visual data exploration.\n",
  8 |     "\n",
  9 |     "<br>\n",
 10 |     "\n",
 11 |     "First, let's import our required libraries and read in the data."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {
 18 |     "collapsed": false
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import pandas as pd\n",
 23 |     "import matplotlib.pyplot as plt\n",
 24 |     "import numpy as np\n",
 25 |     "import os\n",
 26 |     "\n",
 27 |     "\n",
 28 |     "cwd = os.getcwd()\n",
 29 |     "\n",
 30 |     "#If on MAC, this will likely work\n",
 31 |     "datadir = '/'.join(cwd.split('/')[0:-1]) + '/data/'\n",
 32 |     "#If on window's machine, explicitly put in data dir\n",
 33 |     "#datadir = \n",
 34 |     "\n",
 35 |     "%matplotlib inline\n",
 36 |     "\n",
 37 |     "\n",
 38 |     "#Now read in the dataset loansdata.csv\n",
 39 |     "loansData = "
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "The full schema can be found <a href=\"https://github.com/herrfz/dataanalysis/blob/master/assignment1/Assignment1.pdf\">here</a>, but let's use native Pandas methods to also explore the data. Although not specified above, this csv has row headers and the read_csv function implicitly knows this. <br>\n",
 47 |     "\n",
 48 |     "Let's take a look at the column names, in a nicely readable way:"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 1,
 54 |    "metadata": {
 55 |     "collapsed": false
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "print(\"Column Names Are:\")\n",
 60 |     "print(\"\")\n",
 61 |     "\n",
 62 |     "for column_name in loansData.columns.values:\n",
 63 |     "    print(column_name)\n",
 64 |     "\n",
 65 |     "print(\"\")\n",
 66 |     "print('Total # of Columns = {}'.format(len(loansData.columns)))"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "It is oftentimes helpful to take a quick glance at the first few records (when possible). This is an easy way to discover basic data flaws (i.e., all nulls, misaligned fields, etc.). We can do this in Pandas very easily."
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 2,
 79 |    "metadata": {
 80 |     "collapsed": false
 81 |    },
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "loansData.head().transpose() #We transpose it so it will fit in the display window"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "Next, let's run a quick line to get summary statistics of the numeric fields."
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 3,
 97 |    "metadata": {
 98 |     "collapsed": false
 99 |    },
100 |    "outputs": [],
101 |    "source": [
102 |     "loansData.describe().transpose()"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "Let's now focus on a single column, 'Monthly.Income.' First things first, let's display the first five records of just this field."
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 4,
115 |    "metadata": {
116 |     "collapsed": false
117 |    },
118 |    "outputs": [],
119 |    "source": [
120 |     "#Student input line here\n"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "Let's try and understand the distribution of this field. We can do this using the hist() method and matplotlib."
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 5,
133 |    "metadata": {
134 |     "collapsed": false
135 |    },
136 |    "outputs": [],
137 |    "source": [
138 |     "#plt.figure()\n",
139 |     "#hist_inc = loansData['Monthly.Income'].hist()\n",
140 |     "#plt.title('Histogram of Monthly Income')\n",
141 |     "#plt.show()"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {},
147 |    "source": [
148 |     "How would you characterize the shape of this distribution? Is there anything we can do to the income variable to make the distribution more bell curved? Let's create a new column in the dataframe called 'Monthly.LogIncome' and print a histogram of it. What might be some advantages of making such a transformation?\n",
149 |     "<br>\n"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 6,
155 |    "metadata": {
156 |     "collapsed": false
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "#Student: Add a new column Monthly.LogIncome to the dataset that is the log of the Monthly.Income columne"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {
167 |     "collapsed": true
168 |    },
169 |    "outputs": [],
170 |    "source": [
171 |     "#Now look at the distribution\n",
172 |     "plt.figure()\n",
173 |     "h = loansData['Monthly.LogIncome'].hist()\n",
174 |     "plt.title('Histogram of Log(Monthly Income)')\n",
175 |     "plt.show()"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {
181 |     "collapsed": false
182 |    },
183 |    "source": [
184 |     "Now let's answer some questions about the data.\n",
185 |     "\n",
186 |     "<b>Q1: What is the cardinality (i.e., # of distinct values) for 'Interest.Rate' and 'FICO.Range'</b>"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 7,
192 |    "metadata": {
193 |     "collapsed": false
194 |    },
195 |    "outputs": [],
196 |    "source": [
197 |     "#Student input code here - hint, the 'describe()' method returns a useful dataframe\n"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "<b>Q2: What data type did Pandas set fo Interest.Rate? Can we create a new field which is stored as a float?</b>"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 8,
210 |    "metadata": {
211 |     "collapsed": false
212 |    },
213 |    "outputs": [],
214 |    "source": [
215 |     "#Student input code here - show the field type\n"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "metadata": {},
221 |    "source": [
222 |     "In the cell below, create a new field 'Interest.Rate.Num' where 'Interest.Rate' is converted to a float. Hint: this can be done in one line using the .str assessor, strip and astype methods."
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 13,
228 |    "metadata": {
229 |     "collapsed": false
230 |    },
231 |    "outputs": [],
232 |    "source": [
233 |     "#Student input code here - convert field to a float using the '.str' assessor\n"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "<b>Q3: Can we get a sense of the relationship between monthly income and interest rate? Use the scatter() function from Matplotlib.</b>"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 9,
246 |    "metadata": {
247 |     "collapsed": false
248 |    },
249 |    "outputs": [],
250 |    "source": [
251 |     "#Student input line here\n"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "markdown",
256 |    "metadata": {},
257 |    "source": [
258 |     "Given the skewed distribution of Income, is this chart visually helpful? Let's try the Log of income instead."
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 10,
264 |    "metadata": {
265 |     "collapsed": false
266 |    },
267 |    "outputs": [],
268 |    "source": [
269 |     "#Student input code here\n"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "metadata": {},
275 |    "source": [
276 |     "<b>Q4: What is the average interest rate for each FICO range?</b> <br>\n",
277 |     "Hint: use the groupby() method in Pandas."
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": 11,
283 |    "metadata": {
284 |     "collapsed": false
285 |    },
286 |    "outputs": [],
287 |    "source": [
288 |     "#Student input code here\n",
289 |     "fico_grp = "
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 12,
295 |    "metadata": {
296 |     "collapsed": false
297 |    },
298 |    "outputs": [],
299 |    "source": [
300 |     "#fico_grp"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": null,
306 |    "metadata": {
307 |     "collapsed": true
308 |    },
309 |    "outputs": [],
310 |    "source": []
311 |   }
312 |  ],
313 |  "metadata": {
314 |   "kernelspec": {
315 |    "display_name": "Python [py35]",
316 |    "language": "python",
317 |    "name": "Python [py35]"
318 |   },
319 |   "language_info": {
320 |    "codemirror_mode": {
321 |     "name": "ipython",
322 |     "version": 3
323 |    },
324 |    "file_extension": ".py",
325 |    "mimetype": "text/x-python",
326 |    "name": "python",
327 |    "nbconvert_exporter": "python",
328 |    "pygments_lexer": "ipython3",
329 |    "version": "3.5.2"
330 |   }
331 |  },
332 |  "nbformat": 4,
333 |  "nbformat_minor": 0
334 | }
335 | 


--------------------------------------------------------------------------------
/ipython/Labs_Student/Lab4_Survey_Questions_part1_Student.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": []
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "Let's start by reading in the data"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "metadata": {
 23 |     "collapsed": false
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "import pandas as pd\n",
 28 |     "import os\n",
 29 |     "import numpy as np\n",
 30 |     "import matplotlib.pyplot as plt\n",
 31 |     "%matplotlib inline\n",
 32 |     "\n",
 33 |     "\n",
 34 |     "#We assume data is in a parallel directory to this one called 'data'\n",
 35 |     "cwd = os.getcwd()\n",
 36 |     "datadir = '/'.join(cwd.split('/')[0:-1]) + '/data/'\n",
 37 |     "#or you can hardcode the directory\n",
 38 |     "#datadir = \n",
 39 |     "\n",
 40 |     "print(datadir)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "Now read in the data called survey_responses_2016.csv into a pandas data frame."
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 4,
 53 |    "metadata": {
 54 |     "collapsed": true
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "#Student put in read data command here:"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "Let's look at the column headers and use something more descriptive"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 2,
 71 |    "metadata": {
 72 |     "collapsed": false
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "#Student put in code to look at column names"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "Column names like 'profile_1-profile_7' aren't very descriptive. As a quick data maintenance task, let's rename the columns starting with 'profile'. The dictionary in the next cell maps the integer index to a descriptive text.\n",
 84 |     "\n",
 85 |     "Tactically, let's loop through each column name. Within the loop let's check whether the column name starts with 'profile.' If it does, let's create a new name that swaps the key with the value using profile_mapping dictionary (i.e., profile_1 -> profile_Viz). We then add the new column name to a list. If it doesn't start with 'profile' just add the old column name to the list. "
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 6,
 91 |    "metadata": {
 92 |     "collapsed": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "profile_mapping = {1:'Viz',\n",
 97 |     "                   2:'CS',\n",
 98 |     "                   3:'Math',\n",
 99 |     "                   4:'Stats',\n",
100 |     "                   5:'ML',\n",
101 |     "                   6:'Bus',\n",
102 |     "                   7:'Com'}"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 7,
108 |    "metadata": {
109 |     "collapsed": false
110 |    },
111 |    "outputs": [],
112 |    "source": [
113 |     "#Student put code here to change the header names\n",
114 |     "newcols = []\n",
115 |     "\n",
116 |     "for colname in data.columns:\n",
117 |     "    #finish the loop    \n",
118 |     "    \n",
119 |     "#Now swap the old columns with the values in newcols    \n"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "Let's use this data to illustrate common data analytic techniques. We have one numeric variable (len_answer) and different categorical variables which may carry some signal of the 'len_answer' variable. \n",
127 |     "\n",
128 |     "'Len_answer' is the character count of the response to the following question: \"Besides the examples given in lecture 1, discuss a case where data science has created value for some company. Please explain the company's goals and how any sort of data analysis could have helped the company achieve said goals.\" As this is a subjective business question, let's hypothesize that students with more professional experience might be more likely to give longer answers. \n",
129 |     "\n",
130 |     "In more technical terms, we'll test whether the variance of len_answer can be explained away by the categorical representation of a student's experience. \n",
131 |     "\n",
132 |     "The first thing we should do is look at the distribution of len_answer."
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 3,
138 |    "metadata": {
139 |     "collapsed": false
140 |    },
141 |    "outputs": [],
142 |    "source": [
143 |     "#Student - plot a histogram here for len_answer\n"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "It looks like we have at least one strong outlier and a thick distribution around 0. Let's also use the Pandas describe() method to get a stronger sense of the distribution."
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 4,
156 |    "metadata": {
157 |     "collapsed": false
158 |    },
159 |    "outputs": [],
160 |    "source": [
161 |     "data.len_answer.describe()"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "Let's consider cleaning up the data. We'll remove the max value as well as those with a length less than 20 (which we think is a generous minimum to communicate a reasonable answer.\n",
169 |     "\n",
170 |     "Create a new data_frame that removes these outliers."
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 10,
176 |    "metadata": {
177 |     "collapsed": false
178 |    },
179 |    "outputs": [
180 |     {
181 |      "data": {
182 |       "text/plain": [
183 |        "(93, 20)"
184 |       ]
185 |      },
186 |      "execution_count": 10,
187 |      "metadata": {},
188 |      "output_type": "execute_result"
189 |     }
190 |    ],
191 |    "source": [
192 |     "#Student create a filtered data frame here\n",
193 |     "outlier_filter = \n",
194 |     "\n",
195 |     "#\n",
196 |     "data_clean.shape"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {},
202 |    "source": [
203 |     "Now that we have cleaned our data, let's run a pairwise t-test on each experience level to see if their difference in len_answer is statistically significant. To run a t-test, we'll need the mean, standard-deviation and count for each group. We can achieve this with a pandas groupby operation."
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 11,
209 |    "metadata": {
210 |     "collapsed": false
211 |    },
212 |    "outputs": [
213 |     {
214 |      "data": {
215 |       "text/html": [
216 |        "<div>\n",
217 |        "<table border=\"1\" class=\"dataframe\">\n",
218 |        "  <thead>\n",
219 |        "    <tr>\n",
220 |        "      <th></th>\n",
221 |        "      <th colspan=\"3\" halign=\"left\">len_answer</th>\n",
222 |        "    </tr>\n",
223 |        "    <tr>\n",
224 |        "      <th></th>\n",
225 |        "      <th>mean</th>\n",
226 |        "      <th>std</th>\n",
227 |        "      <th>count</th>\n",
228 |        "    </tr>\n",
229 |        "    <tr>\n",
230 |        "      <th>experience</th>\n",
231 |        "      <th></th>\n",
232 |        "      <th></th>\n",
233 |        "      <th></th>\n",
234 |        "    </tr>\n",
235 |        "  </thead>\n",
236 |        "  <tbody>\n",
237 |        "    <tr>\n",
238 |        "      <th>2-5 years, I'm getting good at what I do!</th>\n",
239 |        "      <td>732.222222</td>\n",
240 |        "      <td>398.570468</td>\n",
241 |        "      <td>18</td>\n",
242 |        "    </tr>\n",
243 |        "    <tr>\n",
244 |        "      <th>5+ years, I'm a veteran!</th>\n",
245 |        "      <td>717.333333</td>\n",
246 |        "      <td>269.793748</td>\n",
247 |        "      <td>6</td>\n",
248 |        "    </tr>\n",
249 |        "    <tr>\n",
250 |        "      <th>&lt; 2 years, I'm fresh!</th>\n",
251 |        "      <td>489.312500</td>\n",
252 |        "      <td>285.271501</td>\n",
253 |        "      <td>16</td>\n",
254 |        "    </tr>\n",
255 |        "    <tr>\n",
256 |        "      <th>None, I just finished my undergrad!</th>\n",
257 |        "      <td>507.000000</td>\n",
258 |        "      <td>335.536253</td>\n",
259 |        "      <td>53</td>\n",
260 |        "    </tr>\n",
261 |        "  </tbody>\n",
262 |        "</table>\n",
263 |        "</div>"
264 |       ],
265 |       "text/plain": [
266 |        "                                           len_answer                  \n",
267 |        "                                                 mean         std count\n",
268 |        "experience                                                             \n",
269 |        "2-5 years, I'm getting good at what I do!  732.222222  398.570468    18\n",
270 |        "5+ years, I'm a veteran!                   717.333333  269.793748     6\n",
271 |        "< 2 years, I'm fresh!                      489.312500  285.271501    16\n",
272 |        "None, I just finished my undergrad!        507.000000  335.536253    53"
273 |       ]
274 |      },
275 |      "execution_count": 11,
276 |      "metadata": {},
277 |      "output_type": "execute_result"
278 |     }
279 |    ],
280 |    "source": [
281 |     "#Student input code here\n",
282 |     "\n",
283 |     "#run this to look at the grouped df\n",
284 |     "data_clean_grouped"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "markdown",
289 |    "metadata": {},
290 |    "source": [
291 |     "Visually, we can see a potential split between the [0, 2] year experience range and the [2+] experience range. Let's be more rigorous and run t-tests. Let's write a function that takes in the necessary statistics and returns a p-value.\n",
292 |     "\n",
293 |     "Remember, the t-stat for the difference between two means is:\n",
294 |     "\n",
295 |     "<center>$t = \\frac{\\hat{\\mu_1} - \\hat{\\mu_2}}{\\sqrt{\\frac{\\hat{\\sigma_1}^2}{n_1} + \\frac{\\hat{\\sigma_2}^2}{n_2}}}$</center>\n",
296 |     "\n",
297 |     "The p-value can be found using a t-distribution, but for simplicity, let's approximate this with the normal distribution. For the 2-tailed test, the p-value is: 2 * (1 - Norm.CDF(T))."
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": 31,
303 |    "metadata": {
304 |     "collapsed": false
305 |    },
306 |    "outputs": [],
307 |    "source": [
308 |     "#Student complete the function\n",
309 |     "from scipy.stats import norm\n",
310 |     "def pvalue_diffmeans_twotail(mu1, sig1, n1, mu2, sig2, n2):\n",
311 |     "    '''\n",
312 |     "    P-value calculator for the hypothesis test of mu1 != mu2.\n",
313 |     "    Takes in the approprate inputs to compute the t-statistic for the difference between means\n",
314 |     "    Outputs a p-value for a two-sample t-test.\n",
315 |     "    '''\n",
316 |     "\n",
317 |     "    \n",
318 |     "    return (t, p_value)"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "markdown",
323 |    "metadata": {},
324 |    "source": [
325 |     "Now loop through all possible pairs in data_clean_grouped and perform a t-test."
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 10,
331 |    "metadata": {
332 |     "collapsed": false
333 |    },
334 |    "outputs": [],
335 |    "source": [
336 |     "#Student put in code here:\n",
337 |     "\n",
338 |     "\n",
339 |     "get distinct values in the data frame for the experience variable\n",
340 |     "grps = \n",
341 |     "\n",
342 |     "#Now loop through each pair\n",
343 |     "for i, grp1 in enumerate(grps):\n",
344 |     "    for grp2 in grps[i + 1:]:\n",
345 |     "    \n",
346 |     "        '''\n",
347 |     "        Also, the result of groupby uses a multi-index. So be sure to index on 'len_answer' as well.\n",
348 |     "        Then pull out the mean, std, and cnt from that result.   \n",
349 |     "        '''        \n",
350 |     "\n",
351 |     "        #some code should go here\n",
352 |     "        \n",
353 |     "        print('Two tailed T-Test between groups: {} and {}'.format(grp1, grp2))\n",
354 |     "        print('Diff = {} characters'.format(round(row1['mean'] - row2['mean'], 0)))\n",
355 |     "        print('The t-stat is {} and p-value is {}'.format(round(tstat, 3), round(p_value, 3)))\n",
356 |     "        print('')"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "markdown",
361 |    "metadata": {},
362 |    "source": [
363 |     "What are some observations you might have about the above results? Are there any with large deviances that are not statistically significant at at least a 95% level?\n",
364 |     "\n",
365 |     "Also, how do the numbers change if you rerun it using the original data, and not the cleaned data. What is the effect of outliers on the results?"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": 11,
371 |    "metadata": {
372 |     "collapsed": false
373 |    },
374 |    "outputs": [],
375 |    "source": [
376 |     "#Rerun everything without cleaning outliers\n",
377 |     "\n",
378 |     "grps = \n",
379 |     "\n",
380 |     "#Now loop through each pair\n",
381 |     "for i, grp1 in enumerate(grps):\n",
382 |     "    for grp2 in grps[i + 1:]:\n",
383 |     "    \n",
384 |     "        '''\n",
385 |     "        Also, the result of groupby uses a multi-index. So be sure to index on 'len_answer' as well.\n",
386 |     "        Then pull out the mean, std, and cnt from that result.   \n",
387 |     "        '''        \n",
388 |     "    \n",
389 |     "        print('Two tailed T-Test between groups: {} and {}'.format(grp1, grp2))\n",
390 |     "        print('Diff = {} characters'.format(round(row1['mean'] - row2['mean'], 0)))\n",
391 |     "        print('The t-stat is {} and p-value is {}'.format(round(tstat, 3), round(p_value, 3)))\n",
392 |     "        print('')"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": null,
398 |    "metadata": {
399 |     "collapsed": true
400 |    },
401 |    "outputs": [],
402 |    "source": []
403 |   }
404 |  ],
405 |  "metadata": {
406 |   "anaconda-cloud": {},
407 |   "kernelspec": {
408 |    "display_name": "Python [py35]",
409 |    "language": "python",
410 |    "name": "Python [py35]"
411 |   },
412 |   "language_info": {
413 |    "codemirror_mode": {
414 |     "name": "ipython",
415 |     "version": 3
416 |    },
417 |    "file_extension": ".py",
418 |    "mimetype": "text/x-python",
419 |    "name": "python",
420 |    "nbconvert_exporter": "python",
421 |    "pygments_lexer": "ipython3",
422 |    "version": "3.5.2"
423 |   }
424 |  },
425 |  "nbformat": 4,
426 |  "nbformat_minor": 0
427 | }
428 | 


--------------------------------------------------------------------------------
/ipython/Labs_Student/Lab4_intro_regex.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Introduction to Data Science\n",
  8 |     "## Lab 4: Intro to Regular Expressions"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "metadata": {},
 14 |    "source": [
 15 |     "1\\. I realize that question 2 on the homework might be a little difficult; let's walk through the problem"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {
 22 |     "collapsed": false
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import re # you might find this package useful\n",
 27 |     "\n",
 28 |     "contributors = dict()\n",
 29 |     "\n",
 30 |     "# Read through each line of the data\n",
 31 |     "f = open(\"C:/Users/kevin/Documents/GitHub/DataScienceCourse/ipython/data/osquery_contributors.html\", \"r\")"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {
 38 |     "collapsed": false
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "for line in f:\n",
 43 |     "    print(line)"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {
 50 |     "collapsed": true
 51 |    },
 52 |    "outputs": [],
 53 |    "source": []
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "https://developers.google.com/edu/python/regular-expressions: useful introduction to regular expressions!"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "match = re.search(pat, str)\n"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {
 73 |     "collapsed": false
 74 |    },
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "str = 'an example word:cat!!'\n",
 78 |     "match = re.search('word:\\w\\w\\w', str)\n"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {
 85 |     "collapsed": false
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "# If-statement after search() tests if it succeeded\n",
 90 |     "if match:                      \n",
 91 |     "    print('found', match.group()) \n"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {
 98 |     "collapsed": false
 99 |    },
100 |    "outputs": [],
101 |    "source": [
102 |     "## Search for pattern 'iii' in string 'piiig'.\n",
103 |     "## All of the pattern must match, but it may appear anywhere.\n",
104 |     "## On success, match.group() is matched text.\n",
105 |     "match = re.search('iii', 'piiig') \n"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {
112 |     "collapsed": false
113 |    },
114 |    "outputs": [],
115 |    "source": [
116 |     "match.group()"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {
123 |     "collapsed": false
124 |    },
125 |    "outputs": [],
126 |    "source": [
127 |     "match = re.search('igs', 'piiig') \n",
128 |     "match.group()\n"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "collapsed": false
136 |    },
137 |    "outputs": [],
138 |    "source": [
139 |     "## . = any char but \\n\n",
140 |     "match = re.search('..g', 'piiig') \n",
141 |     "match.group()"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {
148 |     "collapsed": false
149 |    },
150 |    "outputs": [],
151 |    "source": [
152 |     "## \\d = digit char, \\w = word char\n",
153 |     "match = re.search('\\d\\d\\d', 'p123g')\n",
154 |     "match.group()"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {
161 |     "collapsed": false
162 |    },
163 |    "outputs": [],
164 |    "source": [
165 |     "match = re.search('\\w\\w\\w', '@@abcd!!')\n",
166 |     "match.group()"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {
173 |     "collapsed": false
174 |    },
175 |    "outputs": [],
176 |    "source": [
177 |     "  ## i+ = one or more i's, as many as possible.\n",
178 |     "\n",
179 |     "\n",
180 |     "match = re.search('pi+', 'piiig')\n",
181 |     "match.group()"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {
188 |     "collapsed": true
189 |    },
190 |    "outputs": [],
191 |    "source": [
192 |     "## Finds the first/leftmost solution, and within it drives the +\n",
193 |     "  ## as far as possible (aka 'leftmost and largest').\n",
194 |     "  ## In this example, note that it does not get to the second set of i's.\n",
195 |     "\n",
196 |     "match = re.search('i+', 'piigiiii')"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {
203 |     "collapsed": true
204 |    },
205 |    "outputs": [],
206 |    "source": [
207 |     "## \\s* = zero or more whitespace chars\n",
208 |     "  ## Here look for 3 digits, possibly separated by whitespace.\n",
209 |     "match = re.search('\\d\\s*\\d\\s*\\d', 'xx1 2   3xx') \n"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {
216 |     "collapsed": true
217 |    },
218 |    "outputs": [],
219 |    "source": [
220 |     "match = re.search('\\d\\s*\\d\\s*\\d', 'xx12  3xx') \n"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {
227 |     "collapsed": true
228 |    },
229 |    "outputs": [],
230 |    "source": [
231 |     "match = re.search('\\d\\s*\\d\\s*\\d', 'xx123xx')"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {
238 |     "collapsed": true
239 |    },
240 |    "outputs": [],
241 |    "source": [
242 |     "## ^ = matches the start of string, so this fails:\n",
243 |     "match = re.search('^b\\w+', 'foobar') \n"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {
250 |     "collapsed": false
251 |    },
252 |    "outputs": [],
253 |    "source": [
254 |     "## but without the ^ it succeeds:\n",
255 |     "match = re.search('b\\w+', 'foobar')\n",
256 |     "match.group()"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {
263 |     "collapsed": false
264 |    },
265 |    "outputs": [],
266 |    "source": [
267 |     "str = 'purple alice-b@google.com monkey dishwasher'\n",
268 |     "match = re.search( %Insert code here%%, str)\n"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": null,
274 |    "metadata": {
275 |     "collapsed": true
276 |    },
277 |    "outputs": [],
278 |    "source": [
279 |     "str = 'purple alice-b@google.com monkey dishwasher'\n",
280 |     "match = re.search('([\\w.-]+)@([\\w.-]+)', str)"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": null,
286 |    "metadata": {
287 |     "collapsed": false
288 |    },
289 |    "outputs": [],
290 |    "source": [
291 |     "match.group()"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": null,
297 |    "metadata": {
298 |     "collapsed": true
299 |    },
300 |    "outputs": [],
301 |    "source": []
302 |   }
303 |  ],
304 |  "metadata": {
305 |   "kernelspec": {
306 |    "display_name": "Python 3",
307 |    "language": "python",
308 |    "name": "python3"
309 |   },
310 |   "language_info": {
311 |    "codemirror_mode": {
312 |     "name": "ipython",
313 |     "version": 3
314 |    },
315 |    "file_extension": ".py",
316 |    "mimetype": "text/x-python",
317 |    "name": "python",
318 |    "nbconvert_exporter": "python",
319 |    "pygments_lexer": "ipython3",
320 |    "version": "3.6.0"
321 |   }
322 |  },
323 |  "nbformat": 4,
324 |  "nbformat_minor": 0
325 | }
326 | 


--------------------------------------------------------------------------------
/ipython/Labs_Student/Lab_6_FeatureRanking_AUC_Student.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "In this lab we'll look at:\n",
 10 |     "- How to build ROC curves\n",
 11 |     "- Use two different evaluation metrics to perform feature ranking\n",
 12 |     "- Compare/contrast the results of feature ranking on different evaluation measures\n",
 13 |     "- Build models on subsets of the features, using the different methods to select the subset\n",
 14 |     "- Compare these different models\n"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import pandas as pd\n",
 26 |     "import numpy as np\n",
 27 |     "import matplotlib.pyplot as plt\n",
 28 |     "from scipy.stats import entropy\n",
 29 |     "import os\n",
 30 |     "\n",
 31 |     "%matplotlib inline"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "First we'll load the dataset and take a quick peak at its columns and size"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 3,
 44 |    "metadata": {
 45 |     "collapsed": false
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "#load dataset\n",
 50 |     "cwd = os.getcwd()\n",
 51 |     "datadir = '/'.join(cwd.split('/')[0:-1]) + '/data/'\n",
 52 |     "f = datadir + 'ads_dataset_cut.txt'\n",
 53 |     "data = pd.read_csv(f, sep = '\\t')\n",
 54 |     "data.columns, data.shape"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "In the next step we'll use the Decision Tree classifier's built in feature importance attribute to compute the normalized Mutual Information/Information Gain of each feature. Note a few things about this approach: 1). With extremely high dimensional data, one may want to calculate the normalized MI directly for each feature (the code to do that is a bit more complex so we used the DT instead), 2). The DT is a greedy algorithm, so the feature importance ranks it produces may not be equal to the rank of normalized MI calculated individually."
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 88,
 67 |    "metadata": {
 68 |     "collapsed": false
 69 |    },
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "#import the decision tree module from sklearn\n",
 73 |     "from sklearn.tree import DecisionTreeClassifier\n",
 74 |     "\n",
 75 |     "#build a decision tree with max_depth = 20 using entropy\n",
 76 |     "Y = data['y_buy']\n",
 77 |     "X = data.drop('y_buy', 1)\n",
 78 |     "\n",
 79 |     "#Student - instantiate the DT\n",
 80 |     "dt = \n",
 81 |     "#Student - now fit the DT\n",
 82 |     "\n",
 83 |     "#Student - Now use built in feature importance attribute to get MI of each feature and Y\n",
 84 |     "feature_mi = "
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "Now we'll add the feature importances to a dictionary where key-values are: {feature_name:dt_feature_importance}. This can be done in one line using the zip and dict functions."
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 89,
 97 |    "metadata": {
 98 |     "collapsed": false
 99 |    },
100 |    "outputs": [],
101 |    "source": [
102 |     "#Student - Add features and their importances to a dictionary\n",
103 |     "feature_mi_dict = "
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "Now we are going to compute feature ranks using AUC. We can do this without fitting a model, by just seeing how well the individual feature ranks the positives and negatives."
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 99,
116 |    "metadata": {
117 |     "collapsed": false
118 |    },
119 |    "outputs": [],
120 |    "source": [
121 |     "#define a function to print ROC curves. \n",
122 |     "#It should take in only arrays/lists of predictions and outcomes\n",
123 |     "from sklearn.metrics import roc_curve, auc\n",
124 |     "\n",
125 |     "def plotUnivariateROC(preds, truth, label_string):\n",
126 |     "    '''\n",
127 |     "    preds is an nx1 array of predictions\n",
128 |     "    truth is an nx1 array of truth labels\n",
129 |     "    label_string is text to go into the plotting label\n",
130 |     "    '''\n",
131 |     "    #Student input code here\n",
132 |     "    #1. call the roc_curve function to get the ROC X and Y values\n",
133 |     "    fpr, tpr, thresholds = \n",
134 |     "    #2. Input fpr and tpr into the auc function to get the AUC\n",
135 |     "    roc_auc = \n",
136 |     "    \n",
137 |     "    #we are doing this as a special case because we are sending unfitted predictions\n",
138 |     "    #into the function\n",
139 |     "    if roc_auc < 0.5:\n",
140 |     "        fpr, tpr, thresholds = roc_curve(truth, -1 * preds)\n",
141 |     "        roc_auc = auc(fpr, tpr)\n",
142 |     "\n",
143 |     "    #chooses a random color for plotting\n",
144 |     "    c = (np.random.rand(), np.random.rand(), np.random.rand())\n",
145 |     "\n",
146 |     "    #create a plot and set some options\n",
147 |     "    plt.plot(fpr, tpr, color = c, label = label_string + ' (AUC = %0.3f)' % roc_auc)\n",
148 |     "    \n",
149 |     "\n",
150 |     "    plt.plot([0, 1], [0, 1], 'k--')\n",
151 |     "    plt.xlim([0.0, 1.0])\n",
152 |     "    plt.ylim([0.0, 1.0])\n",
153 |     "    plt.xlabel('FPR')\n",
154 |     "    plt.ylabel('TPR')\n",
155 |     "    plt.title('ROC')\n",
156 |     "    plt.legend(loc=\"lower right\")\n",
157 |     "    \n",
158 |     "    return roc_auc"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "Next we'll run each feature through the above function to get its invdividual AUC and also plot on a chart. We add some extra lines of matplotlib code to control the formatting and position of the legend. We also want to add each to a dictionary of the format {feature_name:feature_auc}, similar to what we did above (though not using the same one liner). Take some time to review the chart and think about why different features produce differently shaped curves. "
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 6,
171 |    "metadata": {
172 |     "collapsed": false
173 |    },
174 |    "outputs": [],
175 |    "source": [
176 |     "\n",
177 |     "fig = plt.figure(figsize = (12, 6))\n",
178 |     "ax = plt.subplot(111)\n",
179 |     "\n",
180 |     "#Plot the univariate AUC on the training data. Store the AUC\n",
181 |     "\n",
182 |     "feature_auc_dict = {}\n",
183 |     "for col in data.drop('y_buy',1).columns:\n",
184 |     "    #Student put code here\n",
185 |     "    feature_auc_dict[col] = \n",
186 |     "\n",
187 |     "\n",
188 |     "# Put a legend below current axis\n",
189 |     "box = ax.get_position()\n",
190 |     "ax.set_position([box.x0, box.y0 + box.height * 0.0 , box.width, box.height * 1])\n",
191 |     "ax.legend(loc = 'upper center', bbox_to_anchor = (0.5, -0.15), fancybox = True, \n",
192 |     "              shadow = True, ncol = 4, prop = {'size':10})"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "Next we want to add both of the dictionaries created above into a data frame."
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 7,
205 |    "metadata": {
206 |     "collapsed": false
207 |    },
208 |    "outputs": [],
209 |    "source": [
210 |     "#Student - Add auc and mi each to a single dataframe\n",
211 |     "df_auc = \n",
212 |     "df_mi = \n",
213 |     "\n",
214 |     "#Student - Now merge the two on the feature name\n",
215 |     "feat_imp_df = \n",
216 |     "feat_imp_df"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "To put the different metrics on the same scale, we'll use pandas rank() method for each feature."
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": 8,
229 |    "metadata": {
230 |     "collapsed": false
231 |    },
232 |    "outputs": [],
233 |    "source": [
234 |     "#Student - Now create a df that holds the ranks of auc and mi \n",
235 |     "feat_ranks =\n",
236 |     "\n",
237 |     "#Student - Plot the two ranks\n",
238 |     "\n",
239 |     "#Student - Plot a y=x reference line\n"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 10,
245 |    "metadata": {
246 |     "collapsed": false
247 |    },
248 |    "outputs": [],
249 |    "source": [
250 |     "#Student - Now create lists of top 5 features for both auc and mi\n",
251 |     "top5_auc = \n",
252 |     "top5_mi = \n",
253 |     "top5_auc, top5_mi"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "The next step is the conclusive step from all the analysis done above. We want to test which method above can be used to produce the best subset of features. What we'll do is use the top 5 features ranked by both AUC and the decision tree feature importance and compare them against each other with different algorithms."
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 14,
266 |    "metadata": {
267 |     "collapsed": false
268 |    },
269 |    "outputs": [],
270 |    "source": [
271 |     "'''\n",
272 |     "Now do the following\n",
273 |     "1. Split the data into 80/20 train/test\n",
274 |     "2. For each set of features:\n",
275 |     "- build a decision trees max_depth = 5 \n",
276 |     "- build a logistic regression C = 100\n",
277 |     "- get the auc and log-loss on the test set\n",
278 |     "'''\n",
279 |     "\n",
280 |     "\n",
281 |     "def getLogLoss(Ps, Ys, eps = 10**-6):\n",
282 |     "    return ((Ys == 1) * np.log(Ps + eps) + (Ys == 0) * np.log(1 - Ps + eps)).mean()\n",
283 |     "\n",
284 |     "#Student - Split into train and test randomly without using sklearn package\n",
285 |     "#Note, there are many ways to do this:\n",
286 |     "\n",
287 |     "train_pct = 0.8\n",
288 |     "#1. create an array of n random uniform variables drawn on [0,1] range\n",
289 |     "rand = \n",
290 |     "#2. Convert to boolean where True = random number < train_pct\n",
291 |     "rand_filt = \n",
292 |     "\n",
293 |     "#Student - Use the filter to index data into training and test data sets\n",
294 |     "train = \n",
295 |     "test = \n",
296 |     "\n",
297 |     "\n",
298 |     "fsets = [top5_auc, top5_mi]\n",
299 |     "fset_descr = ['auc', 'mi']\n",
300 |     "mxdepths = [5]\n",
301 |     "Cs = [10**2]\n",
302 |     "\n",
303 |     "\n",
304 |     "#Set up plotting box\n",
305 |     "fig = plt.figure(figsize = (15, 8))\n",
306 |     "ax = plt.subplot(111)\n",
307 |     "\n",
308 |     "\n",
309 |     "\n",
310 |     "for i, fset in enumerate(fsets):\n",
311 |     "    \n",
312 |     "    descr = fset_descr[i]\n",
313 |     "    #set training and testing data\n",
314 |     "    Y_train = train['y_buy']\n",
315 |     "    X_train = train[fset]\n",
316 |     "    Y_test = test['y_buy']\n",
317 |     "    X_test = test[fset]\n",
318 |     "    \n",
319 |     "    \n",
320 |     "    #Student - for all d in mxdepths and C in Cs, build DT's and LR's respectively\n",
321 |     "    # get the predictions on the test set and also get the log-loss, then plot\n",
322 |     "    \n",
323 |     "    #Student - instantiate the class\n",
324 |     "    dt = \n",
325 |     "    #Don't forget to fit the tree\n",
326 |     "    #Now make a prediction\n",
327 |     "    preds_dt = \n",
328 |     "    #Now compute the log-loss\n",
329 |     "    ll_dt = \n",
330 |     "        \n",
331 |     "    plotUnivariateROC(preds_dt, Y_test, '{}:DT:md={}:(LL={})'.format(descr, d, round(ll_dt, 3)))\n",
332 |     "\n",
333 |     "        \n",
334 |     "    #Student - instantiate the class\n",
335 |     "    lr = \n",
336 |     "    #Don't forget to fit the LR\n",
337 |     "    #Now make a prediction\n",
338 |     "    preds_lr = \n",
339 |     "    #Now compute the log-loss\n",
340 |     "    ll_lr = \n",
341 |     "\n",
342 |     "    plotUnivariateROC(preds_lr, Y_test, '{}:LR:C={}:(LL={})'.format(descr, C, round(ll_lr, 3)))\n",
343 |     "\n",
344 |     "    \n",
345 |     "# Put a legend below current axis\n",
346 |     "box = ax.get_position()\n",
347 |     "ax.set_position([box.x0, box.y0 + box.height * 0.0 , box.width, box.height * 1])\n",
348 |     "ax.legend(loc = 'upper center', bbox_to_anchor = (0.5, -0.15), fancybox = True, \n",
349 |     "              shadow = True, ncol = 2, prop = {'size':10})\n"
350 |    ]
351 |   }
352 |  ],
353 |  "metadata": {
354 |   "anaconda-cloud": {},
355 |   "kernelspec": {
356 |    "display_name": "Python [py35]",
357 |    "language": "python",
358 |    "name": "Python [py35]"
359 |   },
360 |   "language_info": {
361 |    "codemirror_mode": {
362 |     "name": "ipython",
363 |     "version": 3
364 |    },
365 |    "file_extension": ".py",
366 |    "mimetype": "text/x-python",
367 |    "name": "python",
368 |    "nbconvert_exporter": "python",
369 |    "pygments_lexer": "ipython3",
370 |    "version": "3.5.2"
371 |   }
372 |  },
373 |  "nbformat": 4,
374 |  "nbformat_minor": 0
375 | }
376 | 


--------------------------------------------------------------------------------
/ipython/Labs_Student/Lab_7_sklearn_magic_student.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "In this lab we'll demonstrate several common techniques and helpful tools used in a model building process:\n",
 10 |     "\n",
 11 |     "- Use Sklearn to generate polynomial features and rescale them\n",
 12 |     "- Create folds for cross-validation\n",
 13 |     "- Perform a grid search to optimize hyper-parameters using cross-validation\n",
 14 |     "- Create pipelines to perform grids search in less code\n",
 15 |     "- Improve upon a baseline model incrementally by adding in more complexity\n",
 16 |     "\n",
 17 |     "This lab will require using several Sklearn classes. It would be helpful to refer to appropriate documentation:\n",
 18 |     "- http://scikit-learn.org/stable/modules/preprocessing.html\n",
 19 |     "- http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler\n",
 20 |     "- http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html#sklearn.preprocessing.PolynomialFeatures\n",
 21 |     "- http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV\n",
 22 |     "- http://scikit-learn.org/stable/modules/pipeline.html#pipeline\n",
 23 |     "\n",
 24 |     "Also, here is a helpful tutorial that explains how to use much of the above:\n",
 25 |     "- https://civisanalytics.com/blog/data-science/2016/01/06/workflows-python-using-pipeline-gridsearchcv-for-compact-code/\n",
 26 |     "\n",
 27 |     "Like always, let's first load in the data.\n"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 1,
 33 |    "metadata": {
 34 |     "collapsed": false
 35 |    },
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "import os\n",
 39 |     "import pandas as pd\n",
 40 |     "from sklearn.linear_model import LogisticRegression\n",
 41 |     "from sklearn.grid_search import GridSearchCV\n",
 42 |     "from sklearn.cross_validation import KFold\n",
 43 |     "cwd = os.getcwd()\n",
 44 |     "datadir = '/'.join(cwd.split('/')[0:-1]) + '/data/'\n",
 45 |     "\n",
 46 |     "data = pd.read_csv(datadir + 'Cell2Cell_data.csv', header=0, sep=',')\n",
 47 |     "\n",
 48 |     "#Randomly sort the data:\n",
 49 |     "data = data.sample(frac = 1)\n",
 50 |     "data.columns"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "Next we're going to prep the data. From prior analysis (Churn Case Study) we learned that we can drop a few variables, as they are either highly redundant or don't carry a strong relationship with the outcome.\n",
 58 |     "\n",
 59 |     "After dropping, we're going to use the SkLearn KFold class to set up cross validation fold indexes."
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 2,
 65 |    "metadata": {
 66 |     "collapsed": false
 67 |    },
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "#Prior analysis (from Churn Case study) has shown that we can drop a few redundant variables\n",
 71 |     "#We want to drop a few to speed up later calculations\n",
 72 |     "dropvar_list = ['incalls', 'creditcd', 'marryyes', 'travel', 'pcown']\n",
 73 |     "data_subset = data.drop(dropvar_list, 1)\n",
 74 |     "\n",
 75 |     "#Set up X and Y\n",
 76 |     "X = data_subset.drop('churndep', 1)\n",
 77 |     "Y = data_subset['churndep']\n",
 78 |     "\n",
 79 |     "#Use Kfold to create 4 folds\n",
 80 |     "kfolds = KFold(#Student -nput code here)\n"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "Next let's use cross-validation to build a baseline model. We're going to use LR with no feature pre-processing. We're going to look at both L1 and L2 regularization with different weights. We can do this very succinctly with SkLearns GridSearchCV package."
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 3,
 93 |    "metadata": {
 94 |     "collapsed": false
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "#1st, set up a paramater grid\n",
 99 |     "param_grid_lr = {'C':#Student put code here, \n",
100 |     "                 'penalty':#Student put code here}\n",
101 |     "\n",
102 |     "#2nd, call the GridSearchCV class, use LogisticRegression and 'log_loss' for scoring\n",
103 |     "lr_grid_search = GridSearchCV(#Student put code here) \n",
104 |     "lr_grid_search.fit(X, Y)\n",
105 |     "\n",
106 |     "#3rd, get the score of the best model and print it\n",
107 |     "best_1 = #Student put code here\n",
108 |     "print(best_1)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 4,
114 |    "metadata": {
115 |     "collapsed": false
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "#Next let's look at the best-estimator chosen to see what the parameters were\n",
120 |     "lr_grid_search.#Student put code here"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "Now let's see if we can beat this by standardizing the features. We'll approach this using the GridSearchCV class but also build a pipeline. Later we'll extend the pipeline to allow for feature engineering as well."
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 5,
133 |    "metadata": {
134 |     "collapsed": false
135 |    },
136 |    "outputs": [],
137 |    "source": [
138 |     "from sklearn.pipeline import Pipeline\n",
139 |     "from sklearn.preprocessing import StandardScaler\n",
140 |     "\n",
141 |     "#Create a set of steps. All but the last step is a transformer (something that processes data). \n",
142 |     "#Build a list of steps, where the first is StandardScaler and the second is LogisticRegression\n",
143 |     "#The last step should be an estimator.\n",
144 |     "\n",
145 |     "steps = [('scaler', #Student put code here,\n",
146 |     "         ('lr', #Student put code here)]\n",
147 |     "\n",
148 |     "#Now set up the pipeline\n",
149 |     "pipeline = Pipeline(#Student put code here)\n",
150 |     "\n",
151 |     "#Now set up the parameter grid, paying close to the correct convention here\n",
152 |     "parameters_scaler = dict(lr__C = #Student put code here,\n",
153 |     "                         lr__penalty = #Student put code here)\n",
154 |     "\n",
155 |     "#Now run another grid search\n",
156 |     "lr_grid_search_scaler = GridSearchCV(#Student put code here)\n",
157 |     "                        \n",
158 |     "#Don't forget to fit this GridSearchCV pipeline\n",
159 |     "#Student put code here\n",
160 |     "\n",
161 |     "#Again, print the score of the best model\n",
162 |     "best_2 = #Student put code here\n",
163 |     "print(best_2)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 6,
169 |    "metadata": {
170 |     "collapsed": false
171 |    },
172 |    "outputs": [],
173 |    "source": [
174 |     "#Let's see the model after scaling. Did the optimal parameters change?\n",
175 |     "lr_grid_search_scaler.best_estimator_.steps[-1][1]"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "Now that we've built a pipeline estimator that performs feature scaling and then logistic regression, let's add to it a feature engineering step. We'll then again use GridSearchCV to find an optimal parameter configuration and see if we can beat our best score above."
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 7,
188 |    "metadata": {
189 |     "collapsed": false
190 |    },
191 |    "outputs": [],
192 |    "source": [
193 |     "from sklearn.preprocessing import PolynomialFeatures\n",
194 |     "\n",
195 |     "#Create a set of steps. All but the last step is a transformer (something that processes data). \n",
196 |     "# Step 1 - PolynomialFeatures\n",
197 |     "# Step 2 - StandardScaler\n",
198 |     "# Step 3 - LogisticRegression\n",
199 |     "\n",
200 |     "steps_poly = [#Student put code here]\n",
201 |     "\n",
202 |     "#Now set up the pipeline\n",
203 |     "pipeline_poly = #Student put code here\n",
204 |     "\n",
205 |     "#Now set up a new parameter grid, use the same paramaters used above for logistic regression, \n",
206 |     "#but add polynomial features up to degree 2 with and without interactions. \n",
207 |     "parameters_poly = dict(#Student put code here)\n",
208 |     "\n",
209 |     "#Now run another grid search\n",
210 |     "lr_grid_search_poly = #Student put code here\n",
211 |     "lr_grid_search_poly.fit(X, Y)\n",
212 |     "\n",
213 |     "best_3 = lr_grid_search_poly.best_score_\n",
214 |     "print(best_3)"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 8,
220 |    "metadata": {
221 |     "collapsed": false
222 |    },
223 |    "outputs": [],
224 |    "source": [
225 |     "#Let's look at the best estimator, stepwise\n",
226 |     "lr_grid_search_poly.best_estimator_.steps"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {},
232 |    "source": [
233 |     "Now make a bar chart to plot results"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 9,
239 |    "metadata": {
240 |     "collapsed": false
241 |    },
242 |    "outputs": [],
243 |    "source": [
244 |     "import numpy as np\n",
245 |     "results = -1 * np.array([best_1, best_2, best_3])\n",
246 |     "labs = ['LR', 'Scaler-LR', 'Poly-Scaler-LR']\n",
247 |     "\n",
248 |     "fig = plt.figure(facecolor = 'w', figsize = (12, 6))\n",
249 |     "ax = plt.subplot(111)\n",
250 |     "\n",
251 |     "width = 0.5\n",
252 |     "ind = np.arange(3)\n",
253 |     "rec = ax.bar(ind + width, results, width, color='r')\n",
254 |     "\n",
255 |     "ax.set_xticks(ind + width)\n",
256 |     "ax.set_xticklabels(labs, size = 14)\n",
257 |     "ax.set_ylim([0.6, 0.7])\n",
258 |     "\n",
259 |     "plt.plot(np.arange(4), min(results) * np.ones(4))"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": null,
265 |    "metadata": {
266 |     "collapsed": true
267 |    },
268 |    "outputs": [],
269 |    "source": []
270 |   }
271 |  ],
272 |  "metadata": {
273 |   "anaconda-cloud": {},
274 |   "kernelspec": {
275 |    "display_name": "Python [py35]",
276 |    "language": "python",
277 |    "name": "Python [py35]"
278 |   },
279 |   "language_info": {
280 |    "codemirror_mode": {
281 |     "name": "ipython",
282 |     "version": 3
283 |    },
284 |    "file_extension": ".py",
285 |    "mimetype": "text/x-python",
286 |    "name": "python",
287 |    "nbconvert_exporter": "python",
288 |    "pygments_lexer": "ipython3",
289 |    "version": "3.5.2"
290 |   }
291 |  },
292 |  "nbformat": 4,
293 |  "nbformat_minor": 0
294 | }
295 | 


--------------------------------------------------------------------------------
/ipython/Labs_Student/NumPyBasics.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "'''\n",
 12 |     "The core data type in Numpy is the ndarray, which enables fast and space-efficient multidimensional array processing.\n",
 13 |     "Note: This notebook is adapted from chapter 4 Python for Data Analysis by Wes McKinney and O'Reilly publishing. NumPy has many, \n",
 14 |     "many features that won't be covered here. The following snippets are just to illustrate basic data types and operations within\n",
 15 |     "numpy.\n",
 16 |     "\n",
 17 |     "Another good resource for learning more about ndarrays is here:\n",
 18 |     "http://docs.scipy.org/doc/numpy/reference/arrays.html\n",
 19 |     "'''\n",
 20 |     "\n",
 21 |     "#First, import NumPy\n",
 22 |     "import numpy as np\n",
 23 |     "\n",
 24 |     "#It is easy to create Nx1 and NxM arrays from standard Python lists\n",
 25 |     "l1 = [0,1,2]\n",
 26 |     "l2 = [3,4,5]\n",
 27 |     "\n",
 28 |     "nd1 = np.array(l1)\n",
 29 |     "nd2 = np.array([l1,  l2])"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {
 36 |     "collapsed": false
 37 |    },
 38 |    "outputs": [
 39 |     {
 40 |      "name": "stdout",
 41 |      "output_type": "stream",
 42 |      "text": [
 43 |       "The ndarray has dimension n=3 and m=1\n",
 44 |       "The ndarray has elements of type=int64\n",
 45 |       "The ndarray has dimension n=2 and m=3\n",
 46 |       "The ndarray has elements of type=int64\n"
 47 |      ]
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "#Now, we can get ask for some basic info to describe the ndarray\n",
 52 |     "def desc_ndarray(nd):\n",
 53 |     "    try:\n",
 54 |     "        print \"The ndarray has dimension n=%s and m=%s\" % (nd.shape[0],nd.shape[1])\n",
 55 |     "    except:\n",
 56 |     "        print \"The ndarray has dimension n=%s and m=1\" % nd.shape[0]\n",
 57 |     "    print \"The ndarray has elements of type=%s\" % nd.dtype\n",
 58 |     "\n",
 59 |     "desc_ndarray(nd1)\n",
 60 |     "\n",
 61 |     "desc_ndarray(nd2)\n",
 62 |     "\n"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 3,
 68 |    "metadata": {
 69 |     "collapsed": false
 70 |    },
 71 |    "outputs": [
 72 |     {
 73 |      "data": {
 74 |       "text/plain": [
 75 |        "[array([ 0.,  0.,  0.,  0.]),\n",
 76 |        " array([ 1.,  1.,  1.,  1.]),\n",
 77 |        " array([ 0.47121338,  1.83328779,  0.4438019 , -0.52309325])]"
 78 |       ]
 79 |      },
 80 |      "execution_count": 3,
 81 |      "metadata": {},
 82 |      "output_type": "execute_result"
 83 |     }
 84 |    ],
 85 |    "source": [
 86 |     "#There are short cuts for creating certain frequently used special ndarrays, i.e.,\n",
 87 |     "\n",
 88 |     "k=4\n",
 89 |     "\n",
 90 |     "#1. an ndarray of all zeros\n",
 91 |     "zero = np.zeros(k)\n",
 92 |     "\n",
 93 |     "#2. an ndarray of all ones\n",
 94 |     "one = np.ones(k)\n",
 95 |     "\n",
 96 |     "#3. an ndarray of random elements (this one is standard normal, but there are many distributions to choose from)\n",
 97 |     "rand = np.random.randn(k)\n",
 98 |     "\n",
 99 |     "[zero, one, rand]"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 4,
105 |    "metadata": {
106 |     "collapsed": false
107 |    },
108 |    "outputs": [
109 |     {
110 |      "data": {
111 |       "text/plain": [
112 |        "[array([[ 0.69394907,  0.85723722],\n",
113 |        "        [-0.16779156,  0.41709003],\n",
114 |        "        [-0.94008249, -0.21591983],\n",
115 |        "        [-0.61305106,  0.41435495]]),\n",
116 |        " array([-0.16779156,  0.41709003]),\n",
117 |        " 0.41709003439166575]"
118 |       ]
119 |      },
120 |      "execution_count": 4,
121 |      "metadata": {},
122 |      "output_type": "execute_result"
123 |     }
124 |    ],
125 |    "source": [
126 |     "'''\n",
127 |     "For indexing an array:\n",
128 |     "1. If nx1 array, follow the same protocol as a regular Python list\n",
129 |     "2. If nxm array use the following examples\n",
130 |     "'''\n",
131 |     "\n",
132 |     "arr2d = np.random.randn(4,2)\n",
133 |     "\n",
134 |     "#A single index gets a full row\n",
135 |     "\n",
136 |     "#2 indexes returns a value\n",
137 |     "[arr2d, arr2d[1],  arr2d[1,1]]"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 5,
143 |    "metadata": {
144 |     "collapsed": false
145 |    },
146 |    "outputs": [
147 |     {
148 |      "data": {
149 |       "text/plain": [
150 |        "[array([-0.4386254 , -0.67720483, -1.19775067, -0.21300288]),\n",
151 |        " array([-0.8772508 , -1.35440967, -2.39550135, -0.42600575]),\n",
152 |        " array([-0.8772508 , -1.35440967, -2.39550135, -0.42600575]),\n",
153 |        " array([-0., -0., -0., -0.])]"
154 |       ]
155 |      },
156 |      "execution_count": 5,
157 |      "metadata": {},
158 |      "output_type": "execute_result"
159 |     }
160 |    ],
161 |    "source": [
162 |     "'''\n",
163 |     "Operations between Arrays and Scalars\n",
164 |     "An important feature of ndarrays is they allow batch operations on data without writing any for loops.  \n",
165 |     "This is called vectorization.\n",
166 |     "Any arithmetic operations between equal-size arrays applies the operation elementwise. \n",
167 |     "'''\n",
168 |     "\n",
169 |     "#examples\n",
170 |     "\n",
171 |     "k = 4\n",
172 |     "rand = np.random.randn(k)\n",
173 |     "[rand, rand + rand, 2*rand, rand*np.zeros(4)]\n",
174 |     "\n"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 7,
180 |    "metadata": {
181 |     "collapsed": false
182 |    },
183 |    "outputs": [
184 |     {
185 |      "data": {
186 |       "text/plain": [
187 |        "[array([ 0.19631415,  0.41059714,  4.26249299]),\n",
188 |        " array([-1.46310809,  1.15559786,  0.10690073]),\n",
189 |        " array([-1.26679394,  1.566195  ,  4.36939372])]"
190 |       ]
191 |      },
192 |      "execution_count": 7,
193 |      "metadata": {},
194 |      "output_type": "execute_result"
195 |     }
196 |    ],
197 |    "source": [
198 |     "'''\n",
199 |     "Matrix operations\n",
200 |     "It is easy to do matrix operations with Nd arrays. The standard arithmetic operators don't work here though. And it is important \n",
201 |     "to make sure matrix shapes are compatible\n",
202 |     "'''\n",
203 |     "\n",
204 |     "k = 3\n",
205 |     "r1 = np.random.randn(k)\n",
206 |     "r2 = np.random.randn(k)\n",
207 |     "\n",
208 |     "#Matrix addition is the standard matrix operator\n",
209 |     "[r1, r2 , r1 + r2]\n"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 8,
215 |    "metadata": {
216 |     "collapsed": false
217 |    },
218 |    "outputs": [
219 |     {
220 |      "data": {
221 |       "text/plain": [
222 |        "[array([[ 0.19631415,  0.41059714,  4.26249299],\n",
223 |        "        [-1.46310809,  1.15559786,  0.10690073]]),\n",
224 |        " array([[ 0.19631415, -1.46310809],\n",
225 |        "        [ 0.41059714,  1.15559786],\n",
226 |        "        [ 4.26249299,  0.10690073]])]"
227 |       ]
228 |      },
229 |      "execution_count": 8,
230 |      "metadata": {},
231 |      "output_type": "execute_result"
232 |     }
233 |    ],
234 |    "source": [
235 |     "#The Transpose can be taken with the attribute T\n",
236 |     "arr2d = np.array([r1, r2])\n",
237 |     "[arr2d, arr2d.T]"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 9,
243 |    "metadata": {
244 |     "collapsed": false
245 |    },
246 |    "outputs": [
247 |     {
248 |      "data": {
249 |       "text/plain": [
250 |        "[array([[ 0.19631415,  0.41059714,  4.26249299],\n",
251 |        "        [-1.46310809,  1.15559786,  0.10690073]]),\n",
252 |        " array([[  3.85392468e-02,   1.68590015e-01,   1.81688465e+01],\n",
253 |        "        [  2.14068529e+00,   1.33540642e+00,   1.14277663e-02]]),\n",
254 |        " array([[ 18.37597578,   0.64291997],\n",
255 |        "        [  0.64291997,   3.48751947]])]"
256 |       ]
257 |      },
258 |      "execution_count": 9,
259 |      "metadata": {},
260 |      "output_type": "execute_result"
261 |     }
262 |    ],
263 |    "source": [
264 |     "'''\n",
265 |     "Matrix multiplication, like inner products can be done on arrays.\n",
266 |     "Just remember that the standard multiplication operator does elementwise multiplication (provided they are the same shape).\n",
267 |     "We need the dot method in order to do an inner product\n",
268 |     "\n",
269 |     "Numpy has a linalg library that can run most matrix operations on ndarrays:\n",
270 |     "http://docs.scipy.org/doc/numpy/reference/routines.linalg.html\n",
271 |     "\n",
272 |     "One can also create a matrix object and use the methods in numpy.matrix to achieve the same thing:\n",
273 |     "http://docs.scipy.org/doc/numpy/reference/generated/numpy.matrix.html\n",
274 |     "'''\n",
275 |     "\n",
276 |     "[arr2d, arr2d * arr2d, arr2d.dot(arr2d.T)]"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 11,
282 |    "metadata": {
283 |     "collapsed": false
284 |    },
285 |    "outputs": [
286 |     {
287 |      "name": "stdout",
288 |      "output_type": "stream",
289 |      "text": [
290 |       "10000 loops, best of 3: 119 µs per loop\n"
291 |      ]
292 |     }
293 |    ],
294 |    "source": [
295 |     "'''\n",
296 |     "One important feature of vectorization is that it allows elementwise processing that is much faster than writing a traditional\n",
297 |     "loop.\n",
298 |     "'''\n",
299 |     "import math\n",
300 |     "\n",
301 |     "#show an example and profile i\n",
302 |     "%timeit [math.sqrt(x) for x in range(1000)]"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 12,
308 |    "metadata": {
309 |     "collapsed": false
310 |    },
311 |    "outputs": [
312 |     {
313 |      "name": "stdout",
314 |      "output_type": "stream",
315 |      "text": [
316 |       "The slowest run took 9.83 times longer than the fastest. This could mean that an intermediate result is being cached \n",
317 |       "100000 loops, best of 3: 5.19 µs per loop\n"
318 |      ]
319 |     }
320 |    ],
321 |    "source": [
322 |     "%timeit np.sqrt(np.arange(1000))"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": 16,
328 |    "metadata": {
329 |     "collapsed": false
330 |    },
331 |    "outputs": [
332 |     {
333 |      "name": "stderr",
334 |      "output_type": "stream",
335 |      "text": [
336 |       "ERROR: Line magic function `%inline` not found.\n"
337 |      ]
338 |     }
339 |    ],
340 |    "source": [
341 |     "'''\n",
342 |     "The last thing we'll cover in this module is the numpy.random library. In general, it is advised to use numpy for\n",
343 |     "random number generation as opposed to python's built in random module.\n",
344 |     "\n",
345 |     "Random number generation has many uses. One common use is generating fake (i.e. random) data to test modeling procedures\n",
346 |     "or to do Monte Carlo Simulations\n",
347 |     "'''\n",
348 |     "import matplotlib.pyplot as plt\n",
349 |     "%inline\n",
350 |     "\n",
351 |     "\n",
352 |     "#Generate random pairs that have a multivariate normal distribution\n",
353 |     "N = 1000\n",
354 |     "mu = np.array([0,0])\n",
355 |     "cov = 0.5\n",
356 |     "sig = np.array([[1, cov],[cov, 1]]) #Must be square, symmetric and non-negative definite\n",
357 |     "x, y = np.random.multivariate_normal(mu, sig, N).T\n",
358 |     "#Now let's plot and see what that looks like\n",
359 |     "\n",
360 |     "\n",
361 |     "plt.plot(x, y,'x'); plt.axis('equal'); plt.show()\n",
362 |     "\n"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 18,
368 |    "metadata": {
369 |     "collapsed": false
370 |    },
371 |    "outputs": [],
372 |    "source": [
373 |     "'''\n",
374 |     "One final example (taken from Wes Mckinney's book):\n",
375 |     "\n",
376 |     "Let's generate a random walk and visualize it\n",
377 |     "'''\n",
378 |     "import matplotlib.pyplot as plt\n",
379 |     "\n",
380 |     "nsteps = 1000\n",
381 |     "draws = np.random.randint(0, 2, size = nsteps) #Randint let's us generate random integers in a range\n",
382 |     "steps = np.where(draws>0, 1, -1) #there function let's us do boolean logic on a conditional applied to an entire array\n",
383 |     "walk = steps.cumsum() #Cumsum returns an array with the same size as steps, that has cum sum of steps up to index i\n",
384 |     "plt.plot(np.arange(len(walk)), walk);plt.show()"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": 30,
390 |    "metadata": {
391 |     "collapsed": false
392 |    },
393 |    "outputs": [],
394 |    "source": []
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": null,
399 |    "metadata": {
400 |     "collapsed": false
401 |    },
402 |    "outputs": [],
403 |    "source": []
404 |   }
405 |  ],
406 |  "metadata": {
407 |   "kernelspec": {
408 |    "display_name": "Python 3",
409 |    "language": "python",
410 |    "name": "python3"
411 |   },
412 |   "language_info": {
413 |    "codemirror_mode": {
414 |     "name": "ipython",
415 |     "version": 3
416 |    },
417 |    "file_extension": ".py",
418 |    "mimetype": "text/x-python",
419 |    "name": "python",
420 |    "nbconvert_exporter": "python",
421 |    "pygments_lexer": "ipython3",
422 |    "version": "3.6.0"
423 |   }
424 |  },
425 |  "nbformat": 4,
426 |  "nbformat_minor": 0
427 | }
428 | 


--------------------------------------------------------------------------------
/ipython/Labs_Student/test.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/Labs_Student/test.txt


--------------------------------------------------------------------------------
/ipython/Labs_complete/Lab4_Survey_Questions_part1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Let's start by reading in the data"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 3,
 13 |    "metadata": {
 14 |     "collapsed": false
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "/Users/briand/Desktop/ds course/ipython/data/\n"
 22 |      ]
 23 |     }
 24 |    ],
 25 |    "source": [
 26 |     "import pandas as pd\n",
 27 |     "import os\n",
 28 |     "import numpy as np\n",
 29 |     "import matplotlib.pyplot as plt\n",
 30 |     "%matplotlib inline\n",
 31 |     "\n",
 32 |     "\n",
 33 |     "#We assume data is in a parallel directory to this one called 'data'\n",
 34 |     "cwd = os.getcwd()\n",
 35 |     "datadir = '/'.join(cwd.split('/')[0:-1]) + '/data/'\n",
 36 |     "print(datadir)"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 4,
 42 |    "metadata": {
 43 |     "collapsed": true
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "#Student put in read data command here:\n",
 48 |     "data = pd.read_csv(datadir + 'survey_responses_2016.csv', header = 0, sep=',')"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "Let's look at the column headers and use something more descriptive"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 5,
 61 |    "metadata": {
 62 |     "collapsed": false
 63 |    },
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "Index(['id', 'cs_python', 'cs_java', 'cs_c', 'cs_perl', 'cs_javascript',\n",
 69 |        "       'cs_r', 'cs_sas', 'profile_1', 'profile_2', 'profile_3', 'profile_4',\n",
 70 |        "       'profile_5', 'profile_6', 'profile_7', 'fruit', 'len_answer', 'season',\n",
 71 |        "       'experience_coded', 'experience'],\n",
 72 |        "      dtype='object')"
 73 |       ]
 74 |      },
 75 |      "execution_count": 5,
 76 |      "metadata": {},
 77 |      "output_type": "execute_result"
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "#Student put in code to look at column names\n",
 82 |     "data.columns"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "Column names like 'profile_1-profile_7' aren't very descriptive. As a quick data maintenance task, let's rename the columns starting with 'profile'. The dictionary in the next cell maps the integer index to a descriptive text.\n",
 90 |     "\n",
 91 |     "Tactically, let's loop through each column name. Within the loop let's check whether the column name starts with 'profile.' If it does, let's create a new name that swaps the key with the value using profile_mapping dictionary (i.e., profile_1 -> profile_Viz). We then add the new column name to a list. If it doesn't start with 'profile' just add the old column name to the list. "
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 6,
 97 |    "metadata": {
 98 |     "collapsed": true
 99 |    },
100 |    "outputs": [],
101 |    "source": [
102 |     "profile_mapping = {1:'Viz',\n",
103 |     "                   2:'CS',\n",
104 |     "                   3:'Math',\n",
105 |     "                   4:'Stats',\n",
106 |     "                   5:'ML',\n",
107 |     "                   6:'Bus',\n",
108 |     "                   7:'Com'}"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 7,
114 |    "metadata": {
115 |     "collapsed": false
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "#Student put code here to change the header names\n",
120 |     "newcols = []\n",
121 |     "\n",
122 |     "for colname in data.columns:\n",
123 |     "    \n",
124 |     "    if colname[0:7] == 'profile':\n",
125 |     "        \n",
126 |     "        newcols.append('profile_{}'.format(profile_mapping[int(colname[-1])]))\n",
127 |     "        \n",
128 |     "    else:\n",
129 |     "        \n",
130 |     "        newcols.append(colname)\n",
131 |     "    \n",
132 |     "#Now swap the old columns with the values in newcols    \n",
133 |     "data.columns = newcols    "
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "Let's use this data to illustrate common data analytic techniques. We have one numeric variable (len_answer) and different categorical variables which may carry some signal of the 'len_answer' variable. \n",
141 |     "\n",
142 |     "'Len_answer' is the character count of the response to the following question: \"Besides the examples given in lecture 1, discuss a case where data science has created value for some company. Please explain the company's goals and how any sort of data analysis could have helped the company achieve said goals.\" As this is a subjective business question, let's hypothesize that students with more professional experience might be more likely to give longer answers. \n",
143 |     "\n",
144 |     "In more technical terms, we'll test whether the variance of len_answer can be explained away by the categorical representation of a student's experience. \n",
145 |     "\n",
146 |     "The first thing we should do is look at the distribution of len_answer."
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 8,
152 |    "metadata": {
153 |     "collapsed": false
154 |    },
155 |    "outputs": [
156 |     {
157 |      "data": {
158 |       "text/plain": [
159 |        "(array([ 41.,  35.,  18.,   5.,   1.,   1.,   0.,   0.,   0.,   1.]),\n",
160 |        " array([    0. ,   368.3,   736.6,  1104.9,  1473.2,  1841.5,  2209.8,\n",
161 |        "         2578.1,  2946.4,  3314.7,  3683. ]),\n",
162 |        " <a list of 10 Patch objects>)"
163 |       ]
164 |      },
165 |      "execution_count": 8,
166 |      "metadata": {},
167 |      "output_type": "execute_result"
168 |     },
169 |     {
170 |      "data": {
171 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEACAYAAAC9Gb03AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAElJJREFUeJzt3X+sZGddx/H3Z1tpCm2vVewuYWVLQ4RCbJaqjaYaF6Hc\nFSJt+INghfJDTf+w2NgEaWvMFmMMNXHRxPCPRbJFkRIitk1Qts0yJKCUanftFpZaowsWudcSmqvY\n8KPs1z/mXJkud3fm3nvmzvTp+5Wc7Jlnzpznu8/sfubMM3PmpKqQJLVr26wLkCRNl0EvSY0z6CWp\ncQa9JDXOoJekxhn0ktS4iYM+ybYkh5Pc1d3el+TRJA90y97plSlJ2qgz17Ht9cDngfNG2vZX1f5+\nS5Ik9WmiI/okO4HXALedfFfvFUmSejXp1M17gXcCJ59Ge12SI0luS7LQb2mSpD6MDfokrwWWq+oI\nTz2Cfx9wUVXtBpYAp3AkaQ5l3G/dJPkD4E3Ak8DZwLnAX1fVNSPb7ALurqpL1ni8P6YjSRtQVb1M\nj489oq+qm6vqBVV1EfBG4FBVXZNkx8hmrwceOs0+5n7Zt2/fzGuwTmu0TutcXfq0nm/dnOwPk+wG\nTgDHgWt7qUiS1Kt1BX1VfQr4VLd+zZjNJUlzwDNjO3v27Jl1CROxzv48HWoE6+zb06XOPo39MHbT\nHSQ17T4kqTVJqK36MFaS9PRm0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BL\nUuMMeklqnEEvSY3bzO/RT2xlZWUrunmKbdu2ce655255v5I0b7bk1yuf9azzptrHWr773W/yuc/9\nA5deeumW9y1Jm9Xnr1dOfESfZBvwj8CjVfW6JOcDdwC7GF5h6g1Vteah+7e/vfVH9AsLizz22GNb\n3q8kzZv1zNFfD3xh5PaNwL1V9WLgEHBTn4VJkvoxUdAn2Qm8BrhtpPlK4EC3fgC4qt/SJEl9mPSI\n/r3AO4HRCf3tVbUMUFVLwAU91yZJ6sHYOfokrwWWq+pIkj2n2fQ0n+reMrK+p1skSasGgwGDwWAq\n+x77rZskfwC8CXgSOBs4F/gY8JPAnqpaTrID+GRVXbzG4+u0rwFTsrCwyB133MDi4uKW9y1Jm7Wl\n14ytqpur6gVVdRHwRuBQVb0ZuBt4a7fZW4A7+yhIktSvzZwZ+x7giiQPA6/sbkuS5sy6zoytqk8B\nn+rWvw68ahpFSZL642/dSFLjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6\nSWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaNzbok5yV5L4kh5McTbKva9+X5NEkD3TL3umX\nK0lar7FXmKqqbyV5RVU9keQM4DNJ/ra7e39V7Z9uiZKkzZho6qaqnuhWz2L44lDd7V6uUC5Jmp6J\ngj7JtiSHgSXgnqq6v7vruiRHktyWZGFqVUqSNixVNX6r1Y2T84CPAe8AHgO+VlWV5PeB51XVr67x\nmIJ9Iy17umW6FhYW2bbtKI8//tWp97WW7dt3sbR0fCZ9S3r6GQwGDAaD/7/97ne/m6rqZdZkXUEP\nkOR3gf8dnZtPsgu4u6ouWWP7+t5Mz9ZZWFhkZeUgs+h7KKx3bCVpVZLegn6Sb908d3VaJsnZwBXA\nF5PsGNns9cBDfRQkSerX2G/dAM8DDiTZxvCF4Y6q+niS25PsBk4Ax4Frp1emJGmjJvl65VHg0jXa\nr5lKRZKkXnlmrCQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS\n1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcZNcSvCsJPclOZzkaJJ9Xfv5SQ4meTjJJ1YvNyhJmi9j\ng76qvgW8oqpeDuwGfjHJZcCNwL1V9WLgEHDTVCuVJG3IRFM3VfVEt3oWw8sPFnAlcKBrPwBc1Xt1\nkqRNmyjok2xLchhYAu6pqvuB7VW1DFBVS8AF0ytTkrRRYy8ODlBVJ4CXJzkP+FiSlzE8qn/KZqfe\nwy0j63u6RZK0ajAYMBgMprLvVJ0mn9d6QPK7wBPArwF7qmo5yQ7gk1V18Rrb12lfA6ZkYWGRlZWD\nzKLvobDesZWkVUmoqvSxr0m+dfPc1W/UJDkbuAI4BtwFvLXb7C3AnX0UJEnq1yRTN88DDiTZxvCF\n4Y6q+niSzwIfSfJ24EvAG6ZYpyRpg8YGfVUdBS5do/3rwKumUZQkqT+eGStJjTPoJalxBr0kNc6g\nl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjJrnC\n1M4kh5J8PsnRJO/o2vcleTTJA92yd/rlSpLWa5IrTD0J3FBVR5KcA/xTknu6+/ZX1f7plSdJ2qxJ\nrjC1BCx1699Icgx4fnd3LxeulSRNz7rm6JNcCOwG7uuarktyJMltqxcQlyTNl4mDvpu2+ShwfVV9\nA3gfcFFV7WZ4xO8UjiTNoUnm6ElyJsOQ/2BV3QlQVY+NbPJnwN2n3sMtI+t7ukWStGowGDAYDKay\n71TV+I2S24GvVdUNI207uvl7kvwW8FNVdfUajy0Y30ffFhYWWVk5yCz6HgqTjK0krSUJVdXL56Bj\nj+iTXA78CnA0yWGGyXkzcHWS3cAJ4DhwbR8FSZL6Ncm3bj4DnLHGXX/XfzmSpL55ZqwkNc6gl6TG\nGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxB\nL0mNM+glqXFjgz7JziSHknw+ydEkv9m1n5/kYJKHk3wiycL0y5UkrdckR/RPAjdU1cuAnwF+I8lL\ngBuBe6vqxcAh4KbplSlJ2qixQV9VS1V1pFv/BnAM2AlcCRzoNjsAXDWtIiVJG7euOfokFwK7gc8C\n26tqGYYvBsAFfRcnSdq8iYM+yTnAR4HruyP7OmmTk29LkubAmZNslORMhiH/waq6s2teTrK9qpaT\n7AD+69R7uGVkfU+3SJJWDQYDBoPBVPadqvEH4kluB75WVTeMtN0KfL2qbk3yLuD8qrpxjcfWLA72\nFxYWWVk5yOzeaIRJxlaS1pKEqkof+xp7RJ/kcuBXgKNJDjNMzpuBW4GPJHk78CXgDX0UJEnq19ig\nr6rPAGec4u5X9VuOJKlvnhkrSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGjfRmbHaiLNIejnXYV22\nb9/F0tLxLe9X0vwy6KfmW8zirNzl5a1/cZE035y6kaTGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z\n6CWpcQa9JDVubNAneX+S5SQPjrTtS/Jokge6Ze90y5QkbdQkR/QfABbXaN9fVZd2y9/1XJckqSdj\ng76qPg08vsZdnmsvSU8Dm5mjvy7JkSS3JVnorSJJUq82+qNm7wN+r6oqye8D+4FfPfXmt4ys7+kW\nSdKqwWDAYDCYyr5TNf4XFpPsAu6uqkvWc193f83iVxwXFhZZWTnILPoeyoz6DpM8p5LmWxKqqpcp\n8kmnbsLInHySHSP3vR54qI9iJEn9Gzt1k+RDDOdafjjJl4F9wCuS7AZOAMeBa6dYoyRpE8YGfVVd\nvUbzB6ZQiyRpCjwzVpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1Lj\nDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUuLFBn+T9SZaTPDjSdn6Sg0keTvKJJAvTLVOStFGT\nHNF/AFg8qe1G4N6qejFwCLip78IkSf0YG/RV9Wng8ZOarwQOdOsHgKt6rkuS1JONztFfUFXLAFW1\nBFzQX0mSpD6NvTj4hOr0d98ysr6nWyRJqwaDAYPBYCr7TtWYjAaS7ALurqpLutvHgD1VtZxkB/DJ\nqrr4FI+tsa8DU7CwsMjKykFm0fdQZtR3mOQ5lTTfklBV6WNfk07dpFtW3QW8tVt/C3BnH8VIkvo3\nydcrPwT8PfBjSb6c5G3Ae4ArkjwMvLK7LUmaQ2Pn6Kvq6lPc9aqea5EkTYFnxkpS4wx6SWqcQS9J\njTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4\ng16SGrepi4MnOQ6sACeA71TVZX0UJUnqz6aCnmHA76mqx/soRpLUv81O3aSHfUiSpmizIV3APUnu\nT/LrfRQkSerXZqduLq+qryb5EYaBf6yqPv39m90ysr6nWzQdZ5FkJj1v376LpaXjM+lberobDAYM\nBoOp7DtV1c+Okn3A/1TV/pPaa3jgv7UWFhZZWTnILPoeyoz6nlW/w777+vckPdMloap6OWrb8NRN\nkmcnOadbfw7wauChPoqSJPVnM1M324GPDY/YORP4y6o62E9ZkqS+bDjoq+rfgd091iJJmgK/GilJ\njTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4\ng16SGmfQS1LjNhX0SfYm+WKSf0nyrr6K0tPV8Hq1s1jOOOM5M+t7x44LZz3wzyg7dlzoc71Om7mU\n4DbgT4FF4GXALyd5SV+Fbb3BrAuY0GDWBZzGtxher7aAT46sT385ceKJDTyunxqXl7/Uz/CdwrQu\nGN23rapzON6bec42/rxP+7mels0c0V8GPFJVX6qq7wAfBq7sp6xZGMy6gAkNZl3AhAazLmACg1kX\nMBGDvm+DWRew5TYT9M8H/mPk9qNdmyRpjmzm4uATO++8X9qKbp7im998YMv7lKR5lKra2AOTnwZu\nqaq93e0bgaqqW0/abmMdSNIzXFWlj/1sJujPAB4GXgl8Ffgc8MtVdayPwiRJ/djw1E1VfTfJdcBB\nhnP97zfkJWn+bPiIXpL09DC1M2Pn7WSqJMeT/HOSw0k+17Wdn+RgkoeTfCLJwsj2NyV5JMmxJK+e\nYl3vT7Kc5MGRtnXXleTSJA924/3HW1TnviSPJnmgW/bOss4kO5McSvL5JEeT/GbXPlfjuUad7+ja\n5208z0pyX/d/5miSfV37vI3nqeqcq/Hs9r+tq+Wu7vbWjGVV9b4wfAH5V2AX8APAEeAl0+hrHTX9\nG3D+SW23Ar/drb8LeE+3/lLgMMOprQu7v0umVNfPAruBBzdTF3Af8FPd+seBxS2ocx9wwxrbXjyL\nOoEdwO5u/RyGnyG9ZN7G8zR1ztV4dvt8dvfnGcBnGZ4/M1fjeZo653E8fwv4C+Cu7vaWjOW0jujn\n8WSq8P3vYK4EDnTrB4CruvXXAR+uqier6jjwCMO/U++q6tPA45upK8kO4Nyqur/b7vaRx0yzThiO\n68munEWdVbVUVUe69W8Ax4CdzNl4nqLO1XNQ5mY8u/qe6FbPYhg6xZyN52nqhDkazyQ7gdcAt51U\ny9THclpBP48nUxVwT5L7k/xa17a9qpZh+J8PuKBrP7n+r7C19V+wzrqez3CMV23leF+X5EiS20be\nds68ziQXMnwH8lnW/zzPos77uqa5Gs9uquEwsATc0wXM3I3nKeqE+RrP9wLv5HsvQrBFY/lM+vXK\ny6vqUoavqL+R5Od46oCzxu15Ma91vQ+4qKp2M/wP9kczrgeAJOcAHwWu746Y5/J5XqPOuRvPqjpR\nVS9n+M7osiQvYw7Hc406X8ocjWeS1wLL3Tu50303fipjOa2g/wrwgpHbO7u2mamqr3Z/Pgb8DcOp\nmOUk2wG6t0T/1W3+FeBHRx6+1fWvt66Z1FtVj1U3UQj8Gd+b3ppZnUnOZBieH6yqO7vmuRvPteqc\nx/FcVVX/zfBHYvYyh+O5Vp1zNp6XA69L8m/AXwG/kOSDwNJWjOW0gv5+4EVJdiV5FvBG4K4p9TVW\nkmd3R08keQ7wauBoV9Nbu83eAqwGw13AG5M8K8kLgRcxPCFsaiXy1Ff5ddXVveVbSXJZkgDXjDxm\nanV2/zBXvR54aA7q/HPgC1X1JyNt8zie31fnvI1nkueuTnckORu4guHnCXM1nqeo84vzNJ5VdXNV\nvaCqLmKYh4eq6s3A3WzFWPb5ifJJny7vZfhtgkeAG6fVz4S1vJDhN38OMwz4G7v2HwLu7eo8CPzg\nyGNuYvhJ9zHg1VOs7UPAfzL8jd8vA28Dzl9vXcBPdH+3R4A/2aI6bwce7Mb2bxjON86sToZHTd8d\nea4f6P4drvt5nlGd8zaeP97VdqSr63c2+v9mRnXO1XiO9PHzfO9bN1sylp4wJUmNeyZ9GCtJz0gG\nvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9Jjfs/4GVOfh6ByxoAAAAASUVORK5CYII=\n",
172 |       "text/plain": [
173 |        "<matplotlib.figure.Figure at 0x107c0a2b0>"
174 |       ]
175 |      },
176 |      "metadata": {},
177 |      "output_type": "display_data"
178 |     }
179 |    ],
180 |    "source": [
181 |     "#Student - build and plot a histogram here\n",
182 |     "plt.hist(data.len_answer)"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "It looks like we have at least one strong outlier and a thick distribution around 0. Let's also use the Pandas describe() method to get a stronger sense of the distribution."
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 9,
195 |    "metadata": {
196 |     "collapsed": false
197 |    },
198 |    "outputs": [
199 |     {
200 |      "data": {
201 |       "text/plain": [
202 |        "count     102.000000\n",
203 |        "mean      547.725490\n",
204 |        "std       480.267152\n",
205 |        "min         0.000000\n",
206 |        "25%       262.500000\n",
207 |        "50%       460.500000\n",
208 |        "75%       745.750000\n",
209 |        "max      3683.000000\n",
210 |        "Name: len_answer, dtype: float64"
211 |       ]
212 |      },
213 |      "execution_count": 9,
214 |      "metadata": {},
215 |      "output_type": "execute_result"
216 |     }
217 |    ],
218 |    "source": [
219 |     "data.len_answer.describe()"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "metadata": {},
225 |    "source": [
226 |     "Let's consider cleaning up the data. We'll remove the max value as well as those with a length less than 20 (which we think is a generous minimum to communicate a reasonable answer.\n",
227 |     "\n",
228 |     "Create a new data_frame that removes these outliers."
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 10,
234 |    "metadata": {
235 |     "collapsed": false
236 |    },
237 |    "outputs": [
238 |     {
239 |      "data": {
240 |       "text/plain": [
241 |        "(93, 20)"
242 |       ]
243 |      },
244 |      "execution_count": 10,
245 |      "metadata": {},
246 |      "output_type": "execute_result"
247 |     }
248 |    ],
249 |    "source": [
250 |     "#Student create a filtered data frame here\n",
251 |     "outlier_filter = (data.len_answer > 20) & (data.len_answer < data.len_answer.max())\n",
252 |     "data_clean = data[outlier_filter]\n",
253 |     "data_clean.shape"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "Now that we have cleaned our data, let's run a pairwise t-test on each experience level to see if their difference in len_answer is statistically significant. To run a t-test, we'll need the mean, standard-deviation and count for each group. We can achieve this with a pandas groupby operation."
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 11,
266 |    "metadata": {
267 |     "collapsed": false
268 |    },
269 |    "outputs": [
270 |     {
271 |      "data": {
272 |       "text/html": [
273 |        "<div>\n",
274 |        "<table border=\"1\" class=\"dataframe\">\n",
275 |        "  <thead>\n",
276 |        "    <tr>\n",
277 |        "      <th></th>\n",
278 |        "      <th colspan=\"3\" halign=\"left\">len_answer</th>\n",
279 |        "    </tr>\n",
280 |        "    <tr>\n",
281 |        "      <th></th>\n",
282 |        "      <th>mean</th>\n",
283 |        "      <th>std</th>\n",
284 |        "      <th>count</th>\n",
285 |        "    </tr>\n",
286 |        "    <tr>\n",
287 |        "      <th>experience</th>\n",
288 |        "      <th></th>\n",
289 |        "      <th></th>\n",
290 |        "      <th></th>\n",
291 |        "    </tr>\n",
292 |        "  </thead>\n",
293 |        "  <tbody>\n",
294 |        "    <tr>\n",
295 |        "      <th>2-5 years, I'm getting good at what I do!</th>\n",
296 |        "      <td>732.222222</td>\n",
297 |        "      <td>398.570468</td>\n",
298 |        "      <td>18</td>\n",
299 |        "    </tr>\n",
300 |        "    <tr>\n",
301 |        "      <th>5+ years, I'm a veteran!</th>\n",
302 |        "      <td>717.333333</td>\n",
303 |        "      <td>269.793748</td>\n",
304 |        "      <td>6</td>\n",
305 |        "    </tr>\n",
306 |        "    <tr>\n",
307 |        "      <th>&lt; 2 years, I'm fresh!</th>\n",
308 |        "      <td>489.312500</td>\n",
309 |        "      <td>285.271501</td>\n",
310 |        "      <td>16</td>\n",
311 |        "    </tr>\n",
312 |        "    <tr>\n",
313 |        "      <th>None, I just finished my undergrad!</th>\n",
314 |        "      <td>507.000000</td>\n",
315 |        "      <td>335.536253</td>\n",
316 |        "      <td>53</td>\n",
317 |        "    </tr>\n",
318 |        "  </tbody>\n",
319 |        "</table>\n",
320 |        "</div>"
321 |       ],
322 |       "text/plain": [
323 |        "                                           len_answer                  \n",
324 |        "                                                 mean         std count\n",
325 |        "experience                                                             \n",
326 |        "2-5 years, I'm getting good at what I do!  732.222222  398.570468    18\n",
327 |        "5+ years, I'm a veteran!                   717.333333  269.793748     6\n",
328 |        "< 2 years, I'm fresh!                      489.312500  285.271501    16\n",
329 |        "None, I just finished my undergrad!        507.000000  335.536253    53"
330 |       ]
331 |      },
332 |      "execution_count": 11,
333 |      "metadata": {},
334 |      "output_type": "execute_result"
335 |     }
336 |    ],
337 |    "source": [
338 |     "#Student input code here\n",
339 |     "data_clean_grouped = data_clean[['len_answer', 'experience']].groupby(['experience']).agg(['mean', 'std', 'count'])\n",
340 |     "data_clean_grouped"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "markdown",
345 |    "metadata": {},
346 |    "source": [
347 |     "Visually, we can see a potential split between the [0, 2] year experience range and the [2+] experience range. Let's be more rigorous and run t-tests. Let's write a function that takes in the necessary statistics and returns a p-value.\n",
348 |     "\n",
349 |     "Remember, the t-stat for the difference between two means is:\n",
350 |     "\n",
351 |     "<center>$t = \\frac{\\hat{\\mu_1} - \\hat{\\mu_2}}{\\sqrt{\\frac{\\hat{\\sigma_1}^2}{n_1} + \\frac{\\hat{\\sigma_2}^2}{n_2}}}$</center>\n",
352 |     "\n",
353 |     "The p-value can be found using a t-distribution, but for simplicity, let's approximate this with the normal distribution. For the 2-tailed test, the p-value is: 2 * (1 - Norm.CDF(T))."
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": 31,
359 |    "metadata": {
360 |     "collapsed": false
361 |    },
362 |    "outputs": [],
363 |    "source": [
364 |     "#Student complete the function\n",
365 |     "from scipy.stats import norm\n",
366 |     "def pvalue_diffmeans_twotail(mu1, sig1, n1, mu2, sig2, n2):\n",
367 |     "    '''\n",
368 |     "    P-value calculator for the hypothesis test of mu1 != mu2.\n",
369 |     "    Takes in the approprate inputs to compute the t-statistic for the difference between means\n",
370 |     "    Outputs a p-value for a two-sample t-test.\n",
371 |     "    '''\n",
372 |     "    diff = mu1 - mu2\n",
373 |     "    stderror = np.sqrt(sig1**2 / n1 + sig2**2 / n2)\n",
374 |     "    t = diff / stderror\n",
375 |     "    \n",
376 |     "    p_value = 2 * (1- norm.cdf(t))\n",
377 |     "    \n",
378 |     "    return (t, p_value)"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "markdown",
383 |    "metadata": {},
384 |    "source": [
385 |     "Now loop through all possible pairs in data_clean_grouped and perform a t-test."
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": 32,
391 |    "metadata": {
392 |     "collapsed": false
393 |    },
394 |    "outputs": [
395 |     {
396 |      "name": "stdout",
397 |      "output_type": "stream",
398 |      "text": [
399 |       "Two tailed T-Test between groups: 2-5 years, I'm getting good at what I do! and 5+ years, I'm a veteran!\n",
400 |       "Diff = 15.0 characters\n",
401 |       "The t-stat is 0.103 and p-value is 0.918\n",
402 |       "\n",
403 |       "Two tailed T-Test between groups: 2-5 years, I'm getting good at what I do! and < 2 years, I'm fresh!\n",
404 |       "Diff = 243.0 characters\n",
405 |       "The t-stat is 2.059 and p-value is 0.039\n",
406 |       "\n",
407 |       "Two tailed T-Test between groups: 2-5 years, I'm getting good at what I do! and None, I just finished my undergrad!\n",
408 |       "Diff = 225.0 characters\n",
409 |       "The t-stat is 2.152 and p-value is 0.031\n",
410 |       "\n",
411 |       "Two tailed T-Test between groups: 5+ years, I'm a veteran! and < 2 years, I'm fresh!\n",
412 |       "Diff = 228.0 characters\n",
413 |       "The t-stat is 1.738 and p-value is 0.082\n",
414 |       "\n",
415 |       "Two tailed T-Test between groups: 5+ years, I'm a veteran! and None, I just finished my undergrad!\n",
416 |       "Diff = 210.0 characters\n",
417 |       "The t-stat is 1.762 and p-value is 0.078\n",
418 |       "\n",
419 |       "Two tailed T-Test between groups: < 2 years, I'm fresh! and None, I just finished my undergrad!\n",
420 |       "Diff = -18.0 characters\n",
421 |       "The t-stat is -0.208 and p-value is 1.165\n",
422 |       "\n"
423 |      ]
424 |     }
425 |    ],
426 |    "source": [
427 |     "#Student put in code here:\n",
428 |     "\n",
429 |     "#get distinct values in the data frame for the experience variable\n",
430 |     "\n",
431 |     "#data_grouped = data[['len_answer', 'experience']].groupby(['experience']).agg(['mean', 'std', 'count'])\n",
432 |     "#ttest_data = data_grouped\n",
433 |     "\n",
434 |     "\n",
435 |     "ttest_data = data_clean_grouped\n",
436 |     "\n",
437 |     "\n",
438 |     "grps = ttest_data.index.values\n",
439 |     "\n",
440 |     "#Now loop through each pair\n",
441 |     "for i, grp1 in enumerate(grps):\n",
442 |     "    for grp2 in grps[i + 1:]:\n",
443 |     "    \n",
444 |     "        '''\n",
445 |     "        hint: since the grp name is the index, pull out the record corresponding to that index value. \n",
446 |     "        Also, the result of groupby uses a multi-index. So be sure to index on 'len_answer' as well.\n",
447 |     "        Then pull out the mean, std, and cnt from that result.   \n",
448 |     "        '''        \n",
449 |     "        row1 = ttest_data.ix[grp1].ix['len_answer']\n",
450 |     "        row2 = ttest_data.ix[grp2].ix['len_answer']\n",
451 |     "    \n",
452 |     "        tstat, p_value = pvalue_diffmeans_twotail(row1['mean'], row1['std'], row1['count'], row2['mean'], row2['std'], row2['count'])\n",
453 |     "    \n",
454 |     "        print('Two tailed T-Test between groups: {} and {}'.format(grp1, grp2))\n",
455 |     "        print('Diff = {} characters'.format(round(row1['mean'] - row2['mean'], 0)))\n",
456 |     "        print('The t-stat is {} and p-value is {}'.format(round(tstat, 3), round(p_value, 3)))\n",
457 |     "        print('')"
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "markdown",
462 |    "metadata": {},
463 |    "source": [
464 |     "What are some observations you might have about the above results? Are there any with large deviances that are not statistically significant at at least a 95% level?\n",
465 |     "\n",
466 |     "Also, how do the numbers change if you rerun it using the original data, and not the cleaned data. What is the effect of outliers on the results?"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": 33,
472 |    "metadata": {
473 |     "collapsed": false
474 |    },
475 |    "outputs": [
476 |     {
477 |      "name": "stdout",
478 |      "output_type": "stream",
479 |      "text": [
480 |       "Two tailed T-Test between groups: 2-5 years, I'm getting good at what I do! and 5+ years, I'm a veteran!\n",
481 |       "Diff = 79.0 characters\n",
482 |       "The t-stat is 0.466 and p-value is 0.641\n",
483 |       "\n",
484 |       "Two tailed T-Test between groups: 2-5 years, I'm getting good at what I do! and < 2 years, I'm fresh!\n",
485 |       "Diff = 54.0 characters\n",
486 |       "The t-stat is 0.252 and p-value is 0.801\n",
487 |       "\n",
488 |       "Two tailed T-Test between groups: 2-5 years, I'm getting good at what I do! and None, I just finished my undergrad!\n",
489 |       "Diff = 230.0 characters\n",
490 |       "The t-stat is 2.148 and p-value is 0.032\n",
491 |       "\n",
492 |       "Two tailed T-Test between groups: 5+ years, I'm a veteran! and < 2 years, I'm fresh!\n",
493 |       "Diff = -25.0 characters\n",
494 |       "The t-stat is -0.104 and p-value is 1.083\n",
495 |       "\n",
496 |       "Two tailed T-Test between groups: 5+ years, I'm a veteran! and None, I just finished my undergrad!\n",
497 |       "Diff = 152.0 characters\n",
498 |       "The t-stat is 1.04 and p-value is 0.298\n",
499 |       "\n",
500 |       "Two tailed T-Test between groups: < 2 years, I'm fresh! and None, I just finished my undergrad!\n",
501 |       "Diff = 176.0 characters\n",
502 |       "The t-stat is 0.894 and p-value is 0.372\n",
503 |       "\n"
504 |      ]
505 |     }
506 |    ],
507 |    "source": [
508 |     "#Rerun everything without cleaning outliers\n",
509 |     "data_grouped = data[['len_answer', 'experience']].groupby(['experience']).agg(['mean', 'std', 'count'])\n",
510 |     "ttest_data = data_grouped\n",
511 |     "\n",
512 |     "\n",
513 |     "grps = ttest_data.index.values\n",
514 |     "\n",
515 |     "#Now loop through each pair\n",
516 |     "for i, grp1 in enumerate(grps):\n",
517 |     "    for grp2 in grps[i + 1:]:\n",
518 |     "    \n",
519 |     "        '''\n",
520 |     "        hint: since the grp name is the index, pull out the record corresponding to that index value. \n",
521 |     "        Also, the result of groupby uses a multi-index. So be sure to index on 'len_answer' as well.\n",
522 |     "        Then pull out the mean, std, and cnt from that result.   \n",
523 |     "        '''        \n",
524 |     "        row1 = ttest_data.ix[grp1].ix['len_answer']\n",
525 |     "        row2 = ttest_data.ix[grp2].ix['len_answer']\n",
526 |     "    \n",
527 |     "        tstat, p_value = pvalue_diffmeans_twotail(row1['mean'], row1['std'], row1['count'], row2['mean'], row2['std'], row2['count'])\n",
528 |     "    \n",
529 |     "        print('Two tailed T-Test between groups: {} and {}'.format(grp1, grp2))\n",
530 |     "        print('Diff = {} characters'.format(round(row1['mean'] - row2['mean'], 0)))\n",
531 |     "        print('The t-stat is {} and p-value is {}'.format(round(tstat, 3), round(p_value, 3)))\n",
532 |     "        print('')"
533 |    ]
534 |   }
535 |  ],
536 |  "metadata": {
537 |   "anaconda-cloud": {},
538 |   "kernelspec": {
539 |    "display_name": "Python [py35]",
540 |    "language": "python",
541 |    "name": "Python [py35]"
542 |   },
543 |   "language_info": {
544 |    "codemirror_mode": {
545 |     "name": "ipython",
546 |     "version": 3
547 |    },
548 |    "file_extension": ".py",
549 |    "mimetype": "text/x-python",
550 |    "name": "python",
551 |    "nbconvert_exporter": "python",
552 |    "pygments_lexer": "ipython3",
553 |    "version": "3.5.2"
554 |   }
555 |  },
556 |  "nbformat": 4,
557 |  "nbformat_minor": 0
558 | }
559 | 


--------------------------------------------------------------------------------
/ipython/Labs_complete/lab_7_sklearn_complete.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "In this lab we'll demonstrate several common techniques and helpful tools used in a model building process:\n",
 10 |     "\n",
 11 |     "- Use Sklearn to generate polynomial features and rescale them\n",
 12 |     "- Create folds for cross-validation\n",
 13 |     "- Perform a grid search to optimize hyper-parameters using cross-validation\n",
 14 |     "- Create pipelines to perform grids search in less code\n",
 15 |     "- Improve upon a baseline model incrementally by adding in more complexity\n",
 16 |     "\n",
 17 |     "This lab will require using several Sklearn classes. It would be helpful to refer to appropriate documentation:\n",
 18 |     "- http://scikit-learn.org/stable/modules/preprocessing.html\n",
 19 |     "- http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler\n",
 20 |     "- http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html#sklearn.preprocessing.PolynomialFeatures\n",
 21 |     "- http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV\n",
 22 |     "- http://scikit-learn.org/stable/modules/pipeline.html#pipeline\n",
 23 |     "\n",
 24 |     "Also, here is a helpful tutorial that explains how to use much of the above:\n",
 25 |     "- https://civisanalytics.com/blog/data-science/2016/01/06/workflows-python-using-pipeline-gridsearchcv-for-compact-code/\n",
 26 |     "\n",
 27 |     "Like always, let's first load in the data.\n"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 99,
 33 |    "metadata": {
 34 |     "collapsed": false
 35 |    },
 36 |    "outputs": [
 37 |     {
 38 |      "data": {
 39 |       "text/plain": [
 40 |        "Index(['revenue', 'outcalls', 'incalls', 'months', 'eqpdays', 'webcap',\n",
 41 |        "       'marryyes', 'travel', 'pcown', 'creditcd', 'retcalls', 'churndep'],\n",
 42 |        "      dtype='object')"
 43 |       ]
 44 |      },
 45 |      "execution_count": 99,
 46 |      "metadata": {},
 47 |      "output_type": "execute_result"
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "import os\n",
 52 |     "import pandas as pd\n",
 53 |     "from sklearn.linear_model import LogisticRegression\n",
 54 |     "from sklearn.grid_search import GridSearchCV\n",
 55 |     "from sklearn.cross_validation import KFold\n",
 56 |     "cwd = os.getcwd()\n",
 57 |     "datadir = '/'.join(cwd.split('/')[0:-1]) + '/data/'\n",
 58 |     "\n",
 59 |     "\n",
 60 |     "\n",
 61 |     "data = pd.read_csv(datadir + 'Cell2Cell_data.csv', header=0, sep=',')\n",
 62 |     "\n",
 63 |     "#Randomly sort the data:\n",
 64 |     "data = data.sample(frac = 1)\n",
 65 |     "data.columns"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "Next we're going to prep the data. From prior analysis (Churn Case Study) we learned that we can drop a few variables, as they are either highly redundant or don't carry a strong relationship with the outcome.\n",
 73 |     "\n",
 74 |     "After dropping, we're going to use the SkLearn KFold class to set up cross validation fold indexes."
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 64,
 80 |    "metadata": {
 81 |     "collapsed": false
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "#Prior analysis (from Churn Case study) has shown that we can drop a few redundant variables\n",
 86 |     "#We want to drop a few to speed up later calculations\n",
 87 |     "dropvar_list = ['incalls', 'creditcd', 'marryyes', 'travel', 'pcown']\n",
 88 |     "data_subset = data.drop(dropvar_list, 1)\n",
 89 |     "\n",
 90 |     "#Set up X and Y\n",
 91 |     "X = data_subset.drop('churndep', 1)\n",
 92 |     "Y = data_subset['churndep']\n",
 93 |     "\n",
 94 |     "#Use Kfold to create 4 folds\n",
 95 |     "kfolds = KFold(data_subset.shape[0], n_folds = 4)\n"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "Next let's use cross-validation to build a baseline model. We're going to use LR with no feature pre-processing. We're going to look at both L1 and L2 regularization with different weights. We can do this very succinctly with SkLearns GridSearchCV package."
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 91,
108 |    "metadata": {
109 |     "collapsed": false
110 |    },
111 |    "outputs": [
112 |     {
113 |      "name": "stdout",
114 |      "output_type": "stream",
115 |      "text": [
116 |       "-0.682495178553\n"
117 |      ]
118 |     }
119 |    ],
120 |    "source": [
121 |     "#1st, set up a paramater grid\n",
122 |     "param_grid_lr = {'C':[10**i for i in range(-3, 3)], 'penalty':['l1', 'l2']}\n",
123 |     "\n",
124 |     "#2nd, call the GridSearchCV class, use LogisticRegression and 'log_loss' for scoring\n",
125 |     "lr_grid_search = GridSearchCV(LogisticRegression(), param_grid_lr, cv = kfolds, scoring = 'log_loss') \n",
126 |     "lr_grid_search.fit(X, Y)\n",
127 |     "\n",
128 |     "#3rd, get the score of the best model and print it\n",
129 |     "best_1 = lr_grid_search.best_score_\n",
130 |     "print(best_1)"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 39,
136 |    "metadata": {
137 |     "collapsed": false
138 |    },
139 |    "outputs": [
140 |     {
141 |      "data": {
142 |       "text/plain": [
143 |        "LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,\n",
144 |        "          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
145 |        "          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n",
146 |        "          verbose=0, warm_start=False)"
147 |       ]
148 |      },
149 |      "execution_count": 39,
150 |      "metadata": {},
151 |      "output_type": "execute_result"
152 |     }
153 |    ],
154 |    "source": [
155 |     "#Next let's look at the best-estimator chosen to see what the parameters were\n",
156 |     "lr_grid_search.best_estimator_"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "Now let's see if we can beat this by standardizing the features. We'll approach this using the GridSearchCV class but also build a pipeline. Later we'll extend the pipeline to allow for feature engineering as well."
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 72,
169 |    "metadata": {
170 |     "collapsed": false
171 |    },
172 |    "outputs": [
173 |     {
174 |      "name": "stdout",
175 |      "output_type": "stream",
176 |      "text": [
177 |       "-0.682490465504\n"
178 |      ]
179 |     }
180 |    ],
181 |    "source": [
182 |     "from sklearn.pipeline import Pipeline\n",
183 |     "from sklearn.preprocessing import StandardScaler\n",
184 |     "\n",
185 |     "#Create a set of steps. All but the last step is a transformer (something that processes data). \n",
186 |     "#Build a list of steps, where the first is StandardScaler and the second is LogisticRegression\n",
187 |     "#The last step should be an estimator.\n",
188 |     "\n",
189 |     "steps = [('scaler', StandardScaler()),\n",
190 |     "         ('lr', LogisticRegression())]\n",
191 |     "\n",
192 |     "#Now set up the pipeline\n",
193 |     "pipeline = Pipeline(steps)\n",
194 |     "\n",
195 |     "#Now set up the parameter grid, paying close to the correct convention here\n",
196 |     "parameters_scaler = dict(lr__C = [10**i for i in range(-3, 3)],\n",
197 |     "                  lr__penalty = ['l1', 'l2'])\n",
198 |     "\n",
199 |     "#Now run another grid search\n",
200 |     "lr_grid_search_scaler = GridSearchCV(pipeline, param_grid = parameters_scaler, cv = kfolds, scoring = 'log_loss')\n",
201 |     "lr_grid_search_scaler.fit(X, Y)\n",
202 |     "\n",
203 |     "\n",
204 |     "#Again, print the score of the best model\n",
205 |     "best_2 = lr_grid_search_scaler.best_score_\n",
206 |     "print(best_2)"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 73,
212 |    "metadata": {
213 |     "collapsed": false
214 |    },
215 |    "outputs": [
216 |     {
217 |      "data": {
218 |       "text/plain": [
219 |        "LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,\n",
220 |        "          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
221 |        "          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n",
222 |        "          verbose=0, warm_start=False)"
223 |       ]
224 |      },
225 |      "execution_count": 73,
226 |      "metadata": {},
227 |      "output_type": "execute_result"
228 |     }
229 |    ],
230 |    "source": [
231 |     "#Let's see the model after scaling. Did the optimal parameters change?\n",
232 |     "lr_grid_search_scaler.best_estimator_.steps[-1][1]"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {},
238 |    "source": [
239 |     "Now that we've built a pipeline estimator that performs feature scaling and then logistic regression, let's add to it a feature engineering step. We'll then again use GridSearchCV to find an optimal parameter configuration and see if we can beat our best score above."
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 85,
245 |    "metadata": {
246 |     "collapsed": false
247 |    },
248 |    "outputs": [
249 |     {
250 |      "name": "stdout",
251 |      "output_type": "stream",
252 |      "text": [
253 |       "-0.68035039448\n"
254 |      ]
255 |     }
256 |    ],
257 |    "source": [
258 |     "from sklearn.preprocessing import PolynomialFeatures\n",
259 |     "\n",
260 |     "#Create a set of steps. All but the last step is a transformer (something that processes data). \n",
261 |     "# Step 1 - PolynomialFeatures\n",
262 |     "# Step 2 - StandardScaler\n",
263 |     "# Step 3 - LogisticRegression\n",
264 |     "\n",
265 |     "steps_poly = [('polyfeat', PolynomialFeatures()),\n",
266 |     "         ('scaler', StandardScaler()),\n",
267 |     "         ('lr', LogisticRegression())]\n",
268 |     "\n",
269 |     "#Now set up the pipeline\n",
270 |     "pipeline_poly = Pipeline(steps_poly)\n",
271 |     "\n",
272 |     "#Now set up a new parameter grid, use the same paramaters used above for logistic regression, but add polynomial features up to degree 3. \n",
273 |     "parameters_poly = dict(polyfeat__degree = [1, 2],\n",
274 |     "                       polyfeat__interaction_only = [True, False],\n",
275 |     "                       lr__C = [10**i for i in range(-3, 3)],\n",
276 |     "                       lr__penalty = ['l1', 'l2'])\n",
277 |     "\n",
278 |     "#Now run another grid search\n",
279 |     "lr_grid_search_poly = GridSearchCV(pipeline_poly, param_grid = parameters_poly, cv = kfolds, scoring = 'log_loss')\n",
280 |     "\n",
281 |     "lr_grid_search_poly.fit(X, Y)\n",
282 |     "best_3 = lr_grid_search_poly.best_score_\n",
283 |     "print(best_3)"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 100,
289 |    "metadata": {
290 |     "collapsed": false
291 |    },
292 |    "outputs": [
293 |     {
294 |      "data": {
295 |       "text/plain": [
296 |        "[('polyfeat',\n",
297 |        "  PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)),\n",
298 |        " ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)),\n",
299 |        " ('lr',\n",
300 |        "  LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,\n",
301 |        "            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
302 |        "            penalty='l1', random_state=None, solver='liblinear', tol=0.0001,\n",
303 |        "            verbose=0, warm_start=False))]"
304 |       ]
305 |      },
306 |      "execution_count": 100,
307 |      "metadata": {},
308 |      "output_type": "execute_result"
309 |     }
310 |    ],
311 |    "source": [
312 |     "#Let's look at the best estimator, stepwise\n",
313 |     "lr_grid_search_poly.best_estimator_.steps"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "markdown",
318 |    "metadata": {},
319 |    "source": [
320 |     "Now make a bar chart to plot results"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": 98,
326 |    "metadata": {
327 |     "collapsed": false
328 |    },
329 |    "outputs": [
330 |     {
331 |      "data": {
332 |       "text/plain": [
333 |        "[<matplotlib.lines.Line2D at 0x10c294da0>]"
334 |       ]
335 |      },
336 |      "execution_count": 98,
337 |      "metadata": {},
338 |      "output_type": "execute_result"
339 |     },
340 |     {
341 |      "data": {
342 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsYAAAF0CAYAAAAggv9WAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X10VOWB+PHvJLwMkAjFQUkm4T3DDC9ZE40QQUxshZa2\nsi2gWFmVAoFSqYXWrShHw3a1ptb60mwPac8etbQQzto9RLs1iw2NuFRhC126rVWKLdCEY3cRxQUR\nJLm/P1rm12hgggTClO/nnJzDZJ4797nEx3xzuXcSCoIgQJIkSTrPZXT1BCRJkqRzgWEsSZIkYRhL\nkiRJgGEsSZIkAYaxJEmSBBjGkiRJEtDBMK6vrycejxOLxaiqqnrf89/4xjcoKiqiuLiYsWPH0q1b\nN958880ObStJkiSdC0Kp3se4tbWVWCxGQ0MDubm5lJSUUFtbSzweb3f8j370Ix5++GF+8pOfnPK2\nkiRJUldJecZ4y5YtFBQUMHjwYLp3786sWbOoq6s74fg1a9Zwww03fKBtJUmSpK6SMoybm5vJz89P\nPs7Ly6O5ubndsYcPH6a+vp7p06ef8raSJElSV+rUm++efvppJk6cSL9+/TrzZSVJkqQzrluqAdFo\nlD179iQfNzU1EY1G2x1bW1ubvIziVLcNhUIdnrQkSZL0QZ3oFruUN9+1tLQwcuRIGhoayMnJ4fLL\nL2fNmjUkEok24w4cOMCwYcNoamqiV69ep7Qt/CmMU0xF56jKykoqKyu7ehrSece1J3Ud11/6Ollz\npjxjnJmZSXV1NZMnT6a1tZW5c+eSSCSoqakhFApRUVEBwLp165gyZUoyik+2rSRJknSuSRnGAB/9\n6Ed55ZVX2nxuwYIFbR7ffPPN3HzzzR3aVpIkSTrX+JvvdNrKysq6egrSecm1J3Ud199fp5TXGJ8t\nXmMsSZKkM+1kzekZY0mSJAnDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZIAw1iSJEkCDGNJkiQJMIwl\nSZIkwDCWJEmSAMNYkiRJAgxjSZIkCTCMJUmSJMAwliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYk\nSZIAw1iSJEkCDGNJkiQJMIwlSZIkwDCWJEmSAMNYkiRJAgxjSZIkCTCMJUmSJMAwliRJkgDDWJIk\nSQIMY0mSJAkwjCVJkiTAMJYkSZIAw1iSJEkCDGNJkiQJMIwlSZIkwDCWJEmSAMNYkiRJAgxjSZIk\nCTCMJUmSJMAwliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZIAw1iSJEkCDGNJkiQJMIwlSZIk\nwDCWJEmSAMNYkiRJAgxjSZIkCTCMJUmSJMAwliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZIA\n6NbVE1D6GzJwILv/+MeunoY+oMEXX8yu117r6mnoA3DtpTfXnnTuCQVBEHT1JABCoRDnyFR0ikKh\nEH7l0lcIXHtpyrWX3lx7Utc4WXN6KYUkSZJEB8O4vr6eeDxOLBajqqqq3TGNjY0UFRUxZswYysvL\nk59/6KGHGDNmDIWFhdx4440cPXq0c2YuSZIkdaKUl1K0trYSi8VoaGggNzeXkpISamtricfjyTEH\nDhzgiiuuYP369USjUfbt20ckEmHv3r1MnDiRl19+mR49enD99dfz8Y9/nJtuuun9E/FSirTlP+em\nN/85N3259tKba0/qGqd1KcWWLVsoKChg8ODBdO/enVmzZlFXV9dmzOrVq5k+fTrRaBSASCSSfK6l\npYVDhw5x7Ngx3n77bXJzc0/nWCRJkqQzImUYNzc3k5+fn3ycl5dHc3NzmzE7duxg//79lJeXU1JS\nwqpVqwDIzc3lS1/6EoMGDSIajdKvXz8+8pGPdPIhSJIkSaevU96u7dixY2zbto0NGzZw6NAhSktL\nKS0tJRKJUFdXx+7du+nbty8zZsxg9erVfOYzn2n3dUKhzpiNzr4Av3TpzbWXrlx76c61J51bUoZx\nNBplz549ycdNTU3JSyaOy8vLIxKJEA6HCYfDTJo0ie3btxMEAcOGDaN///4AfPrTn+ZnP/vZCcP4\nnnsqk38uKyujrKzsAxySzjavc0xvXueYvlx76c21l95cf+mj8c8fx604ydiUYVxSUsLOnTvZvXs3\nOTk51NbWsmbNmjZjpk2bxuLFi2lpaeHIkSNs3ryZpUuXcvDgQV588UXeeecdevbsSUNDAyUlJSfc\nV2VlZarpSJIkSR1W9ueP404rjDMzM6murmby5Mm0trYyd+5cEokENTU1hEIhKioqiMfjTJkyhcLC\nQjIzM6moqGDUqFEAzJgxg6KiIrp3705RUREVFRWnc2ySJEnSGeFvvtNp85+T0pv/nJu+XHvpzbWX\n3lx/6etka8/ffCdJkiRhGEuSJEmAYSxJkiQBhrEkSZIEGMaSJEkSYBhLkiRJgGEsSZIkAYaxJEmS\nBBjGkiRJEmAYS5IkSYBhLEmSJAGGsSRJkgQYxpIkSRJgGEuSJEmAYSxJkiQBhrEkSZIEGMaSJEkS\nYBhLkiRJgGEsSZIkAYaxJEmSBBjGkiRJEmAYS5IkSYBhLEmSJAGGsSRJkgQYxpIkSRJgGEuSJEmA\nYSxJkiQBhrEkSZIEGMaSJEkSYBhLkiRJgGEsSZIkAYaxJEmSBBjGkiRJEmAYS5IkSYBhLEmSJAGG\nsSRJkgQYxpIkSRJgGEuSJEmAYSxJkiQBhrEkSZIEGMaSJEkSYBhLkiRJgGEsSZIkAYaxJEmSBBjG\nkiRJEmAYS5IkSYBhLEmSJAGGsSRJkgQYxpIkSRJgGEuSJEmAYSxJkiQBhrEkSZIEGMaSJEkSYBhL\nkiRJgGEsSZIkAYaxJEmSBBjGkiRJEmAYS5IkSUAHw7i+vp54PE4sFqOqqqrdMY2NjRQVFTFmzBjK\ny8uTnz9w4AAzZ84kkUgwevRoNm/e3DkzlyRJkjpRKAiC4GQDWltbicViNDQ0kJubS0lJCbW1tcTj\n8eSYAwcOcMUVV7B+/Xqi0Sj79u0jEokAcMstt3DVVVcxZ84cjh07xttvv80FF1zw/omEQqSYis5R\noVAIv3LpKwSuvTTl2ktvrr305vpLXydbeynPGG/ZsoWCggIGDx5M9+7dmTVrFnV1dW3GrF69munT\npxONRgGSUfzWW2/x/PPPM2fOHAC6devWbhRLkiRJXS1lGDc3N5Ofn598nJeXR3Nzc5sxO3bsYP/+\n/ZSXl1NSUsKqVasA+P3vf08kEmHOnDkUFxdTUVHB4cOHO/kQJEmSpNPXKTffHTt2jG3btvHMM89Q\nX1/PV7/6VXbu3Jn8/Oc//3m2bdtG7969uf/++ztjl5IkSVKn6pZqQDQaZc+ePcnHTU1NyUsmjsvL\nyyMSiRAOhwmHw0yaNInt27czceJE8vPzueyyywCYMWPGCW/eA6isrEz+uaysjLKyslM8HEmSJOn/\na/zzR0ekDOOSkhJ27tzJ7t27ycnJoba2ljVr1rQZM23aNBYvXkxLSwtHjhxh8+bNLF26lIsvvpj8\n/Hx27NiRvIFv1KhRJ9zXX4axJEmSdLrK/vxx3IqTjE0ZxpmZmVRXVzN58mRaW1uZO3cuiUSCmpoa\nQqEQFRUVxONxpkyZQmFhIZmZmVRUVCQD+NFHH+XGG2/k3XffZdiwYTz22GOnc2ySJEnSGZHy7drO\nFt+uLX35ljXpzbeMSl+uvfTm2ktvrr/0dVpv1yZJkiSdDwxjSZIkCcNYkiRJAgxjSZIkCTCMJUmS\nJMAwliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZIAw1iSJEkCDGNJkiQJMIwlSZIkwDCWJEmS\nAMNYkiRJAgxjSZIkCTCMJUmSJMAwliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZIAw1iSJEkC\nDGNJkiQJMIwlSZIkwDCWJEmSAMNYkiRJAgxjSZIkCTCMJUmSJMAwliRJkgDDWJIkSQIMY0mSJAkw\njCVJkiTAMJYkSZIAw1iSJEkCDGNJkiQJMIwlSZIkwDCWJEmSAMNYkiRJAgxjSZIkCTCMJUmSJMAw\nliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZIAw1iSJEkCDGNJkiQJMIwlSZIkwDCWJEmSAMNY\nkiRJAgxjSZIkCTCMJUmSJMAwliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZKADoZxfX098Xic\nWCxGVVVVu2MaGxspKipizJgxlJeXt3mutbWV4uJirr322tOfsSRJknQGdEs1oLW1lVtvvZWGhgZy\nc3MpKSlh2rRpxOPx5JgDBw7w+c9/nvXr1xONRtm3b1+b13jkkUcYNWoUb731VucfgSRJktQJUp4x\n3rJlCwUFBQwePJju3bsza9Ys6urq2oxZvXo106dPJxqNAhCJRJLPNTU18eMf/5h58+Z18tQlSZKk\nzpMyjJubm8nPz08+zsvLo7m5uc2YHTt2sH//fsrLyykpKWHVqlXJ55YsWcIDDzxAKBTqxGlLkiRJ\nnSvlpRQdcezYMbZt28aGDRs4dOgQpaWllJaW8sorr3DxxRdzySWX0NjYSBAEnbE7SZIkqdOlDONo\nNMqePXuSj5uampKXTByXl5dHJBIhHA4TDoeZNGkS27dvZ+vWrTz11FP8+Mc/5vDhw/zf//0fN910\nE9/73vfa3VdlZWXyz2VlZZSVlX2wo5IkSZKAxj9/dEQoSHEat6WlhZEjR9LQ0EBOTg6XX345a9as\nIZFIJMe8/PLLLF68mPr6eo4cOcK4ceNYu3Yto0aNSo557rnnePDBB3nqqafan0go5BnlNBUKhfAr\nl75C4NpLU6699ObaS2+uv/R1srWX8oxxZmYm1dXVTJ48mdbWVubOnUsikaCmpoZQKERFRQXxeJwp\nU6ZQWFhIZmYmFRUVbaJYkiRJOtelPGN8tnjGOH35U3N686xV+nLtpTfXXnpz/aWvk609f/OdJEmS\nhGEsSZIkAYaxJEmSBBjGkiRJEmAYS5IkSYBhLEmSJAGGsSRJkgQYxpIkSRJgGEuSJEmAYSxJkiQB\nhrEkSZIEGMaSJEkSYBhLkiRJgGEsSZIkAYaxJEmSBBjGkiRJEmAYS5IkSYBhLEmSJAGGsSRJkgQY\nxpIkSRJgGEuSJEmAYSxJkiQBhrEkSZIEGMaSJEkSYBhLkiRJgGEsSZIkAYaxJEmSBBjGkiRJEmAY\nS5IkSYBhLEmSJAGGsSRJkgQYxpIkSRJgGEuSJEmAYSxJkiQBhrEkSZIEGMaSJEkSYBhLkiRJgGEs\nSZIkAYaxJEmSBBjGkiRJEmAYS5IkSYBhLEmSJAGGsSRJkgQYxpIkSRJgGEuSJEmAYSxJkiQBhrEk\nSZIEGMaSJEkSYBhLkiRJgGEsSZIkAYaxJEmSBBjGkiRJEmAYS5IkSYBhLEmSJAGGsSRJkgQYxpIk\nSRJgGEuSJEmAYSxJkiQBHQzj+vp64vE4sViMqqqqdsc0NjZSVFTEmDFjKC8vB6CpqYmrr76a0aNH\nM3bsWB599NHOm7kkSZLUiUJBEAQnG9Da2kosFqOhoYHc3FxKSkqora0lHo8nxxw4cIArrriC9evX\nE41G2bdvH5FIhNdee43XXnuNSy65hIMHD3LppZdSV1fXZtvkREIhUkxF56hQKIRfufQVAtdemnLt\npTfXXnpz/aWvk629lGeMt2zZQkFBAYMHD6Z79+7MmjWLurq6NmNWr17N9OnTiUajAEQiEQAGDhzI\nJZdcAkBWVhaJRILm5ubTORZJkiTpjEgZxs3NzeTn5ycf5+XlvS9ud+zYwf79+ykvL6ekpIRVq1a9\n73V27drFf/3XfzFu3LhOmLYkSZLUubp1xoscO3aMbdu2sWHDBg4dOkRpaSmlpaWMGDECgIMHDzJj\nxgweeeQRsrKyOmOXkiRJUqdKGcbRaJQ9e/YkHzc1NSUvmTguLy+PSCRCOBwmHA4zadIktm/fzogR\nIzh27BgzZszg7/7u75g2bdpJ91VZWZn8c1lZGWVlZad2NJIkSdJfaPzzR0ekvPmupaWFkSNH0tDQ\nQE5ODpdffjlr1qwhkUgkx7z88sssXryY+vp6jhw5wrhx41i7di2jRo3ipptuIhKJ8M1vfvPkE/Hm\nu7TlDQjpzRuA0pdrL7259tKb6y99nWztpTxjnJmZSXV1NZMnT6a1tZW5c+eSSCSoqakhFApRUVFB\nPB5nypQpFBYWkpmZSUVFBaNGjWLTpk384Ac/YOzYsRQVFREKhbjvvvv46Ec/2tnHKEmSJJ2WlGeM\nzxbPGKcvf2pOb561Sl+uvfTm2ktvrr/0dVpv1yZJkiSdDwxjSZIkCcNYkiRJAgxjSZIkCTCMJUmS\nJMAwliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZIAw1iSJEkCDGNJkiQJMIwlSZIkwDCWJEmS\nAMNYkiRJAgxjSZIkCTCMJUmSJMAwliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZIAw1iSJEkC\nDGNJkiQJMIwlSZIkwDCWJEmSAMNYkiRJAgxjSZIkCTCMJUmSJMAwliRJkgDDWJIkSQIMY0mSJAkw\njCVJkiTAMJYkSZIAw1iSJEkCDGNJkiQJMIwlSZIkwDCWJEmSAMNYkiRJAgxjSZIkCTCMJUmSJMAw\nliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZIAw1iSJEkCDGNJkiQJMIwlSZIkwDCWJEmSAMNY\nkiRJAgxjSZIkCTCMJUmSJMAwliRJkgDDWJIkSQIMY0mSJAkwjCVJkiTAMJYkSZKADoZxfX098Xic\nWCxGVVVVu2MaGxspKipizJgxlJeXn9K2kiRJUlcLBUEQnGxAa2srsViMhoYGcnNzKSkpoba2lng8\nnhxz4MABrrjiCtavX080GmXfvn1EIpEObZucSChEiqnoHBUKhfArl75C4NpLU6699ObaS2+uv/R1\nsrWX8ozxli1bKCgoYPDgwXTv3p1Zs2ZRV1fXZszq1auZPn060WgUgEgk0uFtJUmSpHNByjBubm4m\nPz8/+TgvL4/m5uY2Y3bs2MH+/fspLy+npKSEVatWdXhbSZIk6VzQrTNe5NixY2zbto0NGzZw6NAh\nSktLKS0t7YyXliRJks6KlGEcjUbZs2dP8nFTU1Pykonj8vLyiEQihMNhwuEwkyZNYvv27R3a9i+F\nQqEPcgw6B/iVS2+uvfTlVy69ufbSm1+9vz4pw7ikpISdO3eye/ducnJyqK2tZc2aNW3GTJs2jcWL\nF9PS0sKRI0fYvHkzS5cuZeTIkSm3Pc4bECRJktSVUoZxZmYm1dXVTJ48mdbWVubOnUsikaCmpoZQ\nKERFRQXxeJwpU6ZQWFhIZmYmFRUVjBo1CqDdbSVJkqRzTcq3a5MkSZLOB/7mO0k6h61YsYKxY8d2\n9TSkc8qcOXO49tpru3oaKT3xxBNkZ2d39TR0CgxjdcjJ/ic0ZMgQMjIyyMjIoHfv3iQSCb7xjW+c\n5RlKXWPfvn0sWrSIoUOHEg6HGThwINdccw0NDQ2dto8zdYPWc889R0ZGBvv372/3+RUrVpCRkUFm\nZiaZmZlEo1Fmz55NU1PTGZmPzh9z5sxJ/rfVo0cPhg8fzu23387bb7991uZw+PBh7rzzTgoKCujV\nqxcDBgxg4sSJrF27tlP3c6bW7+7du8nIyGDbtm3tPv/EE08k/44zMjIYOHAg1157LS+99NIZmc9f\ni055uzad30KhEJWVlSxcuJB33nmHn/zkJyxcuJC+ffsyf/78rp6edEZ9+tOf5p133uGxxx5j+PDh\n/M///A/PPfccr7/+eldPLendd9+le/fu7T6X6pt2PB7nueeeo6WlhVdffZVFixZx/fXXs2nTpjMx\nVZ1HrrnmGr7//e9z9OhRnn/+eebOncvhw4eprq4+K/tfsGABL7zwAo8++iijR4/mjTfe4MUXXzzh\nD4pd5XTWb58+ffjd735Ha2srzc3N3H777XziE59gx44ddOtmArbHM8bqFFlZWVx00UUMGjSIz372\nsxQWFrJ+/fqunpZ0Rh04cID/+I//4P7776esrIz8/HwuvfRSli5dynXXXQf86ZvanXfeyZAhQwiH\nw4wYMSL5jb+1tZV58+YxbNgwevfuTSwW44EHHki538cee4zRo0fTq1cv4vE4Dz/8cJt39snIyODb\n3/4206dPJysri7vuuusDH2O3bt0YMGAAAwcOZMKECcyfP58XX3yRgwcPfuDXlAB69uzJgAEDiEaj\nzJo1i9mzZ7Nu3ToANm7cyPjx4+nVqxcDBw5k6dKlvPvuu+2+zqpVq4hEIu97/sYbb+Rv//ZvT7j/\np59+mmXLlvGxj32MQYMG8Td/8zcsWLCAz33uc23GPfjgg8RiMcLhMIMGDWqznpYtW0Y8Hqd3794M\nHTqUr3zlKxw9evSkx/30009z2WWX0atXL4YPH87y5cvbzH3o0KGsWLGCuXPn8qEPfYjZs2ef8LVS\n3SYWCoUYMGAAF198McXFxSxZsoTdu3fzyiuvnHS785k/LqjTNTY28pvf/IZYLNbVU5HOqKysLLKy\nsnjqqaeYMGECPXv2fN+Ym266iU2bNvHoo49yySWX0NzczK5du4A/hXFeXh5PPvkkkUiELVu2UFFR\nQSQSYc6cOe3u87vf/S6VlZVUV1dTXFzMr371K+bPn0+PHj1YtGhRctw//MM/cN999/Hggw922j/l\nvvbaa/zwhz9MXlohdaaePXty5MgR9u7dy9SpU7n55pt54oknePXVV5k7dy6ZmZnt/uA4c+ZMvvjF\nL1JXV8eMGTMAeOutt1i3bt1JL4sYOHAg9fX1zJgxgwsuuKDdMcuWLaOmpoaHHnqISZMm8frrr7N1\n69bk81lZWTz++OPk5uby0ksvsXDhQsLhMCtWrGj39f793/+d2bNn861vfYtJkyaxe/duFi5cyNGj\nR/n617+eHPfQQw+xfPlytm7d2mlvZ/vmm2/ygx/8AOCEZ6AFBFIH3HLLLcEnP/nJdp8bMmRIEA6H\ng6ysrKBHjx5BKBQKevfuHbz44otneZbS2fev//qvwYUXXhiEw+GgtLQ0+PKXvxxs3rw5CIIg+O1v\nfxuEQqFg/fr1HX69O+64I7jmmmuSjysrK4OxY8cmHw8aNCj4/ve/32abhx9+OBg1alTycSgUCm67\n7baU+2psbAwyMjKC119/vd3nKysrg8zMzCArKyvo3bt3EAqFgoyMjGDJkiUdPh6pPe/9nrJ58+bg\nwgsvDGbNmhXcddddQSwWazP+8ccfD8LhcHD48OF2t7/11luDj33sY8nH3/72t4OcnJygpaXlhHPY\nuHFjMGjQoKB79+5BcXFxcOuttwbPPvts8vmDBw8G4XA4+M53vtPh41q5cmVQUFDQZt7Z2dnJx5Mm\nTQr+8R//sc0269atC7KyspKPhwwZElx77bUp97Vr164gFAoFW7dubff5xx9/PAiFQkF2dnbQp0+f\nIBQKBaFQKPjUpz7V4eM5H3kphTrF0qVL2b59Oxs3buTqq6/mnnvuYdy4cV09LemM+9SnPsXevXv5\n0Y9+xNSpU3nhhRcYP348X/va1/jFL35BZmYmZWVlJ9x+5cqVlJSUcNFFF5Gdnc1DDz3U5jeG/qV9\n+/bxhz/8gQULFpCdnZ38uOOOO/j973/fZuyll17a5vGYMWOS4z/+8Y93+PhGjBjBL3/5S37+859z\n3333UVxczL333tvh7aUTeeaZZ8jOzqZXr15MmDCB8vJyvvWtb/Gb3/yG8ePHtxk7ceJEjh49ys6d\nO9t9rfnz5/Pss8+yd+9e4E+XG91yyy1kZGTwhz/8Ifnf/gUXXMD9998PwJVXXsnvfvc7fvrTn3L9\n9dfz29/+lsmTJycvpXjppZc4evQoV1999QmP4cknn+TKK68kJyeH7OxslixZcsL1C7B161buvffe\nNuv3M5/5DIcPH+aPf/xjctxll13WZrupU6cmx5/Ku9T06dOH7du3s23bNr7zne8Qi8VYuXJlh7c/\nH3kphTrFhRdeyLBhwxg2bBhPPvkkBQUFjBs3jquuuqqrpyadcT169ODDH/4wH/7wh1m+fDnz589n\nxYoVrFq16qTbrV27liVLlvDNb36T0tJSLrjgAqqrq5PXWb5Xa2srADU1NZSWlp70tfv06dPm8TPP\nPJO8jrFXr14dPTR69OjB0KFDAUgkEuzYsYNFixbx2GOPdfg1pPZcddVVfPe736Vbt27k5uamvDwn\nCIITXhZUWFhIUVERjz/+ONOmTePnP/958rKB3Nxctm/fnhzbv3//5J8zMzOZMGECEyZM4O///u+5\n9957ufvuu1m2bFnK+W/evJkbbriBFStWMGXKFPr160ddXR233377CbdpbW3lnnvuYebMme97bsCA\nAck/v3f9/vM//zOHDx8GTu0yiFAolFy/sViMvXv3MmvWLDZs2NDh1zjfGMbqdP369ePWW2/li1/8\nIr/4xS+6ejrSWZdIJDh27BiJRIKWlhZ++tOfMnny5PeN27RpE+PHj29zs8+JzogBXHTRReTm5rJz\n505uvPHGU5pTfn7+KY0/keXLlzNy5Ei+8IUvUFRU1CmvqfPT8RvW3iuRSPAv//IvbT73/PPP07Nn\nT4YPH37C15s/fz5f//rX+d///V8mTpxIQUEB8Kf4HTZsWIfmdPy38x48eJBEIkGPHj1oaGhod7+b\nNm0iLy+PO++8M/m54/cPnEhxcTEvv/xyh+dzXE5OzimNP5HjP4ivW7fupDcmns8MY3XYW2+91ean\nboC+ffu2O3bRokVUVVXx5JNPJm+GkP7a7N+/n5kzZybfiSU7O5v//M//5IEHHuAjH/kIY8aM4brr\nrmPevHk8/PDDFBcX09TUxK5du5g9ezaxWIwnnniC+vp6RowYwZo1a9i4cWObM1rvtWLFCr7whS/Q\nt29fpk6dyrvvvsu2bdtobm7mjjvuOOVjCIKA//7v/6Zfv35tPl9YWNju+GHDhjFt2jSWL1/Ov/3b\nv53y/qRUFi1axCOPPMLnPvc5brvtNl599VWWLVvG4sWLCYfDJ9zuhhtuYOnSpaxcuZKampqU+ykv\nL+eGG27qaEpeAAACmklEQVTgsssu48ILL+TXv/41d911F4lEgkQiQSgU4rbbbmPZsmX06NGjzc13\nCxcuJBaL0dzczOrVqyktLaW+vp7a2tqT7vPuu+/mk5/8JIMGDeK6666jW7du/OpXv2LLli1UVVWd\n8t8VwCuvvPK+s+3xeLzdsdnZ2cybN4+7777bMD6Rrr7IWenhlltuCTIyMt73MXPmzGDo0KHBgw8+\n+L5tKioqgtGjR3fBbKWz48iRI8Fdd90VXH755UH//v2DPn36BLFYLPjyl78cvPHGG0EQBMHRo0eD\nr3zlK0FeXl4QDoeDESNGBP/0T/+UfG7evHlB//79gw996EPBvHnzgq9+9avB0KFDk/t47813QRAE\ntbW1waWXXhr06tUr6N+/f3DllVcGa9euTT6fkZER/PCHP0w5/+M33/3lx/Eb7A4dOtTuvoMgCH72\ns58FGRkZwQsvvPCB/t6kk93QHQRB8Pzzzwfjx48PwuFwMHDgwOBLX/pScPTo0ZTbf/aznw369u0b\nvP322ynncP/99wdXXnllMGDAgKBXr17B0KFDgwULFgRNTU1txlVVVQXDhw8PevbsGQwaNChYvnx5\n8rk777wzuOiii4Ls7Oxg+vTpwcqVK4OMjIzk8++9+S4IguDZZ58NJk2aFPTp0yfo27dvUFJSkvx/\nQhAEJ/ye+l67du1q9/tyRkZG8Otf/7rdfQdBEOzZsyfo0aNHsGbNmpT7OB+FgqCT3gdEkiSpC02d\nOpX8/PwOnTGW2uOlFJIkKa29+eabbNy4kWeffZZf/vKXXT0dpTHDWJIkpbWioiLeeOMNvva1ryVv\noJM+CC+lkCRJkgB/wYckSZKEYSxJkiQBhrEkSZIEGMaSJEkSYBhLkiRJgGEsSZIkAfD/AHeNuVNj\ny6ctAAAAAElFTkSuQmCC\n",
343 |       "text/plain": [
344 |        "<matplotlib.figure.Figure at 0x10c2ac2b0>"
345 |       ]
346 |      },
347 |      "metadata": {},
348 |      "output_type": "display_data"
349 |     }
350 |    ],
351 |    "source": [
352 |     "import numpy as np\n",
353 |     "results = -1 * np.array([best_1, best_2, best_3])\n",
354 |     "labs = ['LR', 'Scaler-LR', 'Poly-Scaler-LR']\n",
355 |     "\n",
356 |     "fig = plt.figure(facecolor = 'w', figsize = (12, 6))\n",
357 |     "ax = plt.subplot(111)\n",
358 |     "\n",
359 |     "width = 0.5\n",
360 |     "ind = np.arange(3)\n",
361 |     "rec = ax.bar(ind + width, results, width, color='r')\n",
362 |     "\n",
363 |     "ax.set_xticks(ind + width)\n",
364 |     "ax.set_xticklabels(labs, size = 14)\n",
365 |     "ax.set_ylim([0.6, 0.7])\n",
366 |     "\n",
367 |     "plt.plot(np.arange(4), min(results) * np.ones(4))"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": null,
373 |    "metadata": {
374 |     "collapsed": true
375 |    },
376 |    "outputs": [],
377 |    "source": []
378 |   }
379 |  ],
380 |  "metadata": {
381 |   "anaconda-cloud": {},
382 |   "kernelspec": {
383 |    "display_name": "Python [py35]",
384 |    "language": "python",
385 |    "name": "Python [py35]"
386 |   },
387 |   "language_info": {
388 |    "codemirror_mode": {
389 |     "name": "ipython",
390 |     "version": 3
391 |    },
392 |    "file_extension": ".py",
393 |    "mimetype": "text/x-python",
394 |    "name": "python",
395 |    "nbconvert_exporter": "python",
396 |    "pygments_lexer": "ipython3",
397 |    "version": "3.5.2"
398 |   }
399 |  },
400 |  "nbformat": 4,
401 |  "nbformat_minor": 0
402 | }
403 | 


--------------------------------------------------------------------------------
/ipython/README.md:
--------------------------------------------------------------------------------
1 | Intro to Data Science Recitation
2 | ===================
3 | 
4 | Materials for the Intro to Data Science course, Spring 2017
5 | 
6 | You will probably want to fork this repository and keep it up to date. This will ensure that you have the latest versions of all of my example code, and any necessary data or supplementary material that comes with it.
7 | 


--------------------------------------------------------------------------------
/ipython/data/Cell2Cell_info.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/data/Cell2Cell_info.pdf


--------------------------------------------------------------------------------
/ipython/data/survey_responses_2016.csv:
--------------------------------------------------------------------------------
  1 | id,cs_python,cs_java,cs_c,cs_perl,cs_javascript,cs_r,cs_sas,profile_1,profile_2,profile_3,profile_4,profile_5,profile_6,profile_7,fruit,len_answer,season,experience_coded,experience
  2 | 1,1,1,0,0,0,1,0,1,5,6,5,1,6,5,Oranges,136,Fall,2,"< 2 years, I'm fresh!"
  3 | 2,0,1,0,0,0,0,0,1,3,6,4,1,5,5,Apples,112,Summer,1,"None, I just finished my undergrad!"
  4 | 3,1,0,1,0,0,1,1,1,4,4,7,1,1,5,Oranges,97,Spring,1,"None, I just finished my undergrad!"
  5 | 4,0,0,1,0,0,1,0,2,1,8,7,2,5,5,Oranges,1879,Winter,1,"None, I just finished my undergrad!"
  6 | 5,1,0,0,0,1,1,0,5,4,6,6,6,1,7,Apples,699,Spring,2,"< 2 years, I'm fresh!"
  7 | 6,0,0,0,0,0,0,0,2,2,7,2,2,3,5,Apples,340,Fall,1,"None, I just finished my undergrad!"
  8 | 7,0,0,0,1,0,1,0,8,5,6,5,3,7,5,Oranges,874,Winter,4,"5+ years, I'm a veteran!"
  9 | 8,0,0,0,0,0,1,0,3,2,10,7,1,5,5,Apples,118,Fall,1,"None, I just finished my undergrad!"
 10 | 9,1,1,1,0,0,1,1,1,3,8,8,1,1,1,Oranges,0,Winter,1,"None, I just finished my undergrad!"
 11 | 10,1,1,1,0,0,0,0,0,6,8,6,0,0,3,Oranges,310,Spring,1,"None, I just finished my undergrad!"
 12 | 11,1,0,0,0,0,1,0,6,5,8,8,1,3,5,Oranges,947,Spring,1,"None, I just finished my undergrad!"
 13 | 12,0,0,1,0,0,1,1,2,2,2,3,1,1,3,Oranges,379,Fall,2,"< 2 years, I'm fresh!"
 14 | 13,0,1,0,0,0,0,0,3,7,10,9,1,1,5,Oranges,863,Winter,1,"None, I just finished my undergrad!"
 15 | 14,0,0,0,0,0,0,1,7,2,6,6,1,6,7,Oranges,1033,Summer,4,"5+ years, I'm a veteran!"
 16 | 15,1,1,0,0,0,0,0,1,8,5,5,6,6,8,Apples,144,Summer,1,"None, I just finished my undergrad!"
 17 | 16,1,0,0,0,0,1,0,1,2,7,5,1,1,3,Apples,331,Fall,1,"None, I just finished my undergrad!"
 18 | 17,0,1,1,0,0,1,0,5,5,5,5,1,1,3,Apples,261,Summer,1,"None, I just finished my undergrad!"
 19 | 18,1,1,1,0,0,0,0,2,6,6,4,3,7,7,Oranges,752,Spring,1,"None, I just finished my undergrad!"
 20 | 19,1,1,0,0,0,1,1,4,4,6,6,4,2,3,Oranges,829,Fall,1,"None, I just finished my undergrad!"
 21 | 20,0,0,0,0,0,1,1,4,5,6,8,3,3,8,Oranges,269,Fall,1,"None, I just finished my undergrad!"
 22 | 21,1,0,0,0,0,1,0,5,8,8,7,5,3,5,Oranges,271,Fall,3,"2-5 years, I'm getting good at what I do!"
 23 | 22,1,0,0,0,0,0,0,6,1,8,9,1,7,9,Apples,556,Fall,1,"None, I just finished my undergrad!"
 24 | 23,1,1,0,0,0,0,0,1,4,8,7,2,2,9,Apples,148,Summer,2,"< 2 years, I'm fresh!"
 25 | 24,1,1,1,0,0,1,0,3,7,5,2,2,3,7,Oranges,639,Summer,2,"< 2 years, I'm fresh!"
 26 | 25,1,1,1,0,0,1,0,1,4,5,6,2,6,4,Apples,486,Spring,1,"None, I just finished my undergrad!"
 27 | 26,1,0,0,0,0,1,0,8,5,7,8,6,6,6,Apples,399,Summer,1,"None, I just finished my undergrad!"
 28 | 27,0,1,0,0,0,1,0,1,2,4,3,1,2,2,Apples,492,Fall,1,"None, I just finished my undergrad!"
 29 | 28,0,1,0,0,0,1,1,2,3,8,8,3,4,5,Oranges,1172,Fall,1,"None, I just finished my undergrad!"
 30 | 29,1,1,1,0,0,1,0,4,4,3,3,1,7,8,Apples,340,Spring,4,"5+ years, I'm a veteran!"
 31 | 30,0,0,0,0,0,0,0,5,3,7,7,3,5,5,Apples,0,Winter,1,"None, I just finished my undergrad!"
 32 | 31,1,0,1,0,0,1,0,6,3,4,8,1,2,3,Oranges,548,Fall,1,"None, I just finished my undergrad!"
 33 | 32,1,0,0,0,0,0,0,3,5,8,8,3,9,9,Apples,3683,Spring,2,"< 2 years, I'm fresh!"
 34 | 33,1,0,0,0,0,0,1,1,1,3,6,1,4,5,Oranges,0,Fall,2,"< 2 years, I'm fresh!"
 35 | 34,0,0,1,0,0,1,0,1,3,8,7,5,4,6,Oranges,712,Winter,1,"None, I just finished my undergrad!"
 36 | 35,1,0,0,0,0,1,1,2,2,7,8,5,3,5,Apples,243,Summer,1,"None, I just finished my undergrad!"
 37 | 36,1,1,1,0,0,1,0,6,7,4,5,2,4,7,Apples,1141,Fall,1,"None, I just finished my undergrad!"
 38 | 37,1,1,0,0,0,1,0,3,5,9,9,1,5,7,Oranges,440,Summer,1,"None, I just finished my undergrad!"
 39 | 38,1,1,0,0,0,1,0,3,5,9,9,1,5,7,Oranges,0,Summer,1,"None, I just finished my undergrad!"
 40 | 39,1,0,1,0,0,1,1,1,3,7,7,1,3,5,Oranges,334,Fall,1,"None, I just finished my undergrad!"
 41 | 40,1,0,1,0,0,1,1,1,4,7,7,1,7,5,Oranges,0,Fall,1,"None, I just finished my undergrad!"
 42 | 41,1,0,0,0,0,1,1,8,3,9,9,8,3,6,Oranges,844,Fall,3,"2-5 years, I'm getting good at what I do!"
 43 | 42,1,1,0,0,1,0,0,6,6,8,7,3,5,5,Apples,267,Summer,2,"< 2 years, I'm fresh!"
 44 | 43,1,0,1,0,0,0,0,5,5,8,4,6,7,8,Apples,1155,Spring,3,"2-5 years, I'm getting good at what I do!"
 45 | 44,0,0,1,0,0,1,1,7,5,7,7,1,7,7,Apples,371,Winter,2,"< 2 years, I'm fresh!"
 46 | 45,0,1,1,0,0,0,0,4,7,5,5,4,3,4,Apples,618,Fall,1,"None, I just finished my undergrad!"
 47 | 46,1,1,1,0,0,1,0,1,8,9,7,5,1,5,Oranges,418,Winter,1,"None, I just finished my undergrad!"
 48 | 47,0,0,1,0,0,0,1,3,3,7,8,1,6,5,Apples,780,Fall,1,"None, I just finished my undergrad!"
 49 | 48,0,0,1,0,0,1,0,3,5,8,6,5,2,4,Apples,286,Fall,1,"None, I just finished my undergrad!"
 50 | 49,1,1,0,0,1,1,1,5,3,8,8,3,3,6,Apples,303,Fall,1,"None, I just finished my undergrad!"
 51 | 50,1,1,0,0,0,1,0,7,5,9,9,10,5,6,Apples,163,Summer,1,"None, I just finished my undergrad!"
 52 | 51,0,1,0,0,0,1,0,6,5,7,7,5,6,7,Oranges,529,Fall,1,"None, I just finished my undergrad!"
 53 | 52,0,1,0,0,0,1,0,1,2,9,9,1,4,5,Apples,613,Fall,1,"None, I just finished my undergrad!"
 54 | 53,1,0,0,0,0,1,0,7,4,7,7,3,7,9,Apples,248,Fall,2,"< 2 years, I'm fresh!"
 55 | 54,0,0,0,0,0,0,0,5,2,4,4,1,8,8,Apples,657,Summer,3,"2-5 years, I'm getting good at what I do!"
 56 | 55,1,0,0,0,0,1,0,5,5,7,7,1,7,7,Oranges,459,Spring,4,"5+ years, I'm a veteran!"
 57 | 56,0,1,0,0,0,1,0,2,8,6,5,5,4,8,Apples,1032,Fall,1,"None, I just finished my undergrad!"
 58 | 57,0,0,1,0,0,0,0,1,1,8,8,1,5,8,Apples,252,Summer,3,"2-5 years, I'm getting good at what I do!"
 59 | 58,1,1,1,0,0,1,1,6,5,7,8,6,4,4,Apples,185,Fall,1,"None, I just finished my undergrad!"
 60 | 59,1,0,0,0,0,1,1,6,3,7,8,5,3,3,Apples,396,Spring,1,"None, I just finished my undergrad!"
 61 | 60,1,0,1,0,0,1,1,7,5,9,9,3,7,7,Apples,720,Summer,1,"None, I just finished my undergrad!"
 62 | 61,1,0,1,0,0,0,0,1,5,6,1,1,1,2,Oranges,451,Winter,1,"None, I just finished my undergrad!"
 63 | 62,0,0,0,0,0,1,1,3,5,8,8,1,1,7,Oranges,1370,Spring,3,"2-5 years, I'm getting good at what I do!"
 64 | 63,1,1,0,0,0,1,0,3,6,7,6,3,2,2,Apples,418,Fall,2,"< 2 years, I'm fresh!"
 65 | 64,1,0,0,0,0,0,0,6,3,8,7,1,2,4,Apples,233,Fall,1,"None, I just finished my undergrad!"
 66 | 65,1,1,0,0,0,1,0,7,7,8,7,6,2,2,Oranges,531,Fall,2,"< 2 years, I'm fresh!"
 67 | 66,0,1,1,0,0,1,0,8,7,8,7,3,3,7,Oranges,835,Fall,3,"2-5 years, I'm getting good at what I do!"
 68 | 67,0,1,1,0,0,1,1,5,5,7,8,3,8,7,Oranges,765,Spring,3,"2-5 years, I'm getting good at what I do!"
 69 | 68,1,0,0,0,0,0,0,2,6,4,6,5,6,4,Oranges,727,0,3,"2-5 years, I'm getting good at what I do!"
 70 | 69,1,1,1,0,1,0,0,4,7,5,5,6,3,5,Oranges,513,Fall,3,"2-5 years, I'm getting good at what I do!"
 71 | 70,1,1,1,0,1,0,0,4,7,5,5,6,3,5,Oranges,909,Fall,3,"2-5 years, I'm getting good at what I do!"
 72 | 71,0,0,0,0,0,0,1,4,2,5,5,1,6,5,Oranges,907,Fall,3,"2-5 years, I'm getting good at what I do!"
 73 | 72,1,1,1,0,0,1,0,2,9,10,8,4,1,5,Oranges,0,Winter,1,"None, I just finished my undergrad!"
 74 | 73,1,1,1,0,0,1,0,4,7,5,5,5,6,4,Apples,470,Fall,1,"None, I just finished my undergrad!"
 75 | 74,0,0,0,0,0,1,0,2,2,6,6,3,8,6,Oranges,395,Spring,2,"< 2 years, I'm fresh!"
 76 | 75,0,0,0,0,1,0,0,1,6,10,9,1,1,5,Oranges,549,Winter,1,"None, I just finished my undergrad!"
 77 | 76,1,1,1,0,0,1,1,6,6,7,9,4,1,4,Oranges,354,Spring,1,"None, I just finished my undergrad!"
 78 | 77,1,0,0,0,0,1,0,1,4,8,8,3,5,5,Apples,1273,Fall,2,"< 2 years, I'm fresh!"
 79 | 78,1,0,0,0,0,0,0,1,5,8,8,1,8,5,Apples,982,Fall,3,"2-5 years, I'm getting good at what I do!"
 80 | 79,0,0,0,0,0,1,1,1,1,5,7,5,6,4,Oranges,942,Spring,1,"None, I just finished my undergrad!"
 81 | 80,1,1,1,0,0,1,0,3,8,8,6,6,6,7,Apples,308,Spring,3,"2-5 years, I'm getting good at what I do!"
 82 | 81,1,0,0,0,0,0,0,3,3,4,4,3,4,5,Apples,1604,Spring,3,"2-5 years, I'm getting good at what I do!"
 83 | 82,0,0,1,0,1,0,0,2,6,3,2,1,6,10,Apples,706,Fall,4,"5+ years, I'm a veteran!"
 84 | 83,1,1,1,0,0,0,0,4,4,7,7,2,5,6,Oranges,236,Fall,1,"None, I just finished my undergrad!"
 85 | 84,0,0,1,0,0,1,1,5,3,7,8,6,4,6,Oranges,462,Fall,1,"None, I just finished my undergrad!"
 86 | 85,1,1,0,0,1,1,0,4,7,4,6,5,6,9,Oranges,761,Winter,2,"< 2 years, I'm fresh!"
 87 | 86,1,0,0,0,0,0,0,2,2,5,4,1,1,7,Oranges,642,Fall,2,"< 2 years, I'm fresh!"
 88 | 87,1,0,1,0,0,1,0,3,5,7,7,5,1,5,Oranges,567,Summer,1,"None, I just finished my undergrad!"
 89 | 88,1,1,1,1,0,0,0,6,10,8,6,6,2,6,Oranges,633,Winter,1,"None, I just finished my undergrad!"
 90 | 89,1,0,1,0,0,1,0,3,5,7,8,5,3,3,Apples,320,Fall,1,"None, I just finished my undergrad!"
 91 | 90,0,0,0,0,0,1,0,3,2,4,7,2,6,6,Oranges,352,Spring,3,"2-5 years, I'm getting good at what I do!"
 92 | 91,1,0,0,0,0,1,1,2,2,6,8,3,5,4,Oranges,620,Fall,2,"< 2 years, I'm fresh!"
 93 | 92,1,0,0,0,0,1,0,5,5,5,7,7,3,5,Oranges,604,Winter,3,"2-5 years, I'm getting good at what I do!"
 94 | 93,1,1,0,0,0,1,0,5,7,5,7,4,3,5,Oranges,125,Winter,3,"2-5 years, I'm getting good at what I do!"
 95 | 94,1,0,1,0,0,1,0,5,3,7,3,3,4,7,Oranges,0,Summer,3,"2-5 years, I'm getting good at what I do!"
 96 | 95,0,0,0,0,0,1,0,4,1,6,8,1,5,7,Apples,892,Spring,4,"5+ years, I'm a veteran!"
 97 | 96,1,1,1,0,0,1,1,4,4,7,8,7,5,2,Apples,93,Fall,1,"None, I just finished my undergrad!"
 98 | 97,1,1,1,0,1,1,0,5,7,6,6,6,4,5,Apples,302,Fall,2,"< 2 years, I'm fresh!"
 99 | 98,0,0,0,0,0,1,0,2,2,7,5,1,3,3,Apples,832,Spring,1,"None, I just finished my undergrad!"
100 | 99,1,1,1,0,1,0,0,2,6,4,4,3,3,3,Apples,255,Spring,1,"None, I just finished my undergrad!"
101 | 100,1,0,1,0,0,1,1,7,8,6,6,8,4,7,Apples,130,Spring,1,"None, I just finished my undergrad!"
102 | 101,0,0,0,0,0,1,1,5,3,3,7,2,6,6,Apples,526,Spring,1,"None, I just finished my undergrad!"
103 | 102,1,0,0,0,0,0,1,1,1,1,1,1,1,1,Oranges,1,Summer,4,"5+ years, I'm a veteran!"


--------------------------------------------------------------------------------
/ipython/hw/hw_1/Homework1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Introduction to Data Science\n",
  8 |     "## Homework 1: Due Midnight, March 4th. 1/3 of a Grade Deducted for each day late"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "metadata": {},
 14 |    "source": [
 15 |     "Student Name: \n",
 16 |     "\n",
 17 |     "Student Netid: \n",
 18 |     "***"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "### Part 1: Case study\n",
 26 |     "- Read [this article](http://www.nytimes.com/2012/02/19/magazine/shopping-habits.html) in the New York Times.\n",
 27 |     "- Use what we've learned in class and from the book to describe how one could set Target's problem up as a predictive modeling problem, such that they could have gotten the results that they did.  Formulate your solution as a proposed plan using our data science terminology.  Include all the aspects of the formulation that you see as relevant to solving the problem.  Be precise but concise."
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "Place your answer here!"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "### Part 2: Dealing with messy data\n",
 42 |     "Not all data you will deal with is going to be clean. In fact, much of it will be very messy! For example, we have the HTML page that lists the contributors to Facebook's [osquery](https://github.com/facebook/osquery) project that is hosted on [Github.com](https://github.com). In this case, all we are interested in are the contributors and how many commits each of them has. Given the HTML page in `\"data/osquery_contributors.html\"` you will sift through tons of irrelevant data so that you can build a useful data structure.\n",
 43 |     "\n",
 44 |     "Notice that the first six (out of 59 total) contributors are named \"theopolis\", \"marpaia\", \"javuto\", \"jedi22\", \"unixist\", and \"mofarrell\". They have 553, 477, 104, 49, 30, 25 commits respectively.\n",
 45 |     "\n",
 46 |     "![Screenshot](images/osquery_contributors.png)\n",
 47 |     "\n",
 48 |     "To get a better of understanding of how this data is stored in the file, try searching through the raw data file for these usernames to look for any patterns. Your final dictionary should have 59 elements!"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "1\\. Turn this data into a Python dictionary called `contributors` where the keys are the contributor names and the values are the number of commits that each contributor has."
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {
 62 |     "collapsed": true
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "import re # you might find this package useful\n",
 67 |     "\n",
 68 |     "contributors = dict()\n",
 69 |     "\n",
 70 |     "# Place your code here\n",
 71 |     "            \n",
 72 |     "# This line will print your dictionary for grading purposed. Do not remove this line!!!\n",
 73 |     "print contributors"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "### Part 3: Dealing with data Pythonically"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {
 87 |     "collapsed": true
 88 |    },
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "# You might find these packages useful. You may import any others you want!\n",
 92 |     "import pandas as pd\n",
 93 |     "import numpy as np"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "1\\. Load the data set `\"data/ads_dataset.tsv\"` into a Python Pandas data frame called `ads`."
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {
107 |     "collapsed": true
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "# Place your code here"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "2\\. Write a Python function called `getDfSummary()` that does the following:\n",
119 |     "- Takes as input a data frame\n",
120 |     "- For each variable in the data frame calculates the following features:\n",
121 |     "  - `number_nan` to count the number of missing not-a-number values\n",
122 |     "  - Ignoring missing, NA, and Null values:\n",
123 |     "    - `number_distinct` to count the number of distinct values a variable can take on\n",
124 |     "    - `mean`, `max`, `min`, `std` (standard deviation), and `25%`, `50%`, `75%` to correspond to the appropriate percentiles\n",
125 |     "- All of these new features should be loaded in a new data frame. Each row of the data frame should be a variable from the input data frame, and the columns should be the new summary features.\n",
126 |     "- Returns this new data frame containing all of the summary information\n",
127 |     "\n",
128 |     "Hint: The pandas `describe()` [(manual page)](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.describe.html) method returns a useful series of values that can be used here."
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "collapsed": true
136 |    },
137 |    "outputs": [],
138 |    "source": [
139 |     "def getDfSummary(input_data):\n",
140 |     "    # Place your code here\n",
141 |     "    \n",
142 |     "    return output_data"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "3\\. How long does it take for your `getDfSummary()` function to work on your `ads` data frame? Show us the results below.\n",
150 |     "\n",
151 |     "Hint: `%timeit getDfSummary(ads)`"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {
158 |     "collapsed": true
159 |    },
160 |    "outputs": [],
161 |    "source": [
162 |     "# Place your code here"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "4\\. Using the results returned from `getDfSummary()`, which fields, if any, contain missing `NaN` values?"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {
176 |     "collapsed": true
177 |    },
178 |    "outputs": [],
179 |    "source": [
180 |     "# Place your code here"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "markdown",
185 |    "metadata": {},
186 |    "source": [
187 |     "5\\. For the fields with missing values, does it look like the data is missing at random? Are there any other fields that correlate perfectly, or predict that the data is missing? If missing, what should the data value be?\n",
188 |     "\n",
189 |     "Hint: create another data frame that has just the records with a missing value. Get a summary of this data frame using `getDfSummary()` and compare the differences. Do some feature distributions change dramatically?"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "metadata": {
196 |     "collapsed": true
197 |    },
198 |    "outputs": [],
199 |    "source": [
200 |     "# Place your code here"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "6\\. Which variables are binary?"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {
214 |     "collapsed": false
215 |    },
216 |    "outputs": [],
217 |    "source": [
218 |     "# Place your code here"
219 |    ]
220 |   }
221 |  ],
222 |  "metadata": {
223 |   "kernelspec": {
224 |    "display_name": "Python 3",
225 |    "language": "python",
226 |    "name": "python3"
227 |   },
228 |   "language_info": {
229 |    "codemirror_mode": {
230 |     "name": "ipython",
231 |     "version": 3
232 |    },
233 |    "file_extension": ".py",
234 |    "mimetype": "text/x-python",
235 |    "name": "python",
236 |    "nbconvert_exporter": "python",
237 |    "pygments_lexer": "ipython3",
238 |    "version": "3.5.2"
239 |   }
240 |  },
241 |  "nbformat": 4,
242 |  "nbformat_minor": 0
243 | }
244 | 


--------------------------------------------------------------------------------
/ipython/hw/hw_1/images/osquery_contributors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/hw/hw_1/images/osquery_contributors.png


--------------------------------------------------------------------------------
/ipython/hw/hw_2/hw_2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Introduction to Data Science\n",
  8 |     "## Homework 2"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "metadata": {},
 14 |    "source": [
 15 |     "Student Name: \n",
 16 |     "\n",
 17 |     "Student Netid: \n",
 18 |     "***"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "### Preparing a Training Set and Training a Decision Tree\n",
 26 |     "This is a hands-on task where we build a predictive model using Decision Trees discussed in class. For this part, we will be using the data in `cell2cell_data.csv`.\n",
 27 |     "\n",
 28 |     "These historical data consist of 39,859 customers: 19,901 customers that churned (i.e., left the company) and 19,958 that did not churn (see the `\"churndep\"` variable). Here are the data set's 11 possible predictor variables for churning behavior: \n",
 29 |     "\n",
 30 |     "```\n",
 31 |     "Pos.  Var. Name  Var. Description\n",
 32 |     "----- ---------- --------------------------------------------------------------\n",
 33 |     "1     revenue    Mean monthly revenue in dollars\n",
 34 |     "2     outcalls   Mean number of outbound voice calls\n",
 35 |     "3     incalls    Mean number of inbound voice calls\n",
 36 |     "4     months     Months in Service\n",
 37 |     "5     eqpdays    Number of days the customer has had his/her current equipment\n",
 38 |     "6     webcap     Handset is web capable\n",
 39 |     "7     marryyes   Married (1=Yes; 0=No)\n",
 40 |     "8     travel     Has traveled to non-US country (1=Yes; 0=No)\n",
 41 |     "9     pcown      Owns a personal computer (1=Yes; 0=No)\n",
 42 |     "10    creditcd   Possesses a credit card (1=Yes; 0=No)\n",
 43 |     "11    retcalls   Number of calls previously made to retention team\n",
 44 |     "```\n",
 45 |     "\n",
 46 |     "The 12th column, the dependent variable `\"churndep\"`, equals 1 if the customer churned, and 0 otherwise. "
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "1\\. Load the data and prepare it for modeling. Note that the features are already processed for you, so the only thing needed here is split the data into training and testing. Use pandas to create two data frames: train_df and test_df, where train_df has 80% of the data chosen uniformly at random without replacement (test_df should have the other 20%). Also, make sure to write your own code to do the splits. You may use any random() function numpy but DO NOT use the data splitting functions from Sklearn."
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {
 60 |     "collapsed": true
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "import pandas as pd\n",
 65 |     "\n",
 66 |     "# Code here"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "2\\. If we had to, how would we prove to ourselves or a colleague that our data was indeed randomly sampled on X? And by prove, I mean empirically, not just showing this person our code. Don't actually do the work, just describe in your own words a test you could here. Hint: think about this in terms of selection bias and use notes from our 2nd lecture."
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": []
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "3\\. Now build and train a decision tree classifier using `DecisionTreeClassifier()` [(manual page)](http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html) on train_df to predict the `\"churndep\"` target variable. Make sure to use `criterion='entropy'` when instantiating an instance of `DecisionTreeClassifier()`. For all other settings you should use all of the default options."
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {
 92 |     "collapsed": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "import sklearn\n",
 97 |     "\n",
 98 |     "# Code here"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {},
104 |    "source": [
105 |     "4\\. Using the resulting model from 2.2, show a bar plot of feature names and their feature importance (hint: check the attributes of the `DecisionTreeClassifier()` object directly in IPython or check the manual!)."
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {
112 |     "collapsed": true
113 |    },
114 |    "outputs": [],
115 |    "source": [
116 |     "import matplotlib.pyplot as plt\n",
117 |     "%matplotlib inline\n",
118 |     "\n",
119 |     "# Code here"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "5\\. Is the relationship between the top 3 most important features (as measured here) negative or positive? If your marketing director asked you to explain the top 3 drivers of churn, how would you interpret the relationship between these 3 features and the churn outcome?  What \"real-life\" connection can you draw between each variable and churn?"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {
133 |     "collapsed": true
134 |    },
135 |    "outputs": [],
136 |    "source": [
137 |     "# Code/answer here"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "6\\. Using the classifier built in 2.2, try predicting `\"churndep\"` on both the train_df and test_df data sets. What is the accuracy on each?"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {
151 |     "collapsed": true
152 |    },
153 |    "outputs": [],
154 |    "source": [
155 |     "# Code here"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "### Part 2 - Finding a Good Decision Tree\n",
163 |     "The default options for your decision tree may not be optimal. We need to analyze whether tuning the parameters can improve the accuracy of the classifier.  For the following options `min_samples_split` and `min_samples_leaf`:"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "1\\. Generate a list of 10 values of each for the parameters mim_samples_split and min_samples_leaf. "
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {
177 |     "collapsed": true
178 |    },
179 |    "outputs": [],
180 |    "source": [
181 |     "# Code here\n",
182 |     "\n",
183 |     "min_samples_split_values = None\n",
184 |     "min_samples_leaf_values = None"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "2\\. Explain in words your reasoning for choosing the above ranges."
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "metadata": {},
197 |    "source": []
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {},
202 |    "source": [
203 |     "3\\. For each combination of values in 3.1 (there should be 100), build a new classifier and check the classifier's accuracy on the test data. Plot the test set accuracy for these options. Use the values of `min_samples_split` as the x-axis and generate a new series (line) for each of `min_samples_leaf`."
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {
210 |     "collapsed": true
211 |    },
212 |    "outputs": [],
213 |    "source": [
214 |     "import matplotlib.pyplot as plt\n",
215 |     "%matplotlib inline\n",
216 |     "\n",
217 |     "# Code here"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {},
223 |    "source": [
224 |     "4\\. Which configuration returns the best accuracy? What is this accuracy? (Note, if you don't see much variation in the test set accuracy across values of min_samples_split or min_samples_leaf, try redoing the above steps with a different range of values)."
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {
231 |     "collapsed": true
232 |    },
233 |    "outputs": [],
234 |    "source": [
235 |     "# Code here"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "metadata": {},
241 |    "source": [
242 |     "5\\. If you were working for a marketing department, how would you use your churn production model in a real business environment? Explain why churn prediction might be good for the business and how one might improve churn by using this model."
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "markdown",
247 |    "metadata": {},
248 |    "source": [
249 |     "Answer here!"
250 |    ]
251 |   }
252 |  ],
253 |  "metadata": {
254 |   "kernelspec": {
255 |    "display_name": "Python 3",
256 |    "language": "python",
257 |    "name": "python3"
258 |   },
259 |   "language_info": {
260 |    "codemirror_mode": {
261 |     "name": "ipython",
262 |     "version": 3
263 |    },
264 |    "file_extension": ".py",
265 |    "mimetype": "text/x-python",
266 |    "name": "python",
267 |    "nbconvert_exporter": "python",
268 |    "pygments_lexer": "ipython3",
269 |    "version": "3.6.0"
270 |   }
271 |  },
272 |  "nbformat": 4,
273 |  "nbformat_minor": 0
274 | }
275 | 


--------------------------------------------------------------------------------
/ipython/hw/hw_3/Homework_3.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Introduction to Data Science\n",
  8 |     "## Homework 3: Due 5pm to My Mailbox (2nd Floor, 19 W 4th St) Wednesday April 19th"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "metadata": {},
 14 |    "source": [
 15 |     "Student Name: \n",
 16 |     "\n",
 17 |     "Student Netid:\n",
 18 |     "***"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "In this assignment we will be looking at data generated by particle physicists to test whether machine learning can help classify whether certain particle decay experiments identify the presence of a Higgs Boson. One does not need to know anything about particle physics to do well here, but if you are curious, full feature and data descriptions can be found here:\n",
 26 |     "\n",
 27 |     "- https://www.kaggle.com/c/higgs-boson/data\n",
 28 |     "- http://higgsml.lal.in2p3.fr/files/2014/04/documentation_v1.8.pdf\n",
 29 |     "\n",
 30 |     "The goal of this assignment is to learn to use cross-validation for model selection. We’ll also use learning curve analysis to understand how well different algorithms make use of limited data. For more documentation on cross-validation with Python, you can consult the following:\n",
 31 |     "\n",
 32 |     "- http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation\n"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "### Part 1: Data preparation\n",
 40 |     "Create a data preparation and cleaning function that does the following:\n",
 41 |     "- Has a single input that is a file name string\n",
 42 |     "- Reads data (the data is comma separated, has a row header and the first column `EventID` is the index) into a pandas `dataframe`\n",
 43 |     "- Cleans the data\n",
 44 |     "  - Convert the feature `Label` to numeric (choose the minority class to be equal to 1)\n",
 45 |     "    - Create a feature `Y` with numeric label\n",
 46 |     "    - Drop the feature `Label`\n",
 47 |     "  - If a feature has missing values (i.e., `-999`): \n",
 48 |     "    - Create a dummy variable for the missing value\n",
 49 |     "      - Call the variable `orig_var_name` + `_mv` where `orig_var_name` is the name of the actual var with a missing value\n",
 50 |     "      - Give this new variable a 1 if the original variable is missing\n",
 51 |     "    - Replace the missing value with the average of the feature (make sure to compute the mean on records where the value isn't missing). You may find pandas' `.replace()` function useful.\n",
 52 |     "- After the above is done, rescales the data so that each feature has zero mean and unit variance (hint: look up sklearn.preprocessing)\n",
 53 |     "- Returns the cleaned and rescaled dataset\n",
 54 |     "\n",
 55 |     "Hint: as a guide, this function can easily be done in less than 15 lines."
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {
 62 |     "collapsed": true
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "def cleanBosonData(infile_name):\n",
 67 |     "    # Code here\n",
 68 |     "    return data_clean"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "### Part 2: Basic evaluations\n",
 76 |     "In this part you will build an out-of-the box logistic regression (LR) model and support vector machine (SVM). You will then plot ROC for the LR and SVM model."
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "1\\. Clean the two data files included in this assignment (`data/boson_training_cut_2000.csv` and `data/boson_testing_cut.csv`) and use them as training and testing data sets."
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {
 90 |     "collapsed": true
 91 |    },
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "data_train = cleanBosonData(\"data/boson_training_cut_2000.csv\")\n",
 95 |     "data_test = cleanBosonData(\"data/boson_testing_cut.csv\")"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "2\\. On the training set, build the following models:\n",
103 |     "\n",
104 |     "- A logistic regression using sklearn's `linear_model.LogisticRegression()`. For this model, use `C=1e30`.\n",
105 |     "- An SVM using sklearn's `svm.svc()`. For this model, specify that `kernel=\"linear\"`.\n",
106 |     "\n",
107 |     "For each model above, plot the ROC curve of both models on the same plot. Make sure to use the test set for computing and plotting. In the legend, also print out the Area Under the ROC (AUC) for reference."
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {
114 |     "collapsed": true
115 |    },
116 |    "outputs": [],
117 |    "source": [
118 |     "import matplotlib\n",
119 |     "import matplotlib.pyplot as plt\n",
120 |     "%matplotlib inline\n",
121 |     "\n",
122 |     "# Code here"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "3\\. Which of the two models is generally better at ranking the test set? Are there any classification thresholds where the model identified above as \"better\" would underperform the other in a classification metric (such as TPR)?"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "Answer here!"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "### Part 3: Model selection with cross-validation\n",
144 |     "We think we might be able to improve the performance of the SVM if we perform a grid search on the hyper-parameter $C$.  Because we only have 1000 instances, we will have to use cross-validation to find the optimal $C$."
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "1\\. Write a cross-validation function that does the following:\n",
152 |     "- Takes as inputs a dataset, a label name, # of splits/folds (`k`), a sequence of values for $C$ (`cs`)\n",
153 |     "- Performs two loops\n",
154 |     "  - Outer Loop: `for each f in range(k)`:\n",
155 |     "    - Splits the data into `data_train` & `data_validate` according to cross-validation logic\n",
156 |     "  - Inner Loop: `for each c in cs`:\n",
157 |     "    - Trains an SVM on training split with `C=c, kernel=\"linear\"`\n",
158 |     "    - Computes AUC_c_k on validation data\n",
159 |     "    - Stores AUC_c_k in a  dictionary of values\n",
160 |     "- Returns a dictionary, where each key-value pair is: `c:[auc-c1,auc-c2,..auc-ck]`"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {
167 |     "collapsed": true
168 |    },
169 |    "outputs": [],
170 |    "source": [
171 |     "# Code here\n",
172 |     "def xValSVM(dataset, label_name, k, cs):\n",
173 |     "    \n",
174 |     "    return aucs"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "2\\. Using the function written above, do the following:\n",
182 |     "- Generate a sequence of 10 $C$ values in the interval `[10^(-8), ..., 10^1]` (i.e., do all powers of 10 from -8 to 1).\n",
183 |     "2.\tCall aucs = xValSVM(train, ‘Y’, 10, cs)\n",
184 |     "3.\tFor each c in cs, get mean(AUC) and StdErr(AUC) \n",
185 |     "4.\tCompute the value for max(meanAUC-StdErr(AUC)) across all values of c.\n",
186 |     "5.\tGenerate a plot with the following:\n",
187 |     "a.\tLog10(c) on the x-axis\n",
188 |     "b.\t1 series with mean(AUC) for each c\n",
189 |     "c.\t1 series with mean(AUC)-stderr(AUC) for each c (use ‘k+’ as color pattern)\n",
190 |     "d.\t1 series with mean(AUC)+stderr(AUC) for each c (use ‘k--‘ as color pattern)\n",
191 |     "e.\ta reference line for max(AUC-StdErr(AUC)) (use ‘r’ as color pattern)\n",
192 |     "\n",
193 |     "Then answer the question: Did the model parameters selected beat the out-of-the-box model for SVM? "
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {
200 |     "collapsed": true
201 |    },
202 |    "outputs": [],
203 |    "source": [
204 |     "#Code here"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "answer here:"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "markdown",
216 |    "metadata": {},
217 |    "source": [
218 |     "3\\. Which of the two algorithms are more suitable for smaller sample sizes, given the set of features? If it costs twice the investment to run enough experiments to double the data, do you think it is a worthy investment?\n"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "metadata": {},
224 |    "source": [
225 |     "answer here:"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "metadata": {},
231 |    "source": [
232 |     "4\\. Is there a reason why cross-validation might be biased? If so, in what direction is it biased? (Hint: refer to ESL figure 7.8)?\n",
233 |     "\n"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "answer here:"
241 |    ]
242 |   }
243 |  ],
244 |  "metadata": {
245 |   "kernelspec": {
246 |    "display_name": "Python 3",
247 |    "language": "python",
248 |    "name": "python3"
249 |   },
250 |   "language_info": {
251 |    "codemirror_mode": {
252 |     "name": "ipython",
253 |     "version": 3
254 |    },
255 |    "file_extension": ".py",
256 |    "mimetype": "text/x-python",
257 |    "name": "python",
258 |    "nbconvert_exporter": "python",
259 |    "pygments_lexer": "ipython3",
260 |    "version": "3.5.2"
261 |   }
262 |  },
263 |  "nbformat": 4,
264 |  "nbformat_minor": 0
265 | }
266 | 


--------------------------------------------------------------------------------
/ipython/hw/hw_4/hw_4.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Introduction to Data Science\n",
  8 |     "## Homework 4: Due printed out in Kevin's Mailbox 5pm May 1st\n",
  9 |     "## This assignment is OPTIONAL!! If you do it, your HW grade will be the average of all 4"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "Student Name: \n",
 17 |     "\n",
 18 |     "Student Netid:\n",
 19 |     "***"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "### Part 1: Naive Bayes"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "1\\. From your reading you know that the naive Bayes classifier works by calculating the conditional probabilities of each feature, $e_i$, occuring with each class $c$ and treating them independently. This results in the probability of a certain class occuring given a set of features, or a piece of evidence, $E$, as\n",
 34 |     "\n",
 35 |     "$$P(c \\mid E) = \\frac{p(e_1 \\mid c) \\cdot p(e_2 \\mid c) \\cdot \\cdot \\cdot p(e_k \\mid c) \\cdot p(c)}{p(E)}.$$\n",
 36 |     "\n",
 37 |     "The conditional probability of each piece of evidence occuring with a given class is given by\n",
 38 |     "\n",
 39 |     "$$P(e_i \\mid c) = \\frac{\\text{count}(e_i, c)}{\\text{count}(c)}.$$\n",
 40 |     "\n",
 41 |     "In the above equation $\\text{count}(e_i, c)$ is the number of documents in a given class that contain feature $e_i$ and $\\text{count}(c)$ is the number of documents that belong to class $c$. \n",
 42 |     "\n",
 43 |     "A common variation of the above is to use Laplace (sometimes called +1) smoothing. Recall the use of Laplace smoothing introduced toward the end of Chapter 3 in the section Probability Estimation. This is done in sklearn by setting `alpha=1` in the `BernoulliNB()` function (this is also the default behavior). The result of Laplace smoothing will slightly change the conditional probabilities,\n",
 44 |     "\n",
 45 |     "$$P(e_i \\mid c) = \\frac{\\text{count}(e_i, c) + 1}{\\text{count}(c) + 2}.$$\n",
 46 |     "\n",
 47 |     "In no more than **one paragraph**, describe why this is useful. Try to think of a case when not using Laplace smoothing would result in \"bad\" models. Try to give an example. Be precise."
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "Answer here!"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "### Part 2: Text classification for sentiment analysis\n",
 62 |     "For this part of the assignment, we are going to use a data set of movie ratings from IMDB.com. The data consists of the text of a movie review and a target variable which tells us whether the reviewer had a positive feeling towards the movie (equivalent to rating the movie between 7 and 10) or a negative feeling (rating the movie between 1 and 4). Neutral reactions are not included in the data.\n",
 63 |     "\n",
 64 |     "The data are located in \"`data/imdb.csv`\". The first column is the review text; the second is the text label 'P' for positive or 'N' for negative."
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "1\\. Load the data into a pandas `DataFrame()`."
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {
 78 |     "collapsed": true
 79 |    },
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "import pandas as pd\n",
 83 |     "data = None"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "2\\. Code the target variable to be numeric: use the value `1` to represent 'P' and `0` to represent 'N'."
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {
 97 |     "collapsed": false
 98 |    },
 99 |    "outputs": [],
100 |    "source": [
101 |     "# Code here"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "3\\. Put all of the text into a data frame called `X` and the target variable in a data frame called `Y`. Make a train/test split where you give 75% of the data to training."
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {
115 |     "collapsed": false
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "from sklearn.cross_validation import train_test_split\n",
120 |     "\n",
121 |     "X = None\n",
122 |     "Y = None\n",
123 |     "\n",
124 |     "X_train, X_test, Y_train, Y_test = None"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "4\\. Create a binary `CountVectorizer()` and `TfidfVectorizer()`. Use the original single words as well as bigrams. Also, use an \"english\" stop word list. Fit these to the training data to extract a vocabulary and then transform both the train and test data."
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {
138 |     "collapsed": true
139 |    },
140 |    "outputs": [],
141 |    "source": [
142 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
143 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
144 |     "\n",
145 |     "binary_vectorizer = None\n",
146 |     "X_train_binary = None\n",
147 |     "X_test_binary = None\n",
148 |     "\n",
149 |     "tfidf_vectorizer = None\n",
150 |     "X_train_tfidf = None\n",
151 |     "X_test_tfidf = None"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "4\\. Create `LogisticRegression()` and `BernoulliNB()` models. For all settings, keep the default values. In a single plot, show the AUC curve for both classifiers and both the binary and tfidf feature sets. In the legend, include the area under the ROC curve (AUC). Do not forget to label your axes. Your final plot will be a single window with 4 curves.\n",
159 |     "\n",
160 |     "Which model do you think does a better job? Why? Explain in no more than a paragraph."
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {
167 |     "collapsed": true
168 |    },
169 |    "outputs": [],
170 |    "source": [
171 |     "# Run this so your plots show properly\n",
172 |     "import matplotlib.pyplot as plt\n",
173 |     "%matplotlib inline\n",
174 |     "plt.rcParams['figure.figsize'] = 12, 12"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {
181 |     "collapsed": false
182 |    },
183 |    "outputs": [],
184 |    "source": [
185 |     "from sklearn.linear_model import LogisticRegression\n",
186 |     "from sklearn.naive_bayes import BernoulliNB\n",
187 |     "from sklearn import metrics\n",
188 |     "\n",
189 |     "# Code here"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "Explanation here!"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {
202 |     "collapsed": true
203 |    },
204 |    "source": [
205 |     "5\\. Use the model from question 4 that you think did the best job and predict the rating of the test data. Find 5 examples the should have been positive, but were incorrectly classified as negative. List the text below and include an explanation as to why you think it may have been incorrectly classified. You can pick any 5. They do not have to be at random."
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {
212 |     "collapsed": true
213 |    },
214 |    "outputs": [],
215 |    "source": [
216 |     "# Code here to display 5 incorrect reviews."
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "Explanation for the 5 reviews chosen here!"
224 |    ]
225 |   }
226 |  ],
227 |  "metadata": {
228 |   "kernelspec": {
229 |    "display_name": "Python 3",
230 |    "language": "python",
231 |    "name": "python3"
232 |   },
233 |   "language_info": {
234 |    "codemirror_mode": {
235 |     "name": "ipython",
236 |     "version": 3
237 |    },
238 |    "file_extension": ".py",
239 |    "mimetype": "text/x-python",
240 |    "name": "python",
241 |    "nbconvert_exporter": "python",
242 |    "pygments_lexer": "ipython3",
243 |    "version": "3.5.2"
244 |   }
245 |  },
246 |  "nbformat": 4,
247 |  "nbformat_minor": 0
248 | }
249 | 


--------------------------------------------------------------------------------
/ipython/references/Syllabus_2016.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/references/Syllabus_2016.pdf


--------------------------------------------------------------------------------
/ipython/references/churn_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/references/churn_architecture.png


--------------------------------------------------------------------------------
/ipython/references/churn_dataset_info.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/references/churn_dataset_info.pdf


--------------------------------------------------------------------------------
/ipython/references/churn_sampling_scheme.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/references/churn_sampling_scheme.png


--------------------------------------------------------------------------------
/ipython/utils/ClassifierBakeoff.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn.linear_model import LogisticRegression
  4 | from sklearn.metrics import roc_auc_score
  5 | from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier
  6 | from sklearn.tree import DecisionTreeClassifier
  7 | 
  8 | 
  9 | def liftTable(pred, truth, b):
 10 |     df = pd.DataFrame({'p':pred + np.random.rand(len(pred))*0.000001, 'y':truth})
 11 |     df['b'] = b - pd.qcut(df['p'], b, labels=False)
 12 |     df['n'] = np.ones(df.shape[0])
 13 |     df_grp = df.groupby(['b']).sum()
 14 |     base = np.sum(df_grp['y'])/float(df.shape[0])
 15 |     df_grp['n_cum'] = np.cumsum(df_grp['n'])/float(df.shape[0])
 16 |     df_grp['y_cum'] = np.cumsum(df_grp['y'])
 17 |     df_grp['p_y_b'] = df_grp['y']/df_grp['n']
 18 |     df_grp['lift_b'] = df_grp['p_y_b']/base
 19 |     df_grp['cum_lift_b'] = (df_grp['y_cum']/(float(df.shape[0])*df_grp['n_cum']))/base
 20 |     return df_grp
 21 | 
 22 | 
 23 | def getMetrics(preds, labels):
 24 |     '''
 25 |     Takes in non-binary predictions and labels and returns AUC, and several Lifts
 26 |     '''
 27 |     auc = roc_auc_score(labels, preds)
 28 |     ltab = liftTable(preds, labels, 100)
 29 | 
 30 |     lift1 = ltab.ix[1].cum_lift_b
 31 |     lift5 = ltab.ix[5].cum_lift_b
 32 |     lift10 = ltab.ix[10].cum_lift_b
 33 |     lift25 = ltab.ix[25].cum_lift_b
 34 | 
 35 |     return [auc, lift1, lift5, lift10, lift25]
 36 | 
 37 | 
 38 | def dToString(d, dm1, dm2):
 39 |     '''
 40 |     Takes key-values and makes a string, d1 seprates k:v, d2 separates pairs
 41 |     '''
 42 |     arg_str = ''
 43 |     for k in sorted(d.keys()):
 44 |         if len(arg_str) == 0:
 45 |             arg_str = '{}{}{}'.format(k, dm1, d[k])
 46 |         else:
 47 |             arg_str = arg_str + '{}{}{}{}'.format(dm2, k, dm1, d[k])
 48 |     return arg_str
 49 | 
 50 | def getArgCombos(arg_lists):
 51 |     '''
 52 |     Takes every combination and returns an iterable of dicts
 53 |     '''
 54 |     keys = sorted(arg_lists.keys())
 55 |     #Initialize the final iterable
 56 |     tot = 1
 57 |     for k in keys:
 58 |         tot = tot * len(arg_lists[k])
 59 |     iter = []
 60 |     #Fill it with empty dicts
 61 |     for i in range(tot):
 62 |         iter.append({})
 63 |     #Now fill each dictionary    
 64 |     kpass = 1
 65 |     for k in keys:
 66 |         klist = arg_lists[k]
 67 |         ktot = len(klist)
 68 |         for i in range(tot):
 69 |             iter[i][k] = klist[(i/kpass) % ktot]
 70 |         kpass = ktot * kpass
 71 |     return iter
 72 | 
 73 | 
 74 | class LRAdaptor(object):
 75 |     '''
 76 |     This adapts the LogisticRegression() Classifier so that LR can be used as an init for GBT
 77 |     This just overwrites the predict method to be predict_proba
 78 |     '''
 79 |     def __init__(self, est):
 80 |         self.est = est
 81 | 
 82 |     def predict(self, X):
 83 |         return self.est.predict_proba(X)[:,1][:, np.newaxis]
 84 | 
 85 |     def fit(self, X, y):
 86 |         self.est.fit(X, y)
 87 | 
 88 | class GenericClassifier(object):
 89 | 
 90 |     def __init__(self, modclass, dictargs):
 91 |         self.classifier = modclass(**dictargs)
 92 | 
 93 |     def fit(self, X, Y):
 94 |         self.classifier.fit(X,Y)
 95 | 
 96 |     def predict_proba(self, Xt):
 97 |         return self.classifier.predict_proba(Xt)
 98 | 
 99 | 
100 | class GenericClassifierOptimizer(object):
101 | 
102 |     def __init__(self, classtype, arg_lists):
103 |         self.name = classtype.__name__
104 |         self.classtype = classtype
105 |         self.arg_lists = arg_lists
106 |         self.results = self._initDict()
107 | 
108 |     def _initDict(self):
109 |         return {'alg':[], 'opt':[], 'auc':[], 'lift1':[], 'lift5':[], 'lift10':[], 'lift25':[]}
110 | 
111 |     def _updateResDict(self, opt, perf):
112 |         self.results['alg'].append(self.name)
113 |         self.results['opt'].append(opt)
114 |         self.results['auc'].append(perf[0])
115 |         self.results['lift1'].append(perf[1])
116 |         self.results['lift5'].append(perf[2])
117 |         self.results['lift10'].append(perf[3])
118 |         self.results['lift25'].append(perf[4])
119 |    
120 |     def runClassBake(self, X_train, Y_train, X_test, Y_test):
121 | 
122 |         arg_loop = getArgCombos(self.arg_lists)
123 | 
124 |         for d in arg_loop:
125 |            
126 |             mod = GenericClassifier(self.classtype, d)
127 |             mod.fit(X_train, Y_train)
128 |          
129 |             perf = getMetrics(mod.predict_proba(X_test)[:,1], Y_test)
130 |             self._updateResDict(dToString(d, ':', '|'), perf)
131 | 
132 | 
133 | 
134 | class ClassifierBakeoff(object): 
135 | 
136 |     def __init__(self, X_train, Y_train, X_test, Y_test, setup):
137 |         self.instructions = setup
138 |         self.X_train = X_train
139 |         self.Y_train = Y_train
140 |         self.X_test = X_test
141 |         self.Y_test = Y_test
142 |         self.results = self._initDict()
143 | 
144 |     def _initDict(self):
145 |         return {'alg':[], 'opt':[], 'auc':[], 'lift1':[], 'lift5':[], 'lift10':[], 'lift25':[]}
146 | 
147 |     def _updateResDict(self, clfr_results):
148 |         self.results['alg'] =  self.results['alg'] + clfr_results['alg']
149 |         self.results['opt'] =  self.results['opt'] + clfr_results['opt']
150 |         self.results['auc'] =  self.results['auc'] + clfr_results['auc']
151 |         self.results['lift1'] =  self.results['lift1'] + clfr_results['lift1']
152 |         self.results['lift5'] =  self.results['lift5'] + clfr_results['lift5']
153 |         self.results['lift10'] =  self.results['lift10'] + clfr_results['lift10']
154 |         self.results['lift25'] =  self.results['lift25'] + clfr_results['lift25']
155 | 
156 | 
157 |     def bake(self):
158 | 
159 |         for clfr in self.instructions:
160 | 
161 |            classifierBake = GenericClassifierOptimizer(clfr, self.instructions[clfr])
162 |            classifierBake.runClassBake(self.X_train, self.Y_train, self.X_test, self.Y_test)
163 |            self._updateResDict(classifierBake.results)
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 
170 | 


--------------------------------------------------------------------------------
/ipython/utils/ClassifierBakeoff.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/utils/ClassifierBakeoff.pyc


--------------------------------------------------------------------------------
/ipython/utils/bias_variance.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import pandas as pd
  3 | import numpy as np
  4 | from matplotlib import pyplot as plt
  5 | import sklearn.metrics as skm
  6 | import warnings
  7 | warnings.filterwarnings('ignore')
  8 | from sklearn import linear_model
  9 | 
 10 | def simPolynomial(sigma = 0, betas = [0, 0], n = 100):
 11 | 
 12 |     x = np.random.uniform(0, 100, n)
 13 |     e = np.random.normal(0, sigma, n)
 14 | 
 15 |     d = pd.DataFrame(x, columns=['x'])    
 16 |     y = e
 17 |     for i, b in enumerate(betas):
 18 |         y = y + b*(x**i)
 19 |     d['y'] = y
 20 |     return d
 21 | 
 22 | 
 23 | def fitLinReg(d, mn, mx, inter):
 24 |     '''
 25 |     Runs a linear regression and fits it on a grid
 26 |     '''
 27 | 
 28 |     regr = linear_model.LinearRegression(fit_intercept = inter)
 29 |     regr.fit(d.drop('y', 1), d['y'])
 30 |     yhat = regr.predict(pd.DataFrame(np.arange(mn, mx, 1)))
 31 | 
 32 |     return yhat
 33 | 
 34 | def makePolyFeat(d, deg):
 35 |     '''
 36 |     Goal: Generate features up to X**deg
 37 |     1. a data frame with two features X and Y
 38 |     4. a degree 'deg' (from which we make polynomial features 
 39 |     
 40 |     '''
 41 |     #Generate Polynomial terms
 42 |     for i in range(2, deg+1):
 43 |         d['x'+str(i)] = d['x']**i
 44 |     return d
 45 | 
 46 | def fitFullReg(d, mn, mx, betas, inter):
 47 |     '''
 48 |     Runs a linear regression and fits it on a grid. Creates polynomial features using the dimension of betas
 49 |     '''
 50 | 
 51 |     regr = linear_model.LinearRegression(fit_intercept = inter)
 52 |     regr.fit(makePolyFeat(d.drop('y', 1), len(betas)), d['y'])
 53 |     dt = pd.DataFrame(np.arange(mn, mx, 1), columns = ['x'])
 54 |     yhat = regr.predict(makePolyFeat(dt, len(betas)))
 55 | 
 56 |     return yhat
 57 | 
 58 | 
 59 | 
 60 | def plotLinearBiasStage(sigma, betas, ns, fs):
 61 | 
 62 |     mn = 0
 63 |     mx = 101
 64 | 
 65 |     d = simPolynomial(sigma, betas, 10000)
 66 |     plt.figure(figsize = fs)
 67 |     plt.plot(d['x'], d['y'], 'b.', markersize = 0.75)
 68 | 
 69 | 
 70 |     x = np.arange(mn, mx, 1)
 71 |     y_real = np.zeros(len(x))
 72 |     for i, b in enumerate(betas):
 73 |         y_real += b*(x**i)
 74 | 
 75 |     #plt.plot(x, y_real + 2*sigma, 'k+')
 76 |     #plt.plot(x, y_real - 2*sigma, 'k--')
 77 |     plt.plot(x, y_real, 'k*')    
 78 | 
 79 |     for n in ns:
 80 |         dn = simPolynomial(sigma, betas, n)
 81 |         yhat = fitLinReg(dn, mn, mx, True)
 82 |         plt.plot(x, yhat, label = 'n={}'.format(n))
 83 | 
 84 | 
 85 |     plt.legend(loc = 4, ncol = 3)
 86 | 
 87 | 
 88 | 
 89 | def plotVariance(sigma, betas, ns, fs):
 90 | 
 91 |     mn = 0
 92 |     mx = 101
 93 |     nworlds = 100
 94 | 
 95 |     d = simPolynomial(sigma, betas, 10000)
 96 |     x = np.arange(mn, mx, 1)
 97 | 
 98 |     fig = plt.figure(figsize = fs)
 99 |     for pos, n in enumerate(ns):
100 |        
101 |         #First model each world
102 |         yhat_lin = []
103 |         yhat_non = []
104 |         for i in range(nworlds):
105 | 
106 |             dn = simPolynomial(sigma, betas, n)
107 | 
108 |             yhat_lin.append(fitLinReg(dn, mn, mx, True))
109 |             yhat_non.append(fitFullReg(dn, mn, mx, betas, True))
110 | 
111 |         #Now compute appropriate stats and plot
112 | 
113 |         lin_df = pd.DataFrame(yhat_lin)
114 |         non_df = pd.DataFrame(yhat_non)
115 | 
116 |         lin_sig = lin_df.apply(np.std, axis=0).values
117 |         non_sig = non_df.apply(np.std, axis=0).values
118 |         lin_mu = lin_df.apply(np.mean, axis=0).values
119 |         non_mu = non_df.apply(np.mean, axis=0).values
120 | 
121 |         #Need to continue from here
122 | 
123 |         for i in range(nworlds):
124 |     
125 |             ax1 = fig.add_subplot(2, 3, pos + 1)
126 |             plt.title('n={}'.format(n))
127 |             plt.plot(x, yhat_lin[i], '.', color = '0.75')
128 |    
129 |             if i == nworlds - 1:
130 |                 plt.plot(x, lin_mu, 'r-')
131 |                 plt.title('E[std|X] = {}'.format(round(lin_sig.mean(),1)))
132 | 
133 |             ax1.axes.get_xaxis().set_visible(False)
134 |             ax1.set_ylim((-40, 80))
135 | 
136 |             ax2 = fig.add_subplot(2, 3, pos + 4)
137 |             plt.plot(x, yhat_non[i], '--',  color = '0.75')
138 | 
139 |             if i == nworlds - 1:
140 |                 plt.plot(x, non_mu, 'r-')
141 |                 plt.title('E[std|X] = {}'.format(round(non_sig.mean(),1)))
142 | 
143 |             ax2.set_ylim((-40, 80)) 
144 | 
145 |             if pos != 0:
146 |                 ax1.axes.get_yaxis().set_visible(False)
147 |                 ax2.axes.get_yaxis().set_visible(False)
148 | 
149 |     plt.legend()
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | 
157 | def getVarianceTrend(sigma, betas):
158 | 
159 |     mn = 50
160 |     mx = 51
161 |     nworlds = 100
162 |     ns = np.logspace(4, 16, num = 10, base = 2)
163 | 
164 |     res_dict = {'n':[], 'lin':[], 'quad':[], 'non':[]}
165 | 
166 |     for pos, n in enumerate(ns):
167 | 
168 |         yhat_lin = []; yhat_quad = []; yhat_non = []
169 | 
170 |         for i in range(nworlds):
171 | 
172 |             dn = simPolynomial(sigma, betas, n)
173 | 
174 |             #yhat_lin.append(fitLinReg(dn, mn, mx, True)[0])
175 |             yhat_lin.append(fitFullReg(dn, mn, mx, betas[0:1], True)[0])
176 |             yhat_quad.append(fitFullReg(dn, mn, mx, betas[0:2], True)[0])
177 |             yhat_non.append(fitFullReg(dn, mn, mx, betas, True)[0])
178 | 
179 |         res_dict['lin'].append(np.array(yhat_lin).std())
180 |         res_dict['quad'].append(np.array(yhat_quad).std())
181 |         res_dict['non'].append(np.array(yhat_non).std())
182 |         res_dict['n'].append(n)
183 | 
184 | 
185 |     return res_dict
186 | 
187 | def plotVarianceTrend(res_dict, fs):
188 | 
189 |     fig = plt.figure(figsize = fs)
190 | 
191 |     ax1 = fig.add_subplot(2, 1, 1)
192 |     x = np.log2(res_dict['n'])
193 |     plt.plot(x, np.power(res_dict['lin'], 2), 'b-', label = 'd = 1')
194 |     plt.plot(x, np.power(res_dict['quad'], 2), 'r-', label = 'd = 2')
195 |     plt.plot(x, np.power(res_dict['non'], 2), 'g-', label = 'd = 4')
196 | 
197 |     ax1.set_ylim((0, 100))
198 | 
199 |     plt.title('Model Variance by Polynomial Order (d) and Sample Size (n)')
200 |     plt.legend(loc = 1)
201 |     plt.ylabel('Var( E_d[Y|X = 50] )')
202 | 
203 |     ax2 = fig.add_subplot(2, 1, 2)
204 |     filt = (x > 0)
205 |     plt.plot(x[filt], 2*np.log2(res_dict['lin']), 'b-', label = 'd = 1')
206 |     plt.plot(x[filt], 2*np.log2(res_dict['quad']), 'r-', label = 'd = 2')
207 |     plt.plot(x[filt], 2*np.log2(res_dict['non']), 'g-', label = 'd = 4')
208 | 
209 |     ax2.set_xlim((x[filt].min(), x.max()))
210 |     plt.xlabel('Log2(Sample Size)')
211 |     plt.ylabel('Log [ Var( E_d[Y|X = 50] ) ]')  
212 |     plt.legend(loc = 1)    
213 | 


--------------------------------------------------------------------------------
/ipython/utils/bias_variance.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/utils/bias_variance.pyc


--------------------------------------------------------------------------------
/ipython/utils/churn_analysis.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This script has a set of reference functions for performing analysis of the churn dataset
  3 | '''
  4 | import sys
  5 | import pandas as pd
  6 | import numpy as np
  7 | from matplotlib import pyplot as plt
  8 | import sklearn.metrics as skm
  9 | sys.path.append("./utils/")
 10 | from ClassifierBakeoff import *
 11 | 
 12 | import warnings
 13 | warnings.filterwarnings('ignore')
 14 | 
 15 | def getDfSummary(dat):
 16 |     '''
 17 |     Get descriptive stats
 18 |     '''
 19 |     #Get the names of the columns
 20 |     cols = dat.columns.values
 21 | 
 22 |     c_summ = []
 23 |     #Outer Loop for the cols
 24 |     for c in cols:
 25 |         #Count the NAs
 26 |         missing = sum(pd.isnull(dat[c]))
 27 |         #Use describe to get summary statistics, and also drop the 'count' row
 28 |         sumval = dat[c].describe().drop(['count'])
 29 |         #Now count distinct values...note that nunique removes missing values for you
 30 |         distinct = dat[c].nunique()
 31 |         #Append missing and distinct to sumval
 32 |         sumval = sumval.append(pd.Series([missing, distinct], index=['missing', 'distinct']))
 33 |         #Add each sumval to a list and then convert the entire thing to a DS
 34 |         c_summ.append(sumval)
 35 | 
 36 |     return pd.DataFrame(c_summ, index=cols)
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | def plotCorr(dat, lab, h, w):
 43 |     '''
 44 |     Do a heatmap to visualize the correlation matrix, dropping the label
 45 |     '''
 46 | 
 47 |     dat = dat.drop(lab, 1)
 48 |     #Get correlation and 0 out the diagonal (for plotting purposes)
 49 |     c_dat = dat.corr()
 50 |     for i in range(c_dat.shape[0]):
 51 |         c_dat.iloc[i,i] = 0
 52 | 
 53 |     c_mat = c_dat.as_matrix()
 54 |     #c_mat = c_mat[:-1, :-1]
 55 |     fig, ax = plt.subplots()
 56 |     heatmap = plt.pcolor(c_mat, cmap = plt.cm.RdBu)
 57 |  
 58 |     #Set the tick labels and center them
 59 |     ax.set_xticks(np.arange(c_dat.shape[0]) + 0.5, minor = False)
 60 |     ax.set_yticks(np.arange(c_dat.shape[1]) + 0.5, minor = False)
 61 |     ax.set_xticklabels(c_dat.index.values, minor = False, rotation = 45)
 62 |     ax.set_yticklabels(c_dat.index.values, minor = False)
 63 |     heatmap.axes.set_ylim(0, len(c_dat.index))  
 64 |     heatmap.axes.set_xlim(0, len(c_dat.index)) 
 65 |     plt.colorbar(heatmap, ax = ax)
 66 | 
 67 |     #plt.figure(figsize = (h, w))
 68 |     fig = plt.gcf()
 69 |     fig.set_size_inches(h, w)
 70 | 
 71 | 
 72 | def makeBar(df, h, lab,  width):
 73 |     '''
 74 |     Contains
 75 |     '''
 76 |     df_s = df.sort(columns = [h], ascending = False)
 77 | 
 78 |     #Get a barplot
 79 |     ind = np.arange(df_s.shape[0])
 80 |     labs = df_s[[lab]].values.ravel() 
 81 | 
 82 |     fig = plt.figure(facecolor = 'w', figsize = (12, 6))
 83 |     ax = plt.subplot(111)
 84 |     plt.subplots_adjust(bottom = 0.25)
 85 | 
 86 |     rec = ax.bar(ind + width, df_s[[h]].values, width, color='r')
 87 | 
 88 |     ax.set_xticks(ind + getTickAdj(labs, width))
 89 |     ax.set_xticklabels(labs, rotation = 45, size = 14)
 90 | 
 91 | 
 92 | def getTickAdj(labs, width):
 93 |     lens = map(len, labs)
 94 |     lens = -1 * width * (lens - np.mean(lens)) / np.max(lens)
 95 |     return lens
 96 | 
 97 | def plotMI(dat, lab, width = 0.35, signed = 0):
 98 |     '''
 99 |     Draw a bar chart of the normalized MI between each X and Y
100 |     '''
101 |     X = dat.drop(lab, 1)
102 |     Y = dat[[lab]].values
103 |     cols = X.columns.values
104 |     mis = []
105 | 
106 |     #Start by getting MI
107 |     for c in cols:
108 |         mis.append(skm.normalized_mutual_info_score(Y.ravel(), X[[c]].values.ravel()))
109 | 
110 |     #Get signs by correlation
111 |     corrs = dat.corr()[lab]
112 |     corrs[corrs.index != lab]
113 |     df = pd.DataFrame(zip(mis, cols), columns = ['MI', 'Lab'])
114 |     df = pd.merge(df, pd.DataFrame(corrs, columns = ['corr']), how = 'inner', left_on = 'Lab', right_index=True)
115 |  
116 |     if signed == 0:
117 |         makeBar(df, 'MI', 'Lab', width)
118 | 
119 |     else:
120 |         makeBarSigned(df, 'MI', 'Lab', width)
121 | 
122 | 
123 | def makeBarSigned(df, h, lab,  width):
124 |     '''
125 |     Contains
126 |     '''
127 |     df_s = df.sort(columns = [h], ascending = False)
128 | 
129 |     #Get a barplot
130 |     ind = np.arange(df_s.shape[0])
131 |     labs = df_s[[lab]].values.ravel()
132 |     h_pos = (df_s[['corr']].values.ravel() > 0) * df_s.MI
133 |     h_neg = (df_s[['corr']].values.ravel() < 0) * df_s.MI
134 | 
135 |     fig = plt.figure(facecolor = 'w', figsize = (12, 6))
136 |     ax = plt.subplot(111)
137 |     plt.subplots_adjust(bottom = 0.25)
138 | 
139 |     rec = ax.bar(ind + width, h_pos, width, color='r', label = 'Positive')
140 |     rec = ax.bar(ind + width, h_neg, width, color='b', label = 'Negative')
141 | 
142 |     ax.set_xticks(ind + getTickAdj(labs, width))
143 |     ax.set_xticklabels(labs, rotation = 45, size = 14)
144 | 
145 |     plt.legend()
146 | 
147 | 
148 | 
149 | def makeGS_Tup(ent, getmin = True):
150 | 
151 |     ostr = dToString(ent.parameters, ':', '|')
152 |     if len(ostr.split('|')) > 2:
153 |         sp = ostr.split('|')
154 |         if len(sp) == 3:
155 |             ostr = '{}|{}\n{}'.format(sp[0], sp[1], sp[2])
156 |         else:
157 |             ostr = '{}|{}\n{}|{}'.format(sp[0], sp[1], sp[2], sp[3])
158 |         
159 |     #ostr = dToString(ent.parameters, ':', '|')
160 |     mu = np.abs(ent.mean_validation_score) #Log-Loss comes in at negative value
161 |     sig = ent.cv_validation_scores.std()
162 |     stderr = sig/np.sqrt(len(ent.cv_validation_scores))
163 |                          
164 |     if getmin:
165 |         return (mu, ostr, mu + stderr, sig, stderr) #Note, this assumes minimization, thus adding stderr
166 |     else:
167 |         return (mu, ostr, mu - stderr, sig, stderr)
168 |         
169 |     
170 | def rankGS_Params(gs_obj_list, getmin = True):
171 |     '''
172 |     Takes in the .grid_scores_ attributes of a GridSearchCV object
173 |     '''
174 |     tup_list = []
175 |     
176 |     for k in gs_obj_list:
177 |         tup_list.append(makeGS_Tup(k, getmin))
178 |     
179 |     tup_list.sort()
180 | 
181 |     if not getmin:
182 |         tup_list.reverse()
183 | 
184 |     return tup_list
185 | 
186 | 
187 | 
188 | def processGsObjList(gs_obj_list, getmin = True):
189 | 
190 |     rank_list = rankGS_Params(gs_obj_list, getmin)
191 |     hts = []
192 |     desc = []
193 |     errs = []
194 |     std1 = rank_list[0][4]
195 | 
196 |     for tup in rank_list:
197 |         hts.append(tup[0])
198 |         desc.append(tup[1])
199 |         errs.append(2 * tup[4])
200 | 
201 |     return [hts, desc, errs, std1]
202 | 
203 | def plotGridSearchSingle(gs_obj_list, getmin = True):
204 | 
205 |     hts, desc, errs, std1 = processGsObjList(gs_obj_list, getmin = True)
206 | 
207 |     gridBarH(hts, desc, errs, std1)
208 | 
209 | 
210 | 
211 | def plotGridSearchMulti(tup_list, getmin = True):
212 |     '''
213 |     Loop through a list of gs_obj_lists. The Obj list is in the 1 slot of each value in the dict
214 |     '''
215 |     m_ht = []
216 |     m_desc = []
217 |     m_errs = []
218 | 
219 |     best_min = 1000 #This assumes we are minimizing
220 | 
221 |     for tup in tup_list:
222 |         lab = tup[0]
223 |         gs_dict = tup[1]
224 | 
225 |         for k in gs_dict:
226 |             clf = type(k).__name__.split('Classifier')[0]
227 | 
228 |             hts, desc, errs, std1 = processGsObjList(gs_dict[k][1], getmin = True)
229 |             for i, d in enumerate(desc):
230 |                 desc[i] = '{} {} {}'.format(clf, lab, d)
231 | 
232 |             if hts[0] < best_min:
233 |                 best_std1 = std1
234 |     
235 |             m_ht = m_ht + hts
236 |             m_desc = m_desc + desc
237 |             m_errs = m_errs + errs
238 | 
239 |     gridBarH(m_ht, m_desc, m_errs, best_std1, int(len(m_ht)), 12)
240 | 
241 | 
242 | 
243 | def gridBarH(hts, desc, errs, std1, h = 6, w = 12):
244 | 
245 |     fig = plt.figure(facecolor = 'w', figsize = (w, h))
246 |     ax = plt.subplot(111)
247 |     plt.subplots_adjust(bottom = 0.25)
248 | 
249 |     width = 0.5
250 |     
251 |     pos = np.arange(len(hts))
252 | 
253 |     rec = ax.barh(pos, np.array(hts), width, yerr = np.array(errs), color='r')
254 | 
255 |     ax.set_yticks(pos + width/2)
256 |     ax.set_yticklabels(desc, size = 14)
257 | 
258 |     tmp = list(hts)
259 |     tmp.sort()
260 | 
261 |     x_min = np.array(hts).min() - 2*np.array(hts).std()
262 |     x_max = tmp[-2] + 2*np.array(hts).std()
263 |     plt.xlim(x_min, x_max)
264 | 
265 | 
266 |     plt.plot(tmp[0] * np.ones(len(tmp)), pos)
267 |     plt.plot((tmp[0] + std1) * np.ones(len(tmp)), pos)
268 | 
269 | 
270 | 
271 | 
272 | 
273 | 
274 | 
275 | 
276 | 
277 | 
278 | 
279 | 
280 | 


--------------------------------------------------------------------------------
/ipython/utils/course_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import sklearn
  5 | import math
  6 | from sklearn.metrics import roc_curve, auc
  7 | import pickle
  8 | 
  9 | def evenSplit(dat,fld):
 10 |     '''
 11 |     Evenly splits the data on a given binary field, returns a shuffled dataframe
 12 |     '''    
 13 |     pos=dat[(dat[fld]==1)]
 14 |     neg=dat[(dat[fld]==0)]
 15 |     neg_shuf=neg.reindex(np.random.permutation(neg.index))
 16 |     fin_temp=pos.append(neg_shuf[:pos.shape[0]],ignore_index=True)
 17 |     fin_temp=fin_temp.reindex(np.random.permutation(fin_temp.index))
 18 |     return fin_temp
 19 | 
 20 | 
 21 | def trainTest(dat, pct):
 22 |     '''
 23 |     Randomly splits data into train and test
 24 |     '''
 25 |     dat_shuf = dat.reindex(np.random.permutation(dat.index))
 26 |     trn = dat_shuf[:int(np.floor(dat_shuf.shape[0]*pct))]
 27 |     tst = dat_shuf[int(np.floor(dat_shuf.shape[0]*pct)):]
 28 |     return [trn, tst]
 29 | 
 30 | def downSample(dat,fld,mult):
 31 |     '''
 32 |     Evenly splits the data on a given binary field, returns a shuffled dataframe
 33 |     '''
 34 |     pos=dat[(dat[fld]==1)]
 35 |     neg=dat[(dat[fld]==0)]
 36 |     neg_shuf=neg.reindex(np.random.permutation(neg.index))
 37 |     tot=min(pos.shape[0]*mult,neg.shape[0])
 38 |     fin_temp=pos.append(neg_shuf[:tot],ignore_index=True)
 39 |     fin_temp=fin_temp.reindex(np.random.permutation(fin_temp.index))
 40 |     return fin_temp
 41 | 
 42 | 
 43 | def scaleData(d):
 44 |     '''
 45 |     This function takes data and normalizes it to have the same scale (num-min)/(max-min)
 46 |     '''
 47 |     #Note, by creating df_scale like this we preserve the index
 48 |     df_scale=pd.DataFrame(d.iloc[:,1],columns=['temp'])
 49 |     for c in d.columns.values:
 50 |         df_scale[c]=(d[c]-d[c].min())/(d[c].max()-d[c].min())
 51 |     return df_scale.drop('temp',1)
 52 | 
 53 | 
 54 | def plot_dec_line(mn,mx,b0,b1,a,col,lab):
 55 |     '''
 56 |     This function plots a line in a 2 dim space
 57 |     '''
 58 |     x = np.random.uniform(mn,mx,100)
 59 |     dec_line = map(lambda x_i: -1*(x_i*b0/b1+a/b1),x)
 60 |     plt.plot(x,dec_line,col,label=lab)
 61 | 
 62 | 
 63 | 
 64 | def plotSVM(X, Y, my_svm):
 65 |     '''
 66 |     Plots the separating line along with SV's and margin lines
 67 |     Code here derived or taken from this example http://scikit-learn.org/stable/auto_examples/svm/plot_separating_hyperplane.html
 68 |     '''
 69 |     # get the separating hyperplane
 70 |     w = my_svm.coef_[0]
 71 |     a = -w[0] / w[1]
 72 |     xx = np.linspace(X.iloc[:,0].min(), X.iloc[:,1].max())
 73 |     yy = a * xx - (my_svm.intercept_[0]) / w[1]
 74 |     # plot the parallels to the separating hyperplane that pass through the
 75 |     # support vectors
 76 |     b = my_svm.support_vectors_[0]
 77 |     yy_down = a * xx + (b[1] - a * b[0])
 78 |     b = my_svm.support_vectors_[-1]
 79 |     yy_up = a * xx + (b[1] - a * b[0])
 80 |     # plot the line, the points, and the nearest vectors to the plane
 81 |     plt.plot(xx, yy, 'k-')
 82 |     plt.plot(xx, yy_down, 'k--')
 83 |     plt.plot(xx, yy_up, 'k--')
 84 |     plt.scatter(my_svm.support_vectors_[:, 0], my_svm.support_vectors_[:, 1], s=80, facecolors='none')
 85 |     plt.plot(X[(Y==-1)].iloc[:,0], X[(Y==-1)].iloc[:,1],'r.')
 86 |     plt.plot(X[(Y==1)].iloc[:,0], X[(Y==1)].iloc[:,1],'b+')
 87 |     #plt.axis('tight')
 88 |     #plt.show()
 89 | 
 90 | 
 91 | def getP(val):
 92 |     '''
 93 |     Get f(x) where f is the logistic function 
 94 |     '''
 95 |     return (1+math.exp(-1*val))**-1
 96 | 
 97 | def getY(val):
 98 |     '''
 99 |     Return a binary indicator based on a binomial draw with prob=f(val). f the logistic function.
100 |     '''
101 |     return (int(getP(val)>np.random.uniform(0,1,1)[0]))
102 | 
103 | def gen_logistic_dataframe(n,alpha,betas):
104 |     '''
105 |     Aa function that generates a random logistic dataset
106 |     n is the number of samples
107 |     alpha, betas are the logistic truth
108 |     '''
109 |     X = np.random.random([n,len(betas)])
110 |     Y = map(getY,X.dot(betas)+alpha)
111 |     d = pd.DataFrame(X,columns=['f'+str(j) for j in range(X.shape[1])])
112 |     d['Y'] = Y
113 |     return d
114 | 
115 | 
116 | def plotAUC(truth, pred, lab):
117 |     fpr, tpr, thresholds = roc_curve(truth, pred)
118 |     roc_auc = auc(fpr, tpr)
119 |     c = (np.random.rand(), np.random.rand(), np.random.rand())
120 |     plt.plot(fpr, tpr, color=c, label= lab+' (AUC = %0.2f)' % roc_auc)
121 |     plt.plot([0, 1], [0, 1], 'k--')
122 |     plt.xlim([0.0, 1.0])
123 |     plt.ylim([0.0, 1.0])
124 |     plt.xlabel('FPR')
125 |     plt.ylabel('TPR')
126 |     plt.title('ROC')
127 |     plt.legend(loc="lower right")
128 | 
129 | 
130 | 
131 | def LogLoss(dat, beta, alpha):
132 |     X = dat.drop('Y',1)
133 |     Y = dat['Y']
134 |     XB=X.dot(np.array(beta))+alpha*np.ones(len(Y))
135 |     P=(1+np.exp(-1*XB))**-1
136 |     return ((Y==1)*np.log(P)+(Y==0)*np.log(1-P)).mean()
137 | 
138 | 
139 | def LogLossP(Y, P):
140 |     return ((Y==1)*np.log(P)+(Y==0)*np.log(1-P)).mean()
141 | 
142 | 
143 | 
144 | 
145 | def plotSVD(sig):
146 |     norm = math.sqrt(sum(sig*sig))
147 |     energy_k = [math.sqrt(k)/norm for k in np.cumsum(sig*sig)]
148 | 
149 |     plt.figure()
150 |     ax1 = plt.subplot(211)
151 |     ax1.bar(range(len(sig+1)), [0]+sig, 0.35)
152 |     plt.title('Kth Singular Value')
153 |     plt.tick_params(axis='x',which='both',bottom='off',top='off',labelbottom='off')
154 | 
155 |     ax2 = plt.subplot(212)
156 |     plt.plot(range(len(sig)+1), [0]+energy_k)
157 |     plt.title('Normalized Sum-of-Squares of Kth Singular Value')
158 | 
159 |     ax2.set_xlabel('Kth Singular Value')
160 |     ax2.set_ylim([0, 1])
161 | 
162 | 
163 | def genY(x, err, betas):
164 |     '''
165 |     Goal: generate a Y variable as Y=XB+e 
166 |     Input
167 |     1. an np array x of length n
168 |     2. a random noise vector r of length n
169 |     3. a (d+1) x 1 vector of coefficients b - each represents ith degree of x
170 |     '''
171 |     d = pd.DataFrame(x, columns=['x'])    
172 |     y = err
173 |     for i,b in enumerate(betas):
174 |         y = y + b*x**i
175 |     d['y'] = y
176 |     return d
177 | 
178 | 
179 | def makePolyFeat(d, deg):
180 |     '''
181 |     Goal: Generate features up to X**deg
182 |     1. a data frame with two features X and Y
183 |     4. a degree 'deg' (from which we make polynomial features 
184 |     
185 |     '''
186 |     #Generate Polynomial terms
187 |     for i in range(2, deg+1):
188 |         d['x'+str(i)] = d['x']**i
189 |     return d
190 | 
191 | 
192 | def save_obj(obj, name ):
193 |     with open(name + '.pkl', 'wb') as f:
194 |         pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
195 | 
196 | def load_obj(name ):
197 |     with open(name + '.pkl', 'r') as f:
198 |         return pickle.load(f)
199 | 
200 | 
201 | 
202 | 
203 | def happyClass(sig, n):
204 |     '''
205 |     sig is the noise parameter and n is sample size
206 |     '''
207 |     eye1 = [(0.7, 0.75), 0.1]
208 |     eye2 = [(0.3, 0.75), 0.1]
209 | 
210 |     X1 = np.random.random(n)
211 |     X2 = np.random.random(n)
212 |     Y1 = 1*(((X1 - eye1[0][0])**2 + (X2 - eye1[0][1])**2 + np.random.randn(n)*sig) < eye1[1]**2)
213 |     Y2 = 1*(((X1 - eye2[0][0])**2 + (X2 - eye2[0][1])**2 + np.random.randn(n)*sig) < eye2[1]**2)
214 |     Y3 = 1*(abs(X2 - 0.1 - 4*(X1 - 0.5)**2) + np.random.randn(n)*5*sig < 0.05) * 1*(X2 < 0.5) 
215 |     
216 |     Y = 1*((Y1 + Y2 + Y3) > 0)
217 |     D = pd.DataFrame({'X1':X1, 'X2':X2})
218 |     D['Y'] = Y
219 | 
220 |     return D
221 | 
222 | 
223 | def plotZgen(clf, dat, pc, t, fig):
224 |     '''
225 |     This plots a 2d decision boundary given a trained classifier
226 |     Note the data must have two fields X1 and X2 to work
227 |     '''
228 |     plot_step = 0.02
229 |     x_min, x_max = dat['X1'].min(), dat['X1'].max()
230 |     y_min, y_max = dat['X2'].min(), dat['X2'].max()
231 |     xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),np.arange(y_min, y_max, plot_step))
232 |     Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
233 |     Z = Z.reshape(xx.shape)
234 |     ax = fig.add_subplot(pc[0], pc[1], pc[2])
235 |     cs = plt.contourf(xx, yy, Z, cmap=plt.cm.cool)
236 |     plt.plot(dat['X1'][(dat.Y==1)], dat['X2'][(noisy_test.Y==1)], 'r.', markersize = 2)
237 |     plt.title(t)
238 |     ax.axes.get_xaxis().set_visible(False)
239 |     ax.axes.get_yaxis().set_visible(False)
240 | 
241 | 


--------------------------------------------------------------------------------
/ipython/utils/course_utils.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/utils/course_utils.pyc


--------------------------------------------------------------------------------
/ipython/utils/eval_plots.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import pandas as pd
 4 | 
 5 | def getMAE(pred, truth):
 6 |     return np.abs(truth - pred).mean()
 7 | 
 8 | def getLL(pred, truth):
 9 |     ll_sum = 0
10 |     for i in range(len(pred)):
11 |         if (pred[i] == 0):
12 |             p = 0.0001
13 |         elif (pred[i] == 1):
14 |             p = 0.9999
15 |         else:
16 |             p = pred[i]
17 |         ll_sum += truth[i]*np.log(p)+(1-truth[i])*np.log(1-p)
18 |     return (ll_sum)/len(pred)
19 | 
20 | 
21 | def plotCalib(truth, pred, bins = 100, f = 0, l = '', w = 8, h = 8, fig_i = 1, fig_j = 1, fig_k = 1):
22 |     mae = np.round(getMAE(pred, truth),3)
23 |     ll = np.round(getLL(pred, truth), 3)
24 | 
25 |     d = pd.DataFrame({'p':pred, 'y':truth})
26 |     d['p_bin'] = np.floor(d['p']*bins)/bins
27 |     d_bin = d.groupby(['p_bin']).agg([np.mean, len])
28 |     filt = (d_bin['p']['len']>f)
29 | 
30 | 
31 |     if fig_k == 1:
32 |         fig = plt.figure(facecolor = 'w', figsize = (w, h))
33 | 
34 |     x = d_bin['p']['mean'][filt]
35 |     y = d_bin['y']['mean'][filt]
36 |     n = d_bin['y']['len'][filt]
37 | 
38 |     stderr = np.sqrt(y * (1 - y)/n)
39 | 
40 |     ax = plt.subplot(fig_i, fig_j, fig_k)
41 |     #plt.plot(x, y, 'b.', markersize = 9)
42 |     plt.errorbar(x, y, yerr = 1.96 * stderr, fmt = 'o') 
43 |     plt.plot([0.0, 1.0], [0.0, 1.0], 'k-')
44 |     plt.title(l + ':' + ' MAE = {}, LL = {}'.format(mae, ll))
45 | 
46 |     plt.xlim([0.0, 1.0])
47 |     plt.ylim([0.0, 1.0])
48 |     plt.xlabel('prediction P(Y|X)')
49 |     plt.ylabel('actual P(Y|X)')
50 |     #plt.legend(loc=4)
51 |     
52 | 
53 | 
54 | def liftTable(pred, truth, b):
55 |     df = pd.DataFrame({'p':pred + np.random.rand(len(pred))*0.000001, 'y':truth})
56 |     df['b'] = b - pd.qcut(df['p'], b, labels=False)
57 |     df['n'] = np.ones(df.shape[0])
58 |     df_grp = df.groupby(['b']).sum()
59 |     tot_y = float(np.sum(df_grp['y']))
60 |     base = tot_y/float(df.shape[0])
61 |     df_grp['n_cum'] = np.cumsum(df_grp['n'])/float(df.shape[0])
62 |     df_grp['y_cum'] = np.cumsum(df_grp['y'])
63 |     df_grp['p_y_b'] = df_grp['y']/df_grp['n']
64 |     df_grp['lift_b'] = df_grp['p_y_b']/base
65 |     df_grp['cum_lift_b'] = (df_grp['y_cum']/(float(df.shape[0])*df_grp['n_cum']))/base
66 |     df_grp['recall'] = df_grp['y_cum']/tot_y
67 |     return df_grp
68 | 
69 | 
70 | def liftRecallCurve(pred, truth, b, h = 6, w = 12, title = ''):
71 | 
72 |     #Get the lift table
73 |     lt = liftTable(pred, truth, b)
74 | 
75 |     fig, ax1 = plt.subplots(figsize = (w, h))
76 | 
77 |     ax1.plot(lt['n_cum'], lt['cum_lift_b'], 'b-')
78 | 
79 |     ax1.set_xlabel('Quantile')
80 |     # Make the y-axis label and tick labels match the line color.
81 |     ax1.set_ylabel('Lift', color='b')
82 |     for tl in ax1.get_yticklabels():
83 |         tl.set_color('b')
84 | 
85 |     ax2 = ax1.twinx()
86 |     ax2.plot(lt['n_cum'], lt['recall'], 'r.')
87 |     ax2.set_ylabel('Recall', color='r')
88 |     for tl in ax2.get_yticklabels():
89 |         tl.set_color('r')
90 | 
91 |     plt.title(title)
92 | 
93 |     plt.show()
94 | 
95 | 


--------------------------------------------------------------------------------
/ipython/utils/eval_plots.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kmunger/DataScienceCourse/1296e7a011bdaed9bc30991bed2ed8670acaa6e6/ipython/utils/eval_plots.pyc


--------------------------------------------------------------------------------