├── labs ├── figures │ └── conditionalmean.png ├── Lab10_Notes.ipynb └── Lab4_Notes.ipynb ├── lectures ├── lecture11 │ ├── images │ │ ├── KNN.gif │ │ ├── KNN.png │ │ ├── knn_fred.png │ │ ├── tangent_space.png │ │ └── digits_rotated.png │ └── custom.css ├── lecture02 │ ├── images │ │ ├── ipython.png │ │ ├── cute_panda.jpg │ │ ├── grouplens.jpg │ │ ├── pandas_book.jpg │ │ └── robots_txt.jpg │ └── custom.css ├── 2014_10_02-lecture │ └── leaders.png └── 2014_09_23-lecture │ └── images │ ├── ipython.png │ ├── wiz_oz.png │ ├── grouplens.jpg │ ├── cute_panda.jpg │ ├── pandas_book.jpg │ └── robots_txt.jpg ├── README.md ├── .gitignore └── homework ├── HW4.ipynb ├── HW1.ipynb ├── HW5.ipynb ├── HW3.ipynb └── HW2.ipynb /labs/figures/conditionalmean.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/2014/HEAD/labs/figures/conditionalmean.png -------------------------------------------------------------------------------- /lectures/lecture11/images/KNN.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/2014/HEAD/lectures/lecture11/images/KNN.gif -------------------------------------------------------------------------------- /lectures/lecture11/images/KNN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/2014/HEAD/lectures/lecture11/images/KNN.png -------------------------------------------------------------------------------- /lectures/lecture02/images/ipython.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/2014/HEAD/lectures/lecture02/images/ipython.png -------------------------------------------------------------------------------- /lectures/lecture11/images/knn_fred.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/2014/HEAD/lectures/lecture11/images/knn_fred.png -------------------------------------------------------------------------------- /lectures/2014_10_02-lecture/leaders.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/2014/HEAD/lectures/2014_10_02-lecture/leaders.png -------------------------------------------------------------------------------- /lectures/lecture02/images/cute_panda.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/2014/HEAD/lectures/lecture02/images/cute_panda.jpg -------------------------------------------------------------------------------- /lectures/lecture02/images/grouplens.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/2014/HEAD/lectures/lecture02/images/grouplens.jpg -------------------------------------------------------------------------------- /lectures/lecture02/images/pandas_book.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/2014/HEAD/lectures/lecture02/images/pandas_book.jpg -------------------------------------------------------------------------------- /lectures/lecture02/images/robots_txt.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/2014/HEAD/lectures/lecture02/images/robots_txt.jpg -------------------------------------------------------------------------------- /lectures/lecture11/images/tangent_space.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/2014/HEAD/lectures/lecture11/images/tangent_space.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Welcome to CS109 Data Science 2 | 3 | Official course page here: [http://cs109.github.io/2014](http://cs109.github.io/2014) -------------------------------------------------------------------------------- /lectures/2014_09_23-lecture/images/ipython.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/2014/HEAD/lectures/2014_09_23-lecture/images/ipython.png -------------------------------------------------------------------------------- /lectures/2014_09_23-lecture/images/wiz_oz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/2014/HEAD/lectures/2014_09_23-lecture/images/wiz_oz.png -------------------------------------------------------------------------------- /lectures/lecture11/images/digits_rotated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/2014/HEAD/lectures/lecture11/images/digits_rotated.png -------------------------------------------------------------------------------- /lectures/2014_09_23-lecture/images/grouplens.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/2014/HEAD/lectures/2014_09_23-lecture/images/grouplens.jpg -------------------------------------------------------------------------------- /lectures/2014_09_23-lecture/images/cute_panda.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/2014/HEAD/lectures/2014_09_23-lecture/images/cute_panda.jpg -------------------------------------------------------------------------------- /lectures/2014_09_23-lecture/images/pandas_book.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/2014/HEAD/lectures/2014_09_23-lecture/images/pandas_book.jpg -------------------------------------------------------------------------------- /lectures/2014_09_23-lecture/images/robots_txt.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/2014/HEAD/lectures/2014_09_23-lecture/images/robots_txt.jpg -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | homework/.ipynb_checkpoints/ 3 | homework-solutions/.ipynb_checkpoints/ 4 | labs/.ipynb_checkpoints/ 5 | lectures/2014_09_25-lecture/.ipynb_checkpoints/ 6 | lectures/2014_09_30-lecture/.ipynb_checkpoints/ 7 | lectures/2014_10_02-lecture/.ipynb_checkpoints/ 8 | 9 | -------------------------------------------------------------------------------- /lectures/lecture02/custom.css: -------------------------------------------------------------------------------- 1 | 65 | -------------------------------------------------------------------------------- /lectures/lecture11/custom.css: -------------------------------------------------------------------------------- 1 | 69 | -------------------------------------------------------------------------------- /homework/HW4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:6cf3ee9e9432c59a53fa4242f34c0275b9073842a6f526c20c0f9b078d7f251f" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "# Homework 4: Can you predict the Midterm Elections?\n", 16 | "\n", 17 | "Due: Monday, November 3, 2014 11:59 PM\n", 18 | "\n", 19 | " Download this assignment\n", 20 | "\n", 21 | "#### Submission Instructions\n", 22 | "To submit your homework, create a folder named lastname_firstinitial_hw# and place your IPython notebooks, data files, and any other files in this folder. Your IPython Notebooks should be completely executed with the results visible in the notebook. We should not have to run any code. Compress the folder (please use .zip compression) and submit to the CS109 dropbox in the appropriate folder. If we cannot access your work because these directions are not followed correctly, we will not grade your work. For the competition (problem 4), we will post a link on Piazza to a Google Form for you to submit your predictions. \n", 23 | "\n", 24 | "\n", 25 | "---\n" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## Introduction\n", 33 | "\n", 34 | "**Add Introduction**\n", 35 | "\n", 36 | "You will use the [HuffPost Pollster API](http://elections.huffingtonpost.com/pollster/api) to extract the polls for the current 2014 Senate Midterm Elections and provide a final prediction of the result of each state.\n", 37 | "\n", 38 | "#### Data\n", 39 | "\n", 40 | "We will use the polls from the [2014 Senate Midterm Elections](http://elections.huffingtonpost.com/pollster) from the [HuffPost Pollster API](http://elections.huffingtonpost.com/pollster/api). \n", 41 | "\n", 42 | "---" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## Problem 1: Data Wrangling\n", 50 | "\n", 51 | "We will read in the polls from the [2014 Senate Midterm Elections](http://elections.huffingtonpost.com/pollster) from the [HuffPost Pollster API](http://elections.huffingtonpost.com/pollster/api) and create a dictionary of DataFrames as well a master table information for each race." 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "#### Problem 1(a)\n", 59 | "\n", 60 | "Read in [this JSON object](http://elections.huffingtonpost.com/pollster/api/charts/?topic=2014-senate) containing the polls for the 2014 Senate Elections using the HuffPost API. Call this JSON object `info`. This JSON object is imported as a list in Python where each element contains the information for one race. Use the function `type` to confirm the that `info` is a list. " 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "collapsed": false, 66 | "input": [ 67 | "### Your code here ###" 68 | ], 69 | "language": "python", 70 | "metadata": {}, 71 | "outputs": [], 72 | "prompt_number": 1 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "#### Problem 1(b)\n", 79 | "\n", 80 | "For each element of the list in `info` extract the state. We should have one poll per state, but we do not. Why?\n", 81 | "\n", 82 | "**Hint**: Use the internet to find out information on the races in each state that has more than one entry. Eliminate entries of the list that represent races that are not happening." 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "collapsed": false, 88 | "input": [ 89 | "### Your code here ###" 90 | ], 91 | "language": "python", 92 | "metadata": {}, 93 | "outputs": [], 94 | "prompt_number": 40 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "** Your answer here: **" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "#### Problem 1(c)\n", 108 | "\n", 109 | "Create a dictionary of pandas DataFrames called `polls` keyed by the name of the election (a string). Each value in the dictionary should contain the polls for one of the races." 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "collapsed": false, 115 | "input": [ 116 | "### Your code here ###" 117 | ], 118 | "language": "python", 119 | "metadata": {}, 120 | "outputs": [], 121 | "prompt_number": 41 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "#### Problem 1(d)\n", 128 | "\n", 129 | "Now create a master table information containing information about each race. Create a pandas DataFrame called `candidates` with rows containing information about each race. The `candidates` DataFrame should have the following columns: \n", 130 | "\n", 131 | "1. `State` = the state where the race is being held\n", 132 | "2. `R` = name of republican candidate\n", 133 | "3. `D` = name of non-republican candidate (Democrate or Independent) \n", 134 | "4. `incumbent` = R, D or NA\n", 135 | "\n", 136 | "**Hint**: You will need a considerable amount of data wrangling for this." 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "collapsed": false, 142 | "input": [ 143 | "### Your code here ###" 144 | ], 145 | "language": "python", 146 | "metadata": {}, 147 | "outputs": [], 148 | "prompt_number": 2 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "## Problem 2: Confidence Intervals\n", 155 | "\n", 156 | "Compute a 99% confidence interval for each state. " 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "#### Problem 2(a)\n", 164 | "\n", 165 | "Assume you have $M$ polls with sample sizes $n_1, \\dots, n_M$. If the polls are independent, what is the average of the variances of each poll if the true proportion is $p$?" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "** Your answer here: **" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "#### Problem 2(b)\n", 180 | "\n", 181 | "Compute the square root of these values in Problem 2(a) for the republican candidates in each race. Then, compute the standard deviations of the observed poll results for each race. " 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "collapsed": false, 187 | "input": [ 188 | "### Your code here ###" 189 | ], 190 | "language": "python", 191 | "metadata": {}, 192 | "outputs": [] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "#### Problem 2(c) \n", 199 | "\n", 200 | "Plot observed versus theoretical (average of the theoretical SDs) with the area of the point proportional to number of polls. How do these compare?" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "collapsed": false, 206 | "input": [ 207 | "### Your code here ###" 208 | ], 209 | "language": "python", 210 | "metadata": {}, 211 | "outputs": [] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "** Your answer here: **" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "#### Problem 2(d)\n", 225 | "\n", 226 | "Repeat Problem 2(c) but include only the most recent polls from the last two months. Do they match better or worse or the same? Can we just trust the theoretical values?" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "collapsed": false, 232 | "input": [ 233 | "### Your code here ###" 234 | ], 235 | "language": "python", 236 | "metadata": {}, 237 | "outputs": [] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "** Your answer here: **" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "#### Problem 2(e)\n", 251 | "\n", 252 | "Create a scatter plot with each point representing one state. Is there one or more races that are outlier in that it they have much larger variabilities than expected ? Explore the original poll data and explain why the discrepancy?" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "collapsed": false, 258 | "input": [ 259 | "### Your code here ###" 260 | ], 261 | "language": "python", 262 | "metadata": {}, 263 | "outputs": [] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "** Your answer here: **" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "#### Problem 2(f)\n", 277 | "\n", 278 | "Construct confidence intervals for the difference in each race. Use either theoretical or data driven estimates of the standard error depending on your answer to this question. Use the results in Problem 2(e), to justify your choice.\n" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "collapsed": false, 284 | "input": [ 285 | "### Your code here ###" 286 | ], 287 | "language": "python", 288 | "metadata": {}, 289 | "outputs": [] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "# Problem 3: Prediction and Posterior Probabilities\n", 296 | "\n", 297 | "Perform a Bayesian analysis to predict the probability of Republicans winning in each state then provide a posterior distribution of the number of republicans in the senate." 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "#### Problem 3(a)\n", 305 | "\n", 306 | "First, we define a Bayesian model for each race. The prior for the difference $\\theta$ between republicans and democtrats will be $N(\\mu,\\tau^2)$. Say before seeing poll data you have no idea who is going to win, what should $\\mu$ be? How about $\\tau$, should it be large or small? " 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": {}, 312 | "source": [ 313 | "** Your answer here: **" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "#### Problem 3(b)\n", 321 | "\n", 322 | "What is the distribution of $d$ conditioned on $\\theta$. What is the posterior distribution of $\\theta | d$? \n", 323 | "\n", 324 | "**Hint**: Use normal approximation. " 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "**Your answer here:**" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "#### Problem 3(c)\n", 339 | "\n", 340 | "The prior represents what we think before hand. We do not know who is expected to win, so we assume $\\mu=0$. For this problem estimate $\\tau$ using the observed differences across states (Hint: $\\tau$ represents the standard deviation of a typical difference). Compute the posterior mean for each state and plot it against original average. Is there much change? Why or why not? " 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "collapsed": false, 346 | "input": [ 347 | "### Your code here ###" 348 | ], 349 | "language": "python", 350 | "metadata": {}, 351 | "outputs": [] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "**Your answer here:**" 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "metadata": {}, 363 | "source": [ 364 | "#### Problem 3(d)\n", 365 | "\n", 366 | "For each state, report a probabilty of Republicans winning. How does your answer here compare to the other aggregators?" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "collapsed": false, 372 | "input": [ 373 | "### Your code here ###" 374 | ], 375 | "language": "python", 376 | "metadata": {}, 377 | "outputs": [] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": {}, 382 | "source": [ 383 | "**Your answer here:**" 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": {}, 389 | "source": [ 390 | "#### Problem 3(e)\n", 391 | "\n", 392 | "Use the posterior distributions in a Monte Carlo simulation to generate election results. In each simulation compute the total number of seats the Republican control. Show a histogram of these results." 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "collapsed": false, 398 | "input": [ 399 | "### Your code here ###" 400 | ], 401 | "language": "python", 402 | "metadata": {}, 403 | "outputs": [] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "## Problem 4\n", 410 | "\n", 411 | "Predict the results for the 2014 Midterm Elections. We will have a three competitions with the terms for scoring entries described above. For both questions below, **explain** or provide commentary on how you arrived at your predictions including code. \n", 412 | "\n", 413 | "**Hint**: Use election results from 2010, 2012 to build and test models." 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "#### Problem 4(a)\n", 421 | "\n", 422 | "Predict the number of Republican senators. You may provide an interval. Smallest interval that includes the election day result wins. \n", 423 | "\n", 424 | "**Note**: we want the total so add the numbers of those that are not up for election." 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "collapsed": false, 430 | "input": [ 431 | "### Your code here ###" 432 | ], 433 | "language": "python", 434 | "metadata": {}, 435 | "outputs": [] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "**Provide an explanation of methodology here**:" 442 | ] 443 | }, 444 | { 445 | "cell_type": "markdown", 446 | "metadata": {}, 447 | "source": [ 448 | "#### Problem 4(b)\n", 449 | "\n", 450 | "Predict the R-D difference in each state. The predictions that minimize the residual sum of squares between predicted and observed differences wins." 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "collapsed": false, 456 | "input": [ 457 | "### Your code here ###" 458 | ], 459 | "language": "python", 460 | "metadata": {}, 461 | "outputs": [] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": {}, 466 | "source": [ 467 | "**Provide an explanation of methodology here**:" 468 | ] 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "metadata": {}, 473 | "source": [ 474 | "#### Problem 4(c)\n", 475 | "\n", 476 | "Report a confidence interval for the R-D difference in each state. If the election day result falls outside your confidence interval in more than two states you are eliminated. For those surviving this cutoff, we will add up the size of all confidence intervals and sum. The smallest total length of confidence interval wins. \n", 477 | "\n", 478 | "**Note**: you can use Bayesian credible intervals or whatever else you want. " 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "collapsed": false, 484 | "input": [ 485 | "### Your code here ###" 486 | ], 487 | "language": "python", 488 | "metadata": {}, 489 | "outputs": [] 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": {}, 494 | "source": [ 495 | "**Provide an explanation of methodology here**:" 496 | ] 497 | }, 498 | { 499 | "cell_type": "markdown", 500 | "metadata": {}, 501 | "source": [ 502 | "# Submission Instructions\n", 503 | "\n", 504 | "To submit your homework, create a folder named **lastname_firstinitial_hw#** and place your IPython notebooks, data files, and any other files in this folder. Your IPython Notebooks should be completely executed with the results visible in the notebook. We should not have to run any code. Compress the folder (please use .zip compression) and submit to the CS109 dropbox in the appropriate folder. *If we cannot access your work because these directions are not followed correctly, we will not grade your work.*\n" 505 | ] 506 | } 507 | ], 508 | "metadata": {} 509 | } 510 | ] 511 | } -------------------------------------------------------------------------------- /homework/HW1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:d70152474cb236c2ba795571cbec8aea6182dc4faf74613b4705b605c2c87592" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "# Homework 1. Exploratory Data Analysis\n", 16 | "\n", 17 | "Due: Thursday, September 18, 2014 11:59 PM\n", 18 | "\n", 19 | " Download this assignment\n", 20 | "\n", 21 | "---" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## Introduction\n", 29 | "\n", 30 | "In this homework we ask you three questions that we expect you to answer using data. For each question we ask you to complete a series of tasks that should help guide you through the data analysis. Complete these tasks and then write a short (100 words or less) answer to the question.\n", 31 | "\n", 32 | "**Note**: We will briefly discuss this homework assignment on Thursday in class.\n", 33 | "\n", 34 | "#### Data\n", 35 | "For this assignment we will use two databases: \n", 36 | "\n", 37 | "1. The [Sean Lahman's Baseball Database](http://seanlahman.com/baseball-archive/statistics) which contains the \"complete batting and pitching statistics from 1871 to 2013, plus fielding statistics, standings, team stats, managerial records, post-season data, and more. For more details on the latest release, please [read the documentation](http://seanlahman.com/files/database/readme2012.txt).\"\n", 38 | "\n", 39 | "2. [Gapminder](http://www.gapminder.org) is a great resource that contains over [500 data sets](http://www.gapminder.org/data/) related to world indicators such as income, GDP and life expectancy. \n", 40 | "\n", 41 | "\n", 42 | "#### Purpose\n", 43 | "\n", 44 | "In this assignment, you will learn how to: \n", 45 | "\n", 46 | "a. Load in CSV files from the web. \n", 47 | "\n", 48 | "b. Create functions in python. \n", 49 | "\n", 50 | "C. Create plots and summary statistics for exploratory data analysis such as histograms, boxplots and scatter plots. \n", 51 | "\n", 52 | "\n", 53 | "#### Useful libraries for this assignment \n", 54 | "\n", 55 | "* [numpy](http://docs.scipy.org/doc/numpy-dev/user/index.html), for arrays\n", 56 | "* [pandas](http://pandas.pydata.org/), for data frames\n", 57 | "* [matplotlib](http://matplotlib.org/), for plotting\n", 58 | "\n", 59 | "---" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "collapsed": false, 65 | "input": [ 66 | "# special IPython command to prepare the notebook for matplotlib\n", 67 | "%matplotlib inline \n", 68 | "\n", 69 | "import numpy as np\n", 70 | "import pandas as pd\n", 71 | "import matplotlib.pyplot as plt" 72 | ], 73 | "language": "python", 74 | "metadata": {}, 75 | "outputs": [], 76 | "prompt_number": 1 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "## Problem 1\n", 83 | "\n", 84 | "In Lecture 1, we showed a plot that provided evidence that the 2002 and 2003 Oakland A's, a team that used data science, had a competitive advantage. Since, others teams have started using data science as well. Use exploratory data analysis to determine if the competitive advantage has since disappeared. " 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "#### Problem 1(a) \n", 92 | "Load in [these CSV files](http://seanlahman.com/files/database/lahman-csv_2014-02-14.zip) from the [Sean Lahman's Baseball Database](http://seanlahman.com/baseball-archive/statistics). For this assignment, we will use the 'Salaries.csv' and 'Teams.csv' tables. Read these tables into a pandas `DataFrame` and show the head of each table. \n", 93 | "\n", 94 | "**Hint** Use the [requests](http://docs.python-requests.org/en/latest/), [StringIO](http://docs.python.org/2/library/stringio.html) and [zipfile](https://docs.python.org/2/library/zipfile.html) modules to get from the web. " 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "collapsed": false, 100 | "input": [ 101 | "#your code here" 102 | ], 103 | "language": "python", 104 | "metadata": {}, 105 | "outputs": [], 106 | "prompt_number": 2 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "#### Problem 1(b)\n", 113 | "\n", 114 | "Summarize the Salaries DataFrame to show the total salaries for each team for each year. Show the head of the new summarized DataFrame. " 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "collapsed": false, 120 | "input": [ 121 | "#your code here" 122 | ], 123 | "language": "python", 124 | "metadata": {}, 125 | "outputs": [], 126 | "prompt_number": 3 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "#### Problem 1(c)\n", 133 | "\n", 134 | "Merge the new summarized Salaries DataFrame and Teams DataFrame together to create a new DataFrame\n", 135 | "showing wins and total salaries for each team for each year year. Show the head of the new merged DataFrame.\n", 136 | "\n", 137 | "**Hint**: Merge the DataFrames using `teamID` and `yearID`." 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "collapsed": false, 143 | "input": [ 144 | "#your code here" 145 | ], 146 | "language": "python", 147 | "metadata": {}, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "#### Problem 1(d)\n", 155 | "\n", 156 | "How would you graphically display the relationship between total wins and total salaries for a given year? What kind of plot would be best? Choose a plot to show this relationship and specifically annotate the Oakland baseball team on the on the plot. Show this plot across multiple years. In which years can you detect a competitive advantage from the Oakland baseball team of using data science? When did this end? \n", 157 | "\n", 158 | "**Hints**: Use a `for` loop to consider multiple years. Use the `teamID` (three letter representation of the team name) to save space on the plot. " 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "collapsed": false, 164 | "input": [ 165 | "#your code here" 166 | ], 167 | "language": "python", 168 | "metadata": {}, 169 | "outputs": [], 170 | "prompt_number": 4 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "#### Problem 1(e):\n", 177 | "\n", 178 | "**For AC209 Students**: Fit a linear regression to the data from each year and obtain the residuals. Plot the residuals against time to detect patterns that support your answer in 1(d). " 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "collapsed": false, 184 | "input": [ 185 | "#your code here" 186 | ], 187 | "language": "python", 188 | "metadata": {}, 189 | "outputs": [], 190 | "prompt_number": 5 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "## Discussion for Problem 1\n", 197 | "\n", 198 | "*Write a brief discussion of your conclusions to the questions and tasks above in 100 words or less.*\n", 199 | "\n", 200 | "---\n" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "## Problem 2\n", 208 | "\n", 209 | "Several media reports have demonstrated the income inequality has increased in the US during this last decade. Here we will look at global data. Use exploratory data analysis to determine if the gap between Africa/Latin America/Asia and Europe/NorthAmerica has increased, decreased or stayed the same during the last two decades. " 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "#### Problem 2(a)\n", 217 | "\n", 218 | "Using the list of countries by continent from [World Atlas](http://www.worldatlas.com/cntycont.htm) data, load in the `countries.csv` file into a pandas DataFrame and name this data set as `countries`. This data set can be found on Github in the 2014_data repository [here](https://github.com/cs109/2014_data/blob/master/countries.csv). " 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "collapsed": false, 224 | "input": [ 225 | "#your code here" 226 | ], 227 | "language": "python", 228 | "metadata": {}, 229 | "outputs": [], 230 | "prompt_number": 6 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "Using the [data available on Gapminder](http://www.gapminder.org/data/), load in the [Income per person (GDP/capita, PPP$ inflation-adjusted)](https://spreadsheets.google.com/pub?key=phAwcNAVuyj1jiMAkmq1iMg&gid=0) as a pandas DataFrame and name this data set as `income`.\n", 237 | "\n", 238 | "**Hint**: Consider using the pandas function `pandas.read_excel()` to read in the .xlsx file directly." 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "collapsed": false, 244 | "input": [ 245 | "#your code here" 246 | ], 247 | "language": "python", 248 | "metadata": {}, 249 | "outputs": [], 250 | "prompt_number": 7 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "Transform the data set to have years as the rows and countries as the columns. Show the head of this data set when it is loaded. " 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "collapsed": false, 262 | "input": [ 263 | "#your code here" 264 | ], 265 | "language": "python", 266 | "metadata": {}, 267 | "outputs": [], 268 | "prompt_number": 8 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "#### Problem 2(b)\n", 275 | "\n", 276 | "Graphically display the distribution of income per person across all countries in the world for any given year (e.g. 2000). What kind of plot would be best? " 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "collapsed": false, 282 | "input": [ 283 | "#your code here" 284 | ], 285 | "language": "python", 286 | "metadata": {}, 287 | "outputs": [], 288 | "prompt_number": 9 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "#### Problem 2(c)\n", 295 | "\n", 296 | "Write a function to merge the `countries` and `income` data sets for any given year. " 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "collapsed": false, 302 | "input": [ 303 | "\"\"\"\n", 304 | "Function\n", 305 | "--------\n", 306 | "mergeByYear\n", 307 | "\n", 308 | "Return a merged DataFrame containing the income, \n", 309 | "country name and region for a given year. \n", 310 | "\n", 311 | "Parameters\n", 312 | "----------\n", 313 | "year : int\n", 314 | " The year of interest\n", 315 | "\n", 316 | "Returns\n", 317 | "-------\n", 318 | "a DataFrame\n", 319 | " A pandas DataFrame with three columns titled \n", 320 | " 'Country', 'Region', and 'Income'. \n", 321 | "\n", 322 | "Example\n", 323 | "-------\n", 324 | ">>> mergeByYear(2010)\n", 325 | "\"\"\"\n", 326 | "#your code here" 327 | ], 328 | "language": "python", 329 | "metadata": {}, 330 | "outputs": [] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "#### Problem 2(d) \n", 337 | "\n", 338 | "Use exploratory data analysis tools such as histograms and boxplots to explore the distribution of the income per person by region data set from 2(c) for a given year. Describe how these change through the recent years?\n", 339 | "\n", 340 | "**Hint**: Use a `for` loop to consider multiple years. " 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "collapsed": false, 346 | "input": [ 347 | "#your code here" 348 | ], 349 | "language": "python", 350 | "metadata": {}, 351 | "outputs": [], 352 | "prompt_number": 11 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": {}, 357 | "source": [ 358 | "## Discussion for Problem 2\n", 359 | "\n", 360 | "*Write a brief discussion of your conclusions to the questions and tasks above in 100 words or less.*\n", 361 | "\n", 362 | "---\n" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": {}, 368 | "source": [ 369 | "## Problem 3\n", 370 | "\n", 371 | "In general, if group A has larger values than group B on average, does this mean the largest values are from group A? Discuss after completing each of the problems below. " 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": {}, 377 | "source": [ 378 | "#### Problem 3(a)\n", 379 | "\n", 380 | "Assume you have two list of numbers, X and Y, with distribution approximately normal. X and Y have standard deviation equal to 1, but the average of X is different from the average of Y. If the difference in the average of X and the average of Y is larger than 0, how does the proportion of X > a compare to the proportion of Y > a? " 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": {}, 386 | "source": [ 387 | "Write a function that analytically calculates the ratio of these two proportions: Pr(X > a)/Pr(Y > a) as function of the difference in the average of X and the average of Y. \n", 388 | "\n", 389 | "**Hint**: Use the `scipy.stats` module for useful functions related to a normal random variable such as the probability density function, cumulative distribution function and survival function. \n", 390 | "\n", 391 | "**Update**: Assume Y is normally distributed with mean equal to 0. " 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": {}, 397 | "source": [ 398 | "Show the curve for different values of a (a = 2,3,4 and 5)." 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "collapsed": false, 404 | "input": [ 405 | "\"\"\"\n", 406 | "Function\n", 407 | "--------\n", 408 | "ratioNormals\n", 409 | "\n", 410 | "Return ratio of these two proportions: \n", 411 | " Pr(X > a)/Pr(Y > a) as function of \n", 412 | " the difference in the average of X \n", 413 | " and the average of Y. \n", 414 | "\n", 415 | "Parameters\n", 416 | "----------\n", 417 | "diff : difference in the average of X \n", 418 | " and the average of Y. \n", 419 | "a : cutoff value\n", 420 | "\n", 421 | "Returns\n", 422 | "-------\n", 423 | "Returns ratio of these two proportions: \n", 424 | " Pr(X > a)/Pr(Y > a)\n", 425 | " \n", 426 | "Example\n", 427 | "-------\n", 428 | ">>> ratioNormals(diff = 1, a = 2)\n", 429 | "\"\"\"\n", 430 | "#your code here" 431 | ], 432 | "language": "python", 433 | "metadata": {}, 434 | "outputs": [] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "collapsed": false, 439 | "input": [ 440 | "#your code here" 441 | ], 442 | "language": "python", 443 | "metadata": {}, 444 | "outputs": [], 445 | "prompt_number": 13 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "metadata": {}, 450 | "source": [ 451 | "#### Problem 3(b)\n", 452 | "\n", 453 | "Now consider the distribution of income per person from two regions: Asia and South America. Estimate the average income per person across the countries in those two regions. Which region has the larger average of income per person across the countries in that region? \n", 454 | "\n", 455 | "**Update**: Use the year 2012. " 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "collapsed": false, 461 | "input": [ 462 | "#your code here" 463 | ], 464 | "language": "python", 465 | "metadata": {}, 466 | "outputs": [], 467 | "prompt_number": 14 468 | }, 469 | { 470 | "cell_type": "markdown", 471 | "metadata": {}, 472 | "source": [ 473 | "#### Problem 3(c) \n", 474 | "\n", 475 | "Calculate the proportion of countries with income per person that is greater than 10,000 dollars. Which region has a larger proportion of countries with income per person greater than 10,000 dollars? If the answer here is different from the answer in 3(b), explain why in light of your answer to 3(a).\n", 476 | "\n", 477 | "**Update**: Use the year 2012. " 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "collapsed": false, 483 | "input": [ 484 | "#your code here" 485 | ], 486 | "language": "python", 487 | "metadata": {}, 488 | "outputs": [], 489 | "prompt_number": 15 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": {}, 494 | "source": [ 495 | "#### Problem 3(d)\n", 496 | "\n", 497 | "**For AC209 Students**: Re-run this analysis in Problem 3 but compute the average income per person for each region, instead of the average of the reported incomes per person across countries in the region. Why are these two different? Hint: use this [data set](https://spreadsheets.google.com/pub?key=phAwcNAVuyj0XOoBL_n5tAQ&gid=0). " 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "collapsed": false, 503 | "input": [ 504 | "#your code here" 505 | ], 506 | "language": "python", 507 | "metadata": {}, 508 | "outputs": [], 509 | "prompt_number": 16 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "metadata": {}, 514 | "source": [ 515 | "## Discussion for Problem 3\n", 516 | "\n", 517 | "*Write a brief discussion of your conclusions to the questions and tasks above in 100 words or less.*\n", 518 | "\n", 519 | "---\n" 520 | ] 521 | } 522 | ], 523 | "metadata": {} 524 | } 525 | ] 526 | } -------------------------------------------------------------------------------- /labs/Lab10_Notes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "code", 12 | "collapsed": false, 13 | "input": [ 14 | "import numpy as np\n", 15 | "import scipy as sp\n", 16 | "import pandas as pd\n", 17 | "import sklearn\n", 18 | "import seaborn as sns\n", 19 | "from matplotlib import pyplot as plt\n", 20 | "%matplotlib inline\n", 21 | "import time" 22 | ], 23 | "language": "python", 24 | "metadata": {}, 25 | "outputs": [], 26 | "prompt_number": 2 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "# Lab 10: Distributed Computing and Machine Learning\n", 33 | "\n", 34 | "In this lab we are going to talk about two facinating topics-- 1) Distributed computing 2) machine learning. \n", 35 | "\n", 36 | "Distributed computing is becoming increasingly important because with the massive amout of data we have, it is becoming increasingly hard to get any knowledge using a single machine. However, using many computers we can distribute our work to many machines (nodes) and then we can reduce the result.\n", 37 | "\n", 38 | "\n", 39 | "\n", 40 | "## MRJob: Parallel Computing with Python\n", 41 | "### Simple word count example:" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "collapsed": false, 47 | "input": [ 48 | "# MRJob\n", 49 | "import os\n", 50 | "mr_count = \"\"\"\n", 51 | "from mrjob.job import MRJob\n", 52 | "\n", 53 | "\n", 54 | "class MRWordFrequencyCount(MRJob):\n", 55 | "\n", 56 | " def mapper(self, _, line):\n", 57 | " yield \"chars\", len(line)\n", 58 | " yield \"words\", len(line.split())\n", 59 | " yield \"lines\", 1\n", 60 | "\n", 61 | " def reducer(self, key, values):\n", 62 | " yield key, sum(values)\n", 63 | "\n", 64 | "\n", 65 | "if __name__ == '__main__':\n", 66 | " t1 = time.time()\n", 67 | " MRWordFrequencyCount.run()\n", 68 | " print \"CPU Time\", time.time() - t1\"\"\"\n", 69 | "\n", 70 | "fout = open(\"mr_count.py\",\"w\")\n", 71 | "fout.write(mr_count)\n", 72 | "fout.close()\n", 73 | "os.system(\"wget https://www.gutenberg.org/cache/epub/35/pg35.txt -O time_machine.txt\")" 74 | ], 75 | "language": "python", 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "metadata": {}, 80 | "output_type": "pyout", 81 | "prompt_number": 3, 82 | "text": [ 83 | "0" 84 | ] 85 | } 86 | ], 87 | "prompt_number": 3 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "The code above made a python file on your computer and downloaded the novel time machine from Project Guttenberg. Now go to your terminal and run the code with\n", 94 | "\n", 95 | "`python mr_count.py time_machine.txt`" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "collapsed": false, 101 | "input": [ 102 | "complex_mr = \"\"\"from mrjob.job import MRJob\n", 103 | "import re\n", 104 | "import time\n", 105 | "\n", 106 | "WORD_RE = re.compile(r\"[\\w']+\")\n", 107 | "\n", 108 | "\n", 109 | "class MRMostUsedWord(MRJob):\n", 110 | "\n", 111 | " def mapper_get_words(self, _, line):\n", 112 | " # yield each word in the line\n", 113 | " for word in WORD_RE.findall(line):\n", 114 | " yield (word.lower(), 1)\n", 115 | "\n", 116 | " def combiner_count_words(self, word, counts):\n", 117 | " # sum the words we've seen so far\n", 118 | " yield (word, sum(counts))\n", 119 | "\n", 120 | " def reducer_count_words(self, word, counts):\n", 121 | " # send all (num_occurrences, word) pairs to the same reducer.\n", 122 | " # num_occurrences is so we can easily use Python's max() function.\n", 123 | " yield None, (sum(counts), word)\n", 124 | "\n", 125 | " # discard the key; it is just None\n", 126 | " def reducer_find_max_word(self, _, word_count_pairs):\n", 127 | " # each item of word_count_pairs is (count, word),\n", 128 | " # so yielding one results in key=counts, value=word\n", 129 | " yield max(word_count_pairs)\n", 130 | "\n", 131 | " def steps(self):\n", 132 | " return [\n", 133 | " self.mr(mapper=self.mapper_get_words,\n", 134 | " combiner=self.combiner_count_words,\n", 135 | " reducer=self.reducer_count_words),\n", 136 | " self.mr(reducer=self.reducer_find_max_word)\n", 137 | " ]\n", 138 | "\n", 139 | "\n", 140 | "if __name__ == '__main__':\n", 141 | " t1 = time.time()\n", 142 | " MRMostUsedWord.run()\n", 143 | " print \"CPU Time\", time.time() - t1\"\"\"\n", 144 | "\n", 145 | "fout = open(\"mr_max_count.py\",\"w\")\n", 146 | "fout.write(complex_mr)\n", 147 | "fout.close()" 148 | ], 149 | "language": "python", 150 | "metadata": {}, 151 | "outputs": [], 152 | "prompt_number": 4 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "Execute the code with \n", 159 | "\n", 160 | "`python mr_max_count.py time_machine.txt`" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "### Should we always use map-reduce?!\n", 168 | "Nobody ever got fired for using hadoop on a cluster! \n", 169 | "http://research.microsoft.com/pubs/163083/hotcbp12%20final.pdf" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "collapsed": false, 175 | "input": [ 176 | "t1 = time.time()\n", 177 | "fin = open(\"time_machine.txt\",\"r\")\n", 178 | "lines = 0\n", 179 | "words = 0\n", 180 | "chars = 0\n", 181 | "\n", 182 | "for l in fin:\n", 183 | " lines += 1\n", 184 | " words += len(l.split())\n", 185 | " chars += len(l)\n", 186 | " \n", 187 | "print lines, words, chars\n", 188 | "print \"CPU Time\", time.time() - t1" 189 | ], 190 | "language": "python", 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "output_type": "stream", 195 | "stream": "stdout", 196 | "text": [ 197 | "3617 35319 201900\n", 198 | "CPU Time 0.0162739753723\n" 199 | ] 200 | } 201 | ], 202 | "prompt_number": 5 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "As you can see MrJob did not perform as good as the single threaded implementation. Why?\n", 209 | "\n", 210 | "The real benefit of using map-reduce becomes apparent when we have many computing nodes. For example, using amazon's server it is possible to perform this job on hundreds/thousands of nodes. \n", 211 | "\n", 212 | "Fun reading: http://open.blogs.nytimes.com/2007/11/01/self-service-prorated-super-computing-fun/\n", 213 | "\n", 214 | "If you are interested in general local parallel computing, another library to play with is Multiprocessing: https://docs.python.org/2/library/multiprocessing.html" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "## Machine Learning: Fun with handwritten digits" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "collapsed": false, 227 | "input": [ 228 | "# Clustering/ Parallelizing Clustering\n", 229 | "from sklearn.datasets import load_digits\n", 230 | "from sklearn.cross_validation import cross_val_score\n", 231 | "from sklearn.ensemble import RandomForestClassifier\n", 232 | "\n", 233 | "digits = load_digits()\n", 234 | "print digits.data.shape" 235 | ], 236 | "language": "python", 237 | "metadata": {}, 238 | "outputs": [ 239 | { 240 | "output_type": "stream", 241 | "stream": "stdout", 242 | "text": [ 243 | "(1797, 64)\n" 244 | ] 245 | } 246 | ], 247 | "prompt_number": 8 248 | }, 249 | { 250 | "cell_type": "code", 251 | "collapsed": false, 252 | "input": [ 253 | "X = digits.data\n", 254 | "y = digits.target" 255 | ], 256 | "language": "python", 257 | "metadata": {}, 258 | "outputs": [], 259 | "prompt_number": 9 260 | }, 261 | { 262 | "cell_type": "code", 263 | "collapsed": false, 264 | "input": [ 265 | "clf = RandomForestClassifier(n_estimators=50, random_state=0)" 266 | ], 267 | "language": "python", 268 | "metadata": {}, 269 | "outputs": [], 270 | "prompt_number": 10 271 | }, 272 | { 273 | "cell_type": "code", 274 | "collapsed": false, 275 | "input": [ 276 | "clf = clf.fit(X,y)" 277 | ], 278 | "language": "python", 279 | "metadata": {}, 280 | "outputs": [], 281 | "prompt_number": 11 282 | }, 283 | { 284 | "cell_type": "code", 285 | "collapsed": false, 286 | "input": [ 287 | "scores = cross_val_score(clf, X, y)" 288 | ], 289 | "language": "python", 290 | "metadata": {}, 291 | "outputs": [], 292 | "prompt_number": 12 293 | }, 294 | { 295 | "cell_type": "code", 296 | "collapsed": false, 297 | "input": [ 298 | "scores" 299 | ], 300 | "language": "python", 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "metadata": {}, 305 | "output_type": "pyout", 306 | "prompt_number": 13, 307 | "text": [ 308 | "array([ 0.97328881, 0.96994992, 0.97161937])" 309 | ] 310 | } 311 | ], 312 | "prompt_number": "*" 313 | }, 314 | { 315 | "cell_type": "code", 316 | "collapsed": false, 317 | "input": [ 318 | "score_arr = []\n", 319 | "for n in range(10,200,10):\n", 320 | " clf = RandomForestClassifier(n_estimators=n, random_state=0)\n", 321 | " clf = clf.fit(X,y)\n", 322 | " scores = cross_val_score(clf, X, y)\n", 323 | " score_arr.append(scores.mean() )\n", 324 | " " 325 | ], 326 | "language": "python", 327 | "metadata": {}, 328 | "outputs": [], 329 | "prompt_number": "*" 330 | }, 331 | { 332 | "cell_type": "code", 333 | "collapsed": false, 334 | "input": [ 335 | "plt.plot(range(10,200,10), score_arr)" 336 | ], 337 | "language": "python", 338 | "metadata": {}, 339 | "outputs": [ 340 | { 341 | "metadata": {}, 342 | "output_type": "pyout", 343 | "prompt_number": 29, 344 | "text": [ 345 | "[]" 346 | ] 347 | }, 348 | { 349 | "metadata": {}, 350 | "output_type": "display_data", 351 | "png": "iVBORw0KGgoAAAANSUhEUgAAAYQAAAEDCAYAAAA1CHOzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3WtUlFeC//tvgRRaZSkFVJUX1CTGS4J0xnSaDJDRyHBi\ne0w7/+QkhEhMZo7avdKdm0kmYiNOp5fQs+yVyRxjOiaZ7lmrl85gd9JkknXOX42dTHdGIGBuKEk0\nwY4QAxQFJUJxp57zAi3FKBctLKB+n1eUVfWw97Z4frUvz35MhmEYiIhI2IsIdQFERGR0UCCIiAig\nQBARkTMUCCIiAigQRETkDAWCiIgAMGGwFxQUFFBRUQFAbm4uSUlJgecOHDjAzp07MZvNrFy5kuzs\nbHw+Hxs3buT06dN0dXXxyCOPcNttt1FbW8szzzyD3+/H4XCwbds2zGbzyNVMRESGZcAeQllZGdXV\n1RQWFpKfn09+fn7gOb/fz9atW3n11VfZvXs377zzDvX19RQVFXHdddfx29/+lu3btwfes337dh54\n4AF2797NnDlzeP3110e2ZiIiMiwDBkJpaSkZGRkAzJ07l+bmZnw+HwBerxebzYbdbsdkMpGcnExx\ncTFxcXGcOnUKgObmZmJjY4G+cElPTwdg2bJllJSUjFilRERk+AYMBI/Hg91uDzyOjY2loaEh8LPP\n5+PEiRN0d3dTXl5OY2MjK1asoLa2ljvuuIM1a9aQk5MDQHt7O1FRUYH3ut3ukaqTiIhchkHnEM5n\nGAYmkwkAk8lEfn4+OTk5xMXF4XA48Pv9/Nd//RfTp0/n1Vdf5fPPPycvL4/f//733zqOiIiMLgMG\ngtPpxOPxBB673W4cDkfgcUpKCikpKQDk5eWRkJBAWVkZt912GwALFy6krq4Ov9+PxWKhq6sLs9lM\nfX09Tqdz0MKdH0AiIjKyBgyEtLQ0XnjhBe677z4qKytxuVxYLJbA8+vXr2fbtm1ERERQXFzMU089\nRX19PZ988gl33HEHJ0+exGKxEBERQWpqKnv37mXVqlXs37+fJUuWDFo4k8lEQ0PLlddScDhsassg\nUnsGl9ozeBwO22W/1zTYbqfPPfcc5eXlREZGsmXLFj799FNsNhsZGRm8/fbbvPTSS/T09PDDH/6Q\nO++8k7a2Nn7605/S2NhIT08PTzzxBLfeeisNDQ1s3LiRzs5OZs6cyS9+8QsiIyMHLaA+JMGhP7jg\nUnsGl9ozeEY0EEJNH5Lg0B9ccKk9g0vtGTxXEgi6UllERAAFgoiInKFAEBERQIEgIiJnKBBERARQ\nIIiIyBkKBBERAYa5l5GIXBm/YfD5CS+HjjYQHRXBLOdkZjttTIuzMCFS388ktBQIIldBbaOP4iN1\nlFTW0XS681vPT4g0MSPeGgiIWc7JzHJNxjoxKgSllXClQBAZIb6Obso+c1N8uJaqb04DMNEcyd98\nZzqpi6ZhMpmocbdS426hur6Vkx4f1fWtHKQucIy4KdHMOhMQs12TmeWcTHzMJCK06aOMAAWCSBD1\n+v0cOd7EwSN1fPyFh55ePyYTJF4bS9qiaSye7yA66tweXvNnxfR7b31TO9Xulr6gqG+lxt3Kx196\n+PjLc7sOTzRHkuCczBynjZsXOFgwO0YBIUGhvYzChPaKCa4L27PG3crBw7WUflrPaV8XANPjLKQl\nTSclcRp2W/Rl/65mXxc17pZAQFS7W6lrbMN/5k83bspEUhZNIy1pGi67ZZCjjU76fAaPNreTQekP\nLrgcDhtVXzVS+mk9xYdrqXa3AmCdOIFbb3SRljSda6bZRux+Hl3dvVR9c5qSI3WUH3XT2dULwPUz\np5KaNI3khU4sY2j+QZ/P4FEgyKD0Bxcc3T1+PvnSw6FjHj74vJ5ev0FkhImk6+JIS5rGd+bGEzXh\n6q4W6uzq5cNjDRw8UstnX3kxgAmREdw8P57URdNJvNZOZMToXsGkz2fwKBBkUPqDu3yGYfCX2hYO\nHqml7NN6fB09AMx2TSZt0XRuTXQxxWIOcSn7NJ3uoKSyjoOH66hragNgqtVMSuI0UpOmkeCYHOIS\nXpw+n8GjQJBB6Q9u+M6eXIuP1FHbeO7k+teJLu5ccj3WCaN3ItcwDI7Xnqb4cB1ln50LsTkuG6lJ\n07j1xtETYqDPZzApEGRQ+oMbmqEOv4yl9jw7zHXwcC2HjzfhN84Nc916o4trp9tCspTVbxg0nGqn\n1tNG0gInkX7/Vf39wWIYBqdau6hxt9LW0X1Fx4qMjGDRtbFMir78BaAKBBnUWDqBXW1+w+CLmlMc\nPDz0Cdqx2p7Nvi7er6zj4JE6as5MhANEmyP7LoZzTma2czKzXTZmxlsxRw1+m9uh6Ozu5euGc0tp\na9yt1DS0Btoa+pbgpi2axi0LnVd0QhxJPb1+6hrbzqz26lseXF3fSmv7lQXB+e5dNpcVt8657Pcr\nEGRQY/UENpLc3jaKj/QNCXmaO4C+C8FSFk0nbdE0XLGXXsI5Htqzur6Fyr80XXQpK4DJBNNiLWcu\nirMFwmLq5EsvoT33bfncybLG3Up9Uxvnn2giTCamx1uY7ZyMK9bC8doWKs5ca2GeEMHNCxykLZrO\nDXPsRESEZmjO19F93lLfvvp84/HR09v/lOmImRi4eHCq1QxXUNzICBM3z3dc0RXqIxoIBQUFVFRU\nAJCbm0tSUlLguQMHDrBz507MZjMrV64kOzub3//+97z55puB1xw5coSPPvqInJwcKisriYnpuxBn\n3bp1LF26dNACjvU/utFiPJzArpRhGDSe7uDTr7wcPFzLF183AxAdFcktCxykJk0f8kVe47E9u7p7\nOenxnXdRXAs1Da20d/b2e90US9SZrTX6ToIYDPhteVL0BGaf6X3McvVtzTEj3tpvNZbDYeOzL92U\nHOnrvbi97QDYbdGkJPZdYzE9zjoi9fYbBp5T7f0CrMbdQuMFW4xETYhgZrz1zBXjtkCParT1ZkYs\nEMrKyvjNb37Dzp07qaqqIjc3l8LCQgD8fj/p6ekUFRURExPDunXrKCgowOVyBd5fXl7O3r17ycvL\nY9OmTXz/+98fUgicb7z90YXKeDyBDaS7x883Hl/fieq8YYq2zr7JVROwcI6d1EXT+O4CBxPNw/uj\nDpf2NAwDT3PHmRNlS6Adz/aoLnT22/L5ARA3ZeKg12Oc356GYfDlyea+IbzP6wOBdO30KaQlTSP5\nBheTJ13eN+izoVdd30K1+9zn4vyhK4ApVnO/Osxy2pgWO2nUL9+FKwuEAf8KSktLycjIAGDu3Lk0\nNzfj8/mwWq14vV5sNht2ux2A5ORkiouLueuuuwLvf/HFF3nuuecCj0f56JSMUafbuvp9qz07/NHr\nP2/4A3DGWki8NpZrpttIXugiburE0BV6jDCZTDhiJuGImcR3FzgC/97W0R04mZpMpqB+WzaZTMxL\niGFeQgyrM+bx0RceDh6ppfIvTfyl9jSFf/yCm66PJ23RdBZdF3vJXWKbWzsDJ/3q+r4wq2tq4/zT\nkMkE0+Os/eZPZg0yLDaeDfi/5/F4SExMDDyOjY2loaEBq9VKbGwsPp+PEydOMGPGDA4dOkRycnLg\ntRUVFUyfPp24uLjAv+3atYt///d/Jy4ujry8vECYiAxVfVMbJ+rPH59u4VRrV7/XREdFcs10W79v\nqgmOyUSbgzNBKmCZGMWC2XYWzB7Zv2FzVCS33uji1htdeFs6Kf20juLDdXxwtIEPjjZgs0Rx640u\nbp7n4NR5AVBT38Lptv5DVxPNkVw/c2rfbrJnNgoM5sT5eDCsODcMI9D1M5lM5Ofnk5OTQ1xcHPHx\n8f16AK+99hp333134PGqVauw2+0sXLiQV155hR07dpCXlxekash41uzrovTMxVZfN7T2e85ui+Y7\nc+MC47qznZNx2LUb6Hhkt0Wz4tY5fD95NifqWzh4uI73P63nwKGvOXDo636vjZ86kcXzpp755t8X\nAPFTJ+pzMYgBA8HpdOLxnNtl0e1243Cc6zampKSQkpICwObNm0lISAg8V1ZWxpYtW/q99qz09HSe\nffbZIRXwSsbDpL+x1JZd3b2UfVrHH8tr+PCoG7/fYEKkiVsTp7FobjzXzpjCtTOmMsUauourxlJ7\njgXDaU+ncwrfS5pJd4+fQ5/Vc6TKgyvOwnUzpnLNjKmXPccQ7gYMhLS0NF544QXuu+8+Kisrcblc\nWCznluKtX7+ebdu2ERERQUlJCU8//TQA9fX1WCwWJkw4d/jHHnuMn/zkJyxYsIDy8nLmz58/pAKG\nw8Td1RCMSdDze4gjwTAMjn9zmoNH6ij7tD4wAXzNNBtpSdNJvsGJ7byrazvbOmlo+/bNZq6GcJlU\nvlqupD2vnzaZ66ed25KjvbWD9taLT3qHgxGbVF68eDGJiYlkZWURGRnJli1bKCoqwmazkZGRQWZm\nJmvXrqWnp4cNGzYElpR6PB7i4+P7HSs7O5tNmzZhtVqxWq0UFBRcdqHl6mpp6+KlN45w/JvTzHRY\n+92wJcFx5ROJjc0dFJ/ZIqL+zP47MZPNLP2r2aQmTWdm/MgsNxSR/nRhWpi43G9gTac7eG7Px9Q2\nthE/dSLels5+q3cAnDGTzlue17fOPHZK9IC9ic6uXg4ddVN8pI7PT/RtERE1IYKb5ztIWzSNG6+J\nDdkFSUOhHkJwqT2DZ8R6CBLe6praeK7wIxpPd3LH92aRmX49fr9BbWNb4LaPZ5cefnCsgQ+ONQTe\na504oW91z3n3CJ4Rb+HLk6cpPlzLoaMNdHb3rf2enzCV1KTp3LLAiWWiPpIioaK/PrmoE3Ut/Mvv\nPqalrZu7l1zHypQ5mEwmIiLPrTlPXdT32rPbFZxd63126d/R6lN8Xn3qosePnzqR5YtmkbpoGs4x\nepcvkfFGgSDfcqzmFP/Pa5/Q0dnLmjvms+zmhAFfbzKZsNuisduiuen6c3NHHV09fN3gC6wLP+nx\n4Yq1kLZoGvNm6T7AIqONAkH6+eRLD7964wh+v8EPVyVy642uwd90CRPNE7h+5lSunzk1iCUUkZGi\nQJCAkso6fvP/fkZkhIlH/6/v8J25cYO/SUTGDQWCAPDHD75m99vHmBQ9gSfu/Q7zEmJCXSQRucoU\nCGHOMAzeOvgVb/zPX5hiNfNk5k3MdukKXJFwpEAIY37DoPDAFxz44Gvip07kqay/wqUVPyJhS4EQ\npnr9fv79//uc4iN1zIy38uR9f4XdFp5b/opIHwVCGOru6eWlNyr5+EsP182YwhP33qTNwEREgRBu\n2jt7eOH1Cj6vPsWN19h55O6kYd8tTETGJ50Jwsjpti6e/90nnKhr4bsLHPzwB4n97msrIuFNgRAm\nGrzt/POuD6lrauNvvjOdh76/cFRvHiciV58CIQycqGvhxTeO4DnVzopbZ3PP7XNH9L4GIjI2KRDG\nsabTHRT9+TjFR+owgHtun8v/+ddzQl0sERmlFAjjUHtnD//7/RPsL6uhq8dPgmMyP7o7iZn2SaEu\nmoiMYgqEcaTX7+e9T2p5473jnG7rJmaymewl15G2aDou1xTdgEREBqRAGAcMw+Dw8UZ+924V33h8\nREdF8r9uu5blybOJNkeGungiMkYoEMa46voWfvful3z6lReTCZbcNIP/9TfXEjNZVx2LyPAMGggF\nBQVUVFQAkJubS1JSUuC5AwcOsHPnTsxmMytXriQ7O5vf//73vPnmm4HXHDlyhI8++oja2lqeeeYZ\n/H4/DoeDbdu2YTabR6BK4cHb0knRn49z8HAtBrDoulgyb7+eBOfkUBdNRMaoAQOhrKyM6upqCgsL\nqaqqIjc3l8LCQgD8fj9bt26lqKiImJgY1q1bR0ZGBvfeey/33nsvAOXl5ezduxeA7du388ADD7B8\n+XKef/55Xn/9de6///4Rrt7409HVw973q9lbVk1Xt58Eh5XM9OtZdK3uXSAiV2bAy1RLS0vJyMgA\nYO7cuTQ3N+Pz+QDwer3YbDbsdjsmk4nk5GSKi4v7vf/FF1/kxz/+MdAXLunp6QAsW7aMkpKSoFdm\nPPP7Df78yTdsermUNw9+xSTzBP5+xUJ+9g/JCgMRCYoBewgej4fExMTA49jYWBoaGrBarcTGxuLz\n+Thx4gQzZszg0KFDJCcnB15bUVHB9OnTiYvrO1m1t7cTFRUVOI7b7R6J+oxLR443sufdLznZ4MMc\nFcGqtGv4/q2ztQeRiATVsM4ohmEErnA1mUzk5+eTk5NDXFwc8fHxGIYReO1rr73G3XfffcnjDJXD\nEb43a/mq9jS/efMIHx1rwGSC/yN5NtnfX0jc1Mu7niCc23IkqD2DS+0ZegMGgtPpxOPxBB673W4c\nDkfgcUpKCikpKQBs3ryZhISEwHNlZWVs2bIl8NhisdDV1YXZbKa+vh6n0zmkAobj2nlvSydvvHec\n/zlci2FA4jV27l12PbNdNvxdPZfVJg6HLSzbcqSoPYNL7Rk8VxKsA84hpKWlsW/fPgAqKytxuVxY\nLOfuqLV+/Xq8Xi/Nzc2UlJSQmpoKQH19PRaLhQkTzuVNampqYIJ5//79LFmy5LILPV51dvXyX//z\nFza9UsJ7FbXMiLOyIfMmnrzvr3RbSxEZcQP2EBYvXkxiYiJZWVlERkayZcsWioqKsNlsZGRkkJmZ\nydq1a+np6WHDhg3ExPTdmN3j8RAfH9/vWI8++igbN25kz549zJw5k7vuumvkajXG+P0GBw/X8of3\njtPc2sUUq5n7//ZabvvOdCIjtD21iFwdJmM4A/ohMN67kZV/aWLPO1/ydUMr5gkRLE+ezfdvnc2k\n6OBOGKtLHlxqz+BSewbPlQwZaZlKiHzd0Mrv3v2SI8ebMAG3JU3nriXX6b7GIhIyCoSrrLm1k6L3\n/sJ7Fd9gGHDDHDv3pV+vOQIRCTkFwlXS2d3LvrJq/ndpNZ3dvcyIt5K5bC5J18XpZjUiMiooEEaY\n329QfKSOP/y5ilOtXUyxRHFf+vX8zU2aMBaR0UWBMIL8foNt//kRx2pOETUhgjtT57Di1jlBnzAW\nEQkGnZlG0NGaUxyrOcXC2TGsu/NGYqdMDHWRREQuSWMWI6j8s3oAfpB2rcJAREY9BcII6fX7OXS0\ngSmWKBbMigl1cUREBqVAGCGfV5+itb2b7y50EhGhVUQiMvopEEZI+Wd923snLxzaJn4iIqGmQBgB\nPb1+PjjqZupkM/MSNFwkImODAmEEfH7Ci6+jh1sWaLhIRMYOBcIIKPu8b7joexouEpExRIEQZD29\nfj461oDdFs31CVNDXRwRkSFTIATZp181nRsu0h5FIjKGKBCC7Ozqou/doOEiERlbFAhB1N3j58Mv\nPMROiea6GVNCXRwRkWFRIARR5VdNtHdquEhExiYFQhCd3bso+QZXiEsiIjJ8g+52WlBQQEVFBQC5\nubkkJSUFnjtw4AA7d+7EbDazcuVKsrOzAXjzzTf59a9/TWRkJI8//jhLly4lJyeHyspKYmL6LtRa\nt24dS5cuHYk6hUR3Ty8ffeEhbspErp2uu5+JyNgzYCCUlZVRXV1NYWEhVVVV5ObmUlhYCIDf72fr\n1q0UFRURExPDunXryMjIwGw28+KLL1JUVITP5+OFF15g6dKlmEwmnn766XEVAuc7cryJjq5ebl88\nU3dAE5ExacBAKC0tJSMjA4C5c+fS3NyMz+fDarXi9Xqx2WzY7XYAkpOTKS4uJjo6mtTUVCwWCxaL\nhZ///OeB4xmGMYJVCa1yXYwmImPcgHMIHo8ncMIHiI2NpaGhIfCzz+fjxIkTdHd3c+jQITweDydP\nnqSjo4OHH36Y7OxsSkpKAu/ftWsXDz30EE8++SRer3eEqnT1dXX38tGXHhwxE7lmmoaLRGRsGtYd\n0wzDCAyHmEwm8vPzycnJIS4ujvj4+EAPwOv18qtf/YqTJ0/y4IMP8u6777Jq1SrsdjsLFy7klVde\nYceOHeTl5Q36Ox2O0X+CLa74hs6uXpbedh1O5+hdbjoW2nIsUXsGl9oz9AYMBKfTicfjCTx2u904\nHI7A45SUFFJSUgDIy8sjISGBjo4Obr75ZiIiIpg1axZWq5WmpqbA6wDS09N59tlnh1TAhoaWYVUo\nFP5YdgKAxNkxo7a8Dodt1JZtLFJ7BpfaM3iuJFgHHDJKS0tj3759AFRWVuJyubBYLIHn169fj9fr\npbm5meLiYlJTU0lLS6O0tBTDMPB6vbS1tWG323nsscc4evQoAOXl5cyfP/+yCz2adHb38vGXHpz2\nScx2TQ51cURELtuAPYTFixeTmJhIVlYWkZGRbNmyhaKiImw2GxkZGWRmZrJ27Vp6enrYsGFDYEnp\n8uXLyczMBPp6DiaTiezsbDZt2oTVasVqtVJQUDDytbsKKqoa6er2k3yDU6uLRGRMMxmjfOnPaO9G\n/qroMIeONvDs/53MLOfo7SGoSx5cas/gUnsGz4gNGcnAOrp6qKhqZFqshQSHNdTFERG5IgqEK1BR\n1UhXj4aLRGR8UCBcgcBW17oYTUTGAQXCZWrv7KHieCMz4q3MdIzeuQMRkaFSIFymT7700N3jV+9A\nRMYNBcJl0t5FIjLeKBAuQ3tnD4ePN5LgsDIjXquLRGR8UCBcho++aKCn11DvQETGFQXCZQisLtKd\n0URkHFEgDFNbRzdH/tLEbOdkpsVaBn+DiMgYoUAYpo++8NDrN/jeDRouEpHxRYEwTGVnhotu0fyB\niIwzCoRhaG3v5tOvmpjjsuGya7hIRMYXBcIwfHSsgV6/QbKGi0RkHFIgDMPZi9E0XCQi45ECYYha\n2rr49Csv10634YiZFOriiIgEnQJhiD481oDfMPjeQl17ICLjkwJhiM4NFzlCXBIRkZGhQBiC021d\nfHbCy9wZU4ifquEiERmfBg2EgoICsrKyyMrK4vDhw/2eO3DgAPfccw+rV69m9+7dgX9/8803+bu/\n+zvuvvtu/vSnPwFQW1vLmjVryM7O5oknnqCrqyvIVRk5HxxtwDC0s6mIjG8DBkJZWRnV1dUUFhaS\nn59Pfn5+4Dm/38/WrVt59dVX2b17N++88w719fV4vV5efPFF/vM//5OXX36ZP/7xjwBs376dBx54\ngN27dzNnzhxef/31ka1ZEJV/Vg9odZGIjG8DBkJpaSkZGRkAzJ07l+bmZnw+HwBerxebzYbdbsdk\nMpGcnExxcTElJSWkpqZisVhwOBz8/Oc/B/rCJT09HYBly5ZRUlIykvUKmubWTo7WnOL6hKnETpkY\n6uKIiIyYAQPB4/Fgt9sDj2NjY2loaAj87PP5OHHiBN3d3Rw6dAiPx8PJkyfp6Ojg4YcfJjs7O3Di\nb29vJyoqKvBet9s9UnUKqg+OabhIRMLDhOG82DAMTCYTACaTifz8fHJycoiLiyM+Ph7DMIC+3sOv\nfvUrTp48yYMPPsi77777reMMlcNhG04Rg+6jLxsxmWB56rXEjfEJ5VC35Xij9gwutWfoDRgITqcT\nj8cTeOx2u3E4zi27TElJISUlBYC8vDwSEhLo6Ojg5ptvJiIiglmzZmG1WmlqasJisdDV1YXZbKa+\nvh6nc2jfuBsaWi6nXkHhbenk0+ONzEuYir+rJ6RluVIOh21Ml3+0UXsGl9ozeK4kWAccMkpLS2Pf\nvn0AVFZW4nK5sFjObeq2fv16vF4vzc3NFBcXk5qaSlpaGqWlpRiGgdfrpa2tDbvdTmpqKnv37gVg\n//79LFmy5LILfbV8cNSNgW6EIyLhYcAewuLFi0lMTCQrK4vIyEi2bNlCUVERNpuNjIwMMjMzWbt2\nLT09PWzYsIGYmBgAli9fTmZmJtDXczCZTDz66KNs3LiRPXv2MHPmTO66666Rr90V+vBYAybglgW6\nGE1Exj+TMZwB/RAIVTfSMAwe+dc/EzM5mvz1fx2SMgSTuuTBpfYMLrVn8IzYkFE48zR30N7Zy2yX\nJrpEJDwoEC6hur4VgFnOySEuiYjI1aFAuIQad1/3dbYCQUTChALhEmrc6iGISHhRIFxCjbuVKVYz\nUydHh7ooIiJXhQLhIto6uvE0d6h3ICJhRYFwEWeHizR/ICLhRIFwEdVn5w9cCgQRCR8KhIuoCSw5\n1TUIIhI+FAgXUeNuJWpCBNNix/bupiIiw6FAuEBPr5+TnlZmxluJjFDziEj40BnvAnWNbfT0GszW\n/IGIhBkFwgXOXZCm+QMRCS8KhAtUn9myQtcgiEi4USBcQFtWiEi4UiCcxzAMqutbccRMZFL0sG43\nLSIy5ikQznOqtYvW9m5ma/5ARMKQAuE8NZo/EJEwpkA4T+CmOFpyKiJhaNCB8oKCAioqKgDIzc0l\nKSkp8NyBAwfYuXMnZrOZlStXkp2dzfvvv8/jjz/OvHnzAFiwYAGbN28mJyeHyspKYmJiAFi3bh1L\nly4diTpdtnOb2mnISETCz4CBUFZWRnV1NYWFhVRVVZGbm0thYSEAfr+frVu3UlRURExMDOvWrSMj\nIwOA5ORktm/f3u9YJpOJp59+etSFwPmq3a1YoicQO0X3QBCR8DPgkFFpaWngJD937lyam5vx+XwA\neL1ebDYbdrsdk8lEcnIyxcXFmEymSx7PMIwgFj24Ort6cTe1Mds1ecA6iIiMVwMGgsfjwW63Bx7H\nxsbS0NAQ+Nnn83HixAm6u7s5dOgQHo8HgKqqKh5++GFWr15NcXFx4P27du3ioYce4sknn8Tr9Y5E\nfS7b1w2tGECCJpRFJEwNa7G9YRiBb88mk4n8/HxycnKIi4sjPj4ewzCYM2cOjzzyCCtWrKCmpoYH\nH3yQ/fv3s2rVKux2OwsXLuSVV15hx44d5OXlDfo7HY6rM55/6Iu+MEuc67hqv/NqG6/1ChW1Z3Cp\nPUNvwEBwOp2Bb/0Abrcbh8MReJySkkJKSgoAmzdvJiEhAZfLxYoVKwCYNWsW8fHxuN3uwOsA0tPT\nefbZZ4dUwIaGlqHX5gp8erwRALtlwlX7nVeTw2Ebl/UKFbVncKk9g+dKgnXAIaO0tDT27dsHQGVl\nJS6XC4vFEnh+/fr1eL1empubKSkpITU1lbfeeosdO3YA0NjYSGNjI06nk8cee4yjR48CUF5ezvz5\n8y+70CMz/A8mAAAPfklEQVShpr6FyAgT0+OsoS6KiEhIDNhDWLx4MYmJiWRlZREZGcmWLVsoKirC\nZrORkZFBZmYma9eupaenhw0bNhATE0N6ejpPPfUU999/P36/n5/97GdERUWRnZ3Npk2bsFqtWK1W\nCgoKrlYdB+X3G3zd4GN6nJWoCbo0Q0TCk8kYzUt/uDpDRnVNbfz0lVJSEqex/gc3jvjvCwV1yYNL\n7Rlcas/gGbEho3BRXd/3QdRNcUQknCkQ0JbXIiKgQAAUCCIioEAA+gLBbovGZjGHuigiIiET9oFw\nuq0Lb0unegciEvbCPhACO5xqQllEwpwC4ew9ELTltYiEOQXCmbukzdaQkYiEOQWCu5XoqEgc9kmh\nLoqISEiFdSB09/RS29hGgtNKhO6BICJhLqwD4RtPG71+Q7fMFBEhzAOh+sz8gZacioiEeSAEVhhp\nyamISJgHgrsVE5AQr0AQEQnbQDAMg2p3K65YC9HmyFAXR0Qk5MI2EBqbO2jv7NEVyiIiZ4RtIGiH\nUxGR/sI2EKrd2rJCROR8YRsI6iGIiPQ3YbAXFBQUUFFRAUBubi5JSUmB5w4cOMDOnTsxm82sXLmS\n7Oxs3n//fR5//HHmzZsHwIIFC9i8eTO1tbU888wz+P1+HA4H27Ztw2wO3f0HqutbsFmiiJmseyCI\niMAggVBWVkZ1dTWFhYVUVVWRm5tLYWEhAH6/n61bt1JUVERMTAzr1q0jIyMDgOTkZLZv397vWNu3\nb+eBBx5g+fLlPP/887z++uvcf//9I1StgbV19OBp7iDxGjsmbVkhIgIMMmRUWloaOMnPnTuX5uZm\nfD4fAF6vF5vNht3ed1JNTk6muLj4kifYsrIy0tPTAVi2bBklJSXBrMewfN2g+QMRkQsNGAgejwe7\n3R54HBsbS0NDQ+Bnn8/HiRMn6O7u5tChQ3g8HgCqqqp4+OGHWb16NcXFxQC0t7cTFRUVeK/b7R6R\nCg1Fdf2ZLSu05FREJGDQOYTzGYYR6AGYTCby8/PJyckhLi6O+Ph4DMNgzpw5PPLII6xYsYKamhoe\nfPBB9u/f/63jDJXDEfxv8Q2nOwG4aYFrRI4/WoVTXa8GtWdwqT1Db8BAcDqdgW/9AG63G4fDEXic\nkpJCSkoKAJs3byYhIQGXy8WKFSsAmDVrFvHx8dTX12OxWOjq6sJsNlNfX4/T6RxSARsaWoZdqcEc\nq/YyITICs8kYkeOPRg6HLWzqejWoPYNL7Rk8VxKsAw4ZpaWlsW/fPgAqKytxuVxYLJbA8+vXr8fr\n9dLc3ExJSQmpqam89dZb7NixA4DGxkYaGxtxuVykpqayd+9eAPbv38+SJUsuu9BXotfv52SDj5nx\nViZEhu2qWxGRbxmwh7B48WISExPJysoiMjKSLVu2UFRUhM1mIyMjg8zMTNauXUtPTw8bNmwgJiaG\n9PR0nnrqKe6//378fj8/+9nPiIqK4tFHH2Xjxo3s2bOHmTNnctddd12tOvZT19hGT69f8wciIhcw\nGcMZ0A+BYHcjSyrrePWtT1mdMY+MW2YF9dijmbrkwaX2DC61Z/CM2JDReKQrlEVELi78AuHsklNd\ngyAi0k9YBcLZeyDET52IZeKwVtyKiIx7YRUIzb4uWtq6NVwkInIRYRUI1WfuoTzbpeEiEZELhVUg\n1LjPzh+ohyAicqEwC4QzPQQFgojIt4RVIFTXtzIpegJxUyeGuigiIqNO2ARCZ1cv9U1tzHJO1j0Q\nREQuImwC4WtPKwaaPxARuZSwCQTNH4iIDCx8AkFLTkVEBhQ2gVDtbiHCZGJGvGXwF4uIhKGwCAS/\nYfC128f0eAtREyJDXRwRkVEpLAKhwdtOZ3evJpRFRAYQFoFwbkJZ8wciIpcSFoFQfXbLCt0lTUTk\nksIjEOp1UxwRkcGERSDUuFuJmWxmisUc6qKIiIxagwZCQUEBWVlZZGVlcfjw4X7PHThwgHvuuYfV\nq1eze/fufs91dHSQkZHBG2+8AUBOTg4/+MEPWLNmDWvWrOFPf/pTEKtxaa3t3XhbOnWHNBGRQQx4\n27CysjKqq6spLCykqqqK3NxcCgsLAfD7/WzdupWioiJiYmJYt24dGRkZuFwuAF566SViYmICxzKZ\nTDz99NMsXbp0BKvzbWdvmTlb8wciIgMasIdQWlpKRkYGAHPnzqW5uRmfzweA1+vFZrNht9sxmUwk\nJydTXFwMQFVVFcePH+f222/HMIzA8c7/+Wqpdmv+QERkKAYMBI/Hg91uDzyOjY2loaEh8LPP5+PE\niRN0d3dTXl5OY2MjAL/85S/ZtGkTQL+dRXft2sVDDz3Ek08+idfrDXplLkYTyiIiQzOsO80bhhE4\nwZtMJvLz88nJySEuLg6Hw4Hf7+eNN97glltuYcaMGRiGEegVrFq1CrvdzsKFC3nllVfYsWMHeXl5\ng/5Oh+PKxv5rm9qINkeSON9FZER4b3t9pW0p/ak9g0vtGXoDBoLT6cTj8QQeu91uHA5H4HFKSgop\nKSkA5OXlkZCQwNtvv01NTQ1vv/02dXV1mM1mpk2bFngdQHp6Os8+++yQCtjQ0DKsCp2vu8dPTX0L\nc6bZaGpsvezjjAcOh+2K2lL6U3sGl9ozeK4kWAccMkpLS2Pfvn0AVFZW4nK5sFjObQ63fv16vF4v\nzc3NFBcXk5qayvPPP89rr73Gnj17uPfee/nJT35CSkoKjz32GEePHgWgvLyc+fPnX3ahh6q20Uev\n39CW1yIiQzBgD2Hx4sUkJiaSlZVFZGQkW7ZsoaioCJvNRkZGBpmZmaxdu5aenh42bNjQb1XRhbKz\ns9m0aRNWqxWr1UpBQUHQK3MhzR+IiAydyQjF0p9huJJu5H8cOMaBQ1/z0zXf5fqZU4NYqrFHXfLg\nUnsGl9ozeEZsyGis+9rdiglIcFhDXRQRkVFv3AaCYRhU17fitE9ionlYi6lERMLSuA2EptOdtHX2\nMEu3zBQRGZJxGwiBLa81oSwiMiTjNhDqm9oBtORURGSIxu3g+ncXOOjs7iXx2thQF0VEZEwYt4Hg\niJnE3912baiLISIyZozbISMRERkeBYKIiAAKBBEROUOBICIigAJBRETOUCCIiAigQBARkTMUCCIi\nAigQRETkDAWCiIgACgQRETlj0EAoKCggKyuLrKwsDh8+3O+5AwcOcM8997B69Wp2797d77mOjg4y\nMjIoKioCoLa2ljVr1pCdnc0TTzxBV1dXEKshIiJXasBAKCsro7q6msLCQvLz88nPzw885/f72bp1\nK6+++iq7d+/mnXfeob6+PvD8Sy+9RExMDCaTCYDt27fzwAMPsHv3bubMmcPrr78+QlUSEZHLMWAg\nlJaWkpGRAcDcuXNpbm7G5/MB4PV6sdls2O12TCYTycnJFBcXA1BVVcXx48e5/fbbMQwD6AuX9PR0\nAJYtW0ZJScmIVUpERIZvwEDweDzY7fbA49jYWBoaGgI/+3w+Tpw4QXd3N+Xl5TQ2NgLwy1/+kk2b\nNgEEegjt7e1ERUUF3ut2u4NfGxERuWzDuh+CYRiBE7zJZCI/P5+cnBzi4uJwOBz4/X7eeOMNbrnl\nFmbMmIFhGIEewoXHERGR0WXAQHA6nXg8nsBjt9uNw+EIPE5JSSElJQWAvLw8EhISePvtt6mpqeHt\nt9+mrq6O6Ohopk2bhsViobOzk+joaOrr63E6nUMqoMNhu5x6yUWoLYNL7Rlcas/QG3DIKC0tjX37\n9gFQWVmJy+XCYrEEnl+/fj1er5fm5maKi4tJTU3l+eef57XXXmPPnj3ce++9/PjHPyYlJYXU1NTA\nsfbv38+SJUtGsFoiIjJcA/YQFi9eTGJiIllZWURGRrJlyxaKioqw2WxkZGSQmZnJ2rVr6enpYcOG\nDcTExFzyWI8++igbN25kz549zJw5k7vuuivolRERkctnMjSgLyIi6EplERE5Q4EgIiKAAkFERM4Y\n1nUIV0tBQQEVFRUA5ObmkpSUFOISjS3vv/8+jz/+OPPmzQNgwYIFrFu3jn/8x3/E7/fjcDjYtm0b\nZrM5xCUd3T7//HMeeeQR/uEf/oHs7Gxqa2t55plnvtWGb775Jr/97W+JiIggMzOTe+65J9RFH5Uu\nbM+cnBwqKysDi1HWrVvH0qVL1Z5DsG3bNj788EN6enr40Y9+xKJFi4Lz2TRGmffff9/40Y9+ZBiG\nYXz55ZfGfffdF+ISjT2lpaXGY4891u/fcnJyjL179xqGYRj/8i//YvzHf/xHKIo2ZrS1tRl///d/\nb/zTP/2TsWvXLsMwLt6GPp/PWL58udHS0mJ0dHQYd955p3Hq1KlQFn1UulR7/vd//3e/16k9B1dS\nUmKsX7/eMAzD8Hq9xtKlS4P22Rx1Q0YD7Z8kQ2dcsHhMe0kNj9ls5uWXXyY+Pj7wbxdrw4qKCpKS\nkpg8eTLR0dEsXryYDz/8MFTFHrUu1p7w7c/pJ598ovYcxPe+9z3+9V//FQCbzUZ7ezvl5eVB+WyO\nukAYaP8kGRqTyURVVRUPP/wwq1ev5uDBg9pLapgiIyO/NaR2sTb0eDzExsYGXhMXF6fP60VcrD0B\ndu3axUMPPcSTTz6J1+tVew5BZGRk4ALh1157jaVLl9LW1haUz+aonEM4n3He/kkyNHPmzOGRRx5h\nxYoV1NTUsGbNGnp7ewPPX/itTIbvUm2oth26VatWYbfbWbhwIa+88go7duxg8eLF/V6j9ry0AwcO\n8Ic//IFf//rX3HHHHYF/v5LP5qjrIQy2f5IMzuVysWLFCgBmzZpFfHw8p0+fDtyUaDh7Sck5Fovl\nW2144ee1vr4el8sVqiKOKSkpKSxcuBCAv/3bv+XYsWNqzyF67733ePnll3n11VeZPHly0D6boy4Q\nBts/SQb31ltvsWPHDgAaGxtpamri7rvvZu/evYD2khqO879VpaamfqsNb7rpJg4fPkxLSws+n48P\nP/yQ7373u6Eq7qh3fns+9thjHD16FOibn5k/f77acwhaWlrYtm0bL7/8MlOmTAGC99kclVtXPPfc\nc5SXlwf2T1qwYEGoizSm+Hw+nnrqKZqbm/H7/fzkJz/hhhtuYOPGjXR2djJz5kx+8YtfEBkZGeqi\njloff/wxeXl5NDY2EhkZSUxMDP/2b//Gpk2bvtWG+/bt49e//jUmk4k1a9Zw5513hrr4o87F2vPR\nRx9l586dWK1WrFYrBQUFxMbGqj0HsWfPHnbs2ME111wD9M0Z/vM//zObN2++4s/mqAwEERG5+kbd\nkJGIiISGAkFERAAFgoiInKFAEBERQIEgIiJnKBBERARQIIiIyBkKBBERAeD/B2OPA0j5t3OVAAAA\nAElFTkSuQmCC\n", 352 | "text": [ 353 | "" 354 | ] 355 | } 356 | ], 357 | "prompt_number": 29 358 | }, 359 | { 360 | "cell_type": "code", 361 | "collapsed": false, 362 | "input": [], 363 | "language": "python", 364 | "metadata": {}, 365 | "outputs": [] 366 | } 367 | ], 368 | "metadata": {} 369 | } 370 | ] 371 | } -------------------------------------------------------------------------------- /homework/HW5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:785d4758712e4f38191af22c6e5f6b930a6e1db8f423786a69efef8eb239e441" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "code", 13 | "collapsed": false, 14 | "input": [ 15 | "import numpy as np\n", 16 | "import scipy as sp\n", 17 | "import pandas as pd\n", 18 | "import sklearn\n", 19 | "import seaborn as sns\n", 20 | "from matplotlib import pyplot as plt\n", 21 | "%matplotlib inline\n", 22 | "\n", 23 | "import sklearn.cross_validation" 24 | ], 25 | "language": "python", 26 | "metadata": {}, 27 | "outputs": [], 28 | "prompt_number": 1 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "# Homework 5: In Vino Veritas\n", 35 | "\n", 36 | "Due: Thursday, November 13, 2014 11:59 PM\n", 37 | "\n", 38 | " Download this assignment\n", 39 | "\n", 40 | "#### Submission Instructions\n", 41 | "To submit your homework, create a folder named lastname_firstinitial_hw# and place your IPython notebooks, data files, and any other files in this folder. Your IPython Notebooks should be completely executed with the results visible in the notebook. We should not have to run any code. Compress the folder (please use .zip compression) and submit to the CS109 dropbox in the appropriate folder. If we cannot access your work because these directions are not followed correctly, we will not grade your work." 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Can a winemaker predict how a wine will be received based on the chemical properties of the wine? Are there chemical indicators that correlate more strongly with the perceived \"quality\" of a wine?\n", 56 | "\n", 57 | "In this problem we'll examine the wine quality dataset hosted on the UCI website. This data records 11 chemical properties (such as the concentrations of sugar, citric acid, alcohol, pH etc.) of thousands of red and white wines from northern Portugal, as well as the quality of the wines, recorded on a scale from 1 to 10. In this problem, we will only look at the data for *red* wine." 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "### Problem 1: Data Collection" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "Import only the data for **red** wine from the dataset repository. **Build a pandas dataframe** from the csv file and **print the head**. You might have to change the default delimiter used by the read_csv function in Pandas." 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "collapsed": false, 77 | "input": [ 78 | "## your code here" 79 | ], 80 | "language": "python", 81 | "metadata": {}, 82 | "outputs": [], 83 | "prompt_number": 2 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "As in any machine learning problem, we have the feature data, usually labeled as $X$, and the target data, labeled $Y$. Every row in the matrix $X$ is a datapoint (i.e. a wine) and every column in $X$ is a feature of the data (e.g. pH). For a classification problem, $Y$ is a column vector containing the class of every datapoint.\n", 90 | "\n", 91 | "We will use the *quality* column as our target variable. **Save the *quality* column as a separate numpy array** (labeled $Y$) and **remove the *quality* column** from the dataframe.\n", 92 | "\n", 93 | "Also, we will simplify the problem to a binary world in which wines are either \"bad\" ($\\text{score} < 7$) or \"good\" ($\\text{score} \\geq 7)$. **Change the $Y$ array** accordingly such that it only contains zeros (\"bad\" wines) and ones (\"good\" wines). For example, if originally $Y = [1,3,8,4,7]$, the new $Y$ should be $[0,0,1,0,1]$." 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "collapsed": false, 99 | "input": [ 100 | "## your code here" 101 | ], 102 | "language": "python", 103 | "metadata": {}, 104 | "outputs": [], 105 | "prompt_number": 3 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "Use the as_matrix function in Pandas to **save the feature information in your data frame as a numpy array**. This is the $X$ matrix." 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "collapsed": false, 117 | "input": [ 118 | "## your code here" 119 | ], 120 | "language": "python", 121 | "metadata": {}, 122 | "outputs": [], 123 | "prompt_number": 4 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "### Problem 2: Unbalanced Classification Evaluation\n", 130 | "\n", 131 | "In this section, we explore a number of different methods to predict the quality of a wine $Y$ based on the recorded features $X$. Formulated as a machine learning problem, we wish to predict the **target** $Y$ as a function of the **features** $X$.\n", 132 | "\n", 133 | "Because we have defined $Y$ as a binary variable (encoding *bad* as 0 and *good* as 1), this is a **classification** problem. In class, we have discussed several approaches to classifiction incuding **decision trees**, **random forests**, and **Support Vector Machines (SVM)**. \n", 134 | "\n", 135 | "For this problem, we will focus on **random forests**, but we will later in the Problem set invoke these other techniques. Recall from class that the random forest technique works by aggregating the results from a number of randomly perturbed decision trees constructed to explain the data." 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "**(a)** In class, we saw that for a fixed set of data, a decision tree algorithm will generate a single fixed tree to perform a classification task. Describe how a random forest is built from individual decision trees. What are the sources of randomness in the process that are used to build a diverse set of decision trees?" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "**YOUR ANSWER HERE.**" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "**(b)** There are many ways to construct a random forest -- these differences in the method of construction are encoded as *tuning parameters*. As is often the case when our goal is to construct a good prediction, we can set these tuning parameters to obtain the best projected performance in a prediction task. One of the most important tuning parameters in building a random forest is the number of trees to construct. \n", 157 | "\n", 158 | "Here, you should apply the random forest classifier to the wine data and use cross-validation to explore how the score of the classifier changes when varying the number of trees in the forest. Use the random forest classifier built into the scikit-learn library and the cross_val_score function (using the default scoring method) to **plot the scores of the random forests as a function of the number of trees** in the random forest, ranging from 1 (simple decision tree) to 40. You should use 10-fold cross-validation. Feel free to use the boxplot functionality of the seaborn library." 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "collapsed": false, 164 | "input": [ 165 | "from sklearn.ensemble import RandomForestClassifier\n", 166 | "from sklearn.cross_validation import cross_val_score\n", 167 | "\n", 168 | "## your code here" 169 | ], 170 | "language": "python", 171 | "metadata": {}, 172 | "outputs": [], 173 | "prompt_number": 5 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "**(c)** Describe the relationship between cross validation accuracy and the number of trees. What tradeoffs should we consider when choosing the number of trees to use?" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "**YOUR ANSWER HERE.**" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "**(d)** These accuracy scores look very promising compared to, say, classifying the wine using a coinflip. However, in binary classification problems, accuracy can be misleading if one class (say, bad wine) is much more common than another (say, good wine), this is, when the classes are **unbalanced**.\n", 194 | "\n", 195 | "**Print** the percentage of wines that are labeled as \"bad\" in the dataset and **plot the same boxplot** as the last question (feel free to copy/paste), but this time draw a line across the plot denoting the **accuracy** of always guessing zero (\"bad wine\")." 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "collapsed": false, 201 | "input": [ 202 | "## your code here" 203 | ], 204 | "language": "python", 205 | "metadata": {}, 206 | "outputs": [], 207 | "prompt_number": 6 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "###Evaluation Metrics\n", 214 | "\n", 215 | "When there are unbalanced classes in a dataset, guessing the more common class will often yield very high accuracy. For this reason, we usually want to use different metrics that are less sensitive to imbalance when evaluating the predictive performance of classifiers. These metrics were originally developed for clinical trials, so to keep with the standard terminology, we define \"good\" wines (value of 1) as \"positive\" and the \"bad\" wines (value of 0) as the \"negatives\". We then define the following:\n", 216 | "\n", 217 | "$P$ - number of positives in the sample.\n", 218 | "\n", 219 | "$N$ - number of negatives in the sample.\n", 220 | "\n", 221 | "$TP$ - number of true positives: how many of the \"positive\" guesses of the classifier are true.\n", 222 | "\n", 223 | "$FP$ - number of false positives: how many of the \"positive\" guesses of the classifier are actually negatives.\n", 224 | "\n", 225 | "$TN$ - number of true negatives; similarly, this is how many of the \"negative\" guesses of the classifier are true.\n", 226 | "\n", 227 | "$FN$ - number of false negatives; how many of the \"negative\" guesses are actually positives.\n", 228 | "\n", 229 | "When calling the score functions in scikit-learn you obtained the default measure of efficiency, which is called **accuracy**. This is simply the ratio of successful guesses (both positives and negatives) across all samples:\n", 230 | "$$\\text{accuracy} = \\frac{TP + TN}{P+N}.$$\n", 231 | "In our case, when the two classes (good and bad wines) are very unbalanced in the sample, we should look for a better measure of efficiency. \n", 232 | "\n", 233 | "Usually, the goal is to identify the members of the positive class (the rare class) successfully -- this could be either the good wines or the patients presenting a rare disease. It is common practice to define the following ratios:\n", 234 | "\n", 235 | "The **recall** rate (also called the sensitivity or the true positive rate) is the ratio of true positive guesses among all positives:\n", 236 | "$$\\text{recall} = \\frac{TP}{P}=\\frac{TP}{TP+FN}.$$\n", 237 | "The **precision** is the ratio of the true positive guesses over all the positive guesses:\n", 238 | "$$\\text{precision} = \\frac{TP}{TP+FP}.$$" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "**(e)** Describe in words what the **difference** is between **precision** and **recall**. Describe an **application scenario** where precision would be more important than recall, and one scenario where recall would be more important than precision." 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "**YOUR ANSWER HERE.**" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "Because precision and recall both provide valuable information about the quality of a classifier, we often want to combine them into a single general-purpose score. The **F1** score is defined as the harmonic mean of recall and precision:\n", 260 | "$$F_1 = \\frac{2\\times\\text{recall}\\times\\text{precision}}{\\text{recall} + \\text{precision}}.$$\n", 261 | "\n", 262 | "The harmonic mean of two numbers is closer to the smaller of the two numbers than the standard arithmetic mean. The F1 score thus tends to favor classifiers that are strong in both precision and recall, rather than classifiers that emphasize one at the cost of the other." 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "**(f)** For this part, **repeat the cross-validation analysis in part (b) changing the `scoring` parameter** of the cross_val_score function such that the measure used is the **F1 score**. **Comment** briefly on these numbers. Hint: See the scikit-learn documentation for the options you can use for the *scoring* parameter." 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "collapsed": false, 275 | "input": [ 276 | "## your code here" 277 | ], 278 | "language": "python", 279 | "metadata": {}, 280 | "outputs": [], 281 | "prompt_number": 7 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "**YOUR DISCUSSION HERE.**" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "### Problem 3: Classifier Calibration\n", 295 | "\n", 296 | "Many classifiers, including random forest classifiers, can return **prediction probabilities**, which can be interpreted as the probability that a given prediction point falls into a given class (i.e., given the data $X$ and a candidate class $c$, the prediction probability states $P(Y = c | X)$). However, when the classes in the training data are **unbalanced**, as in this wine example, these prediction probabilities calculated by a classifier can be inaccurate. This is because many classifiers, again including random forests, do not have a way to internally adjust for this imbalance.\n", 297 | "\n", 298 | "Despite the inaccuracy caused by imbalance, the prediction probabilities returned by a classifier can still be used to construct good predictions if we can choose the right way to turn a prediction probability into a prediction about the class that the datapoint belongs to. We call this task **calibration**.\n", 299 | "\n", 300 | "If a classifier's prediction probabilities are accurate, the appropriate way to convert its probabilities into predictions is to simply choose the class with probability > 0.5. This is the default behavior of classifiers when we call their `predict` method. When the probabilities are inaccurate, this does not work well, but we can still get good predictions by choosing a more appropriate cutoff. In this question, we will choose a cutoff by cross validation." 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": {}, 306 | "source": [ 307 | "**(a)** Fit a random forest classifier to the wine data **using 15 trees**. Compute the **predicted probabilities** that the classifier assigned to each of the training examples (Hint: Use the `predict_proba` method of the classifier after fitting.). As a **sanity test**, construct a prediction based on these predicted probabilities that labels all wines with a predicted probability of being in class 1 > 0.5 with a 1 and 0 otherwise. For example, if originally probabilities $= [0.1,0.4,0.5,0.6,0.7]$, the predictions should be $[0,0,0,1,1]$. **Compare** this to the output of the classifier's `predict` method, and **show that they are the same**. " 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "collapsed": false, 313 | "input": [ 314 | "## your code here" 315 | ], 316 | "language": "python", 317 | "metadata": {}, 318 | "outputs": [], 319 | "prompt_number": 8 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "**(b)** **Write a function** `cutoff_predict` that takes a **trained** classifier, a data matrix X, and a cutoff, and generates predictions based on the classifier's predicted **probability and the cutoff value**, as you did in the previous question." 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "collapsed": false, 331 | "input": [ 332 | "\"\"\"\n", 333 | "cutoff_predict(clf, X, cutoff)\n", 334 | "\n", 335 | "Inputs:\n", 336 | "clf: a **trained** classifier object\n", 337 | "X: a 2D numpy array of features\n", 338 | "cutoff: a float giving the cutoff value used to convert\n", 339 | " predicted probabilities into a 0/1 prediction.\n", 340 | "\n", 341 | "Output:\n", 342 | "a numpy array of 0/1 predictions.\n", 343 | "\"\"\"\n", 344 | "## your code here" 345 | ], 346 | "language": "python", 347 | "metadata": {}, 348 | "outputs": [ 349 | { 350 | "metadata": {}, 351 | "output_type": "pyout", 352 | "prompt_number": 9, 353 | "text": [ 354 | "'\\ncutoff_predict(clf, X, cutoff)\\n\\nInputs:\\nclf: a **trained** classifier object\\nX: a 2D numpy array of features\\ncutoff: a float giving the cutoff value used to convert\\n predicted probabilities into a 0/1 prediction.\\n\\nOutput:\\na numpy array of 0/1 predictions.\\n'" 355 | ] 356 | } 357 | ], 358 | "prompt_number": 9 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "metadata": {}, 363 | "source": [ 364 | "**(c)** Using **10-fold cross validation** find a cutoff in `np.arange(0.1,0.9,0.1)` that gives the best average **F1 score** when converting prediction probabilities from a **15-tree** random forest classifier into predictions.\n", 365 | "\n", 366 | "To help you with this task, we have provided you a function `custom_f1` that takes a cutoff value and returns a function suitable for using as the `scoring` argument to `cross_val_score`. **This function uses the `cutoff_predict` function that you defined in the previous question**.\n", 367 | "\n", 368 | "Using a **boxplot**, compare the **F1 scores** that correspond to each candidate **cutoff** value." 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "collapsed": false, 374 | "input": [ 375 | "def custom_f1(cutoff):\n", 376 | " def f1_cutoff(clf, X, y):\n", 377 | " ypred = cutoff_predict(clf, X, cutoff)\n", 378 | " return sklearn.metrics.f1_score(y, ypred)\n", 379 | " \n", 380 | " return f1_cutoff\n", 381 | "\n", 382 | "## your code here" 383 | ], 384 | "language": "python", 385 | "metadata": {}, 386 | "outputs": [], 387 | "prompt_number": 10 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "metadata": {}, 392 | "source": [ 393 | "**(d)** According to this analysis, which cutoff value gives the **best predictive results**? **Explain** why this answer makes sense in light of the **unbalanced** classes in the training data." 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "metadata": {}, 399 | "source": [ 400 | "**YOUR ANSWER HERE.**" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "metadata": {}, 406 | "source": [ 407 | "### Problem 4: Visualizing Classifiers Using Decision Surfaces\n", 408 | "\n", 409 | "One common visual summary of a classifier is its decision surface. Recall that a trained classifier takes in features $X$ and tries to predict a target $Y$. We can visualize how the classifier translates different inputs $X$ into a guess for $Y$ by plotting the classifier's **prediction probability** (that is, for a given class $c$, the assigned probability that $Y = c$) as a function of the features $X$. Most classifiers in scikit-learn have a method called `predict_proba` that computes this quantity for new examples after the classifier has been trained." 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "**(a)** Decision surface visualizations are really only meaningful if they are plotted against inputs $X$ that are one- or two-dimensional. So before we plot these surfaces, we will first find **two \"important\" dimensions** of $X$ to focus on. Recall that in the last homework we used SVD to perform a similar task. Here, we will use a different dimension reduction method based on random forests.\n", 417 | "\n", 418 | "Random forests allow us to compute a heuristic for determining how \"important\" a feature is in predicting a target. This heuristic measures the change in prediction accuracy if we take a given feature and permute (scramble) it across the datapoints in the training set. The more the accuracy drops when the feature is permuted, the more \"important\" we can conclude the feature is. Importance can be a useful way to select a small number of features for visualization.\n", 419 | "\n", 420 | "As you did in the last question, train a random forest classifier on the wine data using **15 trees**. Use the `feature_importances_` attribute of the classifier to obtain the relative importance of the features. These features are the columns of the dataframe. Show a simple **bar plot** showing the relative importance of the named features of the wines in the databes." 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "collapsed": false, 426 | "input": [ 427 | "## your code here" 428 | ], 429 | "language": "python", 430 | "metadata": {}, 431 | "outputs": [], 432 | "prompt_number": 11 433 | }, 434 | { 435 | "cell_type": "markdown", 436 | "metadata": {}, 437 | "source": [ 438 | "**(b)** Below, we have provided you with a function `plot_decision_surface` that plots a classifier's decision surface, taking as arguments a classifier object, a two-column feature matrix, and a target vector.\n", 439 | "\n", 440 | "Using this function and the results from the \"importance\" analysis above, **subset** the data matrix to include just the **two features of highest importance**. Then **plot** the decision surfaces of a decision tree classifier, and a random forest classifier with **number of trees set to 15**, and a support vector machine **with `C` set to 100, and `gamma` set to 1.0**. " 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "collapsed": false, 446 | "input": [ 447 | "from sklearn.tree import DecisionTreeClassifier\n", 448 | "import sklearn.linear_model\n", 449 | "import sklearn.svm\n", 450 | "\n", 451 | "def plot_decision_surface(clf, X_train, Y_train):\n", 452 | " plot_step=0.1\n", 453 | " \n", 454 | " if X_train.shape[1] != 2:\n", 455 | " raise ValueError(\"X_train should have exactly 2 columnns!\")\n", 456 | " \n", 457 | " x_min, x_max = X_train[:, 0].min() - plot_step, X_train[:, 0].max() + plot_step\n", 458 | " y_min, y_max = X_train[:, 1].min() - plot_step, X_train[:, 1].max() + plot_step\n", 459 | " xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),\n", 460 | " np.arange(y_min, y_max, plot_step))\n", 461 | "\n", 462 | " clf.fit(X_train,Y_train)\n", 463 | " if hasattr(clf, 'predict_proba'):\n", 464 | " Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1]\n", 465 | " else:\n", 466 | " Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) \n", 467 | " Z = Z.reshape(xx.shape)\n", 468 | " cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Reds)\n", 469 | " plt.scatter(X_train[:,0],X_train[:,1],c=Y_train,cmap=plt.cm.Paired)\n", 470 | " plt.show()\n", 471 | " \n", 472 | "## your code here" 473 | ], 474 | "language": "python", 475 | "metadata": {}, 476 | "outputs": [], 477 | "prompt_number": 12 478 | }, 479 | { 480 | "cell_type": "markdown", 481 | "metadata": {}, 482 | "source": [ 483 | "**(c)** Recall from the lecture that there is a tradeoff between the bias and the variance of a classifier. We want to choose a model that generalizes well to unseen data. With a **high-variance** classifier we run the risk of **overfitting** to noisy or unrepresentative training data. In contrast, classifier with a **high bias** typically produce simpler models that tend to **underfit** the training data, failing to capture important regularities. \n", 484 | "\n", 485 | "Discuss the differences in the above decision surfaces in terms of their **complexity** and **sensitivity** to the training data. How do these properties relate to **bias** and **variance**?" 486 | ] 487 | }, 488 | { 489 | "cell_type": "markdown", 490 | "metadata": {}, 491 | "source": [ 492 | "**YOUR ANSWER HERE.**" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "metadata": {}, 498 | "source": [ 499 | "**(d)** The SVM implementation of sklearn has an **optional parameter** `class_weight`. This parameter is set to `None` per default, but it also provides an `auto` mode, which uses the values of the labels Y to **automatically adjust weights** inversely proportional to class frequencies. As done in sub-problem 4(b), **draw the decision boundaries** for two SVM classifiers. **Use `C=1.0`, and `gamma=1.0`** for **both** models, but for the first SVM set `class_weigth` to **`None`**, and for the second SVM set `class_weigth` to **`'auto'`**. (Hint: `None` is a keyword in Python, whereas the `'auto'` is a String and needs the quotation marks.) " 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "collapsed": false, 505 | "input": [ 506 | "## your Code here" 507 | ], 508 | "language": "python", 509 | "metadata": {}, 510 | "outputs": [], 511 | "prompt_number": 13 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "metadata": {}, 516 | "source": [ 517 | "**(e)** Discuss the difference in the decision boundary with respect to **precision**, **recall**, and **overall performance**. How could the performance be **improved**? " 518 | ] 519 | }, 520 | { 521 | "cell_type": "markdown", 522 | "metadata": {}, 523 | "source": [ 524 | "**YOUR ANSWER HERE**" 525 | ] 526 | }, 527 | { 528 | "cell_type": "markdown", 529 | "metadata": {}, 530 | "source": [ 531 | "# Submission Instructions\n", 532 | "\n", 533 | "To submit your homework, create a folder named **lastname_firstinitial_hw#** and place your IPython notebooks, data files, and any other files in this folder. Your IPython Notebooks should be completely executed with the results visible in the notebook. We should not have to run any code. Compress the folder (please use .zip compression) and submit to the CS109 dropbox in the appropriate folder. *If we cannot access your work because these directions are not followed correctly, we will not grade your work.*\n" 534 | ] 535 | } 536 | ], 537 | "metadata": {} 538 | } 539 | ] 540 | } -------------------------------------------------------------------------------- /homework/HW3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:8d4371387e572dccb374d007f65ea7846d2fafc94b650e10f4e22e2a105c858f" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "# Homework 3: Prediction and Classification\n", 16 | "\n", 17 | "Due: Thursday, October 16, 2014 11:59 PM\n", 18 | "\n", 19 | " Download this assignment\n", 20 | "\n", 21 | "#### Submission Instructions\n", 22 | "To submit your homework, create a folder named lastname_firstinitial_hw# and place your IPython notebooks, data files, and any other files in this folder. Your IPython Notebooks should be completely executed with the results visible in the notebook. We should not have to run any code. Compress the folder (please use .zip compression) and submit to the CS109 dropbox in the appropriate folder. If we cannot access your work because these directions are not followed correctly, we will not grade your work.\n", 23 | "\n", 24 | "---\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "# Introduction\n", 32 | "\n", 33 | "In this assignment you will be using regression and classification to explore different data sets. \n", 34 | "\n", 35 | "**First**: You will use data from before 2002 in the [Sean Lahman's Baseball Database](http://seanlahman.com/baseball-archive/statistics) to create a metric for picking baseball players using linear regression. This is same database we used in Homework 1. This database contains the \"complete batting and pitching statistics from 1871 to 2013, plus fielding statistics, standings, team stats, managerial records, post-season data, and more\". [Documentation provided here](http://seanlahman.com/files/database/readme2012.txt).\n", 36 | "\n", 37 | "![\"Sabermetrics Science\"](http://saberseminar.com/wp-content/uploads/2012/01/saber-web.jpg)\n", 38 | "http://saberseminar.com/wp-content/uploads/2012/01/saber-web.jpg\n", 39 | "\n", 40 | "**Second**: You will use the famous [iris](http://en.wikipedia.org/wiki/Iris_flower_data_set) data set to perform a $k$-neareast neighbor classification using cross validation. While it was introduced in 1936, it is still [one of the most popular](http://archive.ics.uci.edu/ml/) example data sets in the machine learning community. Wikipedia describes the data set as follows: \"The data set consists of 50 samples from each of three species of Iris (Iris setosa, Iris virginica and Iris versicolor). Four features were measured from each sample: the length and the width of the sepals and petals, in centimetres.\" Here is an illustration what the four features measure:\n", 41 | "\n", 42 | "![\"iris data features\"](http://sebastianraschka.com/Images/2014_python_lda/iris_petal_sepal.png)\n", 43 | "http://sebastianraschka.com/Images/2014_python_lda/iris_petal_sepal.png\n", 44 | "\n", 45 | "**Third**: You will investigate the influence of higher dimensional spaces on the classification using another standard data set in machine learning called the The [digits data set](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html). This data set is similar to the MNIST data set discussed in the lecture. The main difference is, that each digit is represented by an 8x8 pixel image patch, which is considerably smaller than the 28x28 pixels from MNIST. In addition, the gray values are restricted to 16 different values (4 bit), instead of 256 (8 bit) for MNIST. \n", 46 | "\n", 47 | "**Finally**: In preparation for Homework 4, we want you to read through the following articles related to predicting the 2014 Senate Midterm Elections. \n", 48 | "\n", 49 | "* [Nate Silver's Methodology at while at NYT](http://fivethirtyeight.blogs.nytimes.com/methodology/)\n", 50 | "* [How The FiveThirtyEight Senate Forecast Model Works](http://fivethirtyeight.com/features/how-the-fivethirtyeight-senate-forecast-model-works/)\n", 51 | "* [Pollster Ratings v4.0: Methodology](http://fivethirtyeight.com/features/pollster-ratings-v40-methodology/)\n", 52 | "* [Pollster Ratings v4.0: Results](http://fivethirtyeight.com/features/pollster-ratings-v40-results/)\n", 53 | "* [Nate Silver versus Sam Wang](http://www.washingtonpost.com/blogs/plum-line/wp/2014/09/17/nate-silver-versus-sam-wang/)\n", 54 | "* [More Nate Silver versus Sam Wang](http://www.dailykos.com/story/2014/09/09/1328288/-Get-Ready-To-Rumbllllle-Battle-Of-The-Nerds-Nate-Silver-VS-Sam-Wang)\n", 55 | "* [Nate Silver explains critisims of Sam Wang](http://politicalwire.com/archives/2014/10/02/nate_silver_rebuts_sam_wang.html)\n", 56 | "* [Background on the feud between Nate Silver and Sam Wang](http://talkingpointsmemo.com/dc/nate-silver-sam-wang-feud)\n", 57 | "* [Are there swing voters?]( http://www.stat.columbia.edu/~gelman/research/unpublished/swing_voters.pdf)\n", 58 | "\n", 59 | "\n", 60 | "\n", 61 | "---" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "## Load Python modules" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "collapsed": false, 74 | "input": [ 75 | "# special IPython command to prepare the notebook for matplotlib\n", 76 | "%matplotlib inline \n", 77 | "\n", 78 | "import requests \n", 79 | "import StringIO\n", 80 | "import zipfile\n", 81 | "import numpy as np\n", 82 | "import pandas as pd # pandas\n", 83 | "import matplotlib.pyplot as plt # module for plotting \n", 84 | "\n", 85 | "# If this module is not already installed, you may need to install it. \n", 86 | "# You can do this by typing 'pip install seaborn' in the command line\n", 87 | "import seaborn as sns \n", 88 | "\n", 89 | "import sklearn\n", 90 | "import sklearn.datasets\n", 91 | "import sklearn.cross_validation\n", 92 | "import sklearn.decomposition\n", 93 | "import sklearn.grid_search\n", 94 | "import sklearn.neighbors\n", 95 | "import sklearn.metrics" 96 | ], 97 | "language": "python", 98 | "metadata": {}, 99 | "outputs": [], 100 | "prompt_number": 1 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "# Problem 1: Sabermetrics\n", 107 | "\n", 108 | "Using data preceding the 2002 season pick 10 offensive players keeping the payroll under $20 million (assign each player the median salary). Predict how many games this team would win in a 162 game season. \n", 109 | "\n", 110 | "In this problem we will be returning to the [Sean Lahman's Baseball Database](http://seanlahman.com/baseball-archive/statistics) that we used in Homework 1. From this database, we will be extract five data sets containing information such as yearly stats and standing, batting statistics, fielding statistics, player names, player salaries and biographical information. You will explore the data in this database from before 2002 and create a metric for picking players. " 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "#### Problem 1(a) \n", 118 | "\n", 119 | "Load in [these CSV files](http://seanlahman.com/files/database/lahman-csv_2014-02-14.zip) from the [Sean Lahman's Baseball Database](http://seanlahman.com/baseball-archive/statistics). For this assignment, we will use the 'Teams.csv', 'Batting.csv', 'Salaries.csv', 'Fielding.csv', 'Master.csv' tables. Read these tables into separate pandas DataFrames with the following names. \n", 120 | "\n", 121 | "CSV file name | Name of pandas DataFrame\n", 122 | ":---: | :---: \n", 123 | "Teams.csv | teams\n", 124 | "Batting.csv | players\n", 125 | "Salaries.csv | salaries\n", 126 | "Fielding.csv | fielding\n", 127 | "Master.csv | master" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "collapsed": false, 133 | "input": [ 134 | "### Your code here ###" 135 | ], 136 | "language": "python", 137 | "metadata": {}, 138 | "outputs": [], 139 | "prompt_number": 2 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "#### Problem 1(b)\n", 146 | "\n", 147 | "Calculate the median salary for each player and create a pandas DataFrame called `medianSalaries` with four columns: (1) the player ID, (2) the first name of the player, (3) the last name of the player and (4) the median salary of the player. Show the head of the `medianSalaries` DataFrame. " 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "collapsed": false, 153 | "input": [ 154 | "### Your code here ###" 155 | ], 156 | "language": "python", 157 | "metadata": {}, 158 | "outputs": [], 159 | "prompt_number": 3 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "#### Problem 1(c)\n", 166 | "\n", 167 | "Now, consider only team/season combinations in which the teams played 162 Games. Exclude all data from before 1947. Compute the per plate appearance rates for singles, doubles, triples, HR, and BB. Create a new pandas DataFrame called `stats` that has the teamID, yearID, wins and these rates.\n", 168 | "\n", 169 | "**Hint**: Singles are hits that are not doubles, triples, nor HR. Plate appearances are base on balls plus at bats." 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "collapsed": false, 175 | "input": [ 176 | "### Your code here ###" 177 | ], 178 | "language": "python", 179 | "metadata": {}, 180 | "outputs": [], 181 | "prompt_number": 4 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "#### Problem 1(d)\n", 188 | "\n", 189 | "Is there a noticeable time trend in the rates computed computed in Problem 1(c)? " 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "collapsed": false, 195 | "input": [ 196 | "### Your code here ###" 197 | ], 198 | "language": "python", 199 | "metadata": {}, 200 | "outputs": [], 201 | "prompt_number": 5 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "#### Problem 1(e) \n", 208 | "\n", 209 | "Using the `stats` DataFrame from Problem 1(c), adjust the singles per PA rates so that the average across teams for each year is 0. Do the same for the doubles, triples, HR, and BB rates. " 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "collapsed": false, 215 | "input": [ 216 | "### Your code here ###" 217 | ], 218 | "language": "python", 219 | "metadata": {}, 220 | "outputs": [], 221 | "prompt_number": 6 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "#### Problem 1(f)\n", 228 | "\n", 229 | "Build a simple linear regression model to predict the number of wins from the average adjusted singles, double, triples, HR, and BB rates. To decide which of these terms to include fit the model to data from 2002 and compute the average squared residuals from predictions to years past 2002. Use the fitted model to define a new sabermetric summary: offensive predicted wins (OPW). Hint: the new summary should be a linear combination of one to five of the five rates.\n" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "collapsed": false, 235 | "input": [ 236 | "### Your code here ###" 237 | ], 238 | "language": "python", 239 | "metadata": {}, 240 | "outputs": [], 241 | "prompt_number": 7 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "** Your answer here: **" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "#### Problem 1(g)\n", 255 | "\n", 256 | "Now we will create a similar database for individual players. Consider only player/year combinations in which the player had at least 500 plate appearances. Consider only the years we considered for the calculations above (after 1947 and seasons with 162 games). For each player/year compute singles, doubles, triples, HR, BB per plate appearance rates. Create a new pandas DataFrame called `playerstats` that has the playerID, yearID and the rates of these stats. Remove the average for each year as for these rates as done in Problem 1(e). " 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "collapsed": false, 262 | "input": [ 263 | "### Your code here ###" 264 | ], 265 | "language": "python", 266 | "metadata": {}, 267 | "outputs": [], 268 | "prompt_number": 8 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "Show the head of the `playerstats` DataFrame. " 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "collapsed": false, 280 | "input": [ 281 | "### Your code here ###" 282 | ], 283 | "language": "python", 284 | "metadata": {}, 285 | "outputs": [], 286 | "prompt_number": 9 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "#### Problem 1(h)\n", 293 | "\n", 294 | "Using the `playerstats` DataFrame created in Problem 1(g), create a new DataFrame called `playerLS` containing the player's lifetime stats. This DataFrame should contain the playerID, the year the player's career started, the year the player's career ended and the player's lifetime average for each of the quantities (singles, doubles, triples, HR, BB). For simplicity we will simply compute the avaerage of the rates by year (a more correct way is to go back to the totals). " 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "collapsed": false, 300 | "input": [ 301 | "### Your code here ###" 302 | ], 303 | "language": "python", 304 | "metadata": {}, 305 | "outputs": [], 306 | "prompt_number": 10 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "Show the head of the `playerLS` DataFrame. " 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "collapsed": false, 318 | "input": [ 319 | "### Your code here ###" 320 | ], 321 | "language": "python", 322 | "metadata": {}, 323 | "outputs": [], 324 | "prompt_number": 11 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "#### Problem 1(i)\n", 331 | "\n", 332 | "Compute the OPW for each player based on the average rates in the `playerLS` DataFrame. You can interpret this summary statistic as the predicted wins for a team with 9 batters exactly like the player in question. Add this column to the playerLS DataFrame. Call this colum OPW." 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "collapsed": false, 338 | "input": [ 339 | "### Your code here ###" 340 | ], 341 | "language": "python", 342 | "metadata": {}, 343 | "outputs": [], 344 | "prompt_number": 12 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "#### Problem 1(j)\n", 351 | "\n", 352 | "Add four columns to the `playerLS` DataFrame that contains the player's position (C, 1B, 2B, 3B, SS, LF, CF, RF, or OF), first name, last name and median salary. " 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "collapsed": false, 358 | "input": [ 359 | "### Your code here ###" 360 | ], 361 | "language": "python", 362 | "metadata": {}, 363 | "outputs": [], 364 | "prompt_number": 13 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "Show the head of the `playerLS` DataFrame. " 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "collapsed": false, 376 | "input": [ 377 | "### Your code here ###" 378 | ], 379 | "language": "python", 380 | "metadata": {}, 381 | "outputs": [], 382 | "prompt_number": 14 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "#### Problem 1(k)\n", 389 | "\n", 390 | "Subset the `playerLS` DataFrame for players active in 2002 and 2003 and played at least three years. Plot and describe the relationship bewteen the median salary (in millions) and the predicted number of wins. " 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "collapsed": false, 396 | "input": [ 397 | "### Your code here ###" 398 | ], 399 | "language": "python", 400 | "metadata": {}, 401 | "outputs": [], 402 | "prompt_number": 15 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "metadata": {}, 407 | "source": [ 408 | "#### Problem 1(l)\n", 409 | "Pick one players from one of each of these 10 position C, 1B, 2B, 3B, SS, LF, CF, RF, DH, or OF keeping the total median salary of all 10 players below 20 million. Report their averaged predicted wins and total salary." 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "collapsed": false, 415 | "input": [ 416 | "### Your code here ###" 417 | ], 418 | "language": "python", 419 | "metadata": {}, 420 | "outputs": [], 421 | "prompt_number": 16 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": {}, 426 | "source": [ 427 | "#### Problem 1(m)\n", 428 | "What do these players outperform in? Singles, doubles, triples HR or BB?" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "collapsed": false, 434 | "input": [ 435 | "### Your code here ###" 436 | ], 437 | "language": "python", 438 | "metadata": {}, 439 | "outputs": [], 440 | "prompt_number": 17 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": {}, 445 | "source": [ 446 | "** Your answer here: **" 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": {}, 452 | "source": [ 453 | "## Discussion for Problem 1\n", 454 | "\n", 455 | "*Write a brief discussion of your conclusions to the questions and tasks above in 100 words or less.*\n", 456 | "\n", 457 | "---" 458 | ] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "metadata": {}, 463 | "source": [ 464 | "# Problem 2: $k$-Nearest Neighbors and Cross Validation \n", 465 | "\n", 466 | "What is the optimal $k$ for predicting species using $k$-nearest neighbor classification \n", 467 | "on the four features provided by the iris dataset.\n", 468 | "\n", 469 | "In this problem you will get to know the famous iris data set, and use cross validation to select the optimal $k$ for a $k$-nearest neighbor classification. This problem set makes heavy use of the [sklearn](http://scikit-learn.org/stable/) library. In addition to Pandas, it is one of the most useful libraries for data scientists! After completing this homework assignment you will know all the basics to get started with your own machine learning projects in sklearn. \n", 470 | "\n", 471 | "Future lectures will give further background information on different classifiers and their specific strengths and weaknesses, but when you have the basics for sklearn down, changing the classifier will boil down to exchanging one to two lines of code.\n", 472 | "\n", 473 | "The data set is so popular, that sklearn provides an extra function to load it:" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "collapsed": false, 479 | "input": [ 480 | "#load the iris data set\n", 481 | "iris = sklearn.datasets.load_iris()\n", 482 | "\n", 483 | "X = iris.data \n", 484 | "Y = iris.target\n", 485 | "\n", 486 | "print X.shape, Y.shape" 487 | ], 488 | "language": "python", 489 | "metadata": {}, 490 | "outputs": [ 491 | { 492 | "output_type": "stream", 493 | "stream": "stdout", 494 | "text": [ 495 | "(150, 4) (150,)\n" 496 | ] 497 | } 498 | ], 499 | "prompt_number": 18 500 | }, 501 | { 502 | "cell_type": "markdown", 503 | "metadata": {}, 504 | "source": [ 505 | "#### Problem 2(a) \n", 506 | "Split the data into a train and a test set. Use a random selection of 33% of the samples as test data. Sklearn provides the [`train_test_split`](http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.train_test_split.html) function for this purpose. Print the dimensions of all the train and test data sets you have created. " 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "collapsed": false, 512 | "input": [ 513 | "### Your code here ###" 514 | ], 515 | "language": "python", 516 | "metadata": {}, 517 | "outputs": [], 518 | "prompt_number": 19 519 | }, 520 | { 521 | "cell_type": "markdown", 522 | "metadata": {}, 523 | "source": [ 524 | "#### Problem 2(b)\n", 525 | "\n", 526 | "Examine the data further by looking at the projections to the first two principal components of the data. Use the [`TruncatedSVD`](http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html) function for this purpose, and create a scatter plot. Use the colors on the scatter plot to represent the different classes in the target data. " 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "collapsed": false, 532 | "input": [ 533 | "### Your code here ###" 534 | ], 535 | "language": "python", 536 | "metadata": {}, 537 | "outputs": [], 538 | "prompt_number": 20 539 | }, 540 | { 541 | "cell_type": "markdown", 542 | "metadata": {}, 543 | "source": [ 544 | "#### Problem 2(c) \n", 545 | "\n", 546 | "In the lecture we discussed how to use cross validation to estimate the optimal value for $k$ (the number of nearest neighbors to base the classification on). Use ***ten fold cross validation*** to estimate the optimal value for $k$ for the iris data set. \n", 547 | "\n", 548 | "**Note**: For your convenience sklearn does not only include the [KNN classifier](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html), but also a [grid search function](http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV). The function is called grid search, because if you have to optimize more than one parameter, it is common practice to define a range of possible values for each parameter. An exhaustive search then runs over the complete grid defined by all the possible parameter combinations. This can get very computation heavy, but luckily our KNN classifier only requires tuning of a single parameter for this problem set. " 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "collapsed": false, 554 | "input": [ 555 | "### Your code here ###" 556 | ], 557 | "language": "python", 558 | "metadata": {}, 559 | "outputs": [], 560 | "prompt_number": 21 561 | }, 562 | { 563 | "cell_type": "markdown", 564 | "metadata": {}, 565 | "source": [ 566 | "#### Problem 2(d)\n", 567 | "\n", 568 | "Visualize the result by plotting the score results versus values for $k$. " 569 | ] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "collapsed": false, 574 | "input": [ 575 | "### Your code here ###" 576 | ], 577 | "language": "python", 578 | "metadata": {}, 579 | "outputs": [], 580 | "prompt_number": 22 581 | }, 582 | { 583 | "cell_type": "markdown", 584 | "metadata": {}, 585 | "source": [ 586 | "Verify that the grid search has indeed chosen the right parameter value for $k$." 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "collapsed": false, 592 | "input": [ 593 | "### Your code here ###" 594 | ], 595 | "language": "python", 596 | "metadata": {}, 597 | "outputs": [], 598 | "prompt_number": 23 599 | }, 600 | { 601 | "cell_type": "markdown", 602 | "metadata": {}, 603 | "source": [ 604 | "#### Problem 2(e)\n", 605 | "\n", 606 | "Test the performance of our tuned KNN classifier on the test set." 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "collapsed": false, 612 | "input": [ 613 | "### Your code here ###" 614 | ], 615 | "language": "python", 616 | "metadata": {}, 617 | "outputs": [], 618 | "prompt_number": 24 619 | }, 620 | { 621 | "cell_type": "markdown", 622 | "metadata": {}, 623 | "source": [ 624 | "## Discussion for Problem 2\n", 625 | "\n", 626 | "*Write a brief discussion of your conclusions to the questions and tasks above in 100 words or less.*\n", 627 | "\n", 628 | "---" 629 | ] 630 | }, 631 | { 632 | "cell_type": "markdown", 633 | "metadata": {}, 634 | "source": [ 635 | "# Problem 3: The Curse and Blessing of Higher Dimensions\n", 636 | "\n", 637 | "In this problem we will investigate the influence of higher dimensional spaces on the classification. The data set is again one of the standard data sets from sklearn. The [digits data set](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html) is similar to the MNIST data set discussed in the lecture. The main difference is, that each digit is represented by an 8x8 pixel image patch, which is considerably smaller than the 28x28 pixels from MNIST. In addition, the gray values are restricted to 16 different values (4 bit), instead of 256 (8 bit) for MNIST. \n", 638 | "\n", 639 | "First we again load our data set." 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "collapsed": false, 645 | "input": [ 646 | "digits = sklearn.datasets.load_digits()\n", 647 | "\n", 648 | "X = digits.data \n", 649 | "Y = digits.target\n", 650 | "\n", 651 | "print X.shape, Y.shape" 652 | ], 653 | "language": "python", 654 | "metadata": {}, 655 | "outputs": [ 656 | { 657 | "output_type": "stream", 658 | "stream": "stdout", 659 | "text": [ 660 | "(1797, 64) (1797,)\n" 661 | ] 662 | } 663 | ], 664 | "prompt_number": 25 665 | }, 666 | { 667 | "cell_type": "markdown", 668 | "metadata": {}, 669 | "source": [ 670 | "#### Problem 3(a) \n", 671 | "\n", 672 | "Start with the same steps as in Problem 2. Split the data into train and test set. Use 33% of the samples as test data. Print the dimensions of all the train and test data sets you created. " 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "collapsed": false, 678 | "input": [ 679 | "### Your code here ###" 680 | ], 681 | "language": "python", 682 | "metadata": {}, 683 | "outputs": [], 684 | "prompt_number": 26 685 | }, 686 | { 687 | "cell_type": "markdown", 688 | "metadata": {}, 689 | "source": [ 690 | "#### Problem 3(b) \n", 691 | "\n", 692 | "Similar to Problem 2(b), create a scatter plot of the projections to the first two PCs. Use the colors on the scatter plot to represent the different classes in the target data. How well can we separate the classes?\n", 693 | "\n", 694 | "**Hint**: Use a `Colormap` in matplotlib to represent the diferent classes in the target data. " 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "collapsed": false, 700 | "input": [ 701 | "### Your code here ###" 702 | ], 703 | "language": "python", 704 | "metadata": {}, 705 | "outputs": [], 706 | "prompt_number": 27 707 | }, 708 | { 709 | "cell_type": "markdown", 710 | "metadata": {}, 711 | "source": [ 712 | "Create individual scatter plots using only two classes at a time to explore which classes are most difficult to distinguish in terms of class separability. You do not need to create scatter plots for all pairwise comparisons, but at least show one. " 713 | ] 714 | }, 715 | { 716 | "cell_type": "code", 717 | "collapsed": false, 718 | "input": [ 719 | "### Your code here ###" 720 | ], 721 | "language": "python", 722 | "metadata": {}, 723 | "outputs": [], 724 | "prompt_number": 28 725 | }, 726 | { 727 | "cell_type": "markdown", 728 | "metadata": {}, 729 | "source": [ 730 | "Give a brief interpretation of the scatter plot. Which classes look like hard to distinguish? Do both feature dimensions contribute to the class separability? " 731 | ] 732 | }, 733 | { 734 | "cell_type": "markdown", 735 | "metadata": {}, 736 | "source": [ 737 | "** Your answer here: **" 738 | ] 739 | }, 740 | { 741 | "cell_type": "markdown", 742 | "metadata": {}, 743 | "source": [ 744 | "#### Problem 3(c) \n", 745 | "\n", 746 | "Write a **ten-fold cross validation** to estimate the optimal value for $k$ for the digits data set. *However*, this time we are interested in the influence of the number of dimensions we project the data down as well. \n", 747 | "\n", 748 | "Extend the cross validation as done for the iris data set, to optimize $k$ for different dimensional projections of the data. Create a boxplot showing test scores for the optimal $k$ for each $d$-dimensional subspace with $d$ ranging from one to ten. The plot should have the scores on the y-axis and the different dimensions $d$ on the x-axis. You can use your favorite plot function for the boxplots. [Seaborn](http://web.stanford.edu/~mwaskom/software/seaborn/index.html) is worth having a look at though. It is a great library for statistical visualization and of course also comes with a [`boxplot`](http://web.stanford.edu/~mwaskom/software/seaborn/generated/seaborn.boxplot.html) function that has simple means for changing the labels on the x-axis." 749 | ] 750 | }, 751 | { 752 | "cell_type": "code", 753 | "collapsed": false, 754 | "input": [ 755 | "### Your cross validation and evaluation code here ###" 756 | ], 757 | "language": "python", 758 | "metadata": {}, 759 | "outputs": [], 760 | "prompt_number": 29 761 | }, 762 | { 763 | "cell_type": "code", 764 | "collapsed": false, 765 | "input": [ 766 | "### Your boxplot code here ### " 767 | ], 768 | "language": "python", 769 | "metadata": {}, 770 | "outputs": [], 771 | "prompt_number": 30 772 | }, 773 | { 774 | "cell_type": "markdown", 775 | "metadata": {}, 776 | "source": [ 777 | "Write a short interpretation of the generated plot, answering the following questions:\n", 778 | "\n", 779 | "* What trend do you see in the plot for increasing dimensions?\n", 780 | "\n", 781 | "* Why do you think this is happening?" 782 | ] 783 | }, 784 | { 785 | "cell_type": "markdown", 786 | "metadata": {}, 787 | "source": [ 788 | "** Your answer here: **" 789 | ] 790 | }, 791 | { 792 | "cell_type": "markdown", 793 | "metadata": {}, 794 | "source": [ 795 | "#### Problem 3(d) \n", 796 | "\n", 797 | "**For AC209 Students**: Change the boxplot we generated above to also show the optimal value for $k$ chosen by the cross validation grid search. " 798 | ] 799 | }, 800 | { 801 | "cell_type": "code", 802 | "collapsed": false, 803 | "input": [ 804 | "### Your code here ### " 805 | ], 806 | "language": "python", 807 | "metadata": {}, 808 | "outputs": [], 809 | "prompt_number": 31 810 | }, 811 | { 812 | "cell_type": "markdown", 813 | "metadata": {}, 814 | "source": [ 815 | "Write a short interpretation answering the following questions:\n", 816 | "\n", 817 | "* Which trend do you observe for the optimal value of $k$?\n", 818 | "\n", 819 | "* Why do you think this is happening?" 820 | ] 821 | }, 822 | { 823 | "cell_type": "markdown", 824 | "metadata": {}, 825 | "source": [ 826 | "** Your answer here: **" 827 | ] 828 | }, 829 | { 830 | "cell_type": "markdown", 831 | "metadata": {}, 832 | "source": [ 833 | "## Discussion for Problem 3\n", 834 | "\n", 835 | "*Write a brief discussion of your conclusions to the questions and tasks above in 100 words or less.*\n", 836 | "\n", 837 | "---" 838 | ] 839 | }, 840 | { 841 | "cell_type": "markdown", 842 | "metadata": {}, 843 | "source": [ 844 | "# Submission Instructions\n", 845 | "\n", 846 | "To submit your homework, create a folder named **lastname_firstinitial_hw#** and place your IPython notebooks, data files, and any other files in this folder. Your IPython Notebooks should be completely executed with the results visible in the notebook. We should not have to run any code. Compress the folder (please use .zip compression) and submit to the CS109 dropbox in the appropriate folder. *If we cannot access your work because these directions are not followed correctly, we will not grade your work.*\n" 847 | ] 848 | }, 849 | { 850 | "cell_type": "code", 851 | "collapsed": false, 852 | "input": [], 853 | "language": "python", 854 | "metadata": {}, 855 | "outputs": [], 856 | "prompt_number": 31 857 | } 858 | ], 859 | "metadata": {} 860 | } 861 | ] 862 | } -------------------------------------------------------------------------------- /labs/Lab4_Notes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:d871c68cdef575e9c7d3ed4ccebc5c3bd613ebcec6f83e9101a9658a5cb36edb" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "code", 13 | "collapsed": false, 14 | "input": [ 15 | "# special IPython command to prepare the notebook for matplotlib\n", 16 | "%matplotlib inline \n", 17 | "\n", 18 | "import urllib2 # module to read in HTML\n", 19 | "import bs4 # BeautifulSoup: module to parse HTML and XML\n", 20 | "import json # \n", 21 | "import datetime as dt # module for manipulating dates and times\n", 22 | "import pandas as pd\n", 23 | "import numpy as np" 24 | ], 25 | "language": "python", 26 | "metadata": {}, 27 | "outputs": [] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## Recall from from lab last week 09/19/2014\n", 34 | "\n", 35 | "Previously discussed: \n", 36 | "\n", 37 | "* More pandas, matplotlib for exploratory data analysis\n", 38 | "* Brief introduction to numpy and scipy\n", 39 | "* Working on the command line\n", 40 | "* Overview of git and Github" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "## Today, we will discuss the following:\n", 48 | "\n", 49 | "* urllib2 - reads in HTML\n", 50 | "* BeautifulSoup - use to parse HTML and XML code\n", 51 | " * Reddit\n", 52 | "* JSON examples\n", 53 | " * World Cup\n", 54 | "\n", 55 | " Download this notebook from Github " 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "# urllib2\n", 63 | "\n", 64 | "[urllib2](https://docs.python.org/2/library/urllib2.html) is a useful module to get information about and retrieving data from the web. The function `urlopen()` opens a URL (similar to opening a file). The file-like object has some of the methods as a file object. For example, to read the entire HTML of the webpage into a single string, use the method `read()`. `readlines()` can read in the text line by line. While `read()` reads in the HTML code and and `close()` closes the URL connection. \n" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "collapsed": false, 70 | "input": [ 71 | "x = urllib2.urlopen(\"http://www.google.com\")\n", 72 | "htmlSource = x.read()\n", 73 | "x.close()" 74 | ], 75 | "language": "python", 76 | "metadata": {}, 77 | "outputs": [] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "collapsed": false, 82 | "input": [ 83 | "type(htmlSource)" 84 | ], 85 | "language": "python", 86 | "metadata": {}, 87 | "outputs": [] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "collapsed": false, 92 | "input": [ 93 | "print htmlSource" 94 | ], 95 | "language": "python", 96 | "metadata": {}, 97 | "outputs": [] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "# BeautifulSoup\n", 104 | "\n", 105 | "Once you have the HTML source code, you have to parse it and clean it up.\n", 106 | "\n", 107 | "[BeautifulSoup](http://www.crummy.com/software/BeautifulSoup/bs4/doc/) is a really useful python module for parsing HTML and XML files. Let's try a few examples. \n", 108 | "\n", 109 | "For this section, we will be working with the HTML code from [Reddit](http://www.reddit.com). " 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "collapsed": false, 115 | "input": [ 116 | "x = urllib2.urlopen(\"http://www.reddit.com\") # Opens URLS\n", 117 | "htmlSource = x.read()\n", 118 | "x.close()\n", 119 | "print htmlSource" 120 | ], 121 | "language": "python", 122 | "metadata": {}, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "collapsed": false, 128 | "input": [ 129 | "### prettify()\n", 130 | "\n", 131 | "Beautiful Soup gives us a `BeautifulSoup` object, which represents the document as a nested data structure. We can use the `prettify()` function to show the different levels of the HTML code. " 132 | ], 133 | "language": "python", 134 | "metadata": {}, 135 | "outputs": [] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "collapsed": false, 140 | "input": [ 141 | "soup = bs4.BeautifulSoup(htmlSource)\n", 142 | "print soup.prettify()" 143 | ], 144 | "language": "python", 145 | "metadata": {}, 146 | "outputs": [] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "### Navigating the tree using tags\n", 153 | "\n", 154 | "The simplest way to navigate the parse tree is to say the name of the tag you want. If you want the `` tag, just say `soup.head`:" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "collapsed": false, 160 | "input": [ 161 | "print soup.head.prettify()" 162 | ], 163 | "language": "python", 164 | "metadata": {}, 165 | "outputs": [] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "### .contents and .children\n", 172 | "\n", 173 | "A tag\u2019s children are available in a list called `.contents` which returns a list. " 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "collapsed": false, 179 | "input": [ 180 | "soup.head.contents" 181 | ], 182 | "language": "python", 183 | "metadata": {}, 184 | "outputs": [] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "collapsed": false, 189 | "input": [ 190 | "len(soup.head.contents)" 191 | ], 192 | "language": "python", 193 | "metadata": {}, 194 | "outputs": [] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "collapsed": false, 199 | "input": [ 200 | "# Extract first three elements from the list of contents\n", 201 | "soup.head.contents[0:3]" 202 | ], 203 | "language": "python", 204 | "metadata": {}, 205 | "outputs": [] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "Instead of getting them as a list, you can iterate over a tag\u2019s children using the .children generator:" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "collapsed": false, 217 | "input": [ 218 | "soup.head.children" 219 | ], 220 | "language": "python", 221 | "metadata": {}, 222 | "outputs": [] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "collapsed": false, 227 | "input": [ 228 | "for child in soup.head.children:\n", 229 | " print(child)" 230 | ], 231 | "language": "python", 232 | "metadata": {}, 233 | "outputs": [] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "collapsed": false, 238 | "input": [ 239 | "# print the title of reddit\n", 240 | "soup.head.title" 241 | ], 242 | "language": "python", 243 | "metadata": {}, 244 | "outputs": [] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "collapsed": false, 249 | "input": [ 250 | "# print the string in the title\n", 251 | "soup.head.title.string" 252 | ], 253 | "language": "python", 254 | "metadata": {}, 255 | "outputs": [] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "### .descendants\n", 262 | "\n", 263 | "Attribute lets you iterate over all of a tag\u2019s children, recursively: its direct children, the children of its direct children, and so on:" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "collapsed": false, 269 | "input": [ 270 | "for child in soup.head.descendants:\n", 271 | " print child" 272 | ], 273 | "language": "python", 274 | "metadata": {}, 275 | "outputs": [] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "### .strings\n", 282 | "\n", 283 | "If there\u2019s more than one thing inside a tag, you can still look at just the strings. Use the .strings generator" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "collapsed": false, 289 | "input": [ 290 | "for string in soup.strings:\n", 291 | " print(repr(string))" 292 | ], 293 | "language": "python", 294 | "metadata": {}, 295 | "outputs": [] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "### .stripped_strings\n", 302 | "\n", 303 | "These strings tend to have a lot of extra whitespace, which you can remove by using the .stripped_strings generator instead" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "collapsed": false, 309 | "input": [ 310 | "for string in soup.stripped_strings:\n", 311 | " print(repr(string))" 312 | ], 313 | "language": "python", 314 | "metadata": {}, 315 | "outputs": [] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "### .parent\n", 322 | "\n", 323 | "You can access an element\u2019s parent with the `.parent` attribute. In the example \u201cthree sisters\u201d document, the `` tag is the parent of the `` tag:" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "collapsed": false, 329 | "input": [ 330 | "soup.title" 331 | ], 332 | "language": "python", 333 | "metadata": {}, 334 | "outputs": [] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "collapsed": false, 339 | "input": [ 340 | "soup.title.string" 341 | ], 342 | "language": "python", 343 | "metadata": {}, 344 | "outputs": [] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "collapsed": false, 349 | "input": [ 350 | "soup.title.string.parent" 351 | ], 352 | "language": "python", 353 | "metadata": {}, 354 | "outputs": [] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "# Searching the Tree\n", 361 | "\n", 362 | "Now, let's consider examples of different filters you can use to search this nested tree of HTML. These filters show up again and again, throughout the search API. You can use them to filter based on a tag\u2019s name, on its attributes, on the text of a string, or on some combination of these.\n", 363 | "\n", 364 | "#### Use `find_all()` to find all tags" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": {}, 370 | "source": [ 371 | "One common task is extracting all the URLs found within a page's tags:" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "collapsed": false, 377 | "input": [ 378 | "# search for all <a> tags; returns a list\n", 379 | "soup.find_all('a')" 380 | ], 381 | "language": "python", 382 | "metadata": {}, 383 | "outputs": [] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "collapsed": false, 388 | "input": [ 389 | "# your turn\n", 390 | "# search for all the paragragh tags" 391 | ], 392 | "language": "python", 393 | "metadata": {}, 394 | "outputs": [] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "collapsed": false, 399 | "input": [ 400 | "# your turn\n", 401 | "# search for all the table tags" 402 | ], 403 | "language": "python", 404 | "metadata": {}, 405 | "outputs": [] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": {}, 410 | "source": [ 411 | "Other arguments to the `.find_all()` function include `limit` and `text`. What do those do? " 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "collapsed": false, 417 | "input": [ 418 | "# your turn \n", 419 | "# search for all the <a> tags and use the limit argument " 420 | ], 421 | "language": "python", 422 | "metadata": {}, 423 | "outputs": [] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "collapsed": false, 428 | "input": [ 429 | "# your turn \n", 430 | "# What does the using the text argument do? " 431 | ], 432 | "language": "python", 433 | "metadata": {}, 434 | "outputs": [] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "#### Use `.get()` to extract an attribute" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "collapsed": false, 446 | "input": [ 447 | "soup.find_all('a')[1].get('href')" 448 | ], 449 | "language": "python", 450 | "metadata": {}, 451 | "outputs": [] 452 | }, 453 | { 454 | "cell_type": "markdown", 455 | "metadata": {}, 456 | "source": [ 457 | "#### Looping through tags" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "collapsed": false, 463 | "input": [ 464 | "# your turn\n", 465 | "# write a for loop printing all the links from reddit" 466 | ], 467 | "language": "python", 468 | "metadata": {}, 469 | "outputs": [] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "collapsed": false, 474 | "input": [ 475 | "# your turn\n", 476 | "# write a for loop, but use a list comprehension this time\n", 477 | "# show the first 5 elements" 478 | ], 479 | "language": "python", 480 | "metadata": {}, 481 | "outputs": [] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "collapsed": false, 486 | "input": [ 487 | "# your turn\n", 488 | "# split the first url by \"/\"" 489 | ], 490 | "language": "python", 491 | "metadata": {}, 492 | "outputs": [] 493 | }, 494 | { 495 | "cell_type": "markdown", 496 | "metadata": {}, 497 | "source": [ 498 | "Another common task is extracting all the text from a page:" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "collapsed": false, 504 | "input": [ 505 | "print(soup.get_text())" 506 | ], 507 | "language": "python", 508 | "metadata": {}, 509 | "outputs": [] 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "metadata": {}, 514 | "source": [ 515 | "# JSON\n", 516 | "\n", 517 | "#### Working with Web APIs\n", 518 | "Web APIs are a more convenient way for programs to interact with websites. Many webistes now have a nice API that gives access to it's data in JSON format.\n" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "collapsed": false, 524 | "input": [ 525 | "a = {'a': 1, 'b':2}\n", 526 | "s = json.dumps(a)\n", 527 | "a2 = json.loads(s)" 528 | ], 529 | "language": "python", 530 | "metadata": {}, 531 | "outputs": [] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "collapsed": false, 536 | "input": [ 537 | "a # a dictionary" 538 | ], 539 | "language": "python", 540 | "metadata": {}, 541 | "outputs": [] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "collapsed": false, 546 | "input": [ 547 | "s # s is a string containing a in JSON encoding" 548 | ], 549 | "language": "python", 550 | "metadata": {}, 551 | "outputs": [] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "collapsed": false, 556 | "input": [ 557 | "a2 # reading back the keys are now in unicode" 558 | ], 559 | "language": "python", 560 | "metadata": {}, 561 | "outputs": [] 562 | }, 563 | { 564 | "cell_type": "markdown", 565 | "metadata": {}, 566 | "source": [ 567 | "## World Cup in JSON!\n", 568 | "\n", 569 | "The [2014 FIFA World Cup](http://en.wikipedia.org/wiki/2014_FIFA_World_Cup) was held this summer in Brazil at several different venues. There was an [API created for the World Cup](http://worldcup.sfg.io) that scraped current match results and output match data as JSON. Possible output includes events such as goals, substitutions, and cards. The [actual matches are listed here](http://worldcup.sfg.io/matches) in JSON. \n", 570 | "\n", 571 | "* Example from [Fernando Masanori](https://gist.github.com/fmasanori/1288160dad16cc473a53)" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "collapsed": false, 577 | "input": [ 578 | "url = \"http://worldcup.sfg.io/matches\"\n", 579 | "data = urllib2.urlopen(url).read()\n", 580 | "wc = json.loads(data.decode('utf-8'))" 581 | ], 582 | "language": "python", 583 | "metadata": {}, 584 | "outputs": [], 585 | "prompt_number": 35 586 | }, 587 | { 588 | "cell_type": "code", 589 | "collapsed": false, 590 | "input": [ 591 | "\"Number of matches in 2014 World Cup: %i\" % len(wc)" 592 | ], 593 | "language": "python", 594 | "metadata": {}, 595 | "outputs": [ 596 | { 597 | "metadata": {}, 598 | "output_type": "pyout", 599 | "prompt_number": 36, 600 | "text": [ 601 | "'Number of matches in 2014 World Cup: 64'" 602 | ] 603 | } 604 | ], 605 | "prompt_number": 36 606 | }, 607 | { 608 | "cell_type": "code", 609 | "collapsed": false, 610 | "input": [ 611 | "# Print keys in first match\n", 612 | "gameIndex = 60\n", 613 | "wc[gameIndex].keys()" 614 | ], 615 | "language": "python", 616 | "metadata": {}, 617 | "outputs": [ 618 | { 619 | "metadata": {}, 620 | "output_type": "pyout", 621 | "prompt_number": 37, 622 | "text": [ 623 | "[u'status',\n", 624 | " u'match_number',\n", 625 | " u'home_team',\n", 626 | " u'away_team',\n", 627 | " u'winner_code',\n", 628 | " u'winner',\n", 629 | " u'away_team_events',\n", 630 | " u'datetime',\n", 631 | " u'location',\n", 632 | " u'home_team_events']" 633 | ] 634 | } 635 | ], 636 | "prompt_number": 37 637 | }, 638 | { 639 | "cell_type": "code", 640 | "collapsed": false, 641 | "input": [ 642 | "wc[gameIndex]['status']" 643 | ], 644 | "language": "python", 645 | "metadata": {}, 646 | "outputs": [ 647 | { 648 | "metadata": {}, 649 | "output_type": "pyout", 650 | "prompt_number": 38, 651 | "text": [ 652 | "u'completed'" 653 | ] 654 | } 655 | ], 656 | "prompt_number": 38 657 | }, 658 | { 659 | "cell_type": "code", 660 | "collapsed": false, 661 | "input": [ 662 | "wc[gameIndex]['match_number']" 663 | ], 664 | "language": "python", 665 | "metadata": {}, 666 | "outputs": [ 667 | { 668 | "metadata": {}, 669 | "output_type": "pyout", 670 | "prompt_number": 39, 671 | "text": [ 672 | "61" 673 | ] 674 | } 675 | ], 676 | "prompt_number": 39 677 | }, 678 | { 679 | "cell_type": "code", 680 | "collapsed": false, 681 | "input": [ 682 | "wc[gameIndex]['away_team']" 683 | ], 684 | "language": "python", 685 | "metadata": {}, 686 | "outputs": [ 687 | { 688 | "metadata": {}, 689 | "output_type": "pyout", 690 | "prompt_number": 40, 691 | "text": [ 692 | "{u'code': u'GER', u'country': u'Germany', u'goals': 7}" 693 | ] 694 | } 695 | ], 696 | "prompt_number": 40 697 | }, 698 | { 699 | "cell_type": "code", 700 | "collapsed": false, 701 | "input": [ 702 | "wc[gameIndex]['away_team_events']" 703 | ], 704 | "language": "python", 705 | "metadata": {}, 706 | "outputs": [ 707 | { 708 | "metadata": {}, 709 | "output_type": "pyout", 710 | "prompt_number": 41, 711 | "text": [ 712 | "[{u'id': 1354,\n", 713 | " u'player': u'M\\xdcller',\n", 714 | " u'time': u'11',\n", 715 | " u'type_of_event': u'goal'},\n", 716 | " {u'id': 1355, u'player': u'Klose', u'time': u'23', u'type_of_event': u'goal'},\n", 717 | " {u'id': 1356, u'player': u'Kroos', u'time': u'24', u'type_of_event': u'goal'},\n", 718 | " {u'id': 1357, u'player': u'Kroos', u'time': u'26', u'type_of_event': u'goal'},\n", 719 | " {u'id': 1358,\n", 720 | " u'player': u'Khedira',\n", 721 | " u'time': u'29',\n", 722 | " u'type_of_event': u'goal'},\n", 723 | " {u'id': 1363,\n", 724 | " u'player': u'Hummels',\n", 725 | " u'time': u'46',\n", 726 | " u'type_of_event': u'substitution-out halftime'},\n", 727 | " {u'id': 1364,\n", 728 | " u'player': u'Mertesacker',\n", 729 | " u'time': u'46',\n", 730 | " u'type_of_event': u'substitution-in halftime'},\n", 731 | " {u'id': 1365,\n", 732 | " u'player': u'Klose',\n", 733 | " u'time': u'58',\n", 734 | " u'type_of_event': u'substitution-out'},\n", 735 | " {u'id': 1366,\n", 736 | " u'player': u'Sch\\xdcrrle',\n", 737 | " u'time': u'58',\n", 738 | " u'type_of_event': u'substitution-in'},\n", 739 | " {u'id': 1370,\n", 740 | " u'player': u'Sch\\xdcrrle',\n", 741 | " u'time': u'69',\n", 742 | " u'type_of_event': u'goal'},\n", 743 | " {u'id': 1372,\n", 744 | " u'player': u'Draxler',\n", 745 | " u'time': u'76',\n", 746 | " u'type_of_event': u'substitution-in'},\n", 747 | " {u'id': 1371,\n", 748 | " u'player': u'Khedira',\n", 749 | " u'time': u'76',\n", 750 | " u'type_of_event': u'substitution-out'},\n", 751 | " {u'id': 1373,\n", 752 | " u'player': u'Sch\\xdcrrle',\n", 753 | " u'time': u'79',\n", 754 | " u'type_of_event': u'goal'}]" 755 | ] 756 | } 757 | ], 758 | "prompt_number": 41 759 | }, 760 | { 761 | "cell_type": "code", 762 | "collapsed": false, 763 | "input": [ 764 | "wc[gameIndex]['home_team']" 765 | ], 766 | "language": "python", 767 | "metadata": {}, 768 | "outputs": [ 769 | { 770 | "metadata": {}, 771 | "output_type": "pyout", 772 | "prompt_number": 42, 773 | "text": [ 774 | "{u'code': u'BRA', u'country': u'Brazil', u'goals': 1}" 775 | ] 776 | } 777 | ], 778 | "prompt_number": 42 779 | }, 780 | { 781 | "cell_type": "markdown", 782 | "metadata": {}, 783 | "source": [ 784 | "The [Brazil v Germany (2014 FIFA World Cup)](http://en.wikipedia.org/wiki/Brazil_v_Germany_(2014_FIFA_World_Cup) match on July 8, 2014 where Germany score the most goals in World Cup tournament history. Germany led 5\u20130 at half time, with 4 goals scored in a span of 6 minutes, and subsequently brought the score up to 7\u20130 in the second half. Brazil scored a goal at the last minute, ending the match 7\u20131. " 785 | ] 786 | }, 787 | { 788 | "cell_type": "markdown", 789 | "metadata": {}, 790 | "source": [ 791 | "Print the team names and goals scored for each match" 792 | ] 793 | }, 794 | { 795 | "cell_type": "code", 796 | "collapsed": false, 797 | "input": [ 798 | "for elem in wc:\n", 799 | " print elem['home_team']['country'], elem['home_team']['goals'], elem['away_team']['country'], elem['away_team']['goals']" 800 | ], 801 | "language": "python", 802 | "metadata": {}, 803 | "outputs": [ 804 | { 805 | "output_type": "stream", 806 | "stream": "stdout", 807 | "text": [ 808 | "Brazil 3 Croatia 1\n", 809 | "Mexico 1 Cameroon 0\n", 810 | "Spain 1 Netherlands 5\n", 811 | "Chile 3 Australia 1\n", 812 | "Colombia 3 Greece 0\n", 813 | "Ivory Coast 2 Japan 1\n", 814 | "Uruguay 1 Costa Rica 3\n", 815 | "England 1 Italy 2\n", 816 | "Switzerland 2 Ecuador 1\n", 817 | "France 3 Honduras 0\n", 818 | "Argentina 2 Bosnia and Herzegovina 1\n", 819 | "Iran 0 Nigeria 0\n", 820 | "Germany 4 Portugal 0\n", 821 | "Ghana 1 USA 2\n", 822 | "Belgium 2 Algeria 1\n", 823 | "Russia 1 Korea Republic 1\n", 824 | "Brazil 0 Mexico 0\n", 825 | "Cameroon 0 Croatia 4\n", 826 | "Spain 0 Chile 2\n", 827 | "Australia 2 Netherlands 3\n", 828 | "Colombia 2 Ivory Coast 1\n", 829 | "Japan 0 Greece 0\n", 830 | "Uruguay 2 England 1\n", 831 | "Italy 0 Costa Rica 1\n", 832 | "Switzerland 2 France 5\n", 833 | "Honduras 1 Ecuador 2\n", 834 | "Argentina 1 Iran 0\n", 835 | "Nigeria 1 Bosnia and Herzegovina 0\n", 836 | "Germany 2 Ghana 2\n", 837 | "USA 2 Portugal 2\n", 838 | "Belgium 1 Russia 0\n", 839 | "Korea Republic 2 Algeria 4\n", 840 | "Cameroon 1 Brazil 4\n", 841 | "Croatia 1 Mexico 3\n", 842 | "Australia 0 Spain 3\n", 843 | "Netherlands 2 Chile 0\n", 844 | "Japan 1 Colombia 4\n", 845 | "Greece 2 Ivory Coast 1\n", 846 | "Italy 0 Uruguay 1\n", 847 | "Costa Rica 0 England 0\n", 848 | "Honduras 0 Switzerland 3\n", 849 | "Ecuador 0 France 0\n", 850 | "Nigeria 2 Argentina 3\n", 851 | "Bosnia and Herzegovina 3 Iran 1\n", 852 | "USA 0 Germany 1\n", 853 | "Portugal 2 Ghana 1\n", 854 | "Korea Republic 0 Belgium 1\n", 855 | "Algeria 1 Russia 1\n", 856 | "Brazil 1 Chile 1\n", 857 | "Colombia 2 Uruguay 0\n", 858 | "Netherlands 2 Mexico 1\n", 859 | "Costa Rica 1 Greece 1\n", 860 | "France 2 Nigeria 0\n", 861 | "Germany 2 Algeria 1\n", 862 | "Argentina 1 Switzerland 0\n", 863 | "Belgium 2 USA 1\n", 864 | "Brazil 2 Colombia 1\n", 865 | "France 0 Germany 1\n", 866 | "Netherlands 0 Costa Rica 0\n", 867 | "Argentina 1 Belgium 0\n", 868 | "Brazil 1 Germany 7\n", 869 | "Netherlands 0 Argentina 0\n", 870 | "Brazil 0 Netherlands 3\n", 871 | "Germany 1 Argentina 0\n" 872 | ] 873 | } 874 | ], 875 | "prompt_number": 43 876 | }, 877 | { 878 | "cell_type": "markdown", 879 | "metadata": {}, 880 | "source": [ 881 | "### Create a pandas DataFrame from JSON" 882 | ] 883 | }, 884 | { 885 | "cell_type": "code", 886 | "collapsed": false, 887 | "input": [ 888 | "data = pd.DataFrame(wc, columns = ['match_number', 'location', 'datetime', 'home_team', 'away_team', 'winner', 'home_team_events', 'away_team_events'])\n", 889 | "data.head()" 890 | ], 891 | "language": "python", 892 | "metadata": {}, 893 | "outputs": [ 894 | { 895 | "html": [ 896 | "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", 897 | "<table border=\"1\" class=\"dataframe\">\n", 898 | " <thead>\n", 899 | " <tr style=\"text-align: right;\">\n", 900 | " <th></th>\n", 901 | " <th>match_number</th>\n", 902 | " <th>location</th>\n", 903 | " <th>datetime</th>\n", 904 | " <th>home_team</th>\n", 905 | " <th>away_team</th>\n", 906 | " <th>winner</th>\n", 907 | " <th>home_team_events</th>\n", 908 | " <th>away_team_events</th>\n", 909 | " </tr>\n", 910 | " </thead>\n", 911 | " <tbody>\n", 912 | " <tr>\n", 913 | " <th>0</th>\n", 914 | " <td> 1</td>\n", 915 | " <td> Arena de Sao Paulo</td>\n", 916 | " <td> 2014-06-12T17:00:00.000-03:00</td>\n", 917 | " <td> {u'country': u'Brazil', u'code': u'BRA', u'goa...</td>\n", 918 | " <td> {u'country': u'Croatia', u'code': u'CRO', u'go...</td>\n", 919 | " <td> Brazil</td>\n", 920 | " <td> [{u'type_of_event': u'goal-own', u'player': u'...</td>\n", 921 | " <td> [{u'type_of_event': u'substitution-in', u'play...</td>\n", 922 | " </tr>\n", 923 | " <tr>\n", 924 | " <th>1</th>\n", 925 | " <td> 2</td>\n", 926 | " <td> Estadio das Dunas</td>\n", 927 | " <td> 2014-06-13T13:00:00.000-03:00</td>\n", 928 | " <td> {u'country': u'Mexico', u'code': u'MEX', u'goa...</td>\n", 929 | " <td> {u'country': u'Cameroon', u'code': u'CMR', u'g...</td>\n", 930 | " <td> Mexico</td>\n", 931 | " <td> [{u'type_of_event': u'yellow-card', u'player':...</td>\n", 932 | " <td> [{u'type_of_event': u'substitution-in halftime...</td>\n", 933 | " </tr>\n", 934 | " <tr>\n", 935 | " <th>2</th>\n", 936 | " <td> 3</td>\n", 937 | " <td> Arena Fonte Nova</td>\n", 938 | " <td> 2014-06-13T16:00:00.000-03:00</td>\n", 939 | " <td> {u'country': u'Spain', u'code': u'ESP', u'goal...</td>\n", 940 | " <td> {u'country': u'Netherlands', u'code': u'NED', ...</td>\n", 941 | " <td> Netherlands</td>\n", 942 | " <td> [{u'type_of_event': u'goal-penalty', u'player'...</td>\n", 943 | " <td> [{u'type_of_event': u'yellow-card', u'player':...</td>\n", 944 | " </tr>\n", 945 | " <tr>\n", 946 | " <th>3</th>\n", 947 | " <td> 4</td>\n", 948 | " <td> Arena Pantanal</td>\n", 949 | " <td> 2014-06-13T19:00:00.000-03:00</td>\n", 950 | " <td> {u'country': u'Chile', u'code': u'CHI', u'goal...</td>\n", 951 | " <td> {u'country': u'Australia', u'code': u'AUS', u'...</td>\n", 952 | " <td> Chile</td>\n", 953 | " <td> [{u'type_of_event': u'goal', u'player': u'Alex...</td>\n", 954 | " <td> [{u'type_of_event': u'goal', u'player': u'Cahi...</td>\n", 955 | " </tr>\n", 956 | " <tr>\n", 957 | " <th>4</th>\n", 958 | " <td> 5</td>\n", 959 | " <td> Estadio Mineirao</td>\n", 960 | " <td> 2014-06-14T13:00:00.000-03:00</td>\n", 961 | " <td> {u'country': u'Colombia', u'code': u'COL', u'g...</td>\n", 962 | " <td> {u'country': u'Greece', u'code': u'GRE', u'goa...</td>\n", 963 | " <td> Colombia</td>\n", 964 | " <td> [{u'type_of_event': u'goal', u'player': u'P. A...</td>\n", 965 | " <td> [{u'type_of_event': u'yellow-card', u'player':...</td>\n", 966 | " </tr>\n", 967 | " </tbody>\n", 968 | "</table>\n", 969 | "</div>" 970 | ], 971 | "metadata": {}, 972 | "output_type": "pyout", 973 | "prompt_number": 44, 974 | "text": [ 975 | " match_number location datetime \\\n", 976 | "0 1 Arena de Sao Paulo 2014-06-12T17:00:00.000-03:00 \n", 977 | "1 2 Estadio das Dunas 2014-06-13T13:00:00.000-03:00 \n", 978 | "2 3 Arena Fonte Nova 2014-06-13T16:00:00.000-03:00 \n", 979 | "3 4 Arena Pantanal 2014-06-13T19:00:00.000-03:00 \n", 980 | "4 5 Estadio Mineirao 2014-06-14T13:00:00.000-03:00 \n", 981 | "\n", 982 | " home_team \\\n", 983 | "0 {u'country': u'Brazil', u'code': u'BRA', u'goa... \n", 984 | "1 {u'country': u'Mexico', u'code': u'MEX', u'goa... \n", 985 | "2 {u'country': u'Spain', u'code': u'ESP', u'goal... \n", 986 | "3 {u'country': u'Chile', u'code': u'CHI', u'goal... \n", 987 | "4 {u'country': u'Colombia', u'code': u'COL', u'g... \n", 988 | "\n", 989 | " away_team winner \\\n", 990 | "0 {u'country': u'Croatia', u'code': u'CRO', u'go... Brazil \n", 991 | "1 {u'country': u'Cameroon', u'code': u'CMR', u'g... Mexico \n", 992 | "2 {u'country': u'Netherlands', u'code': u'NED', ... Netherlands \n", 993 | "3 {u'country': u'Australia', u'code': u'AUS', u'... Chile \n", 994 | "4 {u'country': u'Greece', u'code': u'GRE', u'goa... Colombia \n", 995 | "\n", 996 | " home_team_events \\\n", 997 | "0 [{u'type_of_event': u'goal-own', u'player': u'... \n", 998 | "1 [{u'type_of_event': u'yellow-card', u'player':... \n", 999 | "2 [{u'type_of_event': u'goal-penalty', u'player'... \n", 1000 | "3 [{u'type_of_event': u'goal', u'player': u'Alex... \n", 1001 | "4 [{u'type_of_event': u'goal', u'player': u'P. A... \n", 1002 | "\n", 1003 | " away_team_events \n", 1004 | "0 [{u'type_of_event': u'substitution-in', u'play... \n", 1005 | "1 [{u'type_of_event': u'substitution-in halftime... \n", 1006 | "2 [{u'type_of_event': u'yellow-card', u'player':... \n", 1007 | "3 [{u'type_of_event': u'goal', u'player': u'Cahi... \n", 1008 | "4 [{u'type_of_event': u'yellow-card', u'player':... " 1009 | ] 1010 | } 1011 | ], 1012 | "prompt_number": 44 1013 | }, 1014 | { 1015 | "cell_type": "markdown", 1016 | "metadata": {}, 1017 | "source": [ 1018 | "#### Convert format of a column\n", 1019 | "\n", 1020 | "Here we pandas [DatetimeIndex](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DatetimeIndex.html) to convert the `datetime` column to two seperate columns: a date and a time for each match." 1021 | ] 1022 | }, 1023 | { 1024 | "cell_type": "code", 1025 | "collapsed": false, 1026 | "input": [ 1027 | "data['gameDate'] = pd.DatetimeIndex(data.datetime).date\n", 1028 | "data['gameTime'] = pd.DatetimeIndex(data.datetime).time" 1029 | ], 1030 | "language": "python", 1031 | "metadata": {}, 1032 | "outputs": [], 1033 | "prompt_number": 45 1034 | }, 1035 | { 1036 | "cell_type": "code", 1037 | "collapsed": false, 1038 | "input": [ 1039 | "data.head()" 1040 | ], 1041 | "language": "python", 1042 | "metadata": {}, 1043 | "outputs": [ 1044 | { 1045 | "html": [ 1046 | "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", 1047 | "<table border=\"1\" class=\"dataframe\">\n", 1048 | " <thead>\n", 1049 | " <tr style=\"text-align: right;\">\n", 1050 | " <th></th>\n", 1051 | " <th>match_number</th>\n", 1052 | " <th>location</th>\n", 1053 | " <th>datetime</th>\n", 1054 | " <th>home_team</th>\n", 1055 | " <th>away_team</th>\n", 1056 | " <th>winner</th>\n", 1057 | " <th>home_team_events</th>\n", 1058 | " <th>away_team_events</th>\n", 1059 | " <th>gameDate</th>\n", 1060 | " <th>gameTime</th>\n", 1061 | " </tr>\n", 1062 | " </thead>\n", 1063 | " <tbody>\n", 1064 | " <tr>\n", 1065 | " <th>0</th>\n", 1066 | " <td> 1</td>\n", 1067 | " <td> Arena de Sao Paulo</td>\n", 1068 | " <td> 2014-06-12T17:00:00.000-03:00</td>\n", 1069 | " <td> {u'country': u'Brazil', u'code': u'BRA', u'goa...</td>\n", 1070 | " <td> {u'country': u'Croatia', u'code': u'CRO', u'go...</td>\n", 1071 | " <td> Brazil</td>\n", 1072 | " <td> [{u'type_of_event': u'goal-own', u'player': u'...</td>\n", 1073 | " <td> [{u'type_of_event': u'substitution-in', u'play...</td>\n", 1074 | " <td> 2014-06-12</td>\n", 1075 | " <td> 20:00:00</td>\n", 1076 | " </tr>\n", 1077 | " <tr>\n", 1078 | " <th>1</th>\n", 1079 | " <td> 2</td>\n", 1080 | " <td> Estadio das Dunas</td>\n", 1081 | " <td> 2014-06-13T13:00:00.000-03:00</td>\n", 1082 | " <td> {u'country': u'Mexico', u'code': u'MEX', u'goa...</td>\n", 1083 | " <td> {u'country': u'Cameroon', u'code': u'CMR', u'g...</td>\n", 1084 | " <td> Mexico</td>\n", 1085 | " <td> [{u'type_of_event': u'yellow-card', u'player':...</td>\n", 1086 | " <td> [{u'type_of_event': u'substitution-in halftime...</td>\n", 1087 | " <td> 2014-06-13</td>\n", 1088 | " <td> 16:00:00</td>\n", 1089 | " </tr>\n", 1090 | " <tr>\n", 1091 | " <th>2</th>\n", 1092 | " <td> 3</td>\n", 1093 | " <td> Arena Fonte Nova</td>\n", 1094 | " <td> 2014-06-13T16:00:00.000-03:00</td>\n", 1095 | " <td> {u'country': u'Spain', u'code': u'ESP', u'goal...</td>\n", 1096 | " <td> {u'country': u'Netherlands', u'code': u'NED', ...</td>\n", 1097 | " <td> Netherlands</td>\n", 1098 | " <td> [{u'type_of_event': u'goal-penalty', u'player'...</td>\n", 1099 | " <td> [{u'type_of_event': u'yellow-card', u'player':...</td>\n", 1100 | " <td> 2014-06-13</td>\n", 1101 | " <td> 19:00:00</td>\n", 1102 | " </tr>\n", 1103 | " <tr>\n", 1104 | " <th>3</th>\n", 1105 | " <td> 4</td>\n", 1106 | " <td> Arena Pantanal</td>\n", 1107 | " <td> 2014-06-13T19:00:00.000-03:00</td>\n", 1108 | " <td> {u'country': u'Chile', u'code': u'CHI', u'goal...</td>\n", 1109 | " <td> {u'country': u'Australia', u'code': u'AUS', u'...</td>\n", 1110 | " <td> Chile</td>\n", 1111 | " <td> [{u'type_of_event': u'goal', u'player': u'Alex...</td>\n", 1112 | " <td> [{u'type_of_event': u'goal', u'player': u'Cahi...</td>\n", 1113 | " <td> 2014-06-13</td>\n", 1114 | " <td> 22:00:00</td>\n", 1115 | " </tr>\n", 1116 | " <tr>\n", 1117 | " <th>4</th>\n", 1118 | " <td> 5</td>\n", 1119 | " <td> Estadio Mineirao</td>\n", 1120 | " <td> 2014-06-14T13:00:00.000-03:00</td>\n", 1121 | " <td> {u'country': u'Colombia', u'code': u'COL', u'g...</td>\n", 1122 | " <td> {u'country': u'Greece', u'code': u'GRE', u'goa...</td>\n", 1123 | " <td> Colombia</td>\n", 1124 | " <td> [{u'type_of_event': u'goal', u'player': u'P. A...</td>\n", 1125 | " <td> [{u'type_of_event': u'yellow-card', u'player':...</td>\n", 1126 | " <td> 2014-06-14</td>\n", 1127 | " <td> 16:00:00</td>\n", 1128 | " </tr>\n", 1129 | " </tbody>\n", 1130 | "</table>\n", 1131 | "</div>" 1132 | ], 1133 | "metadata": {}, 1134 | "output_type": "pyout", 1135 | "prompt_number": 46, 1136 | "text": [ 1137 | " match_number location datetime \\\n", 1138 | "0 1 Arena de Sao Paulo 2014-06-12T17:00:00.000-03:00 \n", 1139 | "1 2 Estadio das Dunas 2014-06-13T13:00:00.000-03:00 \n", 1140 | "2 3 Arena Fonte Nova 2014-06-13T16:00:00.000-03:00 \n", 1141 | "3 4 Arena Pantanal 2014-06-13T19:00:00.000-03:00 \n", 1142 | "4 5 Estadio Mineirao 2014-06-14T13:00:00.000-03:00 \n", 1143 | "\n", 1144 | " home_team \\\n", 1145 | "0 {u'country': u'Brazil', u'code': u'BRA', u'goa... \n", 1146 | "1 {u'country': u'Mexico', u'code': u'MEX', u'goa... \n", 1147 | "2 {u'country': u'Spain', u'code': u'ESP', u'goal... \n", 1148 | "3 {u'country': u'Chile', u'code': u'CHI', u'goal... \n", 1149 | "4 {u'country': u'Colombia', u'code': u'COL', u'g... \n", 1150 | "\n", 1151 | " away_team winner \\\n", 1152 | "0 {u'country': u'Croatia', u'code': u'CRO', u'go... Brazil \n", 1153 | "1 {u'country': u'Cameroon', u'code': u'CMR', u'g... Mexico \n", 1154 | "2 {u'country': u'Netherlands', u'code': u'NED', ... Netherlands \n", 1155 | "3 {u'country': u'Australia', u'code': u'AUS', u'... Chile \n", 1156 | "4 {u'country': u'Greece', u'code': u'GRE', u'goa... Colombia \n", 1157 | "\n", 1158 | " home_team_events \\\n", 1159 | "0 [{u'type_of_event': u'goal-own', u'player': u'... \n", 1160 | "1 [{u'type_of_event': u'yellow-card', u'player':... \n", 1161 | "2 [{u'type_of_event': u'goal-penalty', u'player'... \n", 1162 | "3 [{u'type_of_event': u'goal', u'player': u'Alex... \n", 1163 | "4 [{u'type_of_event': u'goal', u'player': u'P. A... \n", 1164 | "\n", 1165 | " away_team_events gameDate gameTime \n", 1166 | "0 [{u'type_of_event': u'substitution-in', u'play... 2014-06-12 20:00:00 \n", 1167 | "1 [{u'type_of_event': u'substitution-in halftime... 2014-06-13 16:00:00 \n", 1168 | "2 [{u'type_of_event': u'yellow-card', u'player':... 2014-06-13 19:00:00 \n", 1169 | "3 [{u'type_of_event': u'goal', u'player': u'Cahi... 2014-06-13 22:00:00 \n", 1170 | "4 [{u'type_of_event': u'yellow-card', u'player':... 2014-06-14 16:00:00 " 1171 | ] 1172 | } 1173 | ], 1174 | "prompt_number": 46 1175 | }, 1176 | { 1177 | "cell_type": "code", 1178 | "collapsed": false, 1179 | "input": [], 1180 | "language": "python", 1181 | "metadata": {}, 1182 | "outputs": [], 1183 | "prompt_number": 46 1184 | } 1185 | ], 1186 | "metadata": {} 1187 | } 1188 | ] 1189 | } -------------------------------------------------------------------------------- /homework/HW2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:151b229e81776daa0c324f9570f662a65d79d63f75a582b79118008267af5725" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "# Homework 2: More Exploratory Data Analysis\n", 16 | "## Gene Expression Data and Election Polls \n", 17 | "\n", 18 | "Due: Thursday, October 2, 2014 11:59 PM\n", 19 | "\n", 20 | "<a href=https://raw.githubusercontent.com/cs109/2014/master/homework/HW2.ipynb download=HW2.ipynb> Download this assignment</a>\n", 21 | "\n", 22 | "#### Submission Instructions\n", 23 | "To submit your homework, create a folder named lastname_firstinitial_hw# and place your IPython notebooks, data files, and any other files in this folder. Your IPython Notebooks should be completely executed with the results visible in the notebook. We should not have to run any code. Compress the folder (please use .zip compression) and submit to the CS109 dropbox in the appropriate folder. If we cannot access your work because these directions are not followed correctly, we will not grade your work.\n", 24 | "\n", 25 | "\n", 26 | "---" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## Introduction\n", 34 | "\n", 35 | "John Tukey wrote in [Exploratory Data Analysis, 1977](http://www.amazon.com/Exploratory-Data-Analysis-Wilder-Tukey/dp/0201076160/ref=pd_bbs_sr_2/103-4466654-5303007?ie=UTF8&s=books&qid=1189739816&sr=8-2): \"The greatest value of a picture is when it forces us to notice what we never expected to see.\" In this assignment we will continue using our exploratory data analysis tools, but apply it to new sets of data: [gene expression](http://en.wikipedia.org/wiki/Gene_expression) and polls from the [2012 Presidental Election](http://en.wikipedia.org/wiki/United_States_presidential_election,_2012) and from the [2014 Senate Midterm Elections](http://en.wikipedia.org/wiki/United_States_Senate_elections,_2014). \n", 36 | "\n", 37 | "**First**: You will use exploratory data analysis and apply the [singular value decomposition](http://en.wikipedia.org/wiki/Singular_value_decomposition) (SVD) to a gene expression data matrix to determine if the the date that the gene expression samples are processed has large effect on the variability seen in the data. \n", 38 | "\n", 39 | "**Second**: You will use the polls from the 2012 Presidential Elections to determine (1) Is there a pollster bias in presidential election polls? and (2) Is the average of polls better than just one poll?\n", 40 | "\n", 41 | "**Finally**: You will use the [HuffPost Pollster API](http://elections.huffingtonpost.com/pollster/api) to extract the polls for the current 2014 Senate Midterm Elections and provide a preliminary prediction of the result of each state.\n", 42 | "\n", 43 | "#### Data\n", 44 | "\n", 45 | "We will use the following data sets: \n", 46 | "\n", 47 | "1. A gene expression data set called `exprs_GSE5859.csv` and sample annotation table called `sampleinfo_GSE5859.csv` which are both available on Github in the 2014_data repository: [expression data set](https://github.com/cs109/2014_data/blob/master/exprs_GSE5859.csv) and [sample annotation table](https://github.com/cs109/2014_data/blob/master/sampleinfo_GSE5859.csv). \n", 48 | "\n", 49 | "2. Polls from the [2012 Presidential Election: Barack Obama vs Mitt Romney](http://elections.huffingtonpost.com/pollster/2012-general-election-romney-vs-obama). The polls we will use are from the [Huffington Post Pollster](http://elections.huffingtonpost.com/pollster). \n", 50 | "\n", 51 | "3. Polls from the [2014 Senate Midterm Elections](http://elections.huffingtonpost.com/pollster) from the [HuffPost Pollster API](http://elections.huffingtonpost.com/pollster/api). \n", 52 | "\n", 53 | "---" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## Load Python modules" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "collapsed": false, 66 | "input": [ 67 | "# special IPython command to prepare the notebook for matplotlib\n", 68 | "%matplotlib inline \n", 69 | "\n", 70 | "import requests \n", 71 | "from StringIO import StringIO\n", 72 | "import numpy as np\n", 73 | "import pandas as pd # pandas\n", 74 | "import matplotlib.pyplot as plt # module for plotting \n", 75 | "import datetime as dt # module for manipulating dates and times\n", 76 | "import numpy.linalg as lin # module for performing linear algebra operations" 77 | ], 78 | "language": "python", 79 | "metadata": {}, 80 | "outputs": [], 81 | "prompt_number": 1 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "## Problem 1\n", 88 | "\n", 89 | "In this problem we will be using a [gene expression](http://en.wikipedia.org/wiki/Gene_expression) data set obtained from a [microarray](http://en.wikipedia.org/wiki/DNA_microarray) experiement [Read more about the specific experiment here](http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE5859). There are two data sets we will use: \n", 90 | "\n", 91 | "1. The gene expression intensities where the rows represent the features on the microarray (e.g. genes) and the columsns represent the different microarray samples. \n", 92 | "\n", 93 | "2. A table that contains the information about each of the samples (columns in the gene expression data set) such as the sex, the age, the treatment status, the date the samples were processed. Each row represents one sample. " 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "#### Problem 1(a) \n", 101 | "Read in the two files from Github: [exprs_GSE5859.csv](https://github.com/cs109/2014_data/blob/master/exprs_GSE5859.csv) and [sampleinfo_GSE5859.csv](https://github.com/cs109/2014_data/blob/master/sampleinfo_GSE5859.csv) as pandas DataFrames called `exprs` and `sampleinfo`. Use the gene names as the index of the `exprs` DataFrame." 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "collapsed": false, 107 | "input": [ 108 | "#your code here" 109 | ], 110 | "language": "python", 111 | "metadata": {}, 112 | "outputs": [], 113 | "prompt_number": 2 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "Make sure the order of the columns in the gene expression DataFrame match the order of file names in the sample annotation DataFrame. If the order of the columns the `exprs` DataFrame do not match the order of the file names in the `sampleinfo` DataFrame, reorder the columns in the `exprs` DataFrame. \n", 120 | "\n", 121 | "**Note**: The column names of the gene expression DataFrame are the filenames of the orignal files from which these data were obtained. \n", 122 | "\n", 123 | "**Hint**: The method `list.index(x)` [[read here](https://docs.python.org/2/tutorial/datastructures.html)] can be used to return the index in the list of the first item whose value is x. It is an error if there is no such item. To check if the order of the columns in `exprs` matches the order of the rows in `sampleinfo`, you can check using the method `.all()` on a Boolean or list of Booleans: \n", 124 | "\n", 125 | "Example code: `(exprs.columns == sampleinfo.filename).all()`" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "collapsed": false, 131 | "input": [ 132 | "#your code here" 133 | ], 134 | "language": "python", 135 | "metadata": {}, 136 | "outputs": [], 137 | "prompt_number": 3 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "Show the head of the two tables: `exprs` and `sampleinfo`. " 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "collapsed": false, 149 | "input": [ 150 | "#your code here" 151 | ], 152 | "language": "python", 153 | "metadata": {}, 154 | "outputs": [] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "#### Problem 1(b)\n", 161 | "\n", 162 | "Extract the year and month as integers from the `sampleinfo` table. \n", 163 | "\n", 164 | "**Hint**: To convert a Series or a column of a pandas DataFrame that contains a date-like object, you can use the `to_datetime` function [[read here](http://pandas.pydata.org/pandas-docs/stable/timeseries.html)]. This will create a `DatetimeIndex` which can be used to extract the month and year for each row in the DataFrame. " 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "collapsed": false, 170 | "input": [ 171 | "#your code here" 172 | ], 173 | "language": "python", 174 | "metadata": {}, 175 | "outputs": [], 176 | "prompt_number": 4 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "#### Problem 1(c)\n", 183 | "\n", 184 | "Convert the dates in the `date` column from the `sampleinfo` table into days since October 31, 2002. Add a column to the `sampleinfo` DataFrame titled `elapsedInDays` containing the days since October 31, 2002. Show the head of the `sampleinfo` DataFrame which includes the new column. \n", 185 | "\n", 186 | "**Hint**: Use the `datetime` module to create a new `datetime` object for the specific date October 31, 2002. Then, subtract the October 31, 2002 date from each date from the `date` column in the `sampleinfo` DataFrame. " 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "collapsed": false, 192 | "input": [ 193 | "#your code here" 194 | ], 195 | "language": "python", 196 | "metadata": {}, 197 | "outputs": [], 198 | "prompt_number": 5 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "#### Problem 1(d)\n", 205 | "\n", 206 | "Use exploratory analysis and the singular value decomposition (SVD) of the gene expression data matrix to determine if the date the samples were processed has large effect on the variability seen in the data or if it is just ethnicity (which is confounded with date). \n", 207 | "\n", 208 | "**Hint**: See the end of the [lecture from 9/23/2014 for help with SVD](http://nbviewer.ipython.org/github/cs109/2014/blob/master/lectures/lecture07/data_scraping_transcript.ipynb). \n", 209 | "\n", 210 | "First subset the the `sampleinfo` DataFrame to include only the CEU ethnicity. Call this new subsetted DataFrame `sampleinfoCEU`. Show the head of `sampleinfoCEU` DataFrame. " 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "collapsed": false, 216 | "input": [ 217 | "#your code here" 218 | ], 219 | "language": "python", 220 | "metadata": {}, 221 | "outputs": [], 222 | "prompt_number": 6 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "Next, subset the `exprs` DataFrame to only include the samples with the CEU ethnicity. Name this new subsetted DataFrame `exprsCEU`. Show the head of the `exprsCEU` DataFrame. " 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "collapsed": false, 234 | "input": [ 235 | "#your code here" 236 | ], 237 | "language": "python", 238 | "metadata": {}, 239 | "outputs": [], 240 | "prompt_number": 7 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "Check to make sure the order of the columns in the `exprsCEU` DataFrame matches the rows in the `sampleinfoCEU` DataFrame. " 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "collapsed": false, 252 | "input": [ 253 | "#your code here" 254 | ], 255 | "language": "python", 256 | "metadata": {}, 257 | "outputs": [], 258 | "prompt_number": 8 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "Compute the average gene expression intensity in the `exprsCEU` DataFrame across all the samples. For each sample in the `exprsCEU` DataFrame, subtract the average gene expression intensity from each of the samples. Show the head of the mean normalized gene expression data. " 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "collapsed": false, 270 | "input": [ 271 | "#your code here" 272 | ], 273 | "language": "python", 274 | "metadata": {}, 275 | "outputs": [], 276 | "prompt_number": 9 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "Using this mean normalized gene expression data, compute the projection to the first Principal Component (PC1). \n", 283 | "\n", 284 | "**Hint**: Use the `numpy.linalg.svd()` function in the `numpy.linalg` module (or the `scipy.linalg.svd()` function in the `scipy.linalg` module) to apply an [singular value decomposition](http://en.wikipedia.org/wiki/Singular_value_decomposition) to a matrix. " 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "collapsed": false, 290 | "input": [ 291 | "#your code here" 292 | ], 293 | "language": "python", 294 | "metadata": {}, 295 | "outputs": [], 296 | "prompt_number": 10 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "Create a histogram using the values from PC1. Use a bin size of 25. " 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "collapsed": false, 308 | "input": [ 309 | "#your code here" 310 | ], 311 | "language": "python", 312 | "metadata": {}, 313 | "outputs": [], 314 | "prompt_number": 11 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "Create a scatter plot with the days since October 31, 2002 on the x-axis and PC1 on the y-axis." 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "collapsed": false, 326 | "input": [ 327 | "#your code here" 328 | ], 329 | "language": "python", 330 | "metadata": {}, 331 | "outputs": [], 332 | "prompt_number": 12 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "Around what day do you notice a difference in the way the samples were processed?" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "collapsed": false, 344 | "input": [ 345 | "#your code here" 346 | ], 347 | "language": "python", 348 | "metadata": {}, 349 | "outputs": [], 350 | "prompt_number": 13 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "Answer:" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "## Discussion for Problem 1\n", 364 | "\n", 365 | "*Write a brief discussion of your conclusions to the questions and tasks above in 100 words or less.*\n", 366 | "\n", 367 | "---\n" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": {}, 373 | "source": [ 374 | "## Problem 2: Is there a pollster bias in presidential election polls?" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "#### Problem 2(a)\n", 382 | "\n", 383 | "The [HuffPost Pollster](http://elections.huffingtonpost.com/pollster) contains many political polls. You can access these polls from individual races as a CSV but you can also access polls through the [HuffPost Pollster API](http://elections.huffingtonpost.com/pollster/api) to access the data. \n", 384 | "\n", 385 | "Read in the polls from the [2012 Presidential Election: Barack Obama vs Mitt Romney](http://elections.huffingtonpost.com/pollster/2012-general-election-romney-vs-obama) into a pandas DataFrame called `election`. For this problem, you may read in the polls for this race directly using [the CSV file](http://elections.huffingtonpost.com/pollster/2012-general-election-romney-vs-obama.csv) available from the HuffPost Pollster page." 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "collapsed": false, 391 | "input": [ 392 | "#your code here" 393 | ], 394 | "language": "python", 395 | "metadata": {}, 396 | "outputs": [], 397 | "prompt_number": 14 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": {}, 402 | "source": [ 403 | "Show the head of the `election` DataFrame. " 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "collapsed": false, 409 | "input": [ 410 | "#your code here" 411 | ], 412 | "language": "python", 413 | "metadata": {}, 414 | "outputs": [], 415 | "prompt_number": 15 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": {}, 420 | "source": [ 421 | "How many polls were conducted in November? Define this number as M. \n", 422 | "\n", 423 | "**Hint**: Subset the `election` DataFrame for only dates in the `Start Date` column that are in November 2012. " 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "collapsed": false, 429 | "input": [ 430 | "#your code here" 431 | ], 432 | "language": "python", 433 | "metadata": {}, 434 | "outputs": [], 435 | "prompt_number": 16 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "Answer:" 442 | ] 443 | }, 444 | { 445 | "cell_type": "markdown", 446 | "metadata": {}, 447 | "source": [ 448 | "What was the median of the number of observations in the November polls? Define this quantity as N. " 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "collapsed": false, 454 | "input": [ 455 | "#your code here" 456 | ], 457 | "language": "python", 458 | "metadata": {}, 459 | "outputs": [], 460 | "prompt_number": 17 461 | }, 462 | { 463 | "cell_type": "markdown", 464 | "metadata": {}, 465 | "source": [ 466 | "Answer: " 467 | ] 468 | }, 469 | { 470 | "cell_type": "markdown", 471 | "metadata": {}, 472 | "source": [ 473 | "#### Problem 2(b)\n", 474 | "\n", 475 | "Using the median sample size $N$ from Problem 1(a), simulate the results from a single poll: simulate the number of votes for Obama out of a sample size $N$ where $p$ = 0.53 is the percent of voters who are voting for Obama. \n", 476 | "\n", 477 | "**Hint**: Use the binomial distribution with parameters $N$ and $p$ = 0.53. " 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "collapsed": false, 483 | "input": [ 484 | "#your code here" 485 | ], 486 | "language": "python", 487 | "metadata": {}, 488 | "outputs": [], 489 | "prompt_number": 18 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": {}, 494 | "source": [ 495 | "Now, perform a Monte Carlo simulation to obtain the estimated percentage of Obama votes with a sample size $N$ where $N$ is the median sample size calculated in Problem 2(a). Let $p$=0.53 be the percent of voters are voting for Obama. \n", 496 | "\n", 497 | "**Hint**: You will repeat the simulation above 1,000 times and plot the distribution of the estimated *percent* of Obama votes from a single poll. The results from the single poll you simulate is random variable and will be different every time you sample. " 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "collapsed": false, 503 | "input": [ 504 | "#your code here" 505 | ], 506 | "language": "python", 507 | "metadata": {}, 508 | "outputs": [], 509 | "prompt_number": 19 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "metadata": {}, 514 | "source": [ 515 | "Plot the distribution of the estimated percentage of Obama votes from your single poll. What is the distribution of the estimated percentage of Obama votes? " 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "collapsed": false, 521 | "input": [ 522 | "#your code here" 523 | ], 524 | "language": "python", 525 | "metadata": {}, 526 | "outputs": [], 527 | "prompt_number": 20 528 | }, 529 | { 530 | "cell_type": "markdown", 531 | "metadata": {}, 532 | "source": [ 533 | "Answer: " 534 | ] 535 | }, 536 | { 537 | "cell_type": "markdown", 538 | "metadata": {}, 539 | "source": [ 540 | "What is the standard error (SE) of the estimated percentage from the poll. \n", 541 | "\n", 542 | "**Hint**: Remember the SE is the standard deviation (SD) of the distribution of a random variable. " 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "collapsed": false, 548 | "input": [ 549 | "#your code here" 550 | ], 551 | "language": "python", 552 | "metadata": {}, 553 | "outputs": [], 554 | "prompt_number": 21 555 | }, 556 | { 557 | "cell_type": "markdown", 558 | "metadata": {}, 559 | "source": [ 560 | "#### Problem 2(c)\n", 561 | "\n", 562 | "Now suppose we run M polls where M is the number of polls that happened in November (calculated in Problem 2(a)). Run 1,000 simulations and compute the mean of the M polls for each simulation. " 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "collapsed": false, 568 | "input": [ 569 | "#your code here" 570 | ], 571 | "language": "python", 572 | "metadata": {}, 573 | "outputs": [], 574 | "prompt_number": 22 575 | }, 576 | { 577 | "cell_type": "markdown", 578 | "metadata": {}, 579 | "source": [ 580 | "What is the distribution of the average of polls?\n", 581 | "\n", 582 | "**Hint**: Show a plot. " 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "collapsed": false, 588 | "input": [ 589 | "#your code here" 590 | ], 591 | "language": "python", 592 | "metadata": {}, 593 | "outputs": [], 594 | "prompt_number": 23 595 | }, 596 | { 597 | "cell_type": "markdown", 598 | "metadata": {}, 599 | "source": [ 600 | "Answer: " 601 | ] 602 | }, 603 | { 604 | "cell_type": "markdown", 605 | "metadata": {}, 606 | "source": [ 607 | "What is the standard error (SE) of the average of polls? " 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "collapsed": false, 613 | "input": [ 614 | "#your code here" 615 | ], 616 | "language": "python", 617 | "metadata": {}, 618 | "outputs": [], 619 | "prompt_number": 24 620 | }, 621 | { 622 | "cell_type": "markdown", 623 | "metadata": {}, 624 | "source": [ 625 | "Answer: " 626 | ] 627 | }, 628 | { 629 | "cell_type": "markdown", 630 | "metadata": {}, 631 | "source": [ 632 | "Is the SE of the average of polls larger, the same, or smaller than that the SD of a single poll (calculated in Problem 2(b))? By how much?\n", 633 | "\n", 634 | "**Hint**: Compute a ratio of the two quantities. " 635 | ] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "collapsed": false, 640 | "input": [ 641 | "#your code here" 642 | ], 643 | "language": "python", 644 | "metadata": {}, 645 | "outputs": [], 646 | "prompt_number": 25 647 | }, 648 | { 649 | "cell_type": "markdown", 650 | "metadata": {}, 651 | "source": [ 652 | "Answer: " 653 | ] 654 | }, 655 | { 656 | "cell_type": "markdown", 657 | "metadata": {}, 658 | "source": [ 659 | "#### Problem 2(d) \n", 660 | "\n", 661 | "Repeat Problem 2(c) but now record the *across poll* standard deviation in each simulation. " 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "collapsed": false, 667 | "input": [ 668 | "#your code here" 669 | ], 670 | "language": "python", 671 | "metadata": {}, 672 | "outputs": [], 673 | "prompt_number": 26 674 | }, 675 | { 676 | "cell_type": "markdown", 677 | "metadata": {}, 678 | "source": [ 679 | "What is the distribution of the *across M polls* standard deviation?\n", 680 | "\n", 681 | "**Hint**: Show a plot. " 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "collapsed": false, 687 | "input": [ 688 | "#your code here" 689 | ], 690 | "language": "python", 691 | "metadata": {}, 692 | "outputs": [], 693 | "prompt_number": 27 694 | }, 695 | { 696 | "cell_type": "markdown", 697 | "metadata": {}, 698 | "source": [ 699 | "Answer: " 700 | ] 701 | }, 702 | { 703 | "cell_type": "markdown", 704 | "metadata": {}, 705 | "source": [ 706 | "#### Problem 2(e) \n", 707 | "\n", 708 | "What is the standard deviation of M polls in our real (not simulated) 2012 presidential election data ? " 709 | ] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "collapsed": false, 714 | "input": [ 715 | "#your code here" 716 | ], 717 | "language": "python", 718 | "metadata": {}, 719 | "outputs": [], 720 | "prompt_number": 28 721 | }, 722 | { 723 | "cell_type": "markdown", 724 | "metadata": {}, 725 | "source": [ 726 | "Is this larger, the same, or smaller than what we expeced if polls were not biased." 727 | ] 728 | }, 729 | { 730 | "cell_type": "code", 731 | "collapsed": false, 732 | "input": [ 733 | "#your code here" 734 | ], 735 | "language": "python", 736 | "metadata": {}, 737 | "outputs": [], 738 | "prompt_number": 29 739 | }, 740 | { 741 | "cell_type": "markdown", 742 | "metadata": {}, 743 | "source": [ 744 | "Answer: " 745 | ] 746 | }, 747 | { 748 | "cell_type": "markdown", 749 | "metadata": {}, 750 | "source": [ 751 | "#### Problem 2(f)\n", 752 | "\n", 753 | "**For AC209 Students**: Learn about the normal approximation for the binomial distribution and derive the results of Problem 2(b) and 2(c) analytically (using this approximation). Compare the results obtained analytically to those obtained from simulations." 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "collapsed": false, 759 | "input": [ 760 | "#your code here" 761 | ], 762 | "language": "python", 763 | "metadata": {}, 764 | "outputs": [], 765 | "prompt_number": 30 766 | }, 767 | { 768 | "cell_type": "markdown", 769 | "metadata": {}, 770 | "source": [ 771 | "Answer: " 772 | ] 773 | }, 774 | { 775 | "cell_type": "markdown", 776 | "metadata": {}, 777 | "source": [ 778 | "## Discussion for Problem 2\n", 779 | "\n", 780 | "*Write a brief discussion of your conclusions to the questions and tasks above in 100 words or less.*\n", 781 | "\n", 782 | "---\n" 783 | ] 784 | }, 785 | { 786 | "cell_type": "markdown", 787 | "metadata": {}, 788 | "source": [ 789 | "## Problem 3: Is the average of polls better than just one poll?" 790 | ] 791 | }, 792 | { 793 | "cell_type": "markdown", 794 | "metadata": {}, 795 | "source": [ 796 | "#### Problem 3(a)\n", 797 | "\n", 798 | "Most undecided voters vote for one of the two candidates at the election. Therefore, the reported percentages underestimate the final value of both candidates. However, if we assume the undecided will split evenly, then the observed difference should be an unbiased estimate of the final difference. \n", 799 | "\n", 800 | "Add a new column to the `election` DataFrame containg the difference between Obama and Romeny called `Diff`. " 801 | ] 802 | }, 803 | { 804 | "cell_type": "code", 805 | "collapsed": false, 806 | "input": [ 807 | "#your code here" 808 | ], 809 | "language": "python", 810 | "metadata": {}, 811 | "outputs": [], 812 | "prompt_number": 31 813 | }, 814 | { 815 | "cell_type": "markdown", 816 | "metadata": {}, 817 | "source": [ 818 | "#### Problem 3(b)\n", 819 | "\n", 820 | "Make a plot of the differences for the week before the election (e.g. 5 days) where the days are on the x-axis and the differences are on the y-axis. Add a horizontal line showing 3.9%: the difference between Obama and Romney on election day." 821 | ] 822 | }, 823 | { 824 | "cell_type": "code", 825 | "collapsed": false, 826 | "input": [ 827 | "#your code here" 828 | ], 829 | "language": "python", 830 | "metadata": {}, 831 | "outputs": [], 832 | "prompt_number": 32 833 | }, 834 | { 835 | "cell_type": "markdown", 836 | "metadata": {}, 837 | "source": [ 838 | "#### Problem 3(c) \n", 839 | "\n", 840 | "Make a plot showing the differences by pollster where the pollsters are on the x-axis and the differences on the y-axis. " 841 | ] 842 | }, 843 | { 844 | "cell_type": "code", 845 | "collapsed": false, 846 | "input": [ 847 | "#your code here" 848 | ], 849 | "language": "python", 850 | "metadata": {}, 851 | "outputs": [], 852 | "prompt_number": 33 853 | }, 854 | { 855 | "cell_type": "markdown", 856 | "metadata": {}, 857 | "source": [ 858 | "Is the *across poll* difference larger than the *between pollster* difference? " 859 | ] 860 | }, 861 | { 862 | "cell_type": "markdown", 863 | "metadata": {}, 864 | "source": [ 865 | "Answer: " 866 | ] 867 | }, 868 | { 869 | "cell_type": "markdown", 870 | "metadata": {}, 871 | "source": [ 872 | "#### Problem 3(d)\n", 873 | "\n", 874 | "Take the average for each pollster and then compute the average of that. Given this difference how confident would you have been of an Obama victory?\n", 875 | "\n", 876 | "**Hint**: Compute an estimate of the SE of this average based exclusively on the observed data. " 877 | ] 878 | }, 879 | { 880 | "cell_type": "code", 881 | "collapsed": false, 882 | "input": [ 883 | "#your code here" 884 | ], 885 | "language": "python", 886 | "metadata": {}, 887 | "outputs": [], 888 | "prompt_number": 34 889 | }, 890 | { 891 | "cell_type": "markdown", 892 | "metadata": {}, 893 | "source": [ 894 | "Answer: " 895 | ] 896 | }, 897 | { 898 | "cell_type": "markdown", 899 | "metadata": {}, 900 | "source": [ 901 | "#### Problem 3(e)\n", 902 | "\n", 903 | "**For AC209 Students**: Show the difference against time and see if you can detect a trend towards the end. Use this trend to see if it improves the final estimate." 904 | ] 905 | }, 906 | { 907 | "cell_type": "code", 908 | "collapsed": false, 909 | "input": [ 910 | "#your code here" 911 | ], 912 | "language": "python", 913 | "metadata": {}, 914 | "outputs": [], 915 | "prompt_number": 35 916 | }, 917 | { 918 | "cell_type": "markdown", 919 | "metadata": {}, 920 | "source": [ 921 | "Answer: " 922 | ] 923 | }, 924 | { 925 | "cell_type": "markdown", 926 | "metadata": {}, 927 | "source": [ 928 | "## Discussion for Problem 3\n", 929 | "\n", 930 | "*Write a brief discussion of your conclusions to the questions and tasks above in 100 words or less.*\n", 931 | "\n", 932 | "---\n" 933 | ] 934 | }, 935 | { 936 | "cell_type": "markdown", 937 | "metadata": {}, 938 | "source": [ 939 | "## Problem 4\n", 940 | "\n", 941 | "In this last problem, we will use the polls from the [2014 Senate Midterm Elections](http://elections.huffingtonpost.com/pollster) from the [HuffPost Pollster API](http://elections.huffingtonpost.com/pollster/api) to create a preliminary prediction of the result of each state. \n", 942 | "\n", 943 | "The HuffPost Pollster API allows you to access the data as a CSV or a JSON response by tacking \".csv\" or \".json\" at the end of the URLs. For example the 2012 Presidential Election could be accessed as a [.json](http://elections.huffingtonpost.com/pollster/2012-general-election-romney-vs-obama.json) instead of a [.csv](http://elections.huffingtonpost.com/pollster/2012-general-election-romney-vs-obama.csv)" 944 | ] 945 | }, 946 | { 947 | "cell_type": "markdown", 948 | "metadata": {}, 949 | "source": [ 950 | "#### Problem 4(a)\n", 951 | "\n", 952 | "Read in the polls for **all** of the 2014 Senate Elections using the HuffPost API. For example, we can consider the [2014 Senate race in Kentucky between Mitch McConnell and Alison Grimes](http://elections.huffingtonpost.com/pollster/2014-kentucky-senate-mcconnell-vs-grimes). \n", 953 | "\n", 954 | "To search for the 2014 Senate races, use the `topics` parameter in the API [[read more about topics here](http://elections.huffingtonpost.com/pollster/api)]. " 955 | ] 956 | }, 957 | { 958 | "cell_type": "code", 959 | "collapsed": false, 960 | "input": [ 961 | "url_str = \"http://elections.huffingtonpost.com/pollster/api/charts/?topic=2014-senate\"" 962 | ], 963 | "language": "python", 964 | "metadata": {}, 965 | "outputs": [], 966 | "prompt_number": 36 967 | }, 968 | { 969 | "cell_type": "markdown", 970 | "metadata": {}, 971 | "source": [ 972 | "To list all the URLs related to the 2014 Senate races using the pollster API, we can use a list comprehension:" 973 | ] 974 | }, 975 | { 976 | "cell_type": "code", 977 | "collapsed": false, 978 | "input": [ 979 | "election_urls = [election['url'] + '.csv' for election in requests.get(url_str).json()]\n", 980 | "election_urls" 981 | ], 982 | "language": "python", 983 | "metadata": {}, 984 | "outputs": [ 985 | { 986 | "metadata": {}, 987 | "output_type": "pyout", 988 | "prompt_number": 37, 989 | "text": [ 990 | "[u'http://elections.huffingtonpost.com/pollster/2014-kentucky-senate-mcconnell-vs-grimes.csv',\n", 991 | " u'http://elections.huffingtonpost.com/pollster/2014-arkansas-senate-cotton-vs-pryor.csv',\n", 992 | " u'http://elections.huffingtonpost.com/pollster/2014-michigan-senate-land-vs-peters.csv',\n", 993 | " u'http://elections.huffingtonpost.com/pollster/2014-louisiana-senate-cassidy-vs-landrieu.csv',\n", 994 | " u'http://elections.huffingtonpost.com/pollster/2014-new-hampshire-senate-brown-vs-shaheen.csv',\n", 995 | " u'http://elections.huffingtonpost.com/pollster/2014-west-virginia-senate-capito-vs-tennant.csv',\n", 996 | " u'http://elections.huffingtonpost.com/pollster/2014-new-hampshire-senate-bass-vs-shaheen.csv',\n", 997 | " u'http://elections.huffingtonpost.com/pollster/2014-north-carolina-senate-tillis-vs-hagan.csv',\n", 998 | " u'http://elections.huffingtonpost.com/pollster/2014-virginia-senate-gillespie-vs-warner.csv',\n", 999 | " u'http://elections.huffingtonpost.com/pollster/2014-colorado-senate-gardner-vs-udall.csv',\n", 1000 | " u'http://elections.huffingtonpost.com/pollster/2014-illinois-senate-oberweis-vs-durbin.csv',\n", 1001 | " u'http://elections.huffingtonpost.com/pollster/2014-alaska-senate-sullivan-vs-begich.csv',\n", 1002 | " u'http://elections.huffingtonpost.com/pollster/2014-iowa-senate-ernst-vs-braley.csv',\n", 1003 | " u'http://elections.huffingtonpost.com/pollster/2014-mississippi-senate-cochran-vs-childers.csv',\n", 1004 | " u'http://elections.huffingtonpost.com/pollster/2014-oregon-senate-wehby-vs-merkley.csv',\n", 1005 | " u'http://elections.huffingtonpost.com/pollster/2014-georgia-senate-perdue-vs-nunn.csv',\n", 1006 | " u'http://elections.huffingtonpost.com/pollster/2014-louisiana-senate-sasse-vs-domina.csv',\n", 1007 | " u'http://elections.huffingtonpost.com/pollster/2014-south-dakota-senate-rounds-vs-weiland.csv',\n", 1008 | " u'http://elections.huffingtonpost.com/pollster/2014-maine-senate-collins-vs-bellows.csv',\n", 1009 | " u'http://elections.huffingtonpost.com/pollster/2014-minnesota-senate-mcfadden-vs-franken.csv',\n", 1010 | " u'http://elections.huffingtonpost.com/pollster/2014-texas-senate-cornyn-vs-alameel.csv',\n", 1011 | " u'http://elections.huffingtonpost.com/pollster/2014-south-carolina-senate-graham-vs-hutto.csv',\n", 1012 | " u'http://elections.huffingtonpost.com/pollster/2014-south-carolina-senate-scott-vs-dickerson.csv',\n", 1013 | " u'http://elections.huffingtonpost.com/pollster/2014-oklahoma-senate-inhofe-vs-silverstein.csv',\n", 1014 | " u'http://elections.huffingtonpost.com/pollster/2014-new-mexico-senate-weh-vs-udall.csv',\n", 1015 | " u'http://elections.huffingtonpost.com/pollster/2014-new-jersey-senate-bell-vs-booker.csv',\n", 1016 | " u'http://elections.huffingtonpost.com/pollster/2014-idaho-senate-risch-vs-mitchell.csv',\n", 1017 | " u'http://elections.huffingtonpost.com/pollster/2014-tennessee-senate-alexander-vs-ball.csv',\n", 1018 | " u'http://elections.huffingtonpost.com/pollster/2014-wyoming-senate.csv',\n", 1019 | " u'http://elections.huffingtonpost.com/pollster/2014-kansas-senate-roberts-vs-orman-vs-taylor.csv',\n", 1020 | " u'http://elections.huffingtonpost.com/pollster/2014-hawaii-senate-cavasso-vs-schatz.csv',\n", 1021 | " u'http://elections.huffingtonpost.com/pollster/2014-oklahoma-senate-lankford-vs-johnson.csv',\n", 1022 | " u'http://elections.huffingtonpost.com/pollster/2014-montana-senate-daines-vs-curtis.csv',\n", 1023 | " u'http://elections.huffingtonpost.com/pollster/2014-rhode-island-senate-zaccaria-vs-reed.csv',\n", 1024 | " u'http://elections.huffingtonpost.com/pollster/2014-massachusetts-senate-herr-vs-markey.csv',\n", 1025 | " u'http://elections.huffingtonpost.com/pollster/2014-delaware-senate-wade-vs-coons.csv']" 1026 | ] 1027 | } 1028 | ], 1029 | "prompt_number": 37 1030 | }, 1031 | { 1032 | "cell_type": "markdown", 1033 | "metadata": {}, 1034 | "source": [ 1035 | "Because there so many Senate races, we can create a dictionary of pandas DataFrames that will be keyed by the name of the election (a string). " 1036 | ] 1037 | }, 1038 | { 1039 | "cell_type": "code", 1040 | "collapsed": false, 1041 | "input": [ 1042 | "def build_frame(url):\n", 1043 | " \"\"\"\n", 1044 | " Returns a pandas DataFrame object containing\n", 1045 | " the data returned from the given url\n", 1046 | " \"\"\"\n", 1047 | " source = requests.get(url).text\n", 1048 | " \n", 1049 | " # Use StringIO because pd.DataFrame.from_csv requires .read() method\n", 1050 | " s = StringIO(source)\n", 1051 | " \n", 1052 | " return pd.DataFrame.from_csv(s, index_col=None).convert_objects(\n", 1053 | " convert_dates=\"coerce\", convert_numeric=True)" 1054 | ], 1055 | "language": "python", 1056 | "metadata": {}, 1057 | "outputs": [], 1058 | "prompt_number": 38 1059 | }, 1060 | { 1061 | "cell_type": "code", 1062 | "collapsed": false, 1063 | "input": [ 1064 | "# Makes a dictionary of pandas DataFrames keyed on election string.\n", 1065 | "dfs = dict((election.split(\"/\")[-1][:-4], build_frame(election)) for election in election_urls)" 1066 | ], 1067 | "language": "python", 1068 | "metadata": {}, 1069 | "outputs": [], 1070 | "prompt_number": 39 1071 | }, 1072 | { 1073 | "cell_type": "markdown", 1074 | "metadata": {}, 1075 | "source": [ 1076 | "Show the head of the DataFrame containing the polls for the 2014 Senate race in Kentucky between McConnell and Grimes." 1077 | ] 1078 | }, 1079 | { 1080 | "cell_type": "code", 1081 | "collapsed": false, 1082 | "input": [ 1083 | "#your code here" 1084 | ], 1085 | "language": "python", 1086 | "metadata": {}, 1087 | "outputs": [], 1088 | "prompt_number": 40 1089 | }, 1090 | { 1091 | "cell_type": "markdown", 1092 | "metadata": {}, 1093 | "source": [ 1094 | "#### Problem 4(b)\n", 1095 | "\n", 1096 | "For each 2014 Senate race, create a preliminary prediction of the result for that state." 1097 | ] 1098 | }, 1099 | { 1100 | "cell_type": "code", 1101 | "collapsed": false, 1102 | "input": [ 1103 | "#your code here" 1104 | ], 1105 | "language": "python", 1106 | "metadata": {}, 1107 | "outputs": [], 1108 | "prompt_number": 42 1109 | }, 1110 | { 1111 | "cell_type": "markdown", 1112 | "metadata": {}, 1113 | "source": [ 1114 | "# Submission Instructions\n", 1115 | "\n", 1116 | "To submit your homework, create a folder named **lastname_firstinitial_hw#** and place your IPython notebooks, data files, and any other files in this folder. Your IPython Notebooks should be completely executed with the results visible in the notebook. We should not have to run any code. Compress the folder (please use .zip compression) and submit to the CS109 dropbox in the appropriate folder. *If we cannot access your work because these directions are not followed correctly, we will not grade your work.*\n" 1117 | ] 1118 | }, 1119 | { 1120 | "cell_type": "code", 1121 | "collapsed": false, 1122 | "input": [], 1123 | "language": "python", 1124 | "metadata": {}, 1125 | "outputs": [] 1126 | } 1127 | ], 1128 | "metadata": {} 1129 | } 1130 | ] 1131 | } --------------------------------------------------------------------------------