├── .gitignore ├── HW0.ipynb ├── HW0_solutions.ipynb ├── HW1.ipynb ├── HW1_solutions.ipynb ├── HW2.ipynb ├── HW2_solutions.ipynb ├── HW3.ipynb ├── HW3_solutions.ipynb ├── HW4.ipynb ├── HW4_solutions.ipynb ├── HW5.ipynb ├── HW5_solutions.ipynb ├── InstructionsForAmazonEMR.ipynb ├── LICENSE ├── README.md ├── computesim.py ├── computesim2.py ├── imdb_top_10000.txt ├── labs ├── lab10 │ ├── Lab_10.ipynb │ ├── final_lab_images.pdf │ ├── svm.csv │ ├── test.csv │ ├── train.csv │ └── yeastall_public.txt ├── lab2 │ ├── Lab_2_A_Johanna.ipynb │ ├── Lab_2_A_Live.ipynb │ ├── Lab_2_A_Live_Ray_Final.ipynb │ ├── Lab_2_B.ipynb │ ├── Lab_2_B_Live.ipynb │ ├── README.md │ ├── cs109style.py │ └── custom.css ├── lab3 │ ├── Italy.png │ ├── data │ │ └── olive.csv │ ├── lab3.ipynb │ └── lab3full.ipynb ├── lab4 │ ├── Lab4.ipynb │ ├── Lab4full.ipynb │ └── data │ │ ├── US_Unemployment_Oct2012.csv │ │ ├── census_demographics.csv │ │ ├── chall-damage.png │ │ ├── chall-table.png │ │ ├── chall.txt │ │ ├── images │ │ ├── images │ │ │ ├── checks │ │ │ │ ├── th (1).jpeg │ │ │ │ ├── th (10).jpeg │ │ │ │ ├── th (11).jpeg │ │ │ │ ├── th (12).jpeg │ │ │ │ ├── th (13).jpeg │ │ │ │ ├── th (14).jpeg │ │ │ │ ├── th (15).jpeg │ │ │ │ ├── th (16).jpeg │ │ │ │ ├── th (17).jpeg │ │ │ │ ├── th (18).jpeg │ │ │ │ ├── th (19).jpeg │ │ │ │ ├── th (2).jpeg │ │ │ │ ├── th (20).jpeg │ │ │ │ ├── th (21).jpeg │ │ │ │ ├── th (22).jpeg │ │ │ │ ├── th (23).jpeg │ │ │ │ ├── th (24).jpeg │ │ │ │ ├── th (25).jpeg │ │ │ │ ├── th (26).jpeg │ │ │ │ ├── th (27).jpeg │ │ │ │ ├── th (28).jpeg │ │ │ │ ├── th (29).jpeg │ │ │ │ ├── th (3).jpeg │ │ │ │ ├── th (30).jpeg │ │ │ │ ├── th (31).jpeg │ │ │ │ ├── th (32).jpeg │ │ │ │ ├── th (33).jpeg │ │ │ │ ├── th (34).jpeg │ │ │ │ ├── th (35).jpeg │ │ │ │ ├── th (4).jpeg │ │ │ │ ├── th (5).jpeg │ │ │ │ ├── th (6).jpeg │ │ │ │ ├── th (7).jpeg │ │ │ │ ├── th (8).jpeg │ │ │ │ ├── th (9).jpeg │ │ │ │ └── th.jpeg │ │ │ └── dollars │ │ │ │ ├── th (1).jpeg │ │ │ │ ├── th (10).jpeg │ │ │ │ ├── th (11).jpeg │ │ │ │ ├── th (12).jpeg │ │ │ │ ├── th (13).jpeg │ │ │ │ ├── th (14).jpeg │ │ │ │ ├── th (15).jpeg │ │ │ │ ├── th (16).jpeg │ │ │ │ ├── th (17).jpeg │ │ │ │ ├── th (18).jpeg │ │ │ │ ├── th (19).jpeg │ │ │ │ ├── th (2).jpeg │ │ │ │ ├── th (20).jpeg │ │ │ │ ├── th (21).jpeg │ │ │ │ ├── th (22).jpeg │ │ │ │ ├── th (23).jpeg │ │ │ │ ├── th (24).jpeg │ │ │ │ ├── th (25).jpeg │ │ │ │ ├── th (26).jpeg │ │ │ │ ├── th (27).jpeg │ │ │ │ ├── th (28).jpeg │ │ │ │ ├── th (29).jpeg │ │ │ │ ├── th (3).jpeg │ │ │ │ ├── th (30).jpeg │ │ │ │ ├── th (31).jpeg │ │ │ │ ├── th (32).jpeg │ │ │ │ ├── th (33).jpeg │ │ │ │ ├── th (34).jpeg │ │ │ │ ├── th (35).jpeg │ │ │ │ ├── th (36).jpeg │ │ │ │ ├── th (37).jpeg │ │ │ │ ├── th (38).jpeg │ │ │ │ ├── th (39).jpeg │ │ │ │ ├── th (4).jpeg │ │ │ │ ├── th (40).jpeg │ │ │ │ ├── th (41).jpeg │ │ │ │ ├── th (42).jpeg │ │ │ │ ├── th (43).jpeg │ │ │ │ ├── th (44).jpeg │ │ │ │ ├── th (45).jpeg │ │ │ │ ├── th (46).jpeg │ │ │ │ ├── th (47).jpeg │ │ │ │ ├── th (48).jpeg │ │ │ │ ├── th (49).jpeg │ │ │ │ ├── th (5).jpeg │ │ │ │ ├── th (50).jpeg │ │ │ │ ├── th (6).jpeg │ │ │ │ ├── th (7).jpeg │ │ │ │ ├── th (8).jpeg │ │ │ │ ├── th (9).jpeg │ │ │ │ └── th.jpeg │ │ └── query_bing_images.py │ │ ├── myclusters.csv │ │ ├── partisan_voting.csv │ │ ├── pcavsfit.png │ │ └── shuttle.png ├── lab5 │ ├── Lab5.ipynb │ └── data │ │ ├── bias-variance-error.png │ │ ├── lc-hb.png │ │ ├── lc-hv.png │ │ ├── olive.csv │ │ └── reg-bias-variance.png ├── lab6 │ ├── BayesLinear.ipynb │ └── _multivariate.py ├── lab7 │ └── GibbsSampler.ipynb ├── lab8 │ ├── anagrams.py │ ├── baseball_friends.csv │ ├── friend_affiliations.py │ ├── generate_friends.py │ ├── lab8_mapreduce.ipynb │ ├── most_used_word.py │ ├── names.txt │ ├── word_count.py │ └── word_list.txt └── lab9 │ ├── lab_9.ipynb │ ├── lab_9_with_answers.ipynb │ └── linkedin_alexander_lex.csv ├── lec_03_statistical_graphs.ipynb ├── lec_03_statistical_graphs_mpl_default.ipynb ├── lec_04_scraping.ipynb ├── lec_04_wrangling.ipynb ├── lec_10_cross_val.ipynb ├── matplotlib_examples ├── geographic_plots.ipynb ├── imdb.tsv ├── scatter_plots.ipynb └── torn.csv └── skeleton.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | 37 | #OSX 38 | __MACOSX/ 39 | .DS_Store 40 | 41 | #Ipython 42 | .ipynb_checkpoints/ 43 | -------------------------------------------------------------------------------- /HW0.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "source": [ 12 | "# Homework 0\n", 13 | "\n", 14 | "### Due Tuesday, September 10 (but no submission is required)\n", 15 | "\n", 16 | "---\n", 17 | "\n", 18 | "Welcome to CS109 / STAT121 / AC209 / E-109 (http://cs109.org/). In this class, we will be using a variety of tools that will require some initial configuration. To ensure everything goes smoothly moving forward, we will setup the majority of those tools in this homework. While some of this will likely be dull, doing it now will enable us to do more exciting work in the weeks that follow without getting bogged down in further software configuration. This homework will not be graded, however it is essential that you complete it timely since it will enable us to set up your accounts. You do not have to hand anything in, with the exception of filling out the online survey. \n", 19 | "\n", 20 | "## Class Survey, Piazza, and Introduction\n", 21 | "\n", 22 | "**Class Survey**\n", 23 | "\n", 24 | "Please complete the mandatory course survey located [here](https://docs.google.com/spreadsheet/viewform?formkey=dFg1ZFJwLWJ6ZWhWR1JJb0tES3lGMEE6MA#gid=0). It should only take a few moments of your time. Once you fill in the survey we will sign you up to the course forum on Piazza and the dropbox system that you will use to hand in the homework. It is imperative that you fill out the survey on time as we use the provided information to sign you up for these services. \n", 25 | "\n", 26 | "**Piazza**\n", 27 | "\n", 28 | "Go to [Piazza](https://piazza.com/harvard/fall2013/cs109/home) and sign up for the class using your Harvard e-mail address. \n", 29 | "\n", 30 | "You will use Piazza as a forum for discussion, to find team members, to arrange appointments, and to ask questions. Piazza should be your primary form of communication with the staff. Use the staff e-mail (staff@cs109.org) only for individual requests, e.g., to excuse yourself from a mandatory guest lecture. All readings, homeworks, and project descriptions will be announced on Piazza first. \n", 31 | "\n", 32 | "**Introduction**\n", 33 | "\n", 34 | "Once you are signed up to the Piazza course forum, introduce yourself to your classmates and course staff with a follow-up post in the introduction thread. Include your name/nickname, your affiliation, why you are taking this course, and tell us something interesting about yourself (e.g., an industry job, an unusual hobby, past travels, or a cool project you did, etc.). Also tell us whether you have experience with data science. \n", 35 | "\n", 36 | "## Programming expectations\n", 37 | "\n", 38 | "All the assignments and labs for this class will use Python and, for the most part, the browser-based IPython notebook format you are currently viewing. Knowledge of Python is not a prerequisite for this course, **provided you are comfortable learning on your own as needed**. While we have strived to make the programming component of this course straightforward, we will not devote much time to teaching prorgramming or Python syntax. Basically, you should feel comfortable with:\n", 39 | "\n", 40 | "* How to look up Python syntax on Google and StackOverflow.\n", 41 | "* Basic programming concepts like functions, loops, arrays, dictionaries, strings, and if statements.\n", 42 | "* How to learn new libraries by reading documentation.\n", 43 | "* Asking questions on StackOverflow or Piazza.\n", 44 | "\n", 45 | "There are many online tutorials to introduce you to scientific python programming. [Here is one](https://github.com/jrjohansson/scientific-python-lectures) that is very nice. Lectures 1-4 are most relevant to this class.\n", 46 | "\n", 47 | "## Getting Python\n", 48 | "\n", 49 | "You will be using Python throughout the course, including many popular 3rd party Python libraries for scientific computing. [Anaconda](http://continuum.io/downloads) is an easy-to-install bundle of Python and most of these libraries. We recommend that you use Anaconda for this course.\n", 50 | "\n", 51 | "Please visit [this page](https://github.com/cs109/content/wiki/Installing-Python) and follow the instructions to set up Python\n", 52 | "\n", 53 | "\n", 54 | "\n", 55 | "## Hello, Python\n", 56 | "\n", 57 | "The IPython notebook is an application to build interactive computational notebooks. You'll be using them to complete labs and homework. Once you've set up Python, please download this page, and open it with IPython by typing\n", 58 | "\n", 59 | "```\n", 60 | "ipython notebook \n", 61 | "```\n", 62 | "\n", 63 | "For the rest of the assignment, use your local copy of this page, running on IPython.\n", 64 | "\n", 65 | "Notebooks are composed of many \"cells\", which can contain text (like this one), or code (like the one below). Double click on the cell below, and evaluate it by clicking the \"play\" button above, for by hitting shift + enter" 66 | ], 67 | "cell_type": "markdown", 68 | "metadata": {} 69 | }, 70 | { 71 | "cell_type": "code", 72 | "language": "python", 73 | "outputs": [], 74 | "collapsed": false, 75 | "prompt_number": 1, 76 | "input": [ 77 | "x = [10, 20, 30, 40, 50]\n", 78 | "for item in x:\n", 79 | " print \"Item is \", item" 80 | ], 81 | "metadata": {} 82 | }, 83 | { 84 | "source": [ 85 | "## Python Libraries\n", 86 | "\n", 87 | "We will be using a several different libraries throughout this course. If you've successfully completed the [installation instructions](https://github.com/cs109/content/wiki/Installing-Python), all of the following statements should run." 88 | ], 89 | "cell_type": "markdown", 90 | "metadata": {} 91 | }, 92 | { 93 | "cell_type": "code", 94 | "language": "python", 95 | "outputs": [], 96 | "collapsed": false, 97 | "prompt_number": 2, 98 | "input": [ 99 | "#IPython is what you are using now to run the notebook\n", 100 | "import IPython\n", 101 | "print \"IPython version: %6.6s (need at least 1.0)\" % IPython.__version__\n", 102 | "\n", 103 | "# Numpy is a library for working with Arrays\n", 104 | "import numpy as np\n", 105 | "print \"Numpy version: %6.6s (need at least 1.7.1)\" % np.__version__\n", 106 | "\n", 107 | "# SciPy implements many different numerical algorithms\n", 108 | "import scipy as sp\n", 109 | "print \"SciPy version: %6.6s (need at least 0.12.0)\" % sp.__version__\n", 110 | "\n", 111 | "# Pandas makes working with data tables easier\n", 112 | "import pandas as pd\n", 113 | "print \"Pandas version: %6.6s (need at least 0.11.0)\" % pd.__version__\n", 114 | "\n", 115 | "# Module for plotting\n", 116 | "import matplotlib\n", 117 | "print \"Mapltolib version: %6.6s (need at least 1.2.1)\" % matplotlib.__version__\n", 118 | "\n", 119 | "# SciKit Learn implements several Machine Learning algorithms\n", 120 | "import sklearn\n", 121 | "print \"Scikit-Learn version: %6.6s (need at least 0.13.1)\" % sklearn.__version__\n", 122 | "\n", 123 | "# Requests is a library for getting data from the Web\n", 124 | "import requests\n", 125 | "print \"requests version: %6.6s (need at least 1.2.3)\" % requests.__version__\n", 126 | "\n", 127 | "# Networkx is a library for working with networks\n", 128 | "import networkx as nx\n", 129 | "print \"NetworkX version: %6.6s (need at least 1.7)\" % nx.__version__\n", 130 | "\n", 131 | "#BeautifulSoup is a library to parse HTML and XML documents\n", 132 | "import BeautifulSoup\n", 133 | "print \"BeautifulSoup version:%6.6s (need at least 3.2)\" % BeautifulSoup.__version__\n", 134 | "\n", 135 | "#MrJob is a library to run map reduce jobs on Amazon's computers\n", 136 | "import mrjob\n", 137 | "print \"Mr Job version: %6.6s (need at least 0.4)\" % mrjob.__version__\n", 138 | "\n", 139 | "#Pattern has lots of tools for working with data from the internet\n", 140 | "import pattern\n", 141 | "print \"Pattern version: %6.6s (need at least 2.6)\" % pattern.__version__" 142 | ], 143 | "metadata": {} 144 | }, 145 | { 146 | "source": [ 147 | "If any of these libraries are missing or out of date, you will need to [install them](https://github.com/cs109/content/wiki/Installing-Python#installing-additional-libraries) and restart IPython" 148 | ], 149 | "cell_type": "markdown", 150 | "metadata": {} 151 | }, 152 | { 153 | "source": [ 154 | "## Hello matplotlib" 155 | ], 156 | "cell_type": "markdown", 157 | "metadata": {} 158 | }, 159 | { 160 | "source": [ 161 | "The notebook integrates nicely with Matplotlib, the primary plotting package for python. This should embed a figure of a sine wave:" 162 | ], 163 | "cell_type": "markdown", 164 | "metadata": {} 165 | }, 166 | { 167 | "cell_type": "code", 168 | "language": "python", 169 | "outputs": [], 170 | "collapsed": false, 171 | "prompt_number": 3, 172 | "input": [ 173 | "#this line prepares IPython for working with matplotlib\n", 174 | "%matplotlib inline \n", 175 | "\n", 176 | "# this actually imports matplotlib\n", 177 | "import matplotlib.pyplot as plt \n", 178 | "\n", 179 | "x = np.linspace(0, 10, 30) #array of 30 points from 0 to 10\n", 180 | "y = np.sin(x)\n", 181 | "z = y + np.random.normal(size=30) * .2\n", 182 | "plt.plot(x, y, 'ro-', label='A sine wave')\n", 183 | "plt.plot(x, z, 'b-', label='Noisy sine')\n", 184 | "plt.legend(loc = 'lower right')\n", 185 | "plt.xlabel(\"X axis\")\n", 186 | "plt.ylabel(\"Y axis\") " 187 | ], 188 | "metadata": {} 189 | }, 190 | { 191 | "source": [ 192 | "If that last cell complained about the `%matplotlib` line, you need to update IPython to v1.0, and restart the notebook. See the [installation page](https://github.com/cs109/content/wiki/Installing-Python)" 193 | ], 194 | "cell_type": "markdown", 195 | "metadata": {} 196 | }, 197 | { 198 | "source": [ 199 | "## Hello Numpy\n", 200 | "\n", 201 | "The Numpy array processing library is the basis of nearly all numerical computing in Python. Here's a 30 second crash course. For more details, consult Chapter 4 of Python for Data Analysis, or the [Numpy User's Guide](http://docs.scipy.org/doc/numpy-dev/user/index.html)" 202 | ], 203 | "cell_type": "markdown", 204 | "metadata": {} 205 | }, 206 | { 207 | "cell_type": "code", 208 | "language": "python", 209 | "outputs": [], 210 | "collapsed": false, 211 | "prompt_number": 4, 212 | "input": [ 213 | "print \"Make a 3 row x 4 column array of random numbers\"\n", 214 | "x = np.random.random((3, 4))\n", 215 | "print x\n", 216 | "print\n", 217 | "\n", 218 | "print \"Add 1 to every element\"\n", 219 | "x = x + 1\n", 220 | "print x\n", 221 | "print\n", 222 | "\n", 223 | "print \"Get the element at row 1, column 2\"\n", 224 | "print x[1, 2]\n", 225 | "print\n", 226 | "\n", 227 | "# The colon syntax is called \"slicing\" the array. \n", 228 | "print \"Get the first row\"\n", 229 | "print x[0, :]\n", 230 | "print\n", 231 | "\n", 232 | "print \"Get every 2nd column of the first row\"\n", 233 | "print x[0, ::2]\n", 234 | "print" 235 | ], 236 | "metadata": {} 237 | }, 238 | { 239 | "source": [ 240 | "Print the maximum, minimum, and mean of the array. This does **not** require writing a loop. In the code cell below, type `x.m`, to find built-in operations for common array statistics like this" 241 | ], 242 | "cell_type": "markdown", 243 | "metadata": {} 244 | }, 245 | { 246 | "cell_type": "code", 247 | "language": "python", 248 | "outputs": [], 249 | "collapsed": false, 250 | "prompt_number": 5, 251 | "input": [ 252 | "#your code here\n" 253 | ], 254 | "metadata": {} 255 | }, 256 | { 257 | "source": [ 258 | "Call the `x.max` function again, but use the `axis` keyword to print the maximum of each row in x." 259 | ], 260 | "cell_type": "markdown", 261 | "metadata": {} 262 | }, 263 | { 264 | "cell_type": "code", 265 | "language": "python", 266 | "outputs": [], 267 | "collapsed": false, 268 | "prompt_number": 6, 269 | "input": [ 270 | "#your code here\n" 271 | ], 272 | "metadata": {} 273 | }, 274 | { 275 | "source": [ 276 | "Here's a way to quickly simulate 500 coin \"fair\" coin tosses (where the probabily of getting Heads is 50%, or 0.5)" 277 | ], 278 | "cell_type": "markdown", 279 | "metadata": {} 280 | }, 281 | { 282 | "cell_type": "code", 283 | "language": "python", 284 | "outputs": [], 285 | "collapsed": false, 286 | "prompt_number": 7, 287 | "input": [ 288 | "x = np.random.binomial(500, .5)\n", 289 | "print \"number of heads:\", x" 290 | ], 291 | "metadata": {} 292 | }, 293 | { 294 | "source": [ 295 | "Repeat this simulation 500 times, and use the [plt.hist() function](http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.hist) to plot a histogram of the number of Heads (1s) in each simulation" 296 | ], 297 | "cell_type": "markdown", 298 | "metadata": {} 299 | }, 300 | { 301 | "cell_type": "code", 302 | "language": "python", 303 | "outputs": [], 304 | "collapsed": false, 305 | "prompt_number": 8, 306 | "input": [ 307 | "#your code here\n" 308 | ], 309 | "metadata": {} 310 | }, 311 | { 312 | "source": [ 313 | "## The Monty Hall Problem\n", 314 | "\n", 315 | "\n", 316 | "Here's a fun and perhaps surprising statistical riddle, and a good way to get some practice writing python functions\n", 317 | "\n", 318 | "In a gameshow, contestants try to guess which of 3 closed doors contain a cash prize (goats are behind the other two doors). Of course, the odds of choosing the correct door are 1 in 3. As a twist, the host of the show occasionally opens a door after a contestant makes his or her choice. This door is always one of the two the contestant did not pick, and is also always one of the goat doors (note that it is always possible to do this, since there are two goat doors). At this point, the contestant has the option of keeping his or her original choice, or swtiching to the other unopened door. The question is: is there any benefit to switching doors? The answer surprises many people who haven't heard the question before.\n", 319 | "\n", 320 | "We can answer the problem by running simulations in Python. We'll do it in several parts.\n", 321 | "\n", 322 | "First, write a function called `simulate_prizedoor`. This function will simulate the location of the prize in many games -- see the detailed specification below:" 323 | ], 324 | "cell_type": "markdown", 325 | "metadata": {} 326 | }, 327 | { 328 | "cell_type": "code", 329 | "language": "python", 330 | "outputs": [], 331 | "collapsed": false, 332 | "prompt_number": 9, 333 | "input": [ 334 | "\"\"\"\n", 335 | "Function\n", 336 | "--------\n", 337 | "simulate_prizedoor\n", 338 | "\n", 339 | "Generate a random array of 0s, 1s, and 2s, representing\n", 340 | "hiding a prize between door 0, door 1, and door 2\n", 341 | "\n", 342 | "Parameters\n", 343 | "----------\n", 344 | "nsim : int\n", 345 | " The number of simulations to run\n", 346 | "\n", 347 | "Returns\n", 348 | "-------\n", 349 | "sims : array\n", 350 | " Random array of 0s, 1s, and 2s\n", 351 | "\n", 352 | "Example\n", 353 | "-------\n", 354 | ">>> print simulate_prizedoor(3)\n", 355 | "array([0, 0, 2])\n", 356 | "\"\"\"\n", 357 | "def simulate_prizedoor(nsim):\n", 358 | " #compute here\n", 359 | " return answer\n", 360 | "#your code here\n" 361 | ], 362 | "metadata": {} 363 | }, 364 | { 365 | "source": [ 366 | "Next, write a function that simulates the contestant's guesses for `nsim` simulations. Call this function `simulate_guess`. The specs:" 367 | ], 368 | "cell_type": "markdown", 369 | "metadata": {} 370 | }, 371 | { 372 | "cell_type": "code", 373 | "language": "python", 374 | "outputs": [], 375 | "collapsed": false, 376 | "prompt_number": 10, 377 | "input": [ 378 | "\"\"\"\n", 379 | "Function\n", 380 | "--------\n", 381 | "simulate_guess\n", 382 | "\n", 383 | "Return any strategy for guessing which door a prize is behind. This\n", 384 | "could be a random strategy, one that always guesses 2, whatever.\n", 385 | "\n", 386 | "Parameters\n", 387 | "----------\n", 388 | "nsim : int\n", 389 | " The number of simulations to generate guesses for\n", 390 | "\n", 391 | "Returns\n", 392 | "-------\n", 393 | "guesses : array\n", 394 | " An array of guesses. Each guess is a 0, 1, or 2\n", 395 | "\n", 396 | "Example\n", 397 | "-------\n", 398 | ">>> print simulate_guess(5)\n", 399 | "array([0, 0, 0, 0, 0])\n", 400 | "\"\"\"\n", 401 | "#your code here\n" 402 | ], 403 | "metadata": {} 404 | }, 405 | { 406 | "source": [ 407 | "Next, write a function, `goat_door`, to simulate randomly revealing one of the goat doors that a contestant didn't pick." 408 | ], 409 | "cell_type": "markdown", 410 | "metadata": {} 411 | }, 412 | { 413 | "cell_type": "code", 414 | "language": "python", 415 | "outputs": [], 416 | "collapsed": false, 417 | "prompt_number": 11, 418 | "input": [ 419 | "\"\"\"\n", 420 | "Function\n", 421 | "--------\n", 422 | "goat_door\n", 423 | "\n", 424 | "Simulate the opening of a \"goat door\" that doesn't contain the prize,\n", 425 | "and is different from the contestants guess\n", 426 | "\n", 427 | "Parameters\n", 428 | "----------\n", 429 | "prizedoors : array\n", 430 | " The door that the prize is behind in each simulation\n", 431 | "guesses : array\n", 432 | " THe door that the contestant guessed in each simulation\n", 433 | "\n", 434 | "Returns\n", 435 | "-------\n", 436 | "goats : array\n", 437 | " The goat door that is opened for each simulation. Each item is 0, 1, or 2, and is different\n", 438 | " from both prizedoors and guesses\n", 439 | "\n", 440 | "Examples\n", 441 | "--------\n", 442 | ">>> print goat_door(np.array([0, 1, 2]), np.array([1, 1, 1]))\n", 443 | ">>> array([2, 2, 0])\n", 444 | "\"\"\"\n", 445 | "#your code here\n" 446 | ], 447 | "metadata": {} 448 | }, 449 | { 450 | "source": [ 451 | "Write a function, `switch_guess`, that represents the strategy of always switching a guess after the goat door is opened." 452 | ], 453 | "cell_type": "markdown", 454 | "metadata": {} 455 | }, 456 | { 457 | "cell_type": "code", 458 | "language": "python", 459 | "outputs": [], 460 | "collapsed": false, 461 | "prompt_number": 12, 462 | "input": [ 463 | "\"\"\"\n", 464 | "Function\n", 465 | "--------\n", 466 | "switch_guess\n", 467 | "\n", 468 | "The strategy that always switches a guess after the goat door is opened\n", 469 | "\n", 470 | "Parameters\n", 471 | "----------\n", 472 | "guesses : array\n", 473 | " Array of original guesses, for each simulation\n", 474 | "goatdoors : array\n", 475 | " Array of revealed goat doors for each simulation\n", 476 | "\n", 477 | "Returns\n", 478 | "-------\n", 479 | "The new door after switching. Should be different from both guesses and goatdoors\n", 480 | "\n", 481 | "Examples\n", 482 | "--------\n", 483 | ">>> print switch_guess(np.array([0, 1, 2]), np.array([1, 2, 1]))\n", 484 | ">>> array([2, 0, 0])\n", 485 | "\"\"\"\n", 486 | "#your code here\n" 487 | ], 488 | "metadata": {} 489 | }, 490 | { 491 | "source": [ 492 | "Last function: write a `win_percentage` function that takes an array of `guesses` and `prizedoors`, and returns the percent of correct guesses" 493 | ], 494 | "cell_type": "markdown", 495 | "metadata": {} 496 | }, 497 | { 498 | "cell_type": "code", 499 | "language": "python", 500 | "outputs": [], 501 | "collapsed": false, 502 | "prompt_number": 13, 503 | "input": [ 504 | "\"\"\"\n", 505 | "Function\n", 506 | "--------\n", 507 | "win_percentage\n", 508 | "\n", 509 | "Calculate the percent of times that a simulation of guesses is correct\n", 510 | "\n", 511 | "Parameters\n", 512 | "-----------\n", 513 | "guesses : array\n", 514 | " Guesses for each simulation\n", 515 | "prizedoors : array\n", 516 | " Location of prize for each simulation\n", 517 | "\n", 518 | "Returns\n", 519 | "--------\n", 520 | "percentage : number between 0 and 100\n", 521 | " The win percentage\n", 522 | "\n", 523 | "Examples\n", 524 | "---------\n", 525 | ">>> print win_percentage(np.array([0, 1, 2]), np.array([0, 0, 0]))\n", 526 | "33.333\n", 527 | "\"\"\"\n", 528 | "#your code here\n" 529 | ], 530 | "metadata": {} 531 | }, 532 | { 533 | "source": [ 534 | "Now, put it together. Simulate 10000 games where contestant keeps his original guess, and 10000 games where the contestant switches his door after a goat door is revealed. Compute the percentage of time the contestant wins under either strategy. Is one strategy better than the other?" 535 | ], 536 | "cell_type": "markdown", 537 | "metadata": {} 538 | }, 539 | { 540 | "cell_type": "code", 541 | "language": "python", 542 | "outputs": [], 543 | "collapsed": false, 544 | "prompt_number": 14, 545 | "input": [ 546 | "#your code here\n" 547 | ], 548 | "metadata": {} 549 | }, 550 | { 551 | "source": [ 552 | "Many people find this answer counter-intuitive (famously, PhD mathematicians have incorrectly claimed the result must be wrong. Clearly, none of them knew Python). \n", 553 | "\n", 554 | "One of the best ways to build intuition about why opening a Goat door affects the odds is to re-run the experiment with 100 doors and one prize. If the game show host opens 98 goat doors after you make your initial selection, would you want to keep your first pick or switch? Can you generalize your simulation code to handle the case of `n` doors?" 555 | ], 556 | "cell_type": "markdown", 557 | "metadata": {} 558 | } 559 | ], 560 | "metadata": {} 561 | } 562 | ] 563 | } -------------------------------------------------------------------------------- /InstructionsForAmazonEMR.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "#Instructions for Amazon Setup" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "### Getting an Amazon account and your credits:\n", 22 | "\n", 23 | "For the class, Amazon will be providing each of you with $100 of free AWS credits. \n", 24 | "\n", 25 | "First, you must register for an AWS account, during which you will be required to enter your own personal credit card information. Once registered, we will provide you with a $100 credit code. It is important you understand that once the provided credit code is used up, your credit card will be charged for any additional AWS usage, so it is important to keep track of your usage.\n", 26 | "\n", 27 | "The following steps will guide you through the registration process:\n", 28 | "\n", 29 | "1. [Sign up for AWS](https://aws-portal.amazon.com/gp/aws/developer/registration/index.html) using either your personal Amazon account or by creating a new AWS account.\n", 30 | "2. After signing up for AWS, [sign up for EC2](https://aws-portal.amazon.com/gp/aws/developer/subscription/index.html?productCode=AmazonEC2), which will include registration for Elastic MapReduce and a other similar services. Some of these other services may carry a cost if you decide to use them for your own personal use.\n", 31 | "3. Wait for an email from us with your AWS credit code.\n", 32 | "4. Login to your AWS Account page. Click Payment Method. At the bottom of the page, click Redeem/View AWS Credits. Then, enter your code and click redeem.\n", 33 | "5. As mentioned in class, you may want to set up a [billing alert](https://portal.aws.amazon.com/gp/aws/developer/account/index.html?ie=UTF8&action=billing-alerts&) using this link.\n", 34 | "\n", 35 | "You can manage your account via the [AWS Console](https://aws.amazon.com/console).\n", 36 | "\n", 37 | "####Get setup to run mrjob on EMR\n", 38 | "\n", 39 | "You can find out more about MRJob [here](http://packages.python.org/mrjob/index.html).\n", 40 | "\n", 41 | "To set yourself up to use `mrjob` on Amazon, after getting your Amazon credits and setting up an AWS account, read the following [QuickStart](http://pythonhosted.org/mrjob/guides/emr-quickstart.html). If you follow the instructions in there, you should have set up your access key, optionally set up ssh tunnel access, and written your access keys to the ~/.mrjob.conf file. You could also set\n", 42 | "\n", 43 | " MRJOB_CONF=/home/you/yourpath/fileName.txt\n", 44 | " \n", 45 | "with the appropriate syntax in bash/csh/zsh/command.exe.\n", 46 | "\n", 47 | "Use Region `us-east-1` when prompted for choosing a region. This might sometimes show up as Virginia. Its ok to use another one, but beware that if you usedifferent regions at different times you might forget to make sure your services are shut down: you will then incur a cost.\n", 48 | "\n", 49 | "**Note**: Just a reminder, with these keys ANYONE can send a job to Amazon under your guise (and you will be charged). It should be fairly obvious that you therefore do not want to distribute these keys. If at anytime your keys are compromised, you can log into [your account](http://aws.amazon.com/account), click on \"Security Credentials\", create a new pair, and deactivate the current pair.\n", 50 | "\n", 51 | "If you decide to use AWS for your final project, a configuration file is preferable to avoid the repetition of reconfiguration. However, you can also use the command line to configure MRJob. \n", 52 | "\n", 53 | "Type following two commands in your terminal:\n", 54 | "\n", 55 | "* export AWS_ACCESS_KEY ID=xxxxxx\n", 56 | "* export AWS_SECRET_ACCESS_KEY=yyyyyy\n", 57 | "\n", 58 | "where the xxxxxx and yyyyyy are your Access Key ID and Secret Access Key, respectively. (or the windows or csh equivalents).\n", 59 | "\n", 60 | "By default, a single \u201csmall standard on-demand\u201d instance will be used for computation. However, these settings can be modified via any of the previously mentioned configuration methods using the \u201cec2 instance type\u201d and \u201cnum ec2 instances\u201d flags. See [here](http://packages.python.org/mrjob/configs-runners.html#on-emr) for more details on these flags as well as others. \n", 61 | "\n", 62 | "### Testing:\n", 63 | "\n", 64 | "\n", 65 | "At this point it is a good idea to try running the scripts at [the mrjob quickstart](http://pythonhosted.org/mrjob/guides/quickstart.html). Note that EMR is billed by the hour, so run as many tests as you can (or as much of your code as you can) in batches of 1 hour, so you can have more credits left over for your own future use.\n", 66 | "\n", 67 | "\n", 68 | "**Important**: Please always make sure that your code is bug free, before actually submitting it to amazon. Try to run the job locally first and see if it produces the desired result. Then, if this worked, you are ready to proceed to the cloud. The homework problems are small and your free credit should provide you with a lot of room for running and testing on Amazon. However, it is your responsibility to make sure the jobs terminate properly and do not cause excessive costs. You can always monitor your currently running jobs using [this overview at region US-EAST-1](https://console.aws.amazon.com/elasticmapreduce/home?region=us-east-1) of your MapReduce job flows." 69 | ] 70 | } 71 | ], 72 | "metadata": {} 73 | } 74 | ] 75 | } 76 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Harvard CS 109: Data Science 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Welcome to CS109: Data Science 2 | ======= 3 | 4 | ## Assignments 5 | 6 | * [Homework 0](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW0.ipynb): Hello, world ([solutions](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW0_solutions.ipynb)) 7 | * [Homework 1](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW1.ipynb): Which of two things is larger? ([solutions](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW1_solutions.ipynb)) 8 | * [Homework 2](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW2.ipynb): Desperately Seeking Silver ([solutions](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW2_solutions.ipynb)) 9 | * [Homework 3](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW3.ipynb): Bayesian Tomatoes ([solutions](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW3_solutions.ipynb)) 10 | * [Homework 4](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW4.ipynb): Do We Really Need Chocolate Recommendations? ([solutions](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW4_solutions.ipynb)) 11 | * [Homework 5](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW5.ipynb): Networks and Congress ([solutions](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW5_solutions.ipynb)) 12 | 13 | ## Lecture Supplements 14 | 15 | * [A gallery of statistical graphs with matplotlib](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/lec_03_statistical_graphs.ipynb) (see also the version with [default matplotlib styles](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/lec_03_statistical_graphs_mpl_default.ipynb)) 16 | * [A rubric for data wrangling and exploratory data analysis](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/lec_04_wrangling.ipynb) 17 | * [Web Scraping and Parsing Demo](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/lec_04_scraping.ipynb) 18 | * [Cross Validation: The Right and Wrong Way](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/lec_10_cross_val.ipynb) 19 | 20 | ## Labs 21 | 22 | * [Lab 2: Web Scraping](https://github.com/cs109/content/tree/master/labs/lab2) 23 | * [Lab 3: EDA, Pandas, Matplotlib](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/labs/lab3/lab3full.ipynb) 24 | * [Lab 4: Scikit-Learn, Regression, PCA](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/labs/lab4/Lab4full.ipynb) 25 | * [Lab 5: Bias, Variance, Cross-Validation](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/labs/lab5/Lab5.ipynb) 26 | * [Lab 6: Bayes, Linear Regression, and Metropolis Sampling](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/labs/lab6/BayesLinear.ipynb) 27 | * [Lab 7: Gibbs Sampling](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/labs/lab7/GibbsSampler.ipynb) 28 | * [Lab 8: MapReduce](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/labs/lab8/lab8_mapreduce.ipynb) 29 | * [Lab 9: Networks](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/labs/lab9/lab_9.ipynb) 30 | * [Lab 10: Support Vector Machines](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/labs/lab10/Lab_10.ipynb) 31 | 32 | 33 | ## Other Resources 34 | 35 | * [Setting up Python](https://github.com/cs109/content/wiki/Installing-Python) 36 | -------------------------------------------------------------------------------- /computesim.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from mrjob.job import MRJob 4 | from itertools import combinations, permutations 5 | from math import sqrt 6 | 7 | from scipy.stats.stats import pearsonr 8 | 9 | class RestaurantSimilarities(MRJob): 10 | 11 | def steps(self): 12 | thesteps = [ 13 | self.mr(mapper=self.line_mapper, reducer=self.users_items_collector), 14 | self.mr(mapper=self.pair_items_mapper, reducer=self.calc_sim_collector) 15 | ] 16 | return thesteps 17 | 18 | def line_mapper(self,_,line): 19 | user_id,business_id,stars,business_avg,user_avg=line.split(',') 20 | yield user_id, (business_id,stars,business_avg,user_avg) 21 | 22 | def users_items_collector(self, user_id, values): 23 | ratings=[] 24 | for business_id,stars,business_avg,user_avg in values: 25 | ratings.append((business_id,(stars, user_avg))) 26 | yield user_id, ratings 27 | 28 | def pair_items_mapper(self, user_id, values): 29 | ratings = values 30 | for biz1tuple, biz2tuple in combinations(ratings, 2): 31 | biz1, biz1r=biz1tuple 32 | biz2, biz2r=biz2tuple 33 | if biz1 <= biz2 : 34 | yield (biz1, biz2), (biz1r, biz2r) 35 | else: 36 | yield (biz2, biz1), (biz2r, biz1r) 37 | 38 | def calc_sim_collector(self, key, values): 39 | (rest1, rest2), common_ratings = key, values 40 | diff1=[] 41 | diff2=[] 42 | n_common=0 43 | 44 | 45 | for rt1, rt2 in common_ratings: 46 | diff1.append(float(rt1[0])-float(rt1[1])) 47 | diff2.append(float(rt2[0])-float(rt2[1])) 48 | n_common=n_common+1 49 | if n_common==0: 50 | rho=0. 51 | else: 52 | rho=pearsonr(diff1, diff2)[0] 53 | if np.isnan(rho): 54 | rho=0. 55 | yield (rest1, rest2), (rho, n_common) 56 | 57 | 58 | #Below MUST be there for things to work! 59 | if __name__ == '__main__': 60 | RestaurantSimilarities.run() 61 | -------------------------------------------------------------------------------- /computesim2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from mrjob.job import MRJob 4 | from itertools import combinations, permutations 5 | from math import sqrt 6 | import mrjob 7 | 8 | from scipy.stats.stats import pearsonr 9 | 10 | class RestaurantSimilarities(MRJob): 11 | 12 | def steps(self): 13 | thesteps = [ 14 | self.mr(mapper=self.line_mapper, reducer=self.users_items_collector), 15 | self.mr(mapper=self.pair_items_mapper, reducer=self.calc_sim_collector), 16 | self.mr(mapper=self.ranking_mapper, reducer=self.top_similar_collector) 17 | ] 18 | return thesteps 19 | 20 | def line_mapper(self,_,line): 21 | user_id,business_id,stars,business_avg,user_avg=line.split(',') 22 | yield user_id, (business_id,stars,business_avg,user_avg) 23 | 24 | def users_items_collector(self, user_id, values): 25 | ratings=[] 26 | for business_id,stars,business_avg,user_avg in values: 27 | ratings.append((business_id,(stars, user_avg))) 28 | yield user_id, ratings 29 | 30 | def pair_items_mapper(self, user_id, values): 31 | ratings = values 32 | for biz1tuple, biz2tuple in combinations(ratings, 2): 33 | biz1, biz1r=biz1tuple 34 | biz2, biz2r=biz2tuple 35 | if biz1 <= biz2 : 36 | yield (biz1, biz2), (biz1r, biz2r) 37 | else: 38 | yield (biz2, biz1), (biz2r, biz1r) 39 | 40 | def calc_sim_collector(self, key, values): 41 | (rest1, rest2), common_ratings = key, values 42 | diff1=[] 43 | diff2=[] 44 | n_common=0 45 | 46 | 47 | for rt1, rt2 in common_ratings: 48 | diff1.append(float(rt1[0])-float(rt1[1])) 49 | diff2.append(float(rt2[0])-float(rt2[1])) 50 | n_common=n_common+1 51 | if n_common==0: 52 | rho=0. 53 | else: 54 | rho=pearsonr(diff1, diff2)[0] 55 | if np.isnan(rho): 56 | rho=0. 57 | yield (rest1, rest2), (rho, n_common) 58 | 59 | def ranking_mapper(self, restaurants, values): 60 | sim, n_common = values 61 | rest1, rest2 = restaurants 62 | if int(n_common) > 0: 63 | yield (rest1), (sim, rest2, n_common) 64 | 65 | def top_similar_collector(self, key, values): 66 | rest1 = key 67 | for sim, rest2, n_common in sorted(values, reverse=True): 68 | yield None, (rest1, rest2, sim, n_common) 69 | 70 | #Below MUST be there for things to work! 71 | if __name__ == '__main__': 72 | RestaurantSimilarities.run() 73 | -------------------------------------------------------------------------------- /labs/lab10/final_lab_images.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab10/final_lab_images.pdf -------------------------------------------------------------------------------- /labs/lab10/svm.csv: -------------------------------------------------------------------------------- 1 | -1.214,-1.178,0 2 | -4.231,-5.235,0 3 | -2.318,-7.908,0 4 | -8.435,-1.324,0 5 | -9.992,-3.456,0 6 | 1.123,8.435,1 7 | 4.389,3.123,1 8 | 1.113,1.888,1 9 | 5.312,9.123,1 10 | 4.124,8.432,1 -------------------------------------------------------------------------------- /labs/lab10/test.csv: -------------------------------------------------------------------------------- 1 | -1.06E-01,-8.15E-02,-1.00E+00 1.78E-01,-3.46E-01,-1.00E+00 1.02E-01,7.18E-01,1.00E+00 6.94E-01,6.23E-01,-1.00E+00 2.35E-02,7.27E-01,1.00E+00 -3.20E-01,-8.34E-01,-1.00E+00 -1.87E-01,5.39E-01,1.00E+00 -6.37E-01,1.53E-01,1.00E+00 -4.74E-01,8.54E-01,1.00E+00 -3.56E-02,-2.72E-01,-1.00E+00 -1.49E-01,1.62E-01,-1.00E+00 -1.81E-01,-1.29E-01,-1.00E+00 -6.02E-01,9.26E-01,1.00E+00 6.98E-01,7.95E-01,-1.00E+00 8.82E-01,-2.01E-01,1.00E+00 -9.24E-01,3.87E-01,1.00E+00 -7.66E-01,-1.13E-02,1.00E+00 1.36E-01,3.17E-02,-1.00E+00 -1.55E-01,-3.31E-01,-1.00E+00 4.85E-01,2.99E-01,-1.00E+00 -6.03E-01,3.33E-01,1.00E+00 -5.73E-01,8.28E-01,1.00E+00 -6.35E-01,-4.75E-01,-1.00E+00 9.09E-01,-7.85E-01,1.00E+00 2.52E-01,-8.94E-01,1.00E+00 -5.18E-01,9.60E-01,1.00E+00 -3.86E-01,-3.18E-01,-1.00E+00 8.23E-01,-1.28E-01,1.00E+00 8.22E-01,-8.77E-01,1.00E+00 -5.04E-01,9.80E-01,1.00E+00 5.34E-01,8.21E-01,-1.00E+00 -8.95E-01,-2.40E-01,1.00E+00 3.43E-01,4.75E-01,-1.00E+00 7.09E-01,5.62E-01,-1.00E+00 -1.00E+00,6.05E-02,1.00E+00 5.24E-01,7.35E-01,-1.00E+00 -5.60E-01,7.56E-01,1.00E+00 6.98E-01,-6.72E-01,1.00E+00 4.90E-01,7.85E-01,-1.00E+00 -3.27E-01,3.43E-01,1.00E+00 -2.93E-03,-4.15E-01,-1.00E+00 -6.31E-01,3.53E-01,1.00E+00 9.14E-01,5.93E-01,-1.00E+00 2.18E-01,3.97E-02,-1.00E+00 -6.16E-01,-8.87E-01,-1.00E+00 -5.29E-01,2.87E-02,1.00E+00 -4.07E-01,1.05E+00,1.00E+00 -2.30E-01,7.14E-02,-1.00E+00 -5.02E-01,8.34E-01,1.00E+00 -5.08E-01,7.93E-01,1.00E+00 -7.91E-01,1.88E-01,1.00E+00 -3.83E-01,8.25E-01,1.00E+00 8.22E-01,4.01E-01,-1.00E+00 9.86E-01,-3.29E-01,1.00E+00 -1.40E-02,-1.52E-01,-1.00E+00 -5.42E-02,9.14E-01,1.00E+00 -1.07E+00,-7.20E-01,-1.00E+00 -2.43E-01,-1.04E+00,1.00E+00 -3.24E-01,-2.83E-01,-1.00E+00 2.48E-01,-2.56E-01,-1.00E+00 -1.72E-01,-8.49E-01,1.00E+00 -4.17E-01,-3.93E-01,-1.00E+00 -3.48E-01,-5.74E-01,-1.00E+00 -8.52E-01,-7.23E-01,-1.00E+00 -7.25E-01,-3.74E-01,-1.00E+00 3.45E-01,-2.23E-02,-1.00E+00 7.42E-01,7.41E-01,-1.00E+00 -1.37E-01,-3.47E-01,-1.00E+00 1.06E-01,6.34E-01,1.00E+00 3.32E-01,-5.66E-01,1.00E+00 -4.18E-01,9.49E-01,1.00E+00 -4.05E-01,-6.13E-01,-1.00E+00 -7.97E-01,9.07E-01,1.00E+00 8.76E-01,3.60E-01,-1.00E+00 5.44E-01,-1.81E-01,1.00E+00 7.54E-02,-5.11E-01,-1.00E+00 5.64E-01,7.72E-01,-1.00E+00 8.17E-01,5.26E-01,-1.00E+00 -3.77E-01,1.06E-01,1.00E+00 4.36E-01,1.50E-01,-1.00E+00 3.97E-01,-5.49E-01,1.00E+00 -2.74E-01,6.02E-01,1.00E+00 -9.89E-01,1.58E-01,1.00E+00 -5.16E-01,-8.25E-01,-1.00E+00 9.81E-01,5.46E-01,-1.00E+00 7.78E-01,-8.93E-01,1.00E+00 -2.59E-01,-6.44E-01,-1.00E+00 -2.38E-01,-9.07E-01,-1.00E+00 -6.04E-01,8.82E-02,1.00E+00 -2.80E-01,-1.51E-02,-1.00E+00 -2.04E-01,7.98E-01,1.00E+00 -1.64E-01,4.36E-01,1.00E+00 7.44E-01,4.11E-01,-1.00E+00 -3.32E-01,-4.59E-01,-1.00E+00 -2.77E-02,-3.61E-01,-1.00E+00 7.07E-01,7.54E-01,-1.00E+00 -8.23E-01,-3.03E-01,1.00E+00 -9.85E-01,-3.84E-01,1.00E+00 -4.91E-01,7.03E-01,1.00E+00 -5.22E-01,3.00E-01,1.00E+00 -5.70E-01,1.04E-01,1.00E+00 -3.24E-01,7.22E-01,1.00E+00 9.20E-01,-3.26E-01,1.00E+00 8.18E-01,3.49E-01,-1.00E+00 -7.13E-01,-4.91E-01,-1.00E+00 5.37E-01,1.05E+00,-1.00E+00 4.89E-02,1.25E-01,-1.00E+00 4.00E-01,-1.34E-01,-1.00E+00 7.30E-01,-3.66E-01,1.00E+00 -8.46E-01,8.70E-01,1.00E+00 8.31E-01,5.85E-01,-1.00E+00 4.46E-01,2.83E-01,-1.00E+00 6.35E-01,8.53E-01,-1.00E+00 1.35E-01,8.40E-01,1.00E+00 -5.81E-01,-1.47E-02,1.00E+00 -4.27E-01,4.92E-01,1.00E+00 -3.09E-02,1.08E+00,1.00E+00 4.87E-01,-7.50E-01,1.00E+00 -5.06E-01,-9.10E-01,-1.00E+00 2.49E-01,3.83E-01,-1.00E+00 -4.43E-01,-7.64E-01,-1.00E+00 -1.05E-01,-8.83E-01,1.00E+00 -2.84E-01,-5.72E-01,-1.00E+00 1.01E+00,3.72E-01,1.00E+00 6.18E-02,-6.08E-01,1.00E+00 8.71E-02,3.61E-01,-1.00E+00 8.72E-01,4.14E-01,-1.00E+00 -4.22E-01,5.21E-01,1.00E+00 -5.30E-01,6.99E-01,1.00E+00 5.90E-01,4.47E-01,-1.00E+00 8.40E-01,-8.50E-01,1.00E+00 9.19E-02,-2.32E-01,-1.00E+00 -8.22E-02,-4.02E-01,-1.00E+00 9.73E-01,6.42E-01,-1.00E+00 4.36E-01,2.76E-01,-1.00E+00 5.25E-01,-5.44E-01,1.00E+00 5.55E-01,3.83E-01,-1.00E+00 9.56E-01,-8.02E-01,1.00E+00 -7.71E-01,4.33E-01,1.00E+00 8.90E-01,-5.31E-01,1.00E+00 -2.62E-01,6.66E-02,1.00E+00 6.42E-01,-1.82E-01,1.00E+00 -4.15E-01,7.78E-01,1.00E+00 4.67E-01,8.18E-01,-1.00E+00 4.78E-01,-2.49E-01,1.00E+00 -8.26E-01,9.44E-01,1.00E+00 -9.46E-01,-4.28E-01,1.00E+00 5.07E-01,-8.09E-01,1.00E+00 -5.36E-01,2.72E-02,1.00E+00 4.10E-01,4.30E-01,-1.00E+00 -2.62E-01,-5.80E-01,-1.00E+00 3.19E-01,2.16E-01,-1.00E+00 1.48E-01,-8.15E-01,1.00E+00 6.30E-01,7.00E-01,-1.00E+00 -9.54E-01,9.26E-01,1.00E+00 -2.45E-01,1.82E-02,-1.00E+00 5.81E-01,3.16E-01,-1.00E+00 4.10E-02,-4.57E-01,-1.00E+00 5.84E-01,6.83E-01,-1.00E+00 -4.87E-01,8.65E-01,1.00E+00 8.26E-01,-4.98E-02,1.00E+00 5.22E-01,-8.89E-01,1.00E+00 8.39E-01,-8.26E-01,1.00E+00 -6.30E-01,-1.49E-01,-1.00E+00 8.52E-01,-1.06E+00,1.00E+00 3.83E-01,6.79E-01,-1.00E+00 -5.70E-02,6.06E-01,1.00E+00 3.04E-01,-1.07E+00,1.00E+00 -8.61E-01,-2.26E-01,1.00E+00 -8.63E-01,2.03E-01,1.00E+00 5.59E-01,1.54E-01,-1.00E+00 -4.18E-01,-3.27E-01,-1.00E+00 1.22E-01,-2.49E-01,-1.00E+00 7.47E-01,5.55E-01,-1.00E+00 2.06E-01,5.74E-01,1.00E+00 -8.91E-01,4.99E-01,1.00E+00 -6.86E-01,-4.69E-01,-1.00E+00 -1.98E-01,-1.36E-01,-1.00E+00 6.17E-01,1.29E-01,-1.00E+00 -7.93E-01,3.61E-01,1.00E+00 -1.01E+00,7.85E-02,1.00E+00 -8.62E-01,-5.79E-01,-1.00E+00 -1.97E-01,2.75E-01,-1.00E+00 5.76E-01,9.37E-01,1.00E+00 -6.14E-01,-9.41E-01,-1.00E+00 -5.52E-01,-2.68E-01,-1.00E+00 1.60E-01,-3.42E-01,-1.00E+00 -3.97E-01,6.57E-01,1.00E+00 -5.79E-01,-8.73E-01,-1.00E+00 8.32E-01,-8.53E-02,1.00E+00 -2.99E-01,-4.29E-01,-1.00E+00 -1.49E-01,6.54E-01,1.00E+00 -7.45E-01,-7.19E-01,-1.00E+00 1.56E-01,9.21E-01,1.00E+00 5.29E-01,9.17E-01,-1.00E+00 8.28E-02,-6.28E-01,1.00E+00 -9.40E-01,-6.63E-01,-1.00E+00 7.14E-01,-2.60E-01,1.00E+00 -1.11E-02,-8.43E-01,1.00E+00 5.43E-01,1.18E-01,-1.00E+00 7.34E-01,-8.91E-01,1.00E+00 3.79E-01,-1.16E-01,-1.00E+00 -1.67E-01,-4.11E-01,-1.00E+00 -7.82E-01,3.77E-01,1.00E+00 2.72E-01,8.11E-01,1.00E+00 -8.68E-01,-6.76E-01,-1.00E+00 -1.81E-01,6.81E-01,1.00E+00 -4.45E-02,4.41E-04,-1.00E+00 4.29E-01,8.29E-01,-1.00E+00 -8.38E-01,-7.70E-02,1.00E+00 7.00E-01,-2.08E-01,1.00E+00 7.74E-01,5.12E-01,-1.00E+00 -6.88E-01,7.93E-01,1.00E+00 -4.25E-01,-8.50E-01,-1.00E+00 4.44E-01,-2.42E-01,1.00E+00 -2.19E-03,6.97E-01,1.00E+00 3.25E-01,-1.86E-01,-1.00E+00 2.71E-01,-8.52E-01,1.00E+00 2.08E-01,-8.29E-01,1.00E+00 -3.38E-01,-8.94E-01,-1.00E+00 -2.43E-02,-5.51E-01,-1.00E+00 2.55E-01,-2.88E-01,-1.00E+00 -7.17E-01,4.20E-04,1.00E+00 1.32E-01,-4.60E-01,-1.00E+00 3.45E-01,-1.29E-01,-1.00E+00 8.24E-02,-9.73E-01,1.00E+00 5.33E-01,2.95E-01,-1.00E+00 -3.39E-01,9.20E-01,1.00E+00 5.51E-01,-8.46E-01,1.00E+00 -4.11E-01,5.12E-01,1.00E+00 4.63E-01,-7.36E-01,1.00E+00 5.76E-01,-5.90E-01,1.00E+00 -6.32E-01,-9.80E-01,-1.00E+00 -1.68E-01,-5.29E-01,-1.00E+00 7.20E-01,-1.04E+00,1.00E+00 7.50E-01,-5.38E-01,1.00E+00 2.52E-01,-9.61E-01,1.00E+00 -7.25E-01,7.44E-02,1.00E+00 -7.20E-01,-5.57E-01,-1.00E+00 -9.54E-01,4.77E-01,1.00E+00 7.11E-01,-9.90E-01,1.00E+00 2.91E-01,-4.43E-01,1.00E+00 3.20E-01,-4.01E-01,1.00E+00 2.34E-01,6.37E-01,1.00E+00 -1.96E-01,-9.90E-01,1.00E+00 -4.38E-01,1.17E-02,1.00E+00 -3.55E-01,8.20E-01,1.00E+00 3.47E-01,-5.45E-01,1.00E+00 8.36E-01,3.44E-01,-1.00E+00 -7.14E-01,-6.41E-01,-1.00E+00 -------------------------------------------------------------------------------- /labs/lab10/train.csv: -------------------------------------------------------------------------------- 1 | -7.7947021e-01,8.3822138e-01,1.0000000e+00 2 | 1.5563491e-01,8.9537743e-01,1.0000000e+00 3 | -5.9907703e-02,-7.1777995e-01,1.0000000e+00 4 | 2.0759636e-01,7.5893338e-01,1.0000000e+00 5 | -1.9598312e-01,-3.7548716e-01,-1.0000000e+00 6 | 5.8848947e-01,-8.4255381e-01,1.0000000e+00 7 | 7.1985874e-03,-5.4831650e-01,-1.0000000e+00 8 | 7.3883852e-01,-6.0339369e-01,1.0000000e+00 9 | 7.0464808e-01,-2.0420052e-02,1.0000000e+00 10 | 9.6992666e-01,6.4137120e-01,-1.0000000e+00 11 | 4.3543099e-01,7.4477254e-01,-1.0000000e+00 12 | -8.4425822e-01,7.4235423e-01,1.0000000e+00 13 | 5.9142471e-01,-5.4602118e-01,1.0000000e+00 14 | -6.9093124e-02,3.7659995e-02,-1.0000000e+00 15 | -9.5154865e-01,-7.3305502e-01,-1.0000000e+00 16 | -1.2988138e-01,7.5676096e-01,1.0000000e+00 17 | -4.9534647e-01,-5.6627908e-01,-1.0000000e+00 18 | -9.0399413e-01,5.0922150e-01,1.0000000e+00 19 | 2.9235128e-01,1.6089015e-01,-1.0000000e+00 20 | 6.4798552e-01,-7.7933769e-01,1.0000000e+00 21 | 3.7595574e-01,7.8203087e-02,-1.0000000e+00 22 | 2.4588993e-01,4.5146739e-03,-1.0000000e+00 23 | -4.5719155e-01,4.2390461e-01,1.0000000e+00 24 | -4.4127876e-01,7.0571892e-01,1.0000000e+00 25 | 5.0744669e-01,7.5872586e-01,-1.0000000e+00 -------------------------------------------------------------------------------- /labs/lab2/Lab_2_A_Live.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "code", 12 | "collapsed": false, 13 | "input": [ 14 | "import cs109style\n", 15 | "cs109style.customize_mpl()\n", 16 | "cs109style.customize_css()\n", 17 | "\n", 18 | "# special IPython command to prepare the notebook for matplotlib\n", 19 | "%matplotlib inline \n", 20 | "\n", 21 | "from collections import defaultdict\n", 22 | "\n", 23 | "import pandas as pd\n", 24 | "import matplotlib.pyplot as plt\n", 25 | "import requests\n", 26 | "from pattern import web\n", 27 | "\n" 28 | ], 29 | "language": "python", 30 | "metadata": {}, 31 | "outputs": [] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## Fetching population data from Wikipedia\n", 38 | "\n", 39 | "In this example we will fetch data about countries and their population from Wikipedia.\n", 40 | "\n", 41 | "http://en.wikipedia.org/wiki/List_of_countries_by_past_and_future_population has several tables for individual countries, subcontinents as well as different years. We will combine the data for all countries and all years in a single panda dataframe and visualize the change in population for different countries.\n", 42 | "\n", 43 | "###We will go through the following steps:\n", 44 | "* fetching html with embedded data\n", 45 | "* parsing html to extract the data\n", 46 | "* collecting the data in a panda dataframe\n", 47 | "* displaying the data\n", 48 | "\n", 49 | "To give you some starting points for your homework, we will also show the different sub-steps that can be taken to reach the presented solution." 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Fetching the Wikipedia site" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "collapsed": false, 62 | "input": [ 63 | "url = 'http://en.wikipedia.org/wiki/List_of_countries_by_past_and_future_population'\n", 64 | "website_html = requests.get(url).text\n", 65 | "#print website_html" 66 | ], 67 | "language": "python", 68 | "metadata": {}, 69 | "outputs": [] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## Parsing html data" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "collapsed": false, 81 | "input": [ 82 | "def get_population_html_tables(html):\n", 83 | " \"\"\"Parse html and return html tables of wikipedia population data.\"\"\"\n", 84 | "\n", 85 | " dom = web.Element(html)\n", 86 | "\n", 87 | " ### 0. step: look at html source!\n", 88 | " \n", 89 | " #### 1. step: get all tables\n", 90 | "\n", 91 | " #### 2. step: get all tables we care about\n", 92 | "\n", 93 | " return tbls\n", 94 | "\n", 95 | "tables = get_population_html_tables(website_html)\n", 96 | "print \"table length: %d\" %len(tables)\n", 97 | "for t in tables:\n", 98 | " print t.attributes\n" 99 | ], 100 | "language": "python", 101 | "metadata": {}, 102 | "outputs": [] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "collapsed": false, 107 | "input": [ 108 | "def table_type(tbl):\n", 109 | " ### Extract the table type\n", 110 | "\n", 111 | "# group the tables by type\n", 112 | "tables_by_type = defaultdict(list) # defaultdicts have a default value that is inserted when a new key is accessed\n", 113 | "for tbl in tables:\n", 114 | " tables_by_type[table_type(tbl)].append(tbl)\n", 115 | "\n", 116 | "print tables_by_type" 117 | ], 118 | "language": "python", 119 | "metadata": {}, 120 | "outputs": [] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "## Extracting data and filling it into a dictionary" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "collapsed": false, 132 | "input": [ 133 | "def get_countries_population(tables):\n", 134 | " \"\"\"Extract population data for countries from all tables and store it in dictionary.\"\"\"\n", 135 | " \n", 136 | " result = defaultdict(dict)\n", 137 | "\n", 138 | " # 1. step: try to extract data for a single table\n", 139 | "\n", 140 | " # 2. step: iterate over all tables, extract headings and actual data and combine data into single dict\n", 141 | " \n", 142 | " return result\n", 143 | "\n", 144 | "\n", 145 | "result = get_countries_population(tables_by_type['Country or territory'])\n", 146 | "print result" 147 | ], 148 | "language": "python", 149 | "metadata": {}, 150 | "outputs": [] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "## Creating a dataframe from a dictionary" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "collapsed": false, 162 | "input": [ 163 | "# create dataframe\n", 164 | "\n", 165 | "df = pd.DataFrame.from_dict(result, orient='index')\n", 166 | "# sort based on year\n", 167 | "df.sort(axis=1,inplace=True)\n", 168 | "print df\n" 169 | ], 170 | "language": "python", 171 | "metadata": {}, 172 | "outputs": [] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "## Some data accessing functions for a panda dataframe" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "collapsed": false, 184 | "input": [ 185 | "subtable = df.iloc[0:2, 0:2]\n", 186 | "print \"subtable\"\n", 187 | "print subtable\n", 188 | "print \"\"\n", 189 | "\n", 190 | "column = df[1955]\n", 191 | "print \"column\"\n", 192 | "print column\n", 193 | "print \"\"\n", 194 | "\n", 195 | "row = df.ix[0] #row 0\n", 196 | "print \"row\"\n", 197 | "print row\n", 198 | "print \"\"\n", 199 | "\n", 200 | "rows = df.ix[:2] #rows 0,1\n", 201 | "print \"rows\"\n", 202 | "print rows\n", 203 | "print \"\"\n", 204 | "\n", 205 | "element = df.ix[0,1955] #element\n", 206 | "print \"element\"\n", 207 | "print element\n", 208 | "print \"\"\n", 209 | "\n", 210 | "# max along column\n", 211 | "print \"max\"\n", 212 | "print df[1950].max()\n", 213 | "print \"\"\n", 214 | "\n", 215 | "# axes\n", 216 | "print \"axes\"\n", 217 | "print df.axes\n", 218 | "print \"\"\n", 219 | "\n", 220 | "row = df.ix[0]\n", 221 | "print \"row info\"\n", 222 | "print row.name\n", 223 | "print row.index\n", 224 | "print \"\"\n", 225 | "\n", 226 | "countries = df.index\n", 227 | "print \"countries\"\n", 228 | "print countries\n", 229 | "print \"\"\n", 230 | "\n", 231 | "print \"Austria\"\n", 232 | "print df.ix['Austria']" 233 | ], 234 | "language": "python", 235 | "metadata": {}, 236 | "outputs": [] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "## Plotting population of 4 countries" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "collapsed": false, 248 | "input": [ 249 | "plotCountries = ['Austria', 'Germany', 'United States', 'France']\n", 250 | " \n", 251 | "for country in plotCountries:\n", 252 | " row = df.ix[country]\n", 253 | " plt.plot(row.index, row, label=row.name ) \n", 254 | " \n", 255 | "plt.ylim(ymin=0) # start y axis at 0\n", 256 | "\n", 257 | "plt.xticks(rotation=70)\n", 258 | "plt.legend(loc='best')\n", 259 | "plt.xlabel(\"Year\")\n", 260 | "plt.ylabel(\"# people (million)\")\n", 261 | "plt.title(\"Population of countries\")" 262 | ], 263 | "language": "python", 264 | "metadata": {}, 265 | "outputs": [] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "## Plot 5 most populous countries from 2010 and 2060" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "collapsed": false, 277 | "input": [ 278 | "def plot_populous(df, year):\n", 279 | " # sort table depending on data value in year column\n", 280 | " df_by_year = df.sort(year, ascending=False)\n", 281 | " \n", 282 | " plt.figure()\n", 283 | " for i in range(5): \n", 284 | " row = df_by_year.ix[i]\n", 285 | " plt.plot(row.index, row, label=row.name ) \n", 286 | " \n", 287 | " plt.ylim(ymin=0)\n", 288 | " \n", 289 | " plt.xticks(rotation=70)\n", 290 | " plt.legend(loc='best')\n", 291 | " plt.xlabel(\"Year\")\n", 292 | " plt.ylabel(\"# people (million)\")\n", 293 | " plt.title(\"Most populous countries in %d\" % year)\n", 294 | "\n", 295 | "plot_populous(df, 2010)\n", 296 | "plot_populous(df, 2050)" 297 | ], 298 | "language": "python", 299 | "metadata": {}, 300 | "outputs": [] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "collapsed": false, 305 | "input": [], 306 | "language": "python", 307 | "metadata": {}, 308 | "outputs": [] 309 | } 310 | ], 311 | "metadata": {} 312 | } 313 | ] 314 | } -------------------------------------------------------------------------------- /labs/lab2/Lab_2_B_Live.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "code", 12 | "collapsed": false, 13 | "input": [ 14 | "# Setup\n", 15 | "import pattern.web as web\n", 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "import matplotlib.pyplot as plt\n", 19 | "\n", 20 | "from cs109style import customize_mpl, customize_css\n", 21 | "customize_mpl()\n", 22 | "customize_css()\n", 23 | "%pylab inline" 24 | ], 25 | "language": "python", 26 | "metadata": {}, 27 | "outputs": [] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## Example 2: extracting reddit titles, upvotes, downvotes, and submission time\n", 34 | "\n", 35 | "### We'll operate in two phases:\n", 36 | "* first, find all the URLs to comment pages on the first few front pages of reddit.\n", 37 | "* second, extract information from each comments page" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "collapsed": false, 43 | "input": [ 44 | "def get_links_from_front_pages(n):\n", 45 | " 'find URLs of comments pages, linked from the n first few pages of reddit'\n", 46 | " url = web.URL('http://www.reddit.com/')\n", 47 | " comment_pages = []\n", 48 | " for page_idx in range(n):\n", 49 | " dom = web.DOM(url.download(cached=False))\n", 50 | " \n", 51 | " ### Extract comments pages\n", 52 | " \n", 53 | " ### find the next page link - reddit has 25 links per page\n", 54 | "\n", 55 | " # use set() to remove repeated URLs\n", 56 | " return list(set(comment_pages))\n", 57 | "\n", 58 | " \n", 59 | "print len(get_links_from_front_pages(6))" 60 | ], 61 | "language": "python", 62 | "metadata": {}, 63 | "outputs": [] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "collapsed": false, 68 | "input": [ 69 | "def info_from_comments_pages(links):\n", 70 | " 'fetch title, upvotes, downvotes, time of submission from a sequence of links'\n", 71 | " results = []\n", 72 | " for urltext in links:\n", 73 | " url = web.URL(urltext)\n", 74 | " print \"fetching info for\", url\n", 75 | " try:\n", 76 | " dom = web.DOM(url.download(cached=False))\n", 77 | " \n", 78 | " ### Extract title, upvotes, downvotes, submission time\n", 79 | " \n", 80 | " results.append((title, upvotes, downvotes, pd.to_datetime(time)))\n", 81 | " except KeyboardInterrupt:\n", 82 | " # allow us to interrupt the kernel and still continue\n", 83 | " break\n", 84 | " except:\n", 85 | " pass # some things that look like comment pages don't have the information above\n", 86 | " return results" 87 | ], 88 | "language": "python", 89 | "metadata": {}, 90 | "outputs": [] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "collapsed": false, 95 | "input": [ 96 | "comments_pages = get_links_from_front_pages(5)\n", 97 | "print \"Fetching info for\", len(comments_pages), \"pages\"\n", 98 | "pages = info_from_comments_pages(comments_pages)\n", 99 | "titles, upvotes, downvotes, dates = zip(*pages) # zip(*seq) transposes a sequence of sequences.\n", 100 | "df = pd.DataFrame({'title' : titles, 'upvotes' : upvotes, 'downvotes' : downvotes, 'date' : dates}, index=dates)\n", 101 | "print df" 102 | ], 103 | "language": "python", 104 | "metadata": {}, 105 | "outputs": [] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "collapsed": false, 110 | "input": [ 111 | "df.sort('date', inplace=True)\n", 112 | "df['upvotes'].plot(c='g')\n", 113 | "df['downvotes'].plot(c='r')\n", 114 | "(df['upvotes'] - df['downvotes']).plot(c='k')\n" 115 | ], 116 | "language": "python", 117 | "metadata": {}, 118 | "outputs": [] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "collapsed": false, 123 | "input": [], 124 | "language": "python", 125 | "metadata": {}, 126 | "outputs": [] 127 | } 128 | ], 129 | "metadata": {} 130 | } 131 | ] 132 | } -------------------------------------------------------------------------------- /labs/lab2/README.md: -------------------------------------------------------------------------------- 1 | # Files for Lab 2 of CS109 2 | 3 | ## Part A 4 | 5 | * [Lab_2_A_Live.ipynb](http://nbviewer.ipython.org/github/cs109/content/blob/master/labs/lab2/Lab_2_A_Live.ipynb) - starting point for lab 6 | * [Lab_2_A_Johanna.ipynb](http://nbviewer.ipython.org/github/cs109/content/blob/master/labs/lab2/Lab_2_A_Johanna.ipynb) - Johanna's original writeup 7 | * [Lab_2_A_Live_Ray_Final.ipynb](http://nbviewer.ipython.org/github/cs109/content/blob/master/labs/lab2/Lab_2_A_Live_Ray_Final.ipynb) - Ray's in-class reconstruction 8 | 9 | ## Part B 10 | 11 | * [Lab_2_B.ipynb](http://nbviewer.ipython.org/github/cs109/content/blob/master/labs/lab2/Lab_2_B.ipynb) - Reddit example 12 | -------------------------------------------------------------------------------- /labs/lab2/cs109style.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from IPython.core.display import HTML 4 | from matplotlib import rcParams 5 | 6 | #colorbrewer2 Dark2 qualitative color table 7 | dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667), 8 | (0.8509803921568627, 0.37254901960784315, 0.00784313725490196), 9 | (0.4588235294117647, 0.4392156862745098, 0.7019607843137254), 10 | (0.9058823529411765, 0.1607843137254902, 0.5411764705882353), 11 | (0.4, 0.6509803921568628, 0.11764705882352941), 12 | (0.9019607843137255, 0.6705882352941176, 0.00784313725490196), 13 | (0.6509803921568628, 0.4627450980392157, 0.11372549019607843), 14 | (0.4, 0.4, 0.4)] 15 | 16 | def customize_mpl(): 17 | """Tweak matplotlib visual style""" 18 | print("Setting custom matplotlib visual style") 19 | 20 | rcParams['figure.figsize'] = (10, 6) 21 | rcParams['figure.dpi'] = 150 22 | rcParams['axes.color_cycle'] = dark2_colors 23 | rcParams['lines.linewidth'] = 2 24 | rcParams['axes.grid'] = True 25 | rcParams['axes.facecolor'] = '#eeeeee' 26 | rcParams['font.size'] = 14 27 | rcParams['patch.edgecolor'] = 'none' 28 | 29 | 30 | def customize_css(): 31 | print("Setting custom CSS for the IPython Notebook") 32 | styles = open('custom.css', 'r').read() 33 | return HTML(styles) 34 | -------------------------------------------------------------------------------- /labs/lab2/custom.css: -------------------------------------------------------------------------------- 1 | 61 | 76 | -------------------------------------------------------------------------------- /labs/lab3/Italy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab3/Italy.png -------------------------------------------------------------------------------- /labs/lab4/data/US_Unemployment_Oct2012.csv: -------------------------------------------------------------------------------- 1 | State,Unemployment 2 | AL,7.1 3 | AK,6.8 4 | AZ,8.1 5 | AR,7.2 6 | CA,10.1 7 | CO,7.7 8 | CT,8.4 9 | DE,7.1 10 | FL,8.2 11 | GA,8.8 12 | HI,5.4 13 | ID,6.6 14 | IL,8.8 15 | IN,8.4 16 | IA,5.1 17 | KS,5.6 18 | KY,8.1 19 | LA,5.9 20 | ME,7.2 21 | MD,6.8 22 | MA,6.7 23 | MI,9.1 24 | MN,5.6 25 | MS,9.1 26 | MO,6.7 27 | MT,5.8 28 | NE,3.9 29 | NV,10.3 30 | NH,5.7 31 | NJ,9.6 32 | NM,6.8 33 | NY,8.4 34 | NC,9.4 35 | ND,3.2 36 | OH,6.9 37 | OK,5.2 38 | OR,8.5 39 | PA,8 40 | RI,10.1 41 | SC,8.8 42 | SD,4.4 43 | TN,7.8 44 | TX,6.4 45 | UT,5.5 46 | VT,5 47 | VA,5.8 48 | WA,7.8 49 | WV,7.5 50 | WI,6.8 51 | WY,5.1 -------------------------------------------------------------------------------- /labs/lab4/data/census_demographics.csv: -------------------------------------------------------------------------------- 1 | state,per_black,per_hisp,per_white,educ_hs,educ_coll,average_income,median_income,pop_density,vote_pop,older_pop,per_older,per_vote 2 | ALABAMA,26.5,4.0,66.8,81.4,21.7,22984,42081,94.4,3001712.5,672383.6,0.14,0.625 3 | ALASKA,3.6,5.8,63.7,90.7,27.0,30726,66521,1.2,475548.444,58540.158,0.081,0.658 4 | ARIZONA,4.5,30.1,57.4,85.0,26.3,25680,50448,56.3,3934880.535,920515.71,0.142,0.607 5 | ARKANSAS,15.6,6.6,74.2,81.9,19.1,21274,39267,56.0,1798043.148,428944.934,0.146,0.612 6 | CALIFORNIA,6.6,38.1,39.7,80.7,30.1,29188,60883,239.1,24009747.944,4409953.704,0.117,0.637 7 | COLORADO,4.3,20.9,69.7,89.3,35.9,30151,56456,48.5,3310567.012,578197.948,0.113,0.647 8 | CONNECTICUT,11.1,13.8,70.9,88.4,35.2,36775,67740,738.1,2263008.088,515622.096,0.144,0.632 9 | DELAWARE,21.9,8.4,65.1,87.0,27.7,29007,57599,460.8,568773.645,133348.845,0.147,0.627 10 | DISTRICT OF COLUMBIA,50.7,9.5,35.3,86.5,49.2,42078,58526,9856.5,442485.136,70451.544,0.114,0.716 11 | FLORIDA,16.5,22.9,57.5,85.3,25.9,26551,47661,350.6,11701330.788,3354127.392,0.176,0.614 12 | GEORGIA,31.0,9.1,55.5,83.5,27.2,25134,49347,168.4,6242473.56,1079673.1,0.11,0.636 13 | HAWAII,2.0,9.2,22.9,89.8,29.4,28882,66420,211.8,867505.11,202097.07,0.147,0.631 14 | IDAHO,0.8,11.5,83.6,88.2,24.3,22518,46423,19.0,954160.97,202878.08,0.128,0.602 15 | ILLINOIS,14.8,16.2,63.3,86.2,30.3,28782,55735,231.1,8133370.424,1634395.639,0.127,0.632 16 | INDIANA,9.4,6.2,81.3,86.2,22.4,24058,47697,181.0,4060042.406,860233.704,0.132,0.623 17 | IOWA,3.1,5.2,88.4,89.9,24.5,25335,48872,54.5,1880257.726,456284.041,0.149,0.614 18 | KANSAS,6.1,10.8,77.8,89.2,29.3,25907,49424,34.9,1765811.37,381874.654,0.133,0.615 19 | KENTUCKY,8.0,3.2,86.1,81.0,20.3,22515,41576,109.9,2757063.636,589863.06,0.135,0.631 20 | LOUISIANA,32.4,4.4,60.1,81.0,20.9,23094,43445,104.9,2886721.516,571854.5,0.125,0.631 21 | MAINE,1.3,1.4,94.3,89.8,26.5,25385,46933,43.1,842071.192,216494.644,0.163,0.634 22 | MARYLAND,30.0,8.4,54.4,87.8,35.7,34849,70647,594.8,3753418.116,728536.125,0.125,0.644 23 | MASSACHUSETTS,7.8,9.9,76.4,88.7,38.3,33966,64509,839.4,4262135.792,922255.04,0.14,0.647 24 | MICHIGAN,14.3,4.5,76.4,88.0,25.0,25135,48432,174.8,6192369.249,1392542.367,0.141,0.627 25 | MINNESOTA,5.4,4.9,82.8,91.3,31.4,29582,57243,66.6,3367262.43,700176.791,0.131,0.63 26 | MISSISSIPPI,37.3,2.9,57.7,79.6,19.5,19977,37881,63.2,1840720.416,387206.56,0.13,0.618 27 | MISSOURI,11.7,3.7,80.8,86.2,25.0,24724,46262,87.1,3744658.624,853517.696,0.142,0.623 28 | MONTANA,0.5,3.1,87.5,91.0,27.9,23836,43872,6.8,623874.375,151726.248,0.152,0.625 29 | NEBRASKA,4.7,9.5,81.8,90.0,27.7,25229,49342,23.8,1131381.574,250599.176,0.136,0.614 30 | NEVADA,8.6,27.1,53.6,84.3,21.8,27589,55726,24.6,1718416.182,340415.25,0.125,0.631 31 | NEW HAMPSHIRE,1.3,2.9,92.2,90.9,32.9,31422,63277,147.0,854189.712,184547.16,0.14,0.648 32 | NEW JERSEY,14.6,18.1,58.9,87.3,34.6,34858,69811,1195.5,5566148.805,1208498.235,0.137,0.631 33 | NEW MEXICO,2.5,46.7,40.2,82.7,25.5,22966,43820,17.0,1280567.76,283182.464,0.136,0.615 34 | NEW YORK,17.5,18.0,58.0,84.4,32.1,30948,55603,411.2,12516121.671,2666731.989,0.137,0.643 35 | NORTH CAROLINA,22.0,8.6,65.0,83.6,26.1,24745,45570,196.1,6093189.031,1274644.932,0.132,0.631 36 | NORTH DAKOTA,1.3,2.2,88.6,89.4,26.3,25803,46781,9.7,434296.82,98486.208,0.144,0.635 37 | OHIO,12.4,3.2,81.0,87.4,24.1,25113,47358,282.3,7204049.424,1650927.993,0.143,0.624 38 | OKLAHOMA,7.7,9.2,68.2,85.4,22.6,23094,42979,54.7,2335568.928,519436.596,0.137,0.616 39 | OREGON,2.0,12.0,78.1,88.6,28.6,26171,49260,39.9,2454758.606,553675.837,0.143,0.634 40 | PENNSYLVANIA,11.3,5.9,79.2,87.4,26.4,27049,50398,283.9,7989789.522,1987890.216,0.156,0.627 41 | RHODE ISLAND,7.2,12.8,76.5,83.7,30.3,28707,54902,1018.1,677038.488,154541.394,0.147,0.644 42 | SOUTH CAROLINA,28.1,5.3,64.0,83.0,24.0,23443,43939,153.9,2938556.44,659771.43,0.141,0.628 43 | SOUTH DAKOTA,1.4,2.9,84.4,89.3,25.3,24110,46369,10.7,501865.938,118667.808,0.144,0.609 44 | TENNESSEE,16.9,4.7,75.4,82.5,22.7,23722,43314,153.9,4034112.39,877259.361,0.137,0.63 45 | TEXAS,12.2,38.1,44.8,80.0,25.8,24870,49646,96.3,16021000.944,2695841.505,0.105,0.624 46 | UTAH,1.3,13.2,80.1,90.6,29.4,23139,56330,33.6,1679064.312,259184.424,0.092,0.596 47 | VERMONT,1.1,1.6,94.2,90.6,33.3,27478,51841,67.9,406553.719,93964.65,0.15,0.649 48 | VIRGINIA,19.8,8.2,64.5,86.1,33.8,32145,61406,202.6,5230406.184,1012075.5,0.125,0.646 49 | WASHINGTON,3.8,11.6,72.1,89.6,31.0,29733,57244,101.2,4378054.358,867414.826,0.127,0.641 50 | WEST VIRGINIA,3.5,1.3,93.0,81.9,17.3,21232,38380,77.1,1170734.684,300568.968,0.162,0.631 51 | WISCONSIN,6.5,6.1,83.1,89.4,25.8,26624,51598,105.0,3592701.443,793935.613,0.139,0.629 52 | WYOMING,1.1,9.1,85.5,91.3,23.6,27860,53802,5.8,361348.488,72156.066,0.127,0.636 53 | -------------------------------------------------------------------------------- /labs/lab4/data/chall-damage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/chall-damage.png -------------------------------------------------------------------------------- /labs/lab4/data/chall-table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/chall-table.png -------------------------------------------------------------------------------- /labs/lab4/data/chall.txt: -------------------------------------------------------------------------------- 1 | 66 0 2 | 70 1 3 | 69 0 4 | 68 0 5 | 67 0 6 | 72 0 7 | 73 0 8 | 70 0 9 | 57 1 10 | 63 1 11 | 70 1 12 | 78 0 13 | 67 0 14 | 53 1 15 | 67 0 16 | 75 0 17 | 70 0 18 | 81 0 19 | 76 0 20 | 79 0 21 | 75 1 22 | 76 0 23 | 58 1 24 | -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (1).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (1).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (10).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (10).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (11).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (11).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (12).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (12).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (13).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (13).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (14).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (14).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (15).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (15).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (16).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (16).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (17).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (17).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (18).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (18).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (19).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (19).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (2).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (2).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (20).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (20).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (21).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (21).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (22).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (22).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (23).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (23).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (24).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (24).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (25).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (25).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (26).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (26).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (27).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (27).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (28).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (28).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (29).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (29).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (3).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (3).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (30).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (30).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (31).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (31).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (32).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (32).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (33).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (33).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (34).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (34).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (35).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (35).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (4).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (4).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (5).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (5).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (6).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (6).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (7).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (7).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (8).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (8).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th (9).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (9).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/checks/th.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th.jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (1).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (1).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (10).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (10).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (11).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (11).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (12).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (12).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (13).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (13).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (14).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (14).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (15).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (15).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (16).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (16).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (17).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (17).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (18).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (18).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (19).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (19).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (2).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (2).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (20).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (20).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (21).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (21).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (22).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (22).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (23).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (23).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (24).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (24).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (25).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (25).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (26).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (26).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (27).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (27).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (28).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (28).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (29).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (29).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (3).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (3).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (30).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (30).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (31).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (31).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (32).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (32).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (33).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (33).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (34).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (34).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (35).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (35).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (36).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (36).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (37).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (37).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (38).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (38).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (39).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (39).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (4).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (4).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (40).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (40).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (41).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (41).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (42).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (42).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (43).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (43).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (44).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (44).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (45).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (45).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (46).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (46).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (47).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (47).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (48).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (48).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (49).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (49).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (5).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (5).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (50).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (50).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (6).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (6).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (7).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (7).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (8).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (8).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th (9).jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (9).jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/images/dollars/th.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th.jpeg -------------------------------------------------------------------------------- /labs/lab4/data/images/query_bing_images.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | import re 4 | import urllib2 5 | import os 6 | import sys 7 | 8 | 9 | def get_soup(url): 10 | return BeautifulSoup(requests.get(url).text) 11 | 12 | query = sys.argv[1] 13 | image_type = '_'.join(query.split()) 14 | print query, image_type 15 | url = "http://www.bing.com/images/search?q=" + query + "&qft=+filterui:color2-bw+filterui:imagesize-large&FORM=R5IR3" 16 | 17 | soup = get_soup(url) 18 | images = [a['src'] for a in soup.find_all("img", {"src": re.compile("mm.bing.net")})] 19 | 20 | for img in images: 21 | raw_img = urllib2.urlopen(img).read() 22 | cntr = len([i for i in os.listdir("images") if image_type in i]) + 1 23 | f = open("images/" + image_type + "_"+ str(cntr) + ".jpg", 'wb') 24 | f.write(raw_img) 25 | f.close() 26 | -------------------------------------------------------------------------------- /labs/lab4/data/myclusters.csv: -------------------------------------------------------------------------------- 1 | State,Cluster 2 | AL,1 3 | AK,6 4 | AZ,4 5 | AR,1 6 | CA,5 7 | CO,4 8 | CT,7 9 | DE,7 10 | FL,8 11 | GA,1 12 | HI,5 13 | ID,6 14 | IL,7 15 | IN,8 16 | IA,8 17 | KS,6 18 | KY,1 19 | LA,1 20 | ME,8 21 | MD,7 22 | MA,7 23 | MI,8 24 | MN,8 25 | MS,1 26 | MO,1 27 | MT,4 28 | NE,6 29 | NV,4 30 | NH,8 31 | NJ,7 32 | NM,4 33 | NY,7 34 | NC,1 35 | ND,6 36 | OH,8 37 | OK,1 38 | OR,5 39 | PA,8 40 | RI,7 41 | SC,1 42 | SD,6 43 | TN,1 44 | TX,1 45 | UT,6 46 | VT,5 47 | VA,8 48 | WA,5 49 | WV,1 50 | WI,8 51 | WY,6 -------------------------------------------------------------------------------- /labs/lab4/data/partisan_voting.csv: -------------------------------------------------------------------------------- 1 | State,PVI 2 | Alabama,R+13 3 | Alaska,R+13 4 | Arizona,R+6 5 | Arkansas,R+9 6 | California,D+7 7 | Colorado,EVEN 8 | Connecticut,D+7 9 | Delaware,D+7 10 | District of Columbia,D+39 11 | Florida,R+2 12 | Georgia,R+7 13 | Hawaii,D+12 14 | Idaho,R+17 15 | Illinois,D+8 16 | Indiana,R+6 17 | Iowa,D+1 18 | Kansas,R+12 19 | Kentucky,R+10 20 | Louisiana,R+10 21 | Maine,D+5 22 | Maryland,D+9 23 | Massachusetts,D+12 24 | Michigan,D+4 25 | Minnesota,D+2 26 | Mississippi,R+10 27 | Missouri,R+3 28 | Montana,R+7 29 | Nebraska,R+13 30 | Nevada,D+1 31 | New Hampshire,D+2 32 | New Jersey,D+4 33 | New Mexico,D+2 34 | New York,D+10 35 | North Carolina,R+4 36 | North Dakota,R+10 37 | Ohio,R+1 38 | Oklahoma,R+17 39 | Oregon,D+4 40 | Pennsylvania,D+2 41 | Rhode Island,D+11 42 | South Carolina,R+8 43 | South Dakota,R+9 44 | Tennessee,R+9 45 | Texas,R+10 46 | Utah,R+20 47 | Vermont,D+13 48 | Virginia,R+2 49 | Washington,D+5 50 | West Virginia,R+8 51 | Wisconsin,D+2 52 | Wyoming,R+20 53 | -------------------------------------------------------------------------------- /labs/lab4/data/pcavsfit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/pcavsfit.png -------------------------------------------------------------------------------- /labs/lab4/data/shuttle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/shuttle.png -------------------------------------------------------------------------------- /labs/lab5/data/bias-variance-error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab5/data/bias-variance-error.png -------------------------------------------------------------------------------- /labs/lab5/data/lc-hb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab5/data/lc-hb.png -------------------------------------------------------------------------------- /labs/lab5/data/lc-hv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab5/data/lc-hv.png -------------------------------------------------------------------------------- /labs/lab5/data/reg-bias-variance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab5/data/reg-bias-variance.png -------------------------------------------------------------------------------- /labs/lab6/_multivariate.py: -------------------------------------------------------------------------------- 1 | # 2 | # Author: Joris Vankerschaver 2013 3 | # 4 | from __future__ import division, print_function, absolute_import 5 | 6 | from scipy.misc import doccer 7 | from functools import wraps 8 | import numpy as np 9 | import scipy.linalg 10 | 11 | __all__ = ['multivariate_normal'] 12 | 13 | 14 | _LOG_2PI = np.log(2 * np.pi) 15 | 16 | 17 | def _process_parameters(dim, mean, cov): 18 | """ 19 | Infer dimensionality from mean or covariance matrix, ensure that 20 | mean and covariance are full vector resp. matrix. 21 | 22 | """ 23 | 24 | # Try to infer dimensionality 25 | if dim is None: 26 | if mean is None: 27 | if cov is None: 28 | dim = 1 29 | else: 30 | cov = np.asarray(cov, dtype=float) 31 | if cov.ndim < 2: 32 | dim = 1 33 | else: 34 | dim = cov.shape[0] 35 | else: 36 | mean = np.asarray(mean, dtype=float) 37 | dim = mean.size 38 | else: 39 | if not np.isscalar(dim): 40 | raise ValueError("Dimension of random variable must be a scalar.") 41 | 42 | # Check input sizes and return full arrays for mean and cov if necessary 43 | if mean is None: 44 | mean = np.zeros(dim) 45 | mean = np.asarray(mean, dtype=float) 46 | 47 | if cov is None: 48 | cov = 1.0 49 | cov = np.asarray(cov, dtype=float) 50 | 51 | if dim == 1: 52 | mean.shape = (1,) 53 | cov.shape = (1, 1) 54 | 55 | if mean.ndim != 1 or mean.shape[0] != dim: 56 | raise ValueError("Array 'mean' must be vector of length %d." % dim) 57 | if cov.ndim == 0: 58 | cov = cov * np.eye(dim) 59 | elif cov.ndim == 1: 60 | cov = np.diag(cov) 61 | else: 62 | if cov.shape != (dim, dim): 63 | raise ValueError("Array 'cov' must be at most two-dimensional," 64 | " but cov.ndim = %d" % cov.ndim) 65 | 66 | return dim, mean, cov 67 | 68 | 69 | def _process_quantiles(x, dim): 70 | """ 71 | Adjust quantiles array so that last axis labels the components of 72 | each data point. 73 | 74 | """ 75 | x = np.asarray(x, dtype=float) 76 | 77 | if x.ndim == 0: 78 | x = x[np.newaxis] 79 | elif x.ndim == 1: 80 | if dim == 1: 81 | x = x[:, np.newaxis] 82 | else: 83 | x = x[np.newaxis, :] 84 | 85 | return x 86 | 87 | 88 | def _squeeze_output(out): 89 | """ 90 | Remove single-dimensional entries from array and convert to scalar, 91 | if necessary. 92 | 93 | """ 94 | out = out.squeeze() 95 | if out.ndim == 0: 96 | out = out[()] 97 | return out 98 | 99 | 100 | def _pinv_1d(v, eps=1e-5): 101 | """ 102 | A helper function for computing the pseudoinverse. 103 | 104 | Parameters 105 | ---------- 106 | v : iterable of numbers 107 | This may be thought of as a vector of eigenvalues or singular values. 108 | eps : float 109 | Elements of v smaller than eps are considered negligible. 110 | 111 | Returns 112 | ------- 113 | v_pinv : 1d float ndarray 114 | A vector of pseudo-inverted numbers. 115 | 116 | """ 117 | return np.array([0 if abs(x) < eps else 1/x for x in v], dtype=float) 118 | 119 | 120 | def _psd_pinv_decomposed_log_pdet(mat, cond=None, rcond=None, 121 | lower=True, check_finite=True): 122 | """ 123 | Compute a decomposition of the pseudo-inverse and the logarithm of 124 | the pseudo-determinant of a symmetric positive semi-definite 125 | matrix. 126 | 127 | The pseudo-determinant of a matrix is defined as the product of 128 | the non-zero eigenvalues, and coincides with the usual determinant 129 | for a full matrix. 130 | 131 | Parameters 132 | ---------- 133 | mat : array_like 134 | Input array of shape (`m`, `n`) 135 | cond, rcond : float or None 136 | Cutoff for 'small' singular values. 137 | Eigenvalues smaller than ``rcond*largest_eigenvalue`` 138 | are considered zero. 139 | If None or -1, suitable machine precision is used. 140 | lower : bool, optional 141 | Whether the pertinent array data is taken from the lower or upper 142 | triangle of `mat`. (Default: lower) 143 | check_finite : boolean, optional 144 | Whether to check that the input matrix contains only finite numbers. 145 | Disabling may give a performance gain, but may result in problems 146 | (crashes, non-termination) if the inputs do contain infinities or NaNs. 147 | 148 | Returns 149 | ------- 150 | M : array_like 151 | The pseudo-inverse of the input matrix is np.dot(M, M.T). 152 | log_pdet : float 153 | Logarithm of the pseudo-determinant of the matrix. 154 | 155 | """ 156 | # Compute the symmetric eigendecomposition. 157 | # The input covariance matrix is required to be real symmetric 158 | # and positive semidefinite which implies that its eigenvalues 159 | # are all real and non-negative, 160 | # but clip them anyway to avoid numerical issues. 161 | 162 | # TODO: the code to set cond/rcond is identical to that in 163 | # scipy.linalg.{pinvh, pinv2} and if/when this function is subsumed 164 | # into scipy.linalg it should probably be shared between all of 165 | # these routines. 166 | 167 | # Note that eigh takes care of array conversion, chkfinite, 168 | # and assertion that the matrix is square. 169 | s, u = scipy.linalg.eigh(mat, lower=lower, check_finite=check_finite) 170 | 171 | if rcond is not None: 172 | cond = rcond 173 | if cond in [None, -1]: 174 | t = u.dtype.char.lower() 175 | factor = {'f': 1E3, 'd': 1E6} 176 | cond = factor[t] * np.finfo(t).eps 177 | eps = cond * np.max(abs(s)) 178 | 179 | if np.min(s) < -eps: 180 | raise ValueError('the covariance matrix must be positive semidefinite') 181 | 182 | s_pinv = _pinv_1d(s, eps) 183 | U = np.multiply(u, np.sqrt(s_pinv)) 184 | log_pdet = np.sum(np.log(s[s > eps])) 185 | 186 | return U, log_pdet 187 | 188 | 189 | _doc_default_callparams = \ 190 | """mean : array_like, optional 191 | Mean of the distribution (default zero) 192 | cov : array_like, optional 193 | Covariance matrix of the distribution (default one) 194 | """ 195 | 196 | _doc_callparams_note = \ 197 | """Setting the parameter `mean` to `None` is equivalent to having `mean` 198 | be the zero-vector. The parameter `cov` can be a scalar, in which case 199 | the covariance matrix is the identity times that value, a vector of 200 | diagonal entries for the covariance matrix, or a two-dimensional 201 | array_like. 202 | """ 203 | 204 | _doc_frozen_callparams = "" 205 | 206 | _doc_frozen_callparams_note = \ 207 | """See class definition for a detailed description of parameters.""" 208 | 209 | docdict_params = { 210 | '_doc_default_callparams': _doc_default_callparams, 211 | '_doc_callparams_note': _doc_callparams_note 212 | } 213 | 214 | docdict_noparams = { 215 | '_doc_default_callparams': _doc_frozen_callparams, 216 | '_doc_callparams_note': _doc_frozen_callparams_note 217 | } 218 | 219 | 220 | class multivariate_normal_gen(object): 221 | r""" 222 | A multivariate normal random variable. 223 | 224 | The `mean` keyword specifies the mean. The `cov` keyword specifies the 225 | covariance matrix. 226 | 227 | .. versionadded:: 0.14.0 228 | 229 | Methods 230 | ------- 231 | pdf(x, mean=None, cov=1) 232 | Probability density function. 233 | logpdf(x, mean=None, cov=1) 234 | Log of the probability density function. 235 | rvs(mean=None, cov=1) 236 | Draw random samples from a multivariate normal distribution. 237 | entropy() 238 | Compute the differential entropy of the multivariate normal. 239 | 240 | Parameters 241 | ---------- 242 | x : array_like 243 | Quantiles, with the last axis of `x` denoting the components. 244 | %(_doc_default_callparams)s 245 | 246 | Alternatively, the object may be called (as a function) to fix the mean 247 | and covariance parameters, returning a "frozen" multivariate normal 248 | random variable: 249 | 250 | rv = multivariate_normal(mean=None, scale=1) 251 | - Frozen object with the same methods but holding the given 252 | mean and covariance fixed. 253 | 254 | Notes 255 | ----- 256 | %(_doc_callparams_note)s 257 | 258 | The covariance matrix `cov` must be a (symmetric) positive 259 | semi-definite matrix. The determinant and inverse of `cov` are computed 260 | as the pseudo-determinant and pseudo-inverse, respectively, so 261 | that `cov` does not need to have full rank. 262 | 263 | The probability density function for `multivariate_normal` is 264 | 265 | .. math:: 266 | 267 | f(x) = \frac{1}{\sqrt{(2 \pi)^k \det \Sigma}} \exp\left( -\frac{1}{2} (x - \mu)^T \Sigma^{-1} (x - \mu) \right), 268 | 269 | where :math:`\mu` is the mean, :math:`\Sigma` the covariance matrix, 270 | and :math:`k` is the dimension of the space where :math:`x` takes values. 271 | 272 | Examples 273 | -------- 274 | >>> from scipy.stats import multivariate_normal 275 | >>> x = np.linspace(0, 5, 10, endpoint=False) 276 | >>> y = multivariate_normal.pdf(x, mean=2.5, cov=0.5); y 277 | array([ 0.00108914, 0.01033349, 0.05946514, 0.20755375, 0.43939129, 278 | 0.56418958, 0.43939129, 0.20755375, 0.05946514, 0.01033349]) 279 | >>> plt.plot(x, y) 280 | 281 | The input quantiles can be any shape of array, as long as the last 282 | axis labels the components. This allows us for instance to 283 | display the frozen pdf for a non-isotropic random variable in 2D as 284 | follows: 285 | 286 | >>> x, y = np.mgrid[-1:1:.01, -1:1:.01] 287 | >>> pos = np.empty(x.shape + (2,)) 288 | >>> pos[:, :, 0] = x; pos[:, :, 1] = y 289 | >>> rv = multivariate_normal([0.5, -0.2], [[2.0, 0.3], [0.3, 0.5]]) 290 | >>> plt.contourf(x, y, rv.pdf(pos)) 291 | 292 | """ 293 | 294 | def __init__(self): 295 | self.__doc__ = doccer.docformat(self.__doc__, docdict_params) 296 | 297 | def __call__(self, mean=None, cov=1): 298 | """ 299 | Create a frozen multivariate normal distribution. 300 | 301 | See `multivariate_normal_frozen` for more information. 302 | 303 | """ 304 | return multivariate_normal_frozen(mean, cov) 305 | 306 | def _logpdf(self, x, mean, prec_U, log_det_cov): 307 | """ 308 | Parameters 309 | ---------- 310 | x : ndarray 311 | Points at which to evaluate the log of the probability 312 | density function 313 | mean : ndarray 314 | Mean of the distribution 315 | prec_U : ndarray 316 | A decomposition such that np.dot(prec_U, prec_U.T) 317 | is the precision matrix, i.e. inverse of the covariance matrix. 318 | log_det_cov : float 319 | Logarithm of the determinant of the covariance matrix 320 | 321 | Notes 322 | ----- 323 | As this function does no argument checking, it should not be 324 | called directly; use 'logpdf' instead. 325 | 326 | """ 327 | dim = x.shape[-1] 328 | dev = x - mean 329 | maha = np.sum(np.square(np.dot(dev, prec_U)), axis=-1) 330 | return -0.5 * (dim * _LOG_2PI + log_det_cov + maha) 331 | 332 | def logpdf(self, x, mean, cov): 333 | """ 334 | Log of the multivariate normal probability density function. 335 | 336 | Parameters 337 | ---------- 338 | x : array_like 339 | Quantiles, with the last axis of `x` denoting the components. 340 | %(_doc_default_callparams)s 341 | 342 | Notes 343 | ----- 344 | %(_doc_callparams_note)s 345 | 346 | Returns 347 | ------- 348 | pdf : ndarray 349 | Log of the probability density function evaluated at `x` 350 | 351 | """ 352 | dim, mean, cov = _process_parameters(None, mean, cov) 353 | x = _process_quantiles(x, dim) 354 | prec_U, log_det_cov = _psd_pinv_decomposed_log_pdet(cov) 355 | out = self._logpdf(x, mean, prec_U, log_det_cov) 356 | return _squeeze_output(out) 357 | 358 | def pdf(self, x, mean, cov): 359 | """ 360 | Multivariate normal probability density function. 361 | 362 | Parameters 363 | ---------- 364 | x : array_like 365 | Quantiles, with the last axis of `x` denoting the components. 366 | %(_doc_default_callparams)s 367 | 368 | Notes 369 | ----- 370 | %(_doc_callparams_note)s 371 | 372 | Returns 373 | ------- 374 | pdf : ndarray 375 | Probability density function evaluated at `x` 376 | 377 | """ 378 | dim, mean, cov = _process_parameters(None, mean, cov) 379 | x = _process_quantiles(x, dim) 380 | prec_U, log_det_cov = _psd_pinv_decomposed_log_pdet(cov) 381 | out = np.exp(self._logpdf(x, mean, prec_U, log_det_cov)) 382 | return _squeeze_output(out) 383 | 384 | def rvs(self, mean=None, cov=1, size=1): 385 | """ 386 | Draw random samples from a multivariate normal distribution. 387 | 388 | Parameters 389 | ---------- 390 | %(_doc_default_callparams)s 391 | size : integer, optional 392 | Number of samples to draw (default 1). 393 | 394 | Notes 395 | ----- 396 | %(_doc_callparams_note)s 397 | 398 | Returns 399 | ------- 400 | rvs : ndarray or scalar 401 | Random variates of size (`size`, `N`), where `N` is the 402 | dimension of the random variable. 403 | 404 | """ 405 | dim, mean, cov = _process_parameters(None, mean, cov) 406 | out = np.random.multivariate_normal(mean, cov, size) 407 | return _squeeze_output(out) 408 | 409 | def entropy(self, mean=None, cov=1): 410 | """ 411 | Compute the differential entropy of the multivariate normal. 412 | 413 | Parameters 414 | ---------- 415 | %(_doc_default_callparams)s 416 | 417 | Notes 418 | ----- 419 | %(_doc_callparams_note)s 420 | 421 | Returns 422 | ------- 423 | h : scalar 424 | Entropy of the multivariate normal distribution 425 | 426 | """ 427 | dim, mean, cov = _process_parameters(None, mean, cov) 428 | return 1/2 * np.log(np.linalg.det(2 * np.pi * np.e * cov)) 429 | 430 | multivariate_normal = multivariate_normal_gen() 431 | 432 | 433 | class multivariate_normal_frozen(object): 434 | def __init__(self, mean=None, cov=1): 435 | """ 436 | Create a frozen multivariate normal distribution. 437 | 438 | Parameters 439 | ---------- 440 | mean : array_like, optional 441 | Mean of the distribution (default zero) 442 | cov : array_like, optional 443 | Covariance matrix of the distribution (default one) 444 | 445 | Examples 446 | -------- 447 | When called with the default parameters, this will create a 1D random 448 | variable with mean 0 and covariance 1: 449 | 450 | >>> from scipy.stats import multivariate_normal 451 | >>> r = multivariate_normal() 452 | >>> r.mean 453 | array([ 0.]) 454 | >>> r.cov 455 | array([[1.]]) 456 | 457 | """ 458 | self.dim, self.mean, self.cov = _process_parameters(None, mean, cov) 459 | self.prec_U, self._log_det_cov = _psd_pinv_decomposed_log_pdet(self.cov) 460 | 461 | self._mnorm = multivariate_normal_gen() 462 | 463 | def logpdf(self, x): 464 | x = _process_quantiles(x, self.dim) 465 | out = self._mnorm._logpdf(x, self.mean, self.prec_U, self._log_det_cov) 466 | return _squeeze_output(out) 467 | 468 | def pdf(self, x): 469 | return np.exp(self.logpdf(x)) 470 | 471 | def rvs(self, size=1): 472 | return self._mnorm.rvs(self.mean, self.cov, size) 473 | 474 | def entropy(self): 475 | """ 476 | Computes the differential entropy of the multivariate normal. 477 | 478 | Returns 479 | ------- 480 | h : scalar 481 | Entropy of the multivariate normal distribution 482 | 483 | """ 484 | return 1/2 * (self.dim * (_LOG_2PI + 1) + self._log_det_cov) 485 | 486 | 487 | # Set frozen generator docstrings from corresponding docstrings in 488 | # multivariate_normal_gen and fill in default strings in class docstrings 489 | for name in ['logpdf', 'pdf', 'rvs']: 490 | method = multivariate_normal_gen.__dict__[name] 491 | method_frozen = multivariate_normal_frozen.__dict__[name] 492 | method_frozen.__doc__ = doccer.docformat(method.__doc__, docdict_noparams) 493 | method.__doc__ = doccer.docformat(method.__doc__, docdict_params) 494 | -------------------------------------------------------------------------------- /labs/lab8/anagrams.py: -------------------------------------------------------------------------------- 1 | from mrjob.job import MRJob 2 | 3 | class MRAnagram(MRJob): 4 | 5 | def mapper(self, _, line): 6 | # Convert word into a list of characters, sort them, and convert 7 | # back to a string. 8 | letters = list(line) 9 | letters.sort() 10 | 11 | # Key is the sorted word, value is the regular word. 12 | yield letters, line 13 | 14 | def reducer(self, _, words): 15 | # Get the list of words containing these letters. 16 | anagrams = [w for w in words] 17 | 18 | # Only yield results if there are at least two words which are 19 | # anagrams of each other. 20 | if len(anagrams) > 1: 21 | yield len(anagrams), anagrams 22 | 23 | 24 | if __name__ == "__main__": 25 | MRAnagram.run() 26 | 27 | 28 | -------------------------------------------------------------------------------- /labs/lab8/friend_affiliations.py: -------------------------------------------------------------------------------- 1 | from mrjob.job import MRJob 2 | 3 | class MRFriendAffiliations(MRJob): 4 | 5 | def mapper(self, _, line): 6 | # Tokenize line. 7 | tokens = line.split(',') 8 | tokens = [t.strip() for t in tokens] 9 | 10 | # First token is the person's name. 11 | # Second token is their favorite team. 12 | # Remaining tokens are their friends' names. 13 | name, team, friends = (tokens[0], tokens[1], tokens[2:]) 14 | 15 | # Emit (key, value) pairs with friends names as the keys and 16 | # (this_name, this_team) as the value (same value for all). 17 | for friend in friends: 18 | yield friend, (name, team) 19 | 20 | # Special case: emit a similar (key, value) pair for this person. 21 | yield name, (name, team) 22 | 23 | def reducer(self, name, friends): 24 | # Count the number of Red Sox and Cardinals fans who are friends 25 | # with this person. 26 | team = None 27 | red_sox_count = 0 28 | cardinals_count = 0 29 | for friend in friends: 30 | # Keep an eye out of the special case where the friends name 31 | # and this persons name are the same -- that tells us which 32 | # team this person cheers for. 33 | if friend[0] == name: 34 | this_team = friend[1] 35 | else: 36 | if friend[1] == "Red Sox": 37 | red_sox_count += 1 38 | elif friend[1] == "Cardinals": 39 | cardinals_count += 1 40 | else: 41 | print "ERROR: Unknown team \"{0}\"".format(friend[1]) 42 | 43 | # Yield results. 44 | yield name, (this_team, red_sox_count, cardinals_count) 45 | 46 | if __name__ == '__main__': 47 | MRFriendAffiliations.run() 48 | 49 | -------------------------------------------------------------------------------- /labs/lab8/generate_friends.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | """ 4 | generate_friends.py 5 | 6 | Generates data file "baseball_friends.csv" to be used for lab8 MapReduce 7 | example. 8 | 9 | Reads list of names from "names.txt", randomly assigns team alligiences, 10 | then assigns friendships based on super simple algorithm, and finally 11 | writes out the file in the following csv format: 12 | 13 | name, team, friend1, friend2, friend3, ... 14 | 15 | """ 16 | 17 | import numpy as np 18 | from numpy.random import binomial 19 | 20 | # Read list of names from file. 21 | names = [line.strip() for line in open("names.txt")] 22 | names = np.unique(names) 23 | 24 | # Randomly generate team affiliations for each person. 25 | team = binomial(1, 0.5, len(names)) 26 | 27 | # Probability that two people who are fans of the same team are friends. 28 | friendliness_same = 0.05 29 | # Probability that two people who are fans of opposite teams are friends. 30 | friendliness_diff = 0.03 31 | 32 | # Create matrix to store friend relationships. 33 | friends = np.zeros([len(names), len(names)]) 34 | for i1 in range(len(names)): 35 | for i2 in range(i1 + 1, len(names)): 36 | if team[i1] == team[i2]: 37 | flip = binomial(1, friendliness_same) 38 | else: 39 | flip = binomial(1, friendliness_diff) 40 | 41 | friends[i1, i2] = flip 42 | friends[i2, i1] = flip 43 | 44 | # Write output file. 45 | outfile = open("baseball_friends.csv", 'w') 46 | for i in range(len(names)): 47 | # Get data for this row. 48 | this_name = names[i] 49 | this_team = "Red Sox" if team[i] else "Cardinals" 50 | friend_list = np.array(names)[friends[i,:] == 1] 51 | 52 | # Write to file. 53 | outstr = ", ".join((this_name, this_team) + tuple(friend_list)) 54 | outfile.write(outstr + "\n") 55 | outfile.close() 56 | 57 | -------------------------------------------------------------------------------- /labs/lab8/lab8_mapreduce.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Lab 8: MapReduce, mrjob, and EC2" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "In this week's lab, we will mostly ignore statistics and instead focus on some practical issues that you will encouter on Homework 4. Section 4 of that homework includes new python techniques (classes, inheritance), an unfamiliar approach to breaking up large computing problems (MapReduce), code that has to be run outside the friendly confines of an ipython notebook, and then you are asked to put it all to use on Amazon's Elastic Compute Cloud (EC2). This sounds very complicated, but the end result is a simpler algorithm for that problem of calculating similarity scores, as well as the ability to expand to arbitrarily large data sets." 23 | ] 24 | }, 25 | { 26 | "cell_type": "heading", 27 | "level": 2, 28 | "metadata": {}, 29 | "source": [ 30 | "1. Classes and generators in python" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "On previous homeworks, nearly all of the coding has been done by writing python functions plus a small amount of code that calls the functions you have written. Included below is the code for the mrjob word_count example that was covered in lecture (the canonical MapReduce example). There are a lot of new features here!\n", 38 | "\n", 39 | "Below is the code for a simple MapReduce algorithm to count the number of words in a text file. This is one of the simplest examples of a problem that can be solved using MapReduce (I even took it from the Section \"[Writing your first job](http://mrjob.readthedocs.org/en/latest/guides/quickstart.html#writing-your-first-job)\" in the mrjob documentation). If you try to run the cell in this notebook, it will not work! We will get to running programs with mrjob soon, but for now it will just serve as reference for some topics we want to cover." 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "collapsed": true, 45 | "input": [ 46 | "from mrjob.job import MRJob\n", 47 | "\n", 48 | "class MRWordFrequencyCount(MRJob):\n", 49 | "\n", 50 | " def mapper(self, _, line):\n", 51 | " yield \"chars\", len(line)\n", 52 | " yield \"words\", len(line.split())\n", 53 | " yield \"lines\", 1\n", 54 | "\n", 55 | " def reducer(self, key, values):\n", 56 | " yield key, sum(values)\n", 57 | "\n", 58 | "if __name__ == '__main__':\n", 59 | " MRWordFrequencyCount.run()\n" 60 | ], 61 | "language": "python", 62 | "metadata": {}, 63 | "outputs": [] 64 | }, 65 | { 66 | "cell_type": "heading", 67 | "level": 3, 68 | "metadata": {}, 69 | "source": [ 70 | "1.1 Classes" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "Classes are the basis of object-oriented programming in python. For all of the problems on previous homework assignments, we have written functions to do calculations, draw figures, etc. To use mrjob, we have to switch gears and use a different style of programming. \n", 78 | "\n", 79 | "As you can see in the example above, the MRWordFrequencyCount class is defined with an indented block similar to a function definition, except with class instead of def. Instead of a list of arguments, the item in parentheses (MRJob) is a *base class* that our newly defined class will inherit most of its features from. Even though there is very little code written above for MRWordFrequencyCount, it knows how to do many complex operations (running a mapper and a reducer, submitting jobs to EC2, etc.) because it inherited these abilities from the base class.\n", 80 | "\n", 81 | "There are two methods, mapper and reducer, that have been written specifically for MRWordFrequencyCount. These methods are also defined for the MRJob base class, but the methods defined here supercede the inherted ones. A class method is similar to a function (as you might guess, since it is also defined with a def statement), but the first argument to a class method will always be self, a reference back to the object to which the method belongs. The always-present self argument allows the method to access other members of the same object (both data and methods). However, when you actually call a class method, you don't have to supply anything for the self argument -- it is implicit. For example, to call the reducer method defined above, you would use:" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "collapsed": false, 87 | "input": [ 88 | "# Call reducer method of MRWordFrequencyCount object using some key and values.\n", 89 | "MRWordFrequencyCount.reducer(my_key, my_values) # Did not specify 'self' argument" 90 | ], 91 | "language": "python", 92 | "metadata": {}, 93 | "outputs": [] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "The next mrjob example -- [Writing your second job](http://mrjob.readthedocs.org/en/latest/guides/quickstart.html#writing-your-second-job) -- processes text to find the most commonly used word. That algorithm involves two MapReduce steps, so it is necessary to write a MRMostUsedWord.steps method to override the inherited method. Notice that the self is used repeatedly to specify the function references inside the list returned by the steps method." 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "collapsed": false, 105 | "input": [ 106 | "import re\n", 107 | "\n", 108 | "WORD_RE = re.compile(r\"[\\w']+\")\n", 109 | "\n", 110 | "\n", 111 | "class MRMostUsedWord(MRJob):\n", 112 | "\n", 113 | " def mapper_get_words(self, _, line):\n", 114 | " # yield each word in the line\n", 115 | " for word in WORD_RE.findall(line):\n", 116 | " yield (word.lower(), 1)\n", 117 | "\n", 118 | " def combiner_count_words(self, word, counts):\n", 119 | " # optimization: sum the words we've seen so far\n", 120 | " yield (word, sum(counts))\n", 121 | "\n", 122 | " def reducer_count_words(self, word, counts):\n", 123 | " # send all (num_occurrences, word) pairs to the same reducer.\n", 124 | " # num_occurrences is so we can easily use Python's max() function.\n", 125 | " yield None, (sum(counts), word)\n", 126 | "\n", 127 | " # discard the key; it is just None\n", 128 | " def reducer_find_max_word(self, _, word_count_pairs):\n", 129 | " # each item of word_count_pairs is (count, word),\n", 130 | " # so yielding one results in key=counts, value=word\n", 131 | " yield max(word_count_pairs)\n", 132 | "\n", 133 | " def steps(self):\n", 134 | " return [\n", 135 | " self.mr(mapper=self.mapper_get_words,\n", 136 | " combiner=self.combiner_count_words,\n", 137 | " reducer=self.reducer_count_words),\n", 138 | " self.mr(reducer=self.reducer_find_max_word)\n", 139 | " ]\n", 140 | "\n", 141 | "\n", 142 | "if __name__ == '__main__':\n", 143 | " MRMostUsedWord.run()" 144 | ], 145 | "language": "python", 146 | "metadata": {}, 147 | "outputs": [] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "[More about classes in python](http://docs.python.org/2/tutorial/classes.html#)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "heading", 158 | "level": 3, 159 | "metadata": {}, 160 | "source": [ 161 | "1.2 Generators" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "Generators are necessary to understand all of those yield statements popping up in the mapper and reducer methods. The main issue, in the case of industrial-strength MapReduce, is that you don't have enough memory to store all of your data at once. This is true even after you have split your data between many compute nodes. So instead of getting an enormous list of data, the mapper and reducer functions both receive and emit generators.\n", 169 | "\n", 170 | "When you run a function, it chugs along until it hits a return statement, at which point it returns some results and then it is done. A generator does its specified calculations until it hits a yield statement. It passes along whatever values it was supposed to yield and then it *pauses* and waits for someone to tell it to continue. It continues until it reaches another yield, and so on.\n", 171 | "\n", 172 | "Not only are mapper and reducer generators, their (key, value) inputs are also generators. This means that for each step of the mapper, it pulls in one (key, value) pair, does some processing, and then emits one or more key value pairs, which move along to a combiner or a shuffler or whatever. This is how MapReduce avoids ever having to load huge datasets into limited memory.\n", 173 | "\n", 174 | "A common stumbling block with generators is the fact that once you have iterated through an entire generator, it is done. You can see an example of this mistake by trying to run the code block below." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "collapsed": false, 180 | "input": [ 181 | "# This function converts a list into a generator.\n", 182 | "def example_generator(list):\n", 183 | " for item in list:\n", 184 | " yield item\n", 185 | " \n", 186 | "# Create a generator.\n", 187 | "my_generator = example_generator([0, 1, 2, 3, 4])\n", 188 | "\n", 189 | "# Iterating over the generator works great the first time.\n", 190 | "print \"generator iteration 1\"\n", 191 | "print \"---------------------\"\n", 192 | "for value in my_generator:\n", 193 | " print value\n", 194 | " \n", 195 | "# ...but it doesn't work the second time.\n", 196 | "print \"\\n\"\n", 197 | "print \"generator iteration 2\"\n", 198 | "print \"---------------------\"\n", 199 | "for value in my_generator:\n", 200 | " print value" 201 | ], 202 | "language": "python", 203 | "metadata": {}, 204 | "outputs": [] 205 | }, 206 | { 207 | "cell_type": "heading", 208 | "level": 3, 209 | "metadata": {}, 210 | "source": [ 211 | "1.3 What does \\_\\_name\\_\\_ == '\\_\\_main\\_\\_' mean??" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "Python is *really* into namespaces (see, for example, [The Zen of Python](http://www.python.org/dev/peps/pep-0020/)). The \\_\\_name\\_\\_ keyword tells you what namespace it is in. For example, if we import numpy, then all of the numpy features are in the numpy namespace." 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "collapsed": false, 224 | "input": [ 225 | "import numpy as np\n", 226 | "print np.__name__\n", 227 | "\n", 228 | "import matplotlib.pyplot as plt\n", 229 | "print plt.__name__" 230 | ], 231 | "language": "python", 232 | "metadata": {}, 233 | "outputs": [] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "If you try to import the above file containing the definition for MRMostUsedWord, then python will interpret the file all the way down until it hits that last if statement. \\_\\_name\\_\\_ will evaluate to MRMostUsedWord (or whatever the name was of the file we imported) and the line inside the if statement will be ignored. On the other hand, if you run this code from the command line, python will interpret it *without* importing it and \\_\\_name\\_\\_ will be the python top level namespace, which is '\\_\\_main\\_\\_', so MRMostUsedWord.run() gets called.\n", 240 | "\n", 241 | "In (many) fewer words: it tells you to run the job only when invoked from the command line.\n", 242 | "\n", 243 | "Try copying the code for MRMostUsedWord to a file, named MRMostUsedWord.py, and then running it on any old text file you might have lying around. The invokation will be somthing like this (modify based on your particular python installation):" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "collapsed": false, 249 | "input": [ 250 | "python MRMostUsedWord.py some_file.txt > most_used_word.out" 251 | ], 252 | "language": "python", 253 | "metadata": {}, 254 | "outputs": [] 255 | }, 256 | { 257 | "cell_type": "heading", 258 | "level": 2, 259 | "metadata": {}, 260 | "source": [ 261 | "2. Setting up your Amazon Web Services account" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": {}, 267 | "source": [ 268 | "There is quite a bit of overhead involved in setting up an AWS account and keeping an eye on the jobs that you end up running. In lab, we will run through an example account activation including:\n", 269 | "\n", 270 | "* Account creation\n", 271 | "* Signing up for Elastic MapReduce\n", 272 | "* Storing security credentials in your mrjob.conf file\n", 273 | "* Redeeming account credits\n", 274 | "* Billing alerts\n", 275 | "* Checking on running jobs using the console\n", 276 | "\n", 277 | "These documents (also linked from HW4) are very useful: [Instructions for Amazon Setup notebook](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/InstructionsForAmazonEMR.ipynb), [Elastic MapReduce Quickstart](http://pythonhosted.org/mrjob/guides/emr-quickstart.html)\n", 278 | "\n", 279 | "Once you have this all set up and working, then mrjob makes it *very easy* to run a MapReduce job with EMR. Using the same MRMostUsedWord example as above, the command line invokation to run with EMR is:" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "collapsed": false, 285 | "input": [ 286 | "python MRMostUsedWord.py -r emr some_file.txt > most_used_word.out" 287 | ], 288 | "language": "python", 289 | "metadata": {}, 290 | "outputs": [] 291 | }, 292 | { 293 | "cell_type": "heading", 294 | "level": 2, 295 | "metadata": {}, 296 | "source": [ 297 | "3. MapReduce exercises" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "![MapReduce schematic](https://developers.google.com/appengine/docs/python/images/mapreduce_mapshuffle.png)\n", 305 | "
\\[Image from [https://developers.google.com/appengine/docs/python/dataprocessing/](https://developers.google.com/appengine/docs/python/dataprocessing/)\\]\n", 306 | "\n", 307 | "Below are two practice problems to get the hang of writing MapReduce algorithms. Remember, you will be writing these programs in separate files that you run from the command line. You are welcome to try out EC2, but these are small datasets and it will generally be much faster to run locally." 308 | ] 309 | }, 310 | { 311 | "cell_type": "heading", 312 | "level": 3, 313 | "metadata": {}, 314 | "source": [ 315 | "3.1 Anagram finder" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "First, grab the file [word_list.txt](https://raw.github.com/cs109/content/master/labs/lab8/word_list.txt). This contains a list of six-letter words that I dumped from my spellchecker. To keep things simple, all of the words consist of lower-case letters only." 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "collapsed": false, 328 | "input": [ 329 | "word_list = [word.strip() for word in open(\"word_list.txt\").readlines()]\n", 330 | "print \"{0} words in list\".format(len(word_list))\n", 331 | "print \"First ten words: {0}\".format(\", \".join(word_list[0:10]))" 332 | ], 333 | "language": "python", 334 | "metadata": {}, 335 | "outputs": [] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "Use mrjob to write a class that finds all anagrams in word_list.txt. \n", 342 | "\n", 343 | "**UPDATE**: [My solution to exercise 3.1](https://raw.github.com/cs109/content/master/labs/lab8/anagrams.py)" 344 | ] 345 | }, 346 | { 347 | "cell_type": "heading", 348 | "level": 3, 349 | "metadata": {}, 350 | "source": [ 351 | "3.2 Friends don't let friends root for the Cardinals" 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": {}, 357 | "source": [ 358 | "![Cardinals v. Red Sox](http://www.stlcardinalbaseball.com/wp-content/uploads/2013/10/CARDINALS-V-RED-SOX-650x325.jpg)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "metadata": {}, 364 | "source": [ 365 | "For the next problem, download the file [baseball_friends.csv](https://raw.github.com/cs109/content/master/labs/lab8/baseball_friends.csv). Each row of this csv file contains the following:\n", 366 | "\n", 367 | "* A person's name\n", 368 | "* The team that person is rooting for -- either \"Cardinals\" or \"Red Sox\"\n", 369 | "* A list of that person's friends, which could have arbitrary length\n", 370 | "\n", 371 | "Let's take a look at one line:" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "collapsed": false, 377 | "input": [ 378 | "friends = open(\"baseball_friends.csv\").readlines()\n", 379 | "print friends[0].strip()\n", 380 | "print len(friends[0].split(\",\")) - 2" 381 | ], 382 | "language": "python", 383 | "metadata": {}, 384 | "outputs": [] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": {}, 389 | "source": [ 390 | "This line tells us that Aaden is a Red Sox friend and he has 65 friends, who are all listed here. For this problem, it's safe to assume that all of the names are unique and that the friendship structure is symmetric (*i.e.* if Alannah shows up in Aaden's friends list, then Aaden will show up in Alannah's friends list).\n", 391 | "\n", 392 | "Write an mrjob class that lists each person's name, their favorite team, the number of Red Sox fans they are friends with, and the number of Cardinals fans they are friends with.\n", 393 | "\n", 394 | "After running that program, we can look at the results to get an idea of the absurdly simple model that I used to generate the input csv file. You might need to modify the code below if the format of your output file doesn't quite match mine." 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "collapsed": false, 400 | "input": [ 401 | "import pandas as pd\n", 402 | "import json\n", 403 | "\n", 404 | "# Read results.\n", 405 | "result_file = \"baseball_friends.out\"\n", 406 | "result = [[json.loads(field) for field in line.strip().split('\\t')] for line in open(result_file)]\n", 407 | "\n", 408 | "# Break out columns.\n", 409 | "names = [x[0] for x in result]\n", 410 | "teams = [x[1][0] for x in result]\n", 411 | "redsox_count = [x[1][1] for x in result]\n", 412 | "cardinals_count = [x[1][2] for x in result]\n", 413 | "\n", 414 | "# Combine in data frame.\n", 415 | "result = pd.DataFrame(index=names, data={'teams': teams, 'redsox_count': redsox_count, \n", 416 | " 'cardinals_count': cardinals_count})" 417 | ], 418 | "language": "python", 419 | "metadata": {}, 420 | "outputs": [] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "collapsed": false, 425 | "input": [ 426 | "%matplotlib inline\n", 427 | "import matplotlib.pyplot as plt\n", 428 | "from matplotlib import rcParams\n", 429 | "rcParams['figure.figsize'] = (10, 6)\n", 430 | "rcParams['font.size'] = 14\n", 431 | "\n", 432 | "# Average number of friends by affiliation.\n", 433 | "print result.groupby('teams').mean()\n", 434 | "\n", 435 | "# Histogram the affiliations of people who are friends of Red Sox fans.\n", 436 | "plt.hist(result.redsox_count[result.teams == \"Red Sox\"], label=\"Red Sox friend Red Sox\")\n", 437 | "plt.hist(result.cardinals_count[result.teams == \"Red Sox\"], label=\"Red Sox friend Cardinals\")\n", 438 | "plt.xlabel('number of friends')\n", 439 | "plt.ylabel('count')\n", 440 | "plt.legend(loc=0)" 441 | ], 442 | "language": "python", 443 | "metadata": {}, 444 | "outputs": [] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "metadata": {}, 449 | "source": [ 450 | "**UPDATE**: [My solution to exercise 3.2](https://raw.github.com/cs109/content/master/labs/lab8/friend_affiliations.py)" 451 | ] 452 | } 453 | ], 454 | "metadata": {} 455 | } 456 | ] 457 | } -------------------------------------------------------------------------------- /labs/lab8/most_used_word.py: -------------------------------------------------------------------------------- 1 | from mrjob.job import MRJob 2 | import re 3 | 4 | WORD_RE = re.compile(r"[\w']+") 5 | 6 | class MRMostUsedWord(MRJob): 7 | 8 | def mapper_get_words(self, _, line): 9 | # yield each word in the line 10 | for word in WORD_RE.findall(line): 11 | yield (word.lower(), 1) 12 | 13 | def combiner_count_words(self, word, counts): 14 | # optimization: sum the words we've seen so far 15 | yield (word, sum(counts)) 16 | 17 | def reducer_count_words(self, word, counts): 18 | # send all (num_occurrences, word) pairs to the same reducer. 19 | # num_occurrences is so we can easily use Python's max() function. 20 | yield None, (sum(counts), word) 21 | 22 | # discard the key, it is just None 23 | def reducer_find_max_word(self, _, word_count_pairs): 24 | # each item of word_count_pairs is (count, word), 25 | # so yielding one results in key=counts, value=word 26 | yield max(word_count_pairs) 27 | 28 | def steps(self): 29 | return [ 30 | self.mr(mapper=self.mapper_get_words, 31 | combiner=self.combiner_count_words, 32 | reducer=self.reducer_count_words), 33 | self.mr(reducer=self.reducer_find_max_word) 34 | ] 35 | 36 | if __name__ == '__main__': 37 | MRMostUsedWord.run() 38 | 39 | -------------------------------------------------------------------------------- /labs/lab8/names.txt: -------------------------------------------------------------------------------- 1 | Sophia 2 | Jacob 3 | Emma 4 | Mason 5 | Isabella 6 | Ethan 7 | Olivia 8 | Noah 9 | Ava 10 | William 11 | Emily 12 | Liam 13 | Abigail 14 | Jayden 15 | Mia 16 | Michael 17 | Madison 18 | Alexander 19 | Elizabeth 20 | Aiden 21 | Chloe 22 | Daniel 23 | Ella 24 | Matthew 25 | Avery 26 | Elijah 27 | Addison 28 | James 29 | Aubrey 30 | Anthony 31 | Lily 32 | Benjamin 33 | Natalie 34 | Joshua 35 | Sofia 36 | Andrew 37 | Charlotte 38 | David 39 | Zoey 40 | Joseph 41 | Grace 42 | Logan 43 | Hannah 44 | Jackson 45 | Amelia 46 | Christopher 47 | Harper 48 | Gabriel 49 | Lillian 50 | Samuel 51 | Samantha 52 | Ryan 53 | Evelyn 54 | Lucas 55 | Victoria 56 | John 57 | Brooklyn 58 | Nathan 59 | Zoe 60 | Isaac 61 | Layla 62 | Dylan 63 | Hailey 64 | Caleb 65 | Leah 66 | Christian 67 | Kaylee 68 | Landon 69 | Anna 70 | Jonathan 71 | Aaliyah 72 | Carter 73 | Gabriella 74 | Luke 75 | Allison 76 | Owen 77 | Nevaeh 78 | Brayden 79 | Alexis 80 | Gavin 81 | Audrey 82 | Wyatt 83 | Savannah 84 | Isaiah 85 | Sarah 86 | Henry 87 | Alyssa 88 | Eli 89 | Claire 90 | Hunter 91 | Taylor 92 | Jack 93 | Riley 94 | Evan 95 | Camila 96 | Jordan 97 | Arianna 98 | Nicholas 99 | Ashley 100 | Tyler 101 | Brianna 102 | Aaron 103 | Sophie 104 | Jeremiah 105 | Peyton 106 | Julian 107 | Bella 108 | Cameron 109 | Khloe 110 | Levi 111 | Genesis 112 | Brandon 113 | Alexa 114 | Angel 115 | Serenity 116 | Austin 117 | Kylie 118 | Connor 119 | Aubree 120 | Adrian 121 | Scarlett 122 | Robert 123 | Stella 124 | Charles 125 | Maya 126 | Thomas 127 | Katherine 128 | Sebastian 129 | Julia 130 | Colton 131 | Lucy 132 | Jaxon 133 | Madelyn 134 | Kevin 135 | Autumn 136 | Zachary 137 | Makayla 138 | Ayden 139 | Kayla 140 | Dominic 141 | Mackenzie 142 | Blake 143 | Lauren 144 | Jose 145 | Gianna 146 | Oliver 147 | Ariana 148 | Justin 149 | Faith 150 | Bentley 151 | Alexandra 152 | Jason 153 | Melanie 154 | Chase 155 | Sydney 156 | Ian 157 | Bailey 158 | Josiah 159 | Caroline 160 | Parker 161 | Naomi 162 | Xavier 163 | Morgan 164 | Adam 165 | Kennedy 166 | Cooper 167 | Ellie 168 | Nathaniel 169 | Jasmine 170 | Grayson 171 | Eva 172 | Jace 173 | Skylar 174 | Carson 175 | Kimberly 176 | Nolan 177 | Violet 178 | Tristan 179 | Molly 180 | Luis 181 | Aria 182 | Brody 183 | Jocelyn 184 | Juan 185 | Trinity 186 | Hudson 187 | London 188 | Bryson 189 | Lydia 190 | Carlos 191 | Madeline 192 | Easton 193 | Reagan 194 | Damian 195 | Piper 196 | Alex 197 | Andrea 198 | Kayden 199 | Annabelle 200 | Ryder 201 | Maria 202 | Jesus 203 | Brooke 204 | Cole 205 | Payton 206 | Micah 207 | Paisley 208 | Vincent 209 | Paige 210 | Max 211 | Ruby 212 | Jaxson 213 | Nora 214 | Eric 215 | Mariah 216 | Asher 217 | Rylee 218 | Hayden 219 | Lilly 220 | Diego 221 | Brielle 222 | Miles 223 | Jade 224 | Steven 225 | Destiny 226 | Ivan 227 | Nicole 228 | Elias 229 | Mila 230 | Aidan 231 | Kendall 232 | Maxwell 233 | Liliana 234 | Bryce 235 | Kaitlyn 236 | Antonio 237 | Natalia 238 | Giovanni 239 | Sadie 240 | Timothy 241 | Jordyn 242 | Bryan 243 | Vanessa 244 | Santiago 245 | Mary 246 | Colin 247 | Mya 248 | Richard 249 | Penelope 250 | Braxton 251 | Isabelle 252 | Kaleb 253 | Alice 254 | Kyle 255 | Reese 256 | Kaden 257 | Gabrielle 258 | Preston 259 | Hadley 260 | Miguel 261 | Katelyn 262 | Jonah 263 | Angelina 264 | Lincoln 265 | Rachel 266 | Riley 267 | Isabel 268 | Leo 269 | Eleanor 270 | Victor 271 | Clara 272 | Brady 273 | Brooklynn 274 | Jeremy 275 | Jessica 276 | Mateo 277 | Elena 278 | Brian 279 | Aliyah 280 | Jaden 281 | Vivian 282 | Ashton 283 | Laila 284 | Patrick 285 | Sara 286 | Declan 287 | Amy 288 | Sean 289 | Eliana 290 | Joel 291 | Lyla 292 | Gael 293 | Juliana 294 | Sawyer 295 | Valeria 296 | Alejandro 297 | Adriana 298 | Marcus 299 | Makenzie 300 | Leonardo 301 | Elise 302 | Jesse 303 | Mckenzie 304 | Caden 305 | Quinn 306 | Jake 307 | Delilah 308 | Kaiden 309 | Cora 310 | Wesley 311 | Kylee 312 | Camden 313 | Rebecca 314 | Edward 315 | Gracie 316 | Brantley 317 | Izabella 318 | Roman 319 | Josephine 320 | Axel 321 | Alaina 322 | Silas 323 | Michelle 324 | Jude 325 | Jennifer 326 | Grant 327 | Eden 328 | Cayden 329 | Valentina 330 | Emmanuel 331 | Aurora 332 | George 333 | Catherine 334 | Maddox 335 | Stephanie 336 | Malachi 337 | Valerie 338 | Bradley 339 | Jayla 340 | Alan 341 | Willow 342 | Weston 343 | Daisy 344 | Gage 345 | Alana 346 | Devin 347 | Melody 348 | Greyson 349 | Hazel 350 | Kenneth 351 | Summer 352 | Mark 353 | Melissa 354 | Oscar 355 | Margaret 356 | Tanner 357 | Kinsley 358 | Rylan 359 | Kinley 360 | Nicolas 361 | Ariel 362 | Harrison 363 | Lila 364 | Derek 365 | Giselle 366 | Peyton 367 | Ryleigh 368 | Ezra 369 | Haley 370 | Tucker 371 | Julianna 372 | Emmett 373 | Ivy 374 | Avery 375 | Alivia 376 | Cody 377 | Brynn 378 | Calvin 379 | Keira 380 | Andres 381 | Daniela 382 | Jorge 383 | Aniyah 384 | Abel 385 | Angela 386 | Paul 387 | Kate 388 | Abraham 389 | Londyn 390 | Kai 391 | Hayden 392 | Collin 393 | Harmony 394 | Theodore 395 | Adalyn 396 | Ezekiel 397 | Megan 398 | Omar 399 | Allie 400 | Jayce 401 | Gabriela 402 | Conner 403 | Alayna 404 | Bennett 405 | Presley 406 | Trevor 407 | Jenna 408 | Eduardo 409 | Alexandria 410 | Peter 411 | Ashlyn 412 | Maximus 413 | Adrianna 414 | Jaiden 415 | Jada 416 | Jameson 417 | Fiona 418 | Seth 419 | Norah 420 | Kingston 421 | Emery 422 | Javier 423 | Maci 424 | Travis 425 | Miranda 426 | Garrett 427 | Ximena 428 | Everett 429 | Amaya 430 | Graham 431 | Cecilia 432 | Xander 433 | Ana 434 | Cristian 435 | Shelby 436 | Damien 437 | Katie 438 | Ryker 439 | Hope 440 | Griffin 441 | Callie 442 | Corbin 443 | Jordan 444 | Myles 445 | Luna 446 | Luca 447 | Leilani 448 | Zane 449 | Eliza 450 | Francisco 451 | Mckenna 452 | Ricardo 453 | Angel 454 | Alexis 455 | Genevieve 456 | Stephen 457 | Makenna 458 | Zayden 459 | Isla 460 | Iker 461 | Lola 462 | Drake 463 | Danielle 464 | Lukas 465 | Chelsea 466 | Charlie 467 | Leila 468 | Spencer 469 | Tessa 470 | Zion 471 | Adelyn 472 | Erick 473 | Camille 474 | Josue 475 | Mikayla 476 | Jeffrey 477 | Adeline 478 | Trenton 479 | Adalynn 480 | Chance 481 | Sienna 482 | Paxton 483 | Esther 484 | Elliot 485 | Jacqueline 486 | Fernando 487 | Emerson 488 | Keegan 489 | Arabella 490 | Landen 491 | Maggie 492 | Manuel 493 | Athena 494 | Amir 495 | Lucia 496 | Shane 497 | Lexi 498 | Raymond 499 | Ayla 500 | Zander 501 | Diana 502 | Andre 503 | Alexia 504 | Israel 505 | Juliet 506 | Mario 507 | Josie 508 | Cesar 509 | Allyson 510 | Simon 511 | Addyson 512 | King 513 | Delaney 514 | Jaylen 515 | Teagan 516 | Johnathan 517 | Marley 518 | Troy 519 | Amber 520 | Dean 521 | Rose 522 | Clayton 523 | Erin 524 | Dominick 525 | Leslie 526 | Tyson 527 | Kayleigh 528 | Jasper 529 | Amanda 530 | Martin 531 | Kathryn 532 | Kyler 533 | Kelsey 534 | Hector 535 | Emilia 536 | Edgar 537 | Alina 538 | Marco 539 | Kenzie 540 | Cash 541 | Kaydence 542 | Edwin 543 | Alicia 544 | Shawn 545 | Alison 546 | Judah 547 | Paris 548 | Andy 549 | Sabrina 550 | Donovan 551 | Ashlynn 552 | Kameron 553 | Lilliana 554 | Elliott 555 | Sierra 556 | Dante 557 | Cassidy 558 | Braylon 559 | Laura 560 | Anderson 561 | Alondra 562 | Johnny 563 | Iris 564 | Drew 565 | Kyla 566 | Sergio 567 | Christina 568 | Cruz 569 | Carly 570 | Dalton 571 | Jillian 572 | Rafael 573 | Madilyn 574 | Gregory 575 | Kyleigh 576 | Lane 577 | Madeleine 578 | Erik 579 | Cadence 580 | Skyler 581 | Nina 582 | Finn 583 | Evangeline 584 | Reid 585 | Nadia 586 | Gunner 587 | Raegan 588 | Jared 589 | Lyric 590 | Caiden 591 | Giuliana 592 | Holden 593 | Briana 594 | Emilio 595 | Georgia 596 | Fabian 597 | Yaretzi 598 | Aden 599 | Elliana 600 | Brendan 601 | Haylee 602 | Rowan 603 | Fatima 604 | Emiliano 605 | Phoebe 606 | Braden 607 | Selena 608 | Jase 609 | Charlie 610 | Jax 611 | Dakota 612 | Emanuel 613 | Annabella 614 | Lorenzo 615 | Abby 616 | Roberto 617 | Daniella 618 | Amari 619 | Juliette 620 | Angelo 621 | Lilah 622 | Beau 623 | Bianca 624 | Louis 625 | Mariana 626 | Derrick 627 | Miriam 628 | Beckett 629 | Parker 630 | Dawson 631 | Veronica 632 | Felix 633 | Gemma 634 | Pedro 635 | Noelle 636 | Brennan 637 | Cheyenne 638 | Frank 639 | Marissa 640 | Maximiliano 641 | Heaven 642 | Quinn 643 | Vivienne 644 | Dallas 645 | Brynlee 646 | Romeo 647 | Joanna 648 | Braylen 649 | Mallory 650 | Joaquin 651 | Aubrie 652 | Waylon 653 | Journey 654 | Allen 655 | Nyla 656 | Colt 657 | Cali 658 | Ruben 659 | Tatum 660 | Milo 661 | Carmen 662 | Julius 663 | Gia 664 | Grady 665 | Jazmine 666 | August 667 | Heidi 668 | Dakota 669 | Miley 670 | Cohen 671 | Baylee 672 | Brock 673 | Elaina 674 | Kellen 675 | Macy 676 | Brycen 677 | Ainsley 678 | Desmond 679 | Jane 680 | Malik 681 | Raelynn 682 | Colby 683 | Anastasia 684 | Nehemiah 685 | Adelaide 686 | Leland 687 | Ruth 688 | Jett 689 | Camryn 690 | Marcos 691 | Kiara 692 | Taylor 693 | Alessandra 694 | Karter 695 | Hanna 696 | Marshall 697 | Finley 698 | Ty 699 | Maddison 700 | Phillip 701 | Lia 702 | Corey 703 | Bethany 704 | Ali 705 | Karen 706 | Adan 707 | Kelly 708 | Dillon 709 | Malia 710 | Arthur 711 | Jazmin 712 | Maverick 713 | Jayda 714 | Leon 715 | Esmeralda 716 | Brooks 717 | Kira 718 | Tristen 719 | Lena 720 | Titus 721 | Kamryn 722 | Keith 723 | Kamila 724 | Dexter 725 | Karina 726 | Karson 727 | Eloise 728 | Emerson 729 | Kara 730 | Landyn 731 | Elisa 732 | Armando 733 | Rylie 734 | Pablo 735 | Olive 736 | Knox 737 | Nayeli 738 | Enrique 739 | Tiffany 740 | Cade 741 | Macie 742 | Gerardo 743 | Skyler 744 | Reed 745 | Addisyn 746 | Kellan 747 | Angelica 748 | Jayson 749 | Briella 750 | Barrett 751 | Fernanda 752 | Walter 753 | Annie 754 | Dustin 755 | Maliyah 756 | Kolton 757 | Amiyah 758 | Ronald 759 | Jayden 760 | Trent 761 | Charlee 762 | Phoenix 763 | Caitlyn 764 | Ismael 765 | Elle 766 | Julio 767 | Crystal 768 | Danny 769 | Julie 770 | Kason 771 | Imani 772 | Scott 773 | Kendra 774 | Messiah 775 | Talia 776 | Jay 777 | Angelique 778 | Esteban 779 | Jazlyn 780 | Gideon 781 | Guadalupe 782 | Tate 783 | Alejandra 784 | Abram 785 | Emely 786 | Trey 787 | Lucille 788 | Keaton 789 | Anya 790 | Jakob 791 | April 792 | Jaime 793 | Elsie 794 | Devon 795 | Madelynn 796 | Braydon 797 | Myla 798 | Izaiah 799 | Julissa 800 | Donald 801 | Scarlet 802 | Albert 803 | Helen 804 | Raul 805 | Breanna 806 | Darius 807 | Kyra 808 | Archer 809 | Madisyn 810 | Colten 811 | Rosalie 812 | Damon 813 | Brittany 814 | River 815 | Brylee 816 | Gustavo 817 | Jayleen 818 | Philip 819 | Arielle 820 | Atticus 821 | Karla 822 | Walker 823 | Kailey 824 | Matteo 825 | Arya 826 | Randy 827 | Sarai 828 | Saul 829 | Harley 830 | Rocco 831 | Miracle 832 | Davis 833 | Kaelyn 834 | Enzo 835 | Kali 836 | Noel 837 | Cynthia 838 | Orion 839 | Daphne 840 | Jamari 841 | Aleah 842 | Remington 843 | Caitlin 844 | Bruce 845 | Cassandra 846 | Darren 847 | Holly 848 | Larry 849 | Janelle 850 | Mathew 851 | Marilyn 852 | Russell 853 | Katelynn 854 | Dennis 855 | Kaylie 856 | Tony 857 | Itzel 858 | Chris 859 | Carolina 860 | Porter 861 | Bristol 862 | Rodrigo 863 | Haven 864 | Armani 865 | Michaela 866 | Zaiden 867 | Monica 868 | Kade 869 | June 870 | Ari 871 | Janiyah 872 | Hugo 873 | Camilla 874 | Zachariah 875 | Jamie 876 | Kamden 877 | Rebekah 878 | Mohamed 879 | Audrina 880 | Quentin 881 | Dayana 882 | Solomon 883 | Lana 884 | Curtis 885 | Serena 886 | Leonel 887 | Tiana 888 | Issac 889 | Nylah 890 | Khalil 891 | Braelyn 892 | Alberto 893 | Savanna 894 | Jerry 895 | Skye 896 | Alec 897 | Raelyn 898 | Gianni 899 | Madalyn 900 | Moises 901 | Sasha 902 | Gunnar 903 | Perla 904 | Adriel 905 | Bridget 906 | Lawrence 907 | Aniya 908 | Alijah 909 | Rowan 910 | Chandler 911 | Logan 912 | Ronan 913 | Mckinley 914 | Prince 915 | Averie 916 | Payton 917 | Jaylah 918 | Arturo 919 | Aylin 920 | Jimmy 921 | Joselyn 922 | Orlando 923 | Nia 924 | Ricky 925 | Hayley 926 | Mitchell 927 | Lilian 928 | Maximilian 929 | Adelynn 930 | Cason 931 | Jaliyah 932 | Malcolm 933 | Kassidy 934 | Muhammad 935 | Kaylin 936 | Kasen 937 | Kadence 938 | Marvin 939 | Celeste 940 | Jalen 941 | Jaelyn 942 | Cyrus 943 | Zariah 944 | Mauricio 945 | Tatiana 946 | Warren 947 | Jimena 948 | Jonas 949 | Lilyana 950 | Kendrick 951 | Anaya 952 | Rhys 953 | Catalina 954 | Dane 955 | Viviana 956 | Ryland 957 | Cataleya 958 | Pierce 959 | Sloane 960 | Johan 961 | Courtney 962 | Rory 963 | Johanna 964 | Uriel 965 | Amari 966 | Major 967 | Melany 968 | Bryant 969 | Anabelle 970 | Reece 971 | Francesca 972 | Casey 973 | Ada 974 | Ibrahim 975 | Alanna 976 | Nikolas 977 | Priscilla 978 | Arjun 979 | Danna 980 | Sullivan 981 | Angie 982 | Finnegan 983 | Kailyn 984 | Alfredo 985 | Lacey 986 | Royce 987 | Sage 988 | Ahmed 989 | Lillie 990 | Amare 991 | Brinley 992 | Lance 993 | Caylee 994 | Ramon 995 | Joy 996 | Jamison 997 | Kenley 998 | Brayan 999 | Vera 1000 | Brenden 1001 | Bailee 1002 | Dominik 1003 | Amira 1004 | Case 1005 | Aileen 1006 | Kristopher 1007 | Aspen 1008 | Maurice 1009 | Emmalyn 1010 | Mekhi 1011 | Erica 1012 | Kobe 1013 | Gracelyn 1014 | Zackary 1015 | Kennedi 1016 | Rhett 1017 | Skyla 1018 | Jensen 1019 | Annalise 1020 | Jaxton 1021 | Danica 1022 | Deandre 1023 | Dylan 1024 | Isaias 1025 | Kiley 1026 | Channing 1027 | Gwendolyn 1028 | Yahir 1029 | Jasmin 1030 | Ezequiel 1031 | Lauryn 1032 | Tobias 1033 | Aleena 1034 | Talon 1035 | Justice 1036 | Sam 1037 | Annabel 1038 | Justice 1039 | Tenley 1040 | Kash 1041 | Dahlia 1042 | Nash 1043 | Gloria 1044 | Alvin 1045 | Lexie 1046 | Jacoby 1047 | Lindsey 1048 | Ace 1049 | Hallie 1050 | Nico 1051 | Sylvia 1052 | Quinton 1053 | Elyse 1054 | Cannon 1055 | Annika 1056 | Franklin 1057 | Maeve 1058 | Raiden 1059 | Marlee 1060 | Joe 1061 | Aryanna 1062 | Lawson 1063 | Kenya 1064 | Beckham 1065 | Lorelei 1066 | Gary 1067 | Selah 1068 | Aldo 1069 | Kaliyah 1070 | Raylan 1071 | Adele 1072 | Frederick 1073 | Natasha 1074 | London 1075 | Brenda 1076 | Boston 1077 | Erika 1078 | Carl 1079 | Alyson 1080 | Byron 1081 | Braylee 1082 | Ernesto 1083 | Emilee 1084 | Moshe 1085 | Raven 1086 | Terry 1087 | Ariella 1088 | Eddie 1089 | Blakely 1090 | Kane 1091 | Liana 1092 | Moses 1093 | Jaycee 1094 | Finley 1095 | Sawyer 1096 | Salvador 1097 | Anahi 1098 | Reese 1099 | Jaelynn 1100 | Kelvin 1101 | Elsa 1102 | Cullen 1103 | Farrah 1104 | Madden 1105 | Cameron 1106 | Wade 1107 | Evelynn 1108 | Clark 1109 | Luciana 1110 | Mohammed 1111 | Zara 1112 | Kieran 1113 | Madilynn 1114 | Jagger 1115 | Eve 1116 | Dorian 1117 | Kaia 1118 | Korbin 1119 | Helena 1120 | Nelson 1121 | Anne 1122 | Roy 1123 | Estrella 1124 | Asa 1125 | Leighton 1126 | Matias 1127 | Nataly 1128 | Nasir 1129 | Whitney 1130 | Nickolas 1131 | Lainey 1132 | Roger 1133 | Amara 1134 | Alonzo 1135 | Anabella 1136 | Jaxen 1137 | Malaysia 1138 | Skylar 1139 | Samara 1140 | Callen 1141 | Zoie 1142 | Malakai 1143 | Amani 1144 | Douglas 1145 | Phoenix 1146 | Ahmad 1147 | Dulce 1148 | Uriah 1149 | Paola 1150 | Conor 1151 | Marie 1152 | Kristian 1153 | Aisha 1154 | Carmelo 1155 | Harlow 1156 | Blaine 1157 | Virginia 1158 | Kayson 1159 | Ember 1160 | Bentlee 1161 | Regina 1162 | Braeden 1163 | Jaylee 1164 | Julien 1165 | Anika 1166 | Nathanael 1167 | Ally 1168 | Aarav 1169 | Kayden 1170 | Keagan 1171 | Alani 1172 | Lucian 1173 | Miah 1174 | Morgan 1175 | Yareli 1176 | Chad 1177 | Journee 1178 | Terrance 1179 | Kiera 1180 | Benson 1181 | Nathalie 1182 | Noe 1183 | Mikaela 1184 | Rodney 1185 | Jaylynn 1186 | Francis 1187 | Litzy 1188 | Layne 1189 | Charley 1190 | Mohammad 1191 | Claudia 1192 | Zayne 1193 | Aliya 1194 | Tatum 1195 | Madyson 1196 | Brett 1197 | Cecelia 1198 | Wilson 1199 | Liberty 1200 | Kian 1201 | Braelynn 1202 | Marc 1203 | Evie 1204 | Rohan 1205 | Rosemary 1206 | Dayton 1207 | Myah 1208 | Braiden 1209 | Lizbeth 1210 | Harper 1211 | Giana 1212 | Luciano 1213 | Ryan 1214 | Nikolai 1215 | Teresa 1216 | Kamari 1217 | Ciara 1218 | Camron 1219 | Isis 1220 | Joey 1221 | Lea 1222 | Santino 1223 | Shayla 1224 | Ellis 1225 | Jazlynn 1226 | Layton 1227 | Rosa 1228 | Xzavier 1229 | Gracelynn 1230 | Jefferson 1231 | Desiree 1232 | Winston 1233 | Elisabeth 1234 | Guillermo 1235 | Isabela 1236 | Demetrius 1237 | Arely 1238 | Bowen 1239 | Mariam 1240 | Daxton 1241 | Abbigail 1242 | Melvin 1243 | Emersyn 1244 | Soren 1245 | Brenna 1246 | Neil 1247 | Kaylynn 1248 | Sylas 1249 | Nova 1250 | Jon 1251 | Raquel 1252 | Raphael 1253 | Dana 1254 | Rex 1255 | Laney 1256 | Yusuf 1257 | Laylah 1258 | Shaun 1259 | Siena 1260 | Brodie 1261 | Amelie 1262 | Tommy 1263 | Clarissa 1264 | Harley 1265 | Lilianna 1266 | Quincy 1267 | Lylah 1268 | Dax 1269 | Halle 1270 | Trace 1271 | Madalynn 1272 | Adonis 1273 | Maleah 1274 | Bently 1275 | Sherlyn 1276 | Giovani 1277 | Linda 1278 | Jeffery 1279 | Shiloh 1280 | Odin 1281 | Jessie 1282 | Luka 1283 | Kenia 1284 | Kylan 1285 | Greta 1286 | Willie 1287 | Marina 1288 | Lewis 1289 | Melina 1290 | Tripp 1291 | Amiya 1292 | Vihaan 1293 | Bria 1294 | Davion 1295 | Natalee 1296 | Kendall 1297 | Sariah 1298 | Arian 1299 | Mollie 1300 | Cory 1301 | Nancy 1302 | Jamarion 1303 | Christine 1304 | Jonathon 1305 | Felicity 1306 | Nixon 1307 | Zuri 1308 | Rayan 1309 | Irene 1310 | Emery 1311 | Simone 1312 | Jermaine 1313 | Amya 1314 | Reginald 1315 | Matilda 1316 | Tomas 1317 | Colette 1318 | Emmitt 1319 | Kristen 1320 | Ayaan 1321 | Paityn 1322 | Zechariah 1323 | Alayah 1324 | Billy 1325 | Janiya 1326 | Hamza 1327 | Kallie 1328 | Micheal 1329 | Mira 1330 | Urijah 1331 | Hailee 1332 | Aryan 1333 | Kathleen 1334 | Lee 1335 | Meredith 1336 | Jasiah 1337 | Janessa 1338 | Landry 1339 | Noemi 1340 | Crosby 1341 | Aiyana 1342 | Mathias 1343 | Aliana 1344 | Toby 1345 | Leia 1346 | Tristian 1347 | Mariyah 1348 | Will 1349 | Tori 1350 | Felipe 1351 | Alissa 1352 | Triston 1353 | Ivanna 1354 | Eden 1355 | Joslyn 1356 | Terrell 1357 | Sandra 1358 | Deacon 1359 | Maryam 1360 | Matthias 1361 | Saniyah 1362 | Jamal 1363 | Kassandra 1364 | Makai 1365 | Danika 1366 | Maxim 1367 | Denise 1368 | Sterling 1369 | Jemma 1370 | Hank 1371 | River 1372 | Gerald 1373 | Charleigh 1374 | Alessandro 1375 | Emelia 1376 | Jaydon 1377 | Kristina 1378 | Hayes 1379 | Armani 1380 | Niko 1381 | Beatrice 1382 | Branson 1383 | Jaylene 1384 | Flynn 1385 | Karlee 1386 | Kody 1387 | Blake 1388 | Marlon 1389 | Cara 1390 | Mayson 1391 | Addilyn 1392 | Allan 1393 | Amina 1394 | Augustus 1395 | Ansley 1396 | Jessie 1397 | Kaitlynn 1398 | Neymar 1399 | Iliana 1400 | Adrien 1401 | Mckayla 1402 | Aydan 1403 | Adelina 1404 | Leonard 1405 | Briley 1406 | Sincere 1407 | Elaine 1408 | Kyson 1409 | Lailah 1410 | Terrence 1411 | Mercedes 1412 | Jerome 1413 | Chaya 1414 | Jadiel 1415 | Lindsay 1416 | Kole 1417 | Hattie 1418 | Aron 1419 | Lisa 1420 | Aydin 1421 | Marisol 1422 | Omari 1423 | Patricia 1424 | Ronnie 1425 | Bryanna 1426 | Zain 1427 | Taliyah 1428 | Vicente 1429 | Adrienne 1430 | Bobby 1431 | Emmy 1432 | Yosef 1433 | Millie 1434 | Alexzander 1435 | Paislee 1436 | Harry 1437 | Charli 1438 | Kale 1439 | Kourtney 1440 | Rogelio 1441 | Leyla 1442 | Casen 1443 | Maia 1444 | Ray 1445 | Willa 1446 | Clay 1447 | Milan 1448 | Masen 1449 | Paula 1450 | Sage 1451 | Ayleen 1452 | Ulises 1453 | Clare 1454 | Kymani 1455 | Kensley 1456 | Chaim 1457 | Reyna 1458 | Javon 1459 | Martha 1460 | Brent 1461 | Adley 1462 | Jadon 1463 | Elianna 1464 | Elisha 1465 | Emilie 1466 | Stanley 1467 | Karsyn 1468 | Jovanni 1469 | Yasmin 1470 | Princeton 1471 | Lorelai 1472 | Alonso 1473 | Amirah 1474 | Darian 1475 | Aryana 1476 | Conrad 1477 | Livia 1478 | Dwayne 1479 | Alena 1480 | Eugene 1481 | Kiana 1482 | Gauge 1483 | Celia 1484 | Rene 1485 | Kailee 1486 | Kareem 1487 | Rylan 1488 | Roland 1489 | Ellen 1490 | Ben 1491 | Galilea 1492 | Vincenzo 1493 | Kynlee 1494 | Abdullah 1495 | Leanna 1496 | Camren 1497 | Renata 1498 | Kenny 1499 | Mae 1500 | Brentley 1501 | Ayanna 1502 | Memphis 1503 | Chanel 1504 | Blaze 1505 | Lesly 1506 | Edison 1507 | Cindy 1508 | Osvaldo 1509 | Carla 1510 | Teagan 1511 | Pearl 1512 | Westin 1513 | Jaylin 1514 | Deshawn 1515 | Kimora 1516 | Rayden 1517 | Angeline 1518 | Cedric 1519 | Carlee 1520 | Marquis 1521 | Aubri 1522 | Samir 1523 | Edith 1524 | Steve 1525 | Alia 1526 | Draven 1527 | Frances 1528 | Jairo 1529 | Corinne 1530 | Giovanny 1531 | Jocelynn 1532 | Brennen 1533 | Cherish 1534 | Bronson 1535 | Wendy 1536 | Crew 1537 | Carolyn 1538 | Davin 1539 | Lina 1540 | Kolten 1541 | Tabitha 1542 | Ronin 1543 | Winter 1544 | Ariel 1545 | Abril 1546 | Semaj 1547 | Bryn 1548 | Alden 1549 | Jolie 1550 | Isiah 1551 | Yaritza 1552 | Lennox 1553 | Casey 1554 | Davian 1555 | Zion 1556 | Jaylin 1557 | Lillianna 1558 | Cain 1559 | Jordynn 1560 | Wayne 1561 | Zariyah 1562 | Craig 1563 | Audriana 1564 | Lamar 1565 | Jayde 1566 | Leonidas 1567 | Jaida 1568 | Cristopher 1569 | Salma 1570 | Otto 1571 | Diamond 1572 | Bo 1573 | Malaya 1574 | Darrell 1575 | Kimber 1576 | Kolby 1577 | Ryann 1578 | Marcelo 1579 | Abbie 1580 | Bruno 1581 | Paloma 1582 | Fletcher 1583 | Destinee 1584 | Justus 1585 | Kaleigh 1586 | Alfonso 1587 | Asia 1588 | Theo 1589 | Demi 1590 | Tyrone 1591 | Yamileth 1592 | Aidyn 1593 | Deborah 1594 | Harvey 1595 | Elin 1596 | Rudy 1597 | Kaiya 1598 | Brendon 1599 | Mara 1600 | Tristin 1601 | Averi 1602 | Dominique 1603 | Nola 1604 | Kaeden 1605 | Tara 1606 | Samson 1607 | Taryn 1608 | Kyree 1609 | Emmalee 1610 | Jovani 1611 | Aubrianna 1612 | Lionel 1613 | Janae 1614 | Amos 1615 | Kyndall 1616 | Giancarlo 1617 | Jewel 1618 | Misael 1619 | Zaniyah 1620 | Callum 1621 | Kaya 1622 | Quintin 1623 | Sonia 1624 | Valentino 1625 | Alaya 1626 | Gavyn 1627 | Heather 1628 | Lennon 1629 | Nathaly 1630 | Jamir 1631 | Shannon 1632 | Kamron 1633 | Ariah 1634 | Zavier 1635 | Avah 1636 | Arlo 1637 | Giada 1638 | Junior 1639 | Lilith 1640 | Killian 1641 | Samiyah 1642 | Leandro 1643 | Sharon 1644 | Konnor 1645 | Coraline 1646 | Hezekiah 1647 | Eileen 1648 | Jordyn 1649 | Julianne 1650 | Markus 1651 | Milania 1652 | Ramiro 1653 | Chana 1654 | Callan 1655 | Regan 1656 | Chace 1657 | Krystal 1658 | Johnathon 1659 | Rihanna 1660 | Lyric 1661 | Sidney 1662 | Fisher 1663 | Hadassah 1664 | Rashad 1665 | Macey 1666 | Kamryn 1667 | Mina 1668 | Legend 1669 | Paulina 1670 | Duncan 1671 | Rayne 1672 | Harold 1673 | Kaitlin 1674 | Camilo 1675 | Maritza 1676 | Hendrix 1677 | Susan 1678 | Seamus 1679 | Raina 1680 | Coleman 1681 | Hana 1682 | Vance 1683 | Keyla 1684 | Rylee 1685 | Temperance 1686 | Elian 1687 | Aimee 1688 | Jaeden 1689 | Alisson 1690 | Jamie 1691 | Charlize 1692 | Krish 1693 | Kendal 1694 | Abdiel 1695 | Lara 1696 | Antoine 1697 | Roselyn 1698 | Camdyn 1699 | Alannah 1700 | Van 1701 | Alma 1702 | Branden 1703 | Dixie 1704 | Cayson 1705 | Larissa 1706 | Gibson 1707 | Patience 1708 | Javion 1709 | Taraji 1710 | Izayah 1711 | Sky 1712 | Darwin 1713 | Zaria 1714 | Jamar 1715 | Aleigha 1716 | Mike 1717 | Alyvia 1718 | Randall 1719 | Aviana 1720 | Brecken 1721 | Bryleigh 1722 | Hassan 1723 | Elliot 1724 | Thiago 1725 | Jenny 1726 | Heath 1727 | Luz 1728 | Arnav 1729 | Ali 1730 | Kingsley 1731 | Alisha 1732 | Kyrie 1733 | Ayana 1734 | Xavi 1735 | Campbell 1736 | Damari 1737 | Karis 1738 | Deangelo 1739 | Lilyanna 1740 | Jionni 1741 | Azaria 1742 | Joziah 1743 | Blair 1744 | Makhi 1745 | Micah 1746 | Vaughn 1747 | Moriah 1748 | Zeke 1749 | Myra 1750 | Konner 1751 | Lilia 1752 | Ean 1753 | Aliza 1754 | Frankie 1755 | Giovanna 1756 | Yael 1757 | Karissa 1758 | Benton 1759 | Saniya 1760 | Oakley 1761 | Emory 1762 | Efrain 1763 | Estella 1764 | Marcel 1765 | Juniper 1766 | Rolando 1767 | Kairi 1768 | Maxton 1769 | Kenna 1770 | Jaycob 1771 | Meghan 1772 | Keenan 1773 | Abrielle 1774 | Rowen 1775 | Elissa 1776 | Yousef 1777 | Rachael 1778 | Ishaan 1779 | Emmaline 1780 | Jedidiah 1781 | Jolene 1782 | Remy 1783 | Joyce 1784 | Todd 1785 | Britney 1786 | Reagan 1787 | Carlie 1788 | Bodhi 1789 | Haylie 1790 | Damarion 1791 | Judith 1792 | Juelz 1793 | Renee 1794 | Valentin 1795 | Saanvi 1796 | Austyn 1797 | Yesenia 1798 | Broderick 1799 | Barbara 1800 | Anders 1801 | Dallas 1802 | Alvaro 1803 | Jaqueline 1804 | Mustafa 1805 | Karma 1806 | Thaddeus 1807 | America 1808 | Brenton 1809 | Sariyah 1810 | Cale 1811 | Azalea 1812 | Clinton 1813 | Everly 1814 | Derick 1815 | Ingrid 1816 | Jorden 1817 | Lillyana 1818 | Gilberto 1819 | Emmalynn 1820 | Jabari 1821 | Marianna 1822 | Rey 1823 | Brisa 1824 | Salvatore 1825 | Kaelynn 1826 | Freddy 1827 | Leona 1828 | Donte 1829 | Libby 1830 | Ernest 1831 | Deanna 1832 | Aaden 1833 | Mattie 1834 | Axton 1835 | Miya 1836 | Blaise 1837 | Kai 1838 | Lucca 1839 | Annalee 1840 | Maximo 1841 | Nahla 1842 | Sidney 1843 | Dorothy 1844 | Dario 1845 | Kaylyn 1846 | Rodolfo 1847 | Rayna 1848 | Trevon 1849 | Araceli 1850 | Camryn 1851 | Cambria 1852 | Deegan 1853 | Evalyn 1854 | Sonny 1855 | Haleigh 1856 | Cassius 1857 | Thalia 1858 | Truman 1859 | Jakayla 1860 | Brice 1861 | Maliah 1862 | Brogan 1863 | Saige 1864 | Hugh 1865 | Avianna 1866 | Yehuda 1867 | Charity 1868 | Agustin 1869 | Kaylen 1870 | Eliot 1871 | Raylee 1872 | Stefan 1873 | Tamia 1874 | Zaid 1875 | Aubrielle 1876 | Bridger 1877 | Bayleigh 1878 | Damion 1879 | Carley 1880 | Eliseo 1881 | Kailynn 1882 | Houston 1883 | Katrina 1884 | Johann 1885 | Belen 1886 | Leroy 1887 | Karlie 1888 | Sheldon 1889 | Natalya 1890 | Dariel 1891 | Alaysia 1892 | Darryl 1893 | Celine 1894 | Isai 1895 | Milana 1896 | Tyrell 1897 | Monroe 1898 | Alfred 1899 | Estelle 1900 | Demarcus 1901 | Meadow 1902 | Kohen 1903 | Audrianna 1904 | Ignacio 1905 | Cristina 1906 | Rylen 1907 | Harlee 1908 | Santos 1909 | Jazzlyn 1910 | Cael 1911 | Scarlette 1912 | Davon 1913 | Zahra 1914 | Kaysen 1915 | Akira 1916 | Mack 1917 | Ann 1918 | Darien 1919 | Collins 1920 | Ross 1921 | Kendyl 1922 | Titan 1923 | Anabel 1924 | Tyree 1925 | Azariah 1926 | Ameer 1927 | Carissa 1928 | Zaire 1929 | Milena 1930 | Aditya 1931 | Tia 1932 | Briggs 1933 | Alisa 1934 | Immanuel 1935 | Bree 1936 | Malaki 1937 | Carleigh 1938 | Turner 1939 | Cheyanne 1940 | Bradyn 1941 | Sarahi 1942 | Graysen 1943 | Laurel 1944 | Kase 1945 | Kylah 1946 | Reuben 1947 | Tinley 1948 | Yandel 1949 | Kora 1950 | Gaige 1951 | Marisa 1952 | Jaidyn 1953 | Esme 1954 | Franco 1955 | Sloan 1956 | Trystan 1957 | Cailyn 1958 | Maison 1959 | Gisselle 1960 | Simeon 1961 | Kasey 1962 | Anton 1963 | Kyndal 1964 | Darnell 1965 | Marlene 1966 | Emory 1967 | Riya 1968 | Roderick 1969 | Annabell 1970 | Deon 1971 | Aubriana 1972 | Devan 1973 | Izabelle 1974 | Graeme 1975 | Kirsten 1976 | Howard 1977 | Aya 1978 | Jael 1979 | Dalilah 1980 | Kael 1981 | Devyn 1982 | Karsen 1983 | Geraldine 1984 | Jarrett 1985 | Analia 1986 | Apollo 1987 | Hayleigh 1988 | Denzel 1989 | Landry 1990 | Foster 1991 | Sofie 1992 | Gilbert 1993 | Tess 1994 | Jaylon 1995 | Ashtyn 1996 | Kylen 1997 | Jessa 1998 | Augustine 1999 | Katalina 2000 | Dangelo -------------------------------------------------------------------------------- /labs/lab8/word_count.py: -------------------------------------------------------------------------------- 1 | from mrjob.job import MRJob 2 | 3 | class MRWordFrequencyCount(MRJob): 4 | 5 | def mapper(self, _, line): 6 | yield "chars", len(line) 7 | yield "words", len(line.split()) 8 | yield "lines", 1 9 | 10 | def reducer(self, key, values): 11 | yield key, sum(values) 12 | 13 | if __name__ == '__main__': 14 | MRWordFrequencyCount.run() 15 | -------------------------------------------------------------------------------- /lec_04_scraping.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Scraping Demo\n", 15 | "Companion to Lecture 4 of Harvard [CS109: Data Science](http://cs109.org)\n" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "collapsed": false, 21 | "input": [ 22 | "import requests\n", 23 | "from pattern import web\n", 24 | "from BeautifulSoup import BeautifulSoup" 25 | ], 26 | "language": "python", 27 | "metadata": {}, 28 | "outputs": [], 29 | "prompt_number": 1 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# Task\n", 36 | "\n", 37 | "Find and print the movie title, list of genres, runtime, and score of all movies on [this page](http://www.imdb.com/search/title?at=0&sort=num_votes,desc&start=1&title_type=feature&year=1950,2012)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### Two ways of making get requests" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "#### 1. Explicit URL" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "collapsed": false, 57 | "input": [ 58 | "url = 'http://www.imdb.com/search/title?sort=num_votes,desc&start=1&title_type=feature&year=1950,2012'\n", 59 | "r = requests.get(url)\n", 60 | "print r.url" 61 | ], 62 | "language": "python", 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "output_type": "stream", 67 | "stream": "stdout", 68 | "text": [ 69 | "http://www.imdb.com/search/title?sort=num_votes,desc&start=1&title_type=feature&year=1950,2012\n" 70 | ] 71 | } 72 | ], 73 | "prompt_number": 7 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "#### 2. Base URL with GET dictionary" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "collapsed": false, 85 | "input": [ 86 | "url = 'http://www.imdb.com/search/title'\n", 87 | "params = dict(sort='num_votes,desc', start=1, title_type='feature', year='1950,2012')\n", 88 | "r = requests.get(url, params=params)\n", 89 | "print r.url # notice it constructs the full url for you" 90 | ], 91 | "language": "python", 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "output_type": "stream", 96 | "stream": "stdout", 97 | "text": [ 98 | "http://www.imdb.com/search/title?sort=num_votes%2Cdesc&start=1&title_type=feature&year=1950%2C2012\n" 99 | ] 100 | } 101 | ], 102 | "prompt_number": 8 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "# Using Pattern" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "collapsed": false, 114 | "input": [ 115 | "#selection in pattern follows the rules of CSS\n", 116 | "\n", 117 | "dom = web.Element(r.text)\n", 118 | "for movie in dom.by_tag('td.title'): \n", 119 | " title = movie.by_tag('a')[0].content\n", 120 | " genres = movie.by_tag('span.genre')[0].by_tag('a')\n", 121 | " genres = [g.content for g in genres]\n", 122 | " runtime = movie.by_tag('span.runtime')[0].content\n", 123 | " rating = movie.by_tag('span.value')[0].content\n", 124 | " print title, genres, runtime, rating" 125 | ], 126 | "language": "python", 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "output_type": "stream", 131 | "stream": "stdout", 132 | "text": [ 133 | "The Shawshank Redemption [u'Crime', u'Drama'] 142 mins. 9.3\n", 134 | "The Dark Knight [u'Action', u'Crime', u'Drama', u'Thriller'] 152 mins. 9.0\n", 135 | "Inception [u'Action', u'Adventure', u'Mystery', u'Sci-Fi', u'Thriller'] 148 mins. 8.8\n", 136 | "Pulp Fiction [u'Crime', u'Drama', u'Thriller'] 154 mins. 9.0\n", 137 | "Fight Club [u'Drama'] 139 mins. 8.9\n", 138 | "The Lord of the Rings: The Fellowship of the Ring" 139 | ] 140 | }, 141 | { 142 | "output_type": "stream", 143 | "stream": "stdout", 144 | "text": [ 145 | " [u'Action', u'Adventure', u'Fantasy'] 178 mins. 8.8\n", 146 | "The Matrix [u'Action', u'Adventure', u'Sci-Fi'] 136 mins. 8.7\n", 147 | "The Lord of the Rings: The Return of the King [u'Action', u'Adventure', u'Fantasy'] 201 mins. 8.9\n", 148 | "The Godfather [u'Crime', u'Drama'] 175 mins. 9.2\n", 149 | "Forrest Gump [u'Drama', u'Romance'] 142 mins. 8.7\n", 150 | "The Dark Knight Rises [u'Action', u'Crime', u'Thriller'] 165 mins. 8.6\n", 151 | "The Lord of the Rings: The Two Towers" 152 | ] 153 | }, 154 | { 155 | "output_type": "stream", 156 | "stream": "stdout", 157 | "text": [ 158 | " [u'Action', u'Adventure', u'Fantasy'] 179 mins. 8.7\n", 159 | "Se7en [u'Crime', u'Mystery', u'Thriller'] 127 mins. 8.7\n", 160 | "Avatar [u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'] 162 mins. 7.9\n", 161 | "Batman Begins [u'Action', u'Adventure', u'Crime', u'Drama'] 140 mins. 8.3\n", 162 | "Gladiator [u'Action', u'Adventure', u'Drama'] 155 mins. 8.5\n", 163 | "Star Wars" 164 | ] 165 | }, 166 | { 167 | "output_type": "stream", 168 | "stream": "stdout", 169 | "text": [ 170 | " [u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'] 121 mins. 8.8\n", 171 | "The Avengers [u'Action', u'Fantasy'] 143 mins. 8.2\n", 172 | "Memento [u'Mystery', u'Thriller'] 113 mins. 8.6\n", 173 | "American Beauty [u'Drama'] 122 mins. 8.5\n", 174 | "Schindler's List [u'Biography', u'Drama', u'History', u'War'] 195 mins. 8.9\n", 175 | "Saving Private Ryan [u'Action', u'Drama', u'War'] 169 mins. 8.6\n", 176 | "The Departed" 177 | ] 178 | }, 179 | { 180 | "output_type": "stream", 181 | "stream": "stdout", 182 | "text": [ 183 | " [u'Crime', u'Drama', u'Thriller'] 151 mins. 8.5\n", 184 | "The Silence of the Lambs [u'Crime', u'Drama', u'Thriller'] 118 mins. 8.7\n", 185 | "Pirates of the Caribbean: The Curse of the Black Pearl [u'Action', u'Adventure', u'Fantasy'] 143 mins. 8.0\n", 186 | "Star Wars: Episode V - The Empire Strikes Back" 187 | ] 188 | }, 189 | { 190 | "output_type": "stream", 191 | "stream": "stdout", 192 | "text": [ 193 | " [u'Action', u'Adventure', u'Sci-Fi'] 124 mins. 8.8\n", 194 | "Titanic [u'Drama', u'Romance'] 194 mins. 7.6\n", 195 | "V for Vendetta [u'Action', u'Crime', u'Fantasy', u'Mystery', u'Sci-Fi', u'Thriller'] 132 mins. 8.2\n", 196 | "Inglourious Basterds [u'Adventure', u'Drama', u'War'] 153 mins. 8.3\n", 197 | "The Prestige [u'Drama', u'Mystery', u'Thriller'] 130 mins. 8.4\n", 198 | "American History X" 199 | ] 200 | }, 201 | { 202 | "output_type": "stream", 203 | "stream": "stdout", 204 | "text": [ 205 | " [u'Crime', u'Drama'] 119 mins. 8.6\n", 206 | "The Godfather: Part II [u'Crime', u'Drama'] 200 mins. 9.0\n", 207 | "The Usual Suspects [u'Crime', u'Mystery', u'Thriller'] 106 mins. 8.7\n", 208 | "Braveheart [u'Action', u'Biography', u'Drama', u'History', u'War'] 177 mins. 8.4\n", 209 | "Terminator 2: Judgment Day" 210 | ] 211 | }, 212 | { 213 | "output_type": "stream", 214 | "stream": "stdout", 215 | "text": [ 216 | " [u'Action', u'Sci-Fi', u'Thriller'] 137 mins. 8.6\n", 217 | "The Sixth Sense [u'Drama', u'Mystery', u'Thriller'] 107 mins. 8.2\n", 218 | "Kill Bill: Vol. 1 [u'Action', u'Crime'] 111 mins. 8.2\n", 219 | "Goodfellas [u'Biography', u'Crime', u'Drama', u'Thriller'] 146 mins. 8.8\n", 220 | "Sin City [u'Crime', u'Thriller'] 124 mins. 8.2\n", 221 | "Léon: The Professional" 222 | ] 223 | }, 224 | { 225 | "output_type": "stream", 226 | "stream": "stdout", 227 | "text": [ 228 | " [u'Crime', u'Drama', u'Thriller'] 110 mins. 8.6\n", 229 | "Django Unchained [u'Adventure', u'Drama', u'Western'] 165 mins. 8.5\n", 230 | "One Flew Over the Cuckoo's Nest [u'Drama'] 133 mins. 8.8\n", 231 | "The Green Mile [u'Crime', u'Drama', u'Fantasy', u'Mystery'] 189 mins. 8.5\n", 232 | "Raiders of the Lost Ark [u'Action', u'Adventure'] 115 mins. 8.6\n", 233 | "Eternal Sunshine of the Spotless Mind" 234 | ] 235 | }, 236 | { 237 | "output_type": "stream", 238 | "stream": "stdout", 239 | "text": [ 240 | " [u'Drama', u'Romance', u'Sci-Fi'] 108 mins. 8.4\n", 241 | "Shutter Island [u'Drama', u'Thriller'] 138 mins. 8.0\n", 242 | "Iron Man [u'Action', u'Adventure', u'Sci-Fi'] 126 mins. 7.9\n", 243 | "Back to the Future [u'Adventure', u'Comedy', u'Sci-Fi'] 116 mins. 8.5\n", 244 | "WALL·E [u'Animation', u'Adventure', u'Family', u'Romance', u'Sci-Fi'] 98 mins. 8.5\n", 245 | "300" 246 | ] 247 | }, 248 | { 249 | "output_type": "stream", 250 | "stream": "stdout", 251 | "text": [ 252 | " [u'Action', u'Fantasy', u'History', u'War'] 117 mins. 7.7\n" 253 | ] 254 | } 255 | ], 256 | "prompt_number": 9 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "# Using BeautifulSoup" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "collapsed": false, 268 | "input": [ 269 | "bs = BeautifulSoup(r.text)\n", 270 | "for movie in bs.findAll('td', 'title'):\n", 271 | " title = movie.find('a').contents[0]\n", 272 | " genres = movie.find('span', 'genre').findAll('a')\n", 273 | " genres = [g.contents[0] for g in genres]\n", 274 | " runtime = movie.find('span', 'runtime').contents[0]\n", 275 | " rating = movie.find('span', 'value').contents[0]\n", 276 | " print title, genres, runtime, rating\n" 277 | ], 278 | "language": "python", 279 | "metadata": {}, 280 | "outputs": [ 281 | { 282 | "output_type": "stream", 283 | "stream": "stdout", 284 | "text": [ 285 | "The Shawshank Redemption [u'Crime', u'Drama'] 142 mins. 9.3\n", 286 | "The Dark Knight [u'Action', u'Crime', u'Drama', u'Thriller'] 152 mins. 9.0\n", 287 | "Inception [u'Action', u'Adventure', u'Mystery', u'Sci-Fi', u'Thriller'] 148 mins. 8.8\n", 288 | "Pulp Fiction [u'Crime', u'Drama', u'Thriller'] 154 mins. 9.0\n", 289 | "Fight Club [u'Drama'] 139 mins. 8.9\n", 290 | "The Lord of the Rings: The Fellowship of the Ring [u'Action', u'Adventure', u'Fantasy'] 178 mins. 8.8\n", 291 | "The Matrix [u'Action', u'Adventure', u'Sci-Fi'] 136 mins. 8.7\n", 292 | "The Lord of the Rings: The Return of the King [u'Action', u'Adventure', u'Fantasy'] 201 mins. 8.9\n", 293 | "The Godfather [u'Crime', u'Drama'] 175 mins. 9.2\n", 294 | "Forrest Gump" 295 | ] 296 | }, 297 | { 298 | "output_type": "stream", 299 | "stream": "stdout", 300 | "text": [ 301 | " [u'Drama', u'Romance'] 142 mins. 8.7\n", 302 | "The Dark Knight Rises [u'Action', u'Crime', u'Thriller'] 165 mins. 8.6\n", 303 | "The Lord of the Rings: The Two Towers [u'Action', u'Adventure', u'Fantasy'] 179 mins. 8.7\n", 304 | "Se7en [u'Crime', u'Mystery', u'Thriller'] 127 mins. 8.7\n", 305 | "Avatar [u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'] 162 mins. 7.9\n", 306 | "Batman Begins [u'Action', u'Adventure', u'Crime', u'Drama'] 140 mins. 8.3\n", 307 | "Gladiator [u'Action', u'Adventure', u'Drama'] 155 mins. 8.5\n", 308 | "Star Wars [u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'] 121 mins. 8.8\n", 309 | "The Avengers [u'Action', u'Fantasy'] 143 mins. 8.2\n", 310 | "Memento" 311 | ] 312 | }, 313 | { 314 | "output_type": "stream", 315 | "stream": "stdout", 316 | "text": [ 317 | " [u'Mystery', u'Thriller'] 113 mins. 8.6\n", 318 | "American Beauty [u'Drama'] 122 mins. 8.5\n", 319 | "Schindler's List [u'Biography', u'Drama', u'History', u'War'] 195 mins. 8.9\n", 320 | "Saving Private Ryan [u'Action', u'Drama', u'War'] 169 mins. 8.6\n", 321 | "The Departed [u'Crime', u'Drama', u'Thriller'] 151 mins. 8.5\n", 322 | "The Silence of the Lambs [u'Crime', u'Drama', u'Thriller'] 118 mins. 8.7\n", 323 | "Pirates of the Caribbean: The Curse of the Black Pearl [u'Action', u'Adventure', u'Fantasy'] 143 mins. 8.0\n", 324 | "Star Wars: Episode V - The Empire Strikes Back [u'Action', u'Adventure', u'Sci-Fi'] 124 mins. 8.8\n", 325 | "Titanic [u'Drama', u'Romance'] 194 mins. 7.6\n", 326 | "V for Vendetta" 327 | ] 328 | }, 329 | { 330 | "output_type": "stream", 331 | "stream": "stdout", 332 | "text": [ 333 | " [u'Action', u'Crime', u'Fantasy', u'Mystery', u'Sci-Fi', u'Thriller'] 132 mins. 8.2\n", 334 | "Inglourious Basterds [u'Adventure', u'Drama', u'War'] 153 mins. 8.3\n", 335 | "The Prestige [u'Drama', u'Mystery', u'Thriller'] 130 mins. 8.4\n", 336 | "American History X [u'Crime', u'Drama'] 119 mins. 8.6\n", 337 | "The Godfather: Part II [u'Crime', u'Drama'] 200 mins. 9.0\n", 338 | "The Usual Suspects [u'Crime', u'Mystery', u'Thriller'] 106 mins. 8.7\n", 339 | "Braveheart [u'Action', u'Biography', u'Drama', u'History', u'War'] 177 mins. 8.4\n", 340 | "Terminator 2: Judgment Day [u'Action', u'Sci-Fi', u'Thriller'] 137 mins. 8.6\n", 341 | "The Sixth Sense [u'Drama', u'Mystery', u'Thriller'] 107 mins. 8.2\n", 342 | "Kill Bill: Vol. 1" 343 | ] 344 | }, 345 | { 346 | "output_type": "stream", 347 | "stream": "stdout", 348 | "text": [ 349 | " [u'Action', u'Crime'] 111 mins. 8.2\n", 350 | "Goodfellas [u'Biography', u'Crime', u'Drama', u'Thriller'] 146 mins. 8.8\n", 351 | "Sin City [u'Crime', u'Thriller'] 124 mins. 8.2\n", 352 | "Léon: The Professional [u'Crime', u'Drama', u'Thriller'] 110 mins. 8.6\n", 353 | "Django Unchained [u'Adventure', u'Drama', u'Western'] 165 mins. 8.5\n", 354 | "One Flew Over the Cuckoo's Nest [u'Drama'] 133 mins. 8.8\n", 355 | "The Green Mile [u'Crime', u'Drama', u'Fantasy', u'Mystery'] 189 mins. 8.5\n", 356 | "Raiders of the Lost Ark [u'Action', u'Adventure'] 115 mins. 8.6\n", 357 | "Eternal Sunshine of the Spotless Mind [u'Drama', u'Romance', u'Sci-Fi'] 108 mins. 8.4\n", 358 | "Shutter Island" 359 | ] 360 | }, 361 | { 362 | "output_type": "stream", 363 | "stream": "stdout", 364 | "text": [ 365 | " [u'Drama', u'Thriller'] 138 mins. 8.0\n", 366 | "Iron Man [u'Action', u'Adventure', u'Sci-Fi'] 126 mins. 7.9\n", 367 | "Back to the Future [u'Adventure', u'Comedy', u'Sci-Fi'] 116 mins. 8.5\n", 368 | "WALL·E [u'Animation', u'Adventure', u'Family', u'Romance', u'Sci-Fi'] 98 mins. 8.5\n", 369 | "300 [u'Action', u'Fantasy', u'History', u'War'] 117 mins. 7.7\n" 370 | ] 371 | } 372 | ], 373 | "prompt_number": 5 374 | } 375 | ], 376 | "metadata": {} 377 | } 378 | ] 379 | } -------------------------------------------------------------------------------- /matplotlib_examples/imdb.tsv: -------------------------------------------------------------------------------- 1 | Rank Title Year Director Country 2 | 1 Citizen Kane 1941 Welles, Orson USA 3 | 2 Bicycle Thieves 1948 De Sica, Vittorio Italy 4 | 3 Psycho 1960 Hitchcock, Alfred USA 5 | 4 The Godfather 1972 Coppola, Francis Ford USA 6 | 5 2001: A Space Odyssey 1968 Kubrick, Stanley UK / USA 7 | 6 The Rules of the Game 1939 Renoir, Jean France 8 | 7 Singin' in the Rain 1952 Kelly, Gene / Donen, Stanley USA 9 | 8 Bonnie and Clyde 1967 Penn, Arthur USA 10 | 9 The Searchers 1956 Ford, John USA 11 | 10 Casablanca 1942 Curtiz, Michael USA 12 | 11 Seven Samurai 1954 Kurosawa, Akira Japan 13 | 12 Annie Hall 1977 Allen, Woody USA 14 | 13 Chinatown 1974 Polanski, Roman USA 15 | 14 Pulp Fiction 1994 Tarantino, Quentin USA 16 | 15 Raging Bull 1980 Scorsese, Martin USA 17 | 16 Rashmon 1950 Kurosawa, Akira Japan 18 | 17 Star Wars: Episode IV - 1977 Lucas, George USA 19 | 18 The 400 Blows 1959 Truffaut, Franois France 20 | 19 Breathless 1960 Godard, Jean-Luc France 21 | 20 E.T. The Extra-Terrestrial 1982 Spielberg, Steven USA 22 | 21 Vertigo 1958 Hitchcock, Alfred USA 23 | 22 Schindler's List 1993 Spielberg, Steven USA 24 | 23 8 1/2 1963 Fellini, Federico Italy / France 25 | 24 Metropolis 1927 Lang, Fritz Germany 26 | 25 Lawrence of Arabia 1962 Lean, David UK 27 | 26 GoodFellas 1990 Scorsese, Martin USA 28 | 27 Some Like It Hot 1959 Wilder, Billy USA 29 | 28 La Dolce Vita 1960 Fellini, Federico Italy / France 30 | 29 M 1931 Lang, Fritz Germany 31 | 30 On the Waterfront 1954 Kazan, Elia USA 32 | 31 All About Eve 1950 Mankiewicz, Joseph L. USA 33 | 32 Gone with the Wind 1939 Fleming, Victor USA 34 | 33 Taxi Driver 1976 Scorsese, Martin USA 35 | 34 The Conformist 1970 Bertolucci, Bernardo Italy / France 36 | 35 The Wizard of Oz 1939 Fleming, Victor / Vidor, King USA 37 | 36 Dr. Strangelove 1964 Kubrick, Stanley UK 38 | 37 The Godfather Part II 1974 Coppola, Francis Ford USA 39 | 38 Raise the Red Lantern 1991 Zhang, Yimou China / Hong Kong 40 | 39 Tokyo Story 1953 Ozu, Yasujiro Japan 41 | 40 Sunset Boulevard 1950 Wilder, Billy USA 42 | 41 The World of Apu 1959 Ray, Satyajit India 43 | 42 Double Indemnity 1944 Wilder, Billy USA 44 | 43 Belle de jour 1967 Buuel, Luis France / Italy 45 | 44 Persona 1966 Bergman, Ingmar Sweden 46 | 45 Jules and Jim 1962 Truffaut, Franois France 47 | 46 Apocalypse Now 1979 Coppola, Francis Ford USA 48 | 47 The Graduate 1967 Nichols, Mike USA 49 | 48 Nashville 1975 Altman, Robert USA 50 | 49 L'Avventura 1960 Antonioni, Michelangelo Italy / France 51 | 50 It's a Wonderful Life 1946 Capra, Frank USA -------------------------------------------------------------------------------- /skeleton.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from mrjob.job import MRJob 4 | from itertools import combinations, permutations 5 | 6 | from scipy.stats.stats import pearsonr 7 | 8 | 9 | class RestaurantSimilarities(MRJob): 10 | 11 | def steps(self): 12 | "the steps in the map-reduce process" 13 | thesteps = [ 14 | self.mr(mapper=self.line_mapper, reducer=self.users_items_collector), 15 | self.mr(mapper=self.pair_items_mapper, reducer=self.calc_sim_collector) 16 | ] 17 | return thesteps 18 | 19 | def line_mapper(self,_,line): 20 | "this is the complete implementation" 21 | user_id,business_id,stars,business_avg,user_avg=line.split(',') 22 | yield user_id, (business_id,stars,business_avg,user_avg) 23 | 24 | 25 | def users_items_collector(self, user_id, values): 26 | """ 27 | #iterate over the list of tuples yielded in the previous mapper 28 | #and append them to an array of rating information 29 | """ 30 | pass 31 | 32 | 33 | def pair_items_mapper(self, user_id, values): 34 | """ 35 | ignoring the user_id key, take all combinations of business pairs 36 | and yield as key the pair id, and as value the pair rating information 37 | """ 38 | pass #your code here 39 | 40 | def calc_sim_collector(self, key, values): 41 | """ 42 | Pick up the information from the previous yield as shown. Compute 43 | the pearson correlation and yield the final information as in the 44 | last line here. 45 | """ 46 | (rest1, rest2), common_ratings = key, values 47 | #your code here 48 | yield (rest1, rest2), (rho, n_common) 49 | 50 | 51 | #Below MUST be there for things to work 52 | if __name__ == '__main__': 53 | RestaurantSimilarities.run() 54 | --------------------------------------------------------------------------------