├── .gitignore
├── HW0.ipynb
├── HW0_solutions.ipynb
├── HW1.ipynb
├── HW1_solutions.ipynb
├── HW2.ipynb
├── HW2_solutions.ipynb
├── HW3.ipynb
├── HW3_solutions.ipynb
├── HW4.ipynb
├── HW4_solutions.ipynb
├── HW5.ipynb
├── HW5_solutions.ipynb
├── InstructionsForAmazonEMR.ipynb
├── LICENSE
├── README.md
├── computesim.py
├── computesim2.py
├── imdb_top_10000.txt
├── labs
    ├── lab10
    │   ├── Lab_10.ipynb
    │   ├── final_lab_images.pdf
    │   ├── svm.csv
    │   ├── test.csv
    │   ├── train.csv
    │   └── yeastall_public.txt
    ├── lab2
    │   ├── Lab_2_A_Johanna.ipynb
    │   ├── Lab_2_A_Live.ipynb
    │   ├── Lab_2_A_Live_Ray_Final.ipynb
    │   ├── Lab_2_B.ipynb
    │   ├── Lab_2_B_Live.ipynb
    │   ├── README.md
    │   ├── cs109style.py
    │   └── custom.css
    ├── lab3
    │   ├── Italy.png
    │   ├── data
    │   │   └── olive.csv
    │   ├── lab3.ipynb
    │   └── lab3full.ipynb
    ├── lab4
    │   ├── Lab4.ipynb
    │   ├── Lab4full.ipynb
    │   └── data
    │   │   ├── US_Unemployment_Oct2012.csv
    │   │   ├── census_demographics.csv
    │   │   ├── chall-damage.png
    │   │   ├── chall-table.png
    │   │   ├── chall.txt
    │   │   ├── images
    │   │       ├── images
    │   │       │   ├── checks
    │   │       │   │   ├── th (1).jpeg
    │   │       │   │   ├── th (10).jpeg
    │   │       │   │   ├── th (11).jpeg
    │   │       │   │   ├── th (12).jpeg
    │   │       │   │   ├── th (13).jpeg
    │   │       │   │   ├── th (14).jpeg
    │   │       │   │   ├── th (15).jpeg
    │   │       │   │   ├── th (16).jpeg
    │   │       │   │   ├── th (17).jpeg
    │   │       │   │   ├── th (18).jpeg
    │   │       │   │   ├── th (19).jpeg
    │   │       │   │   ├── th (2).jpeg
    │   │       │   │   ├── th (20).jpeg
    │   │       │   │   ├── th (21).jpeg
    │   │       │   │   ├── th (22).jpeg
    │   │       │   │   ├── th (23).jpeg
    │   │       │   │   ├── th (24).jpeg
    │   │       │   │   ├── th (25).jpeg
    │   │       │   │   ├── th (26).jpeg
    │   │       │   │   ├── th (27).jpeg
    │   │       │   │   ├── th (28).jpeg
    │   │       │   │   ├── th (29).jpeg
    │   │       │   │   ├── th (3).jpeg
    │   │       │   │   ├── th (30).jpeg
    │   │       │   │   ├── th (31).jpeg
    │   │       │   │   ├── th (32).jpeg
    │   │       │   │   ├── th (33).jpeg
    │   │       │   │   ├── th (34).jpeg
    │   │       │   │   ├── th (35).jpeg
    │   │       │   │   ├── th (4).jpeg
    │   │       │   │   ├── th (5).jpeg
    │   │       │   │   ├── th (6).jpeg
    │   │       │   │   ├── th (7).jpeg
    │   │       │   │   ├── th (8).jpeg
    │   │       │   │   ├── th (9).jpeg
    │   │       │   │   └── th.jpeg
    │   │       │   └── dollars
    │   │       │   │   ├── th (1).jpeg
    │   │       │   │   ├── th (10).jpeg
    │   │       │   │   ├── th (11).jpeg
    │   │       │   │   ├── th (12).jpeg
    │   │       │   │   ├── th (13).jpeg
    │   │       │   │   ├── th (14).jpeg
    │   │       │   │   ├── th (15).jpeg
    │   │       │   │   ├── th (16).jpeg
    │   │       │   │   ├── th (17).jpeg
    │   │       │   │   ├── th (18).jpeg
    │   │       │   │   ├── th (19).jpeg
    │   │       │   │   ├── th (2).jpeg
    │   │       │   │   ├── th (20).jpeg
    │   │       │   │   ├── th (21).jpeg
    │   │       │   │   ├── th (22).jpeg
    │   │       │   │   ├── th (23).jpeg
    │   │       │   │   ├── th (24).jpeg
    │   │       │   │   ├── th (25).jpeg
    │   │       │   │   ├── th (26).jpeg
    │   │       │   │   ├── th (27).jpeg
    │   │       │   │   ├── th (28).jpeg
    │   │       │   │   ├── th (29).jpeg
    │   │       │   │   ├── th (3).jpeg
    │   │       │   │   ├── th (30).jpeg
    │   │       │   │   ├── th (31).jpeg
    │   │       │   │   ├── th (32).jpeg
    │   │       │   │   ├── th (33).jpeg
    │   │       │   │   ├── th (34).jpeg
    │   │       │   │   ├── th (35).jpeg
    │   │       │   │   ├── th (36).jpeg
    │   │       │   │   ├── th (37).jpeg
    │   │       │   │   ├── th (38).jpeg
    │   │       │   │   ├── th (39).jpeg
    │   │       │   │   ├── th (4).jpeg
    │   │       │   │   ├── th (40).jpeg
    │   │       │   │   ├── th (41).jpeg
    │   │       │   │   ├── th (42).jpeg
    │   │       │   │   ├── th (43).jpeg
    │   │       │   │   ├── th (44).jpeg
    │   │       │   │   ├── th (45).jpeg
    │   │       │   │   ├── th (46).jpeg
    │   │       │   │   ├── th (47).jpeg
    │   │       │   │   ├── th (48).jpeg
    │   │       │   │   ├── th (49).jpeg
    │   │       │   │   ├── th (5).jpeg
    │   │       │   │   ├── th (50).jpeg
    │   │       │   │   ├── th (6).jpeg
    │   │       │   │   ├── th (7).jpeg
    │   │       │   │   ├── th (8).jpeg
    │   │       │   │   ├── th (9).jpeg
    │   │       │   │   └── th.jpeg
    │   │       └── query_bing_images.py
    │   │   ├── myclusters.csv
    │   │   ├── partisan_voting.csv
    │   │   ├── pcavsfit.png
    │   │   └── shuttle.png
    ├── lab5
    │   ├── Lab5.ipynb
    │   └── data
    │   │   ├── bias-variance-error.png
    │   │   ├── lc-hb.png
    │   │   ├── lc-hv.png
    │   │   ├── olive.csv
    │   │   └── reg-bias-variance.png
    ├── lab6
    │   ├── BayesLinear.ipynb
    │   └── _multivariate.py
    ├── lab7
    │   └── GibbsSampler.ipynb
    ├── lab8
    │   ├── anagrams.py
    │   ├── baseball_friends.csv
    │   ├── friend_affiliations.py
    │   ├── generate_friends.py
    │   ├── lab8_mapreduce.ipynb
    │   ├── most_used_word.py
    │   ├── names.txt
    │   ├── word_count.py
    │   └── word_list.txt
    └── lab9
    │   ├── lab_9.ipynb
    │   ├── lab_9_with_answers.ipynb
    │   └── linkedin_alexander_lex.csv
├── lec_03_statistical_graphs.ipynb
├── lec_03_statistical_graphs_mpl_default.ipynb
├── lec_04_scraping.ipynb
├── lec_04_wrangling.ipynb
├── lec_10_cross_val.ipynb
├── matplotlib_examples
    ├── geographic_plots.ipynb
    ├── imdb.tsv
    ├── scatter_plots.ipynb
    └── torn.csv
└── skeleton.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | 
24 | # Unit test / coverage reports
25 | .coverage
26 | .tox
27 | nosetests.xml
28 | 
29 | # Translations
30 | *.mo
31 | 
32 | # Mr Developer
33 | .mr.developer.cfg
34 | .project
35 | .pydevproject
36 | 
37 | #OSX
38 | __MACOSX/
39 | .DS_Store
40 | 
41 | #Ipython
42 | .ipynb_checkpoints/
43 | 


--------------------------------------------------------------------------------
/HW0.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "metadata": {
  3 |     "name": ""
  4 |   }, 
  5 |   "nbformat": 3, 
  6 |   "nbformat_minor": 0, 
  7 |   "worksheets": [
  8 |     {
  9 |       "cells": [
 10 |         {
 11 |           "source": [
 12 |             "# Homework 0\n", 
 13 |             "\n", 
 14 |             "### Due Tuesday, September 10 (but no submission is required)\n", 
 15 |             "\n", 
 16 |             "---\n", 
 17 |             "\n", 
 18 |             "Welcome to CS109 / STAT121 / AC209 / E-109 (http://cs109.org/).  In this class, we will be using a variety of tools that will require some initial configuration. To ensure everything goes smoothly moving forward, we will setup the majority of those tools in this homework. While some of this will likely be dull, doing it now will enable us to do more exciting work in the weeks that follow without getting bogged down in further software configuration. This homework will not be graded, however it is essential that you complete it timely since it will enable us to set up your accounts. You do not have to hand anything in, with the exception of filling out the online survey. \n", 
 19 |             "\n", 
 20 |             "## Class Survey, Piazza, and Introduction\n", 
 21 |             "\n", 
 22 |             "**Class Survey**\n", 
 23 |             "\n", 
 24 |             "Please complete the mandatory course survey located [here](https://docs.google.com/spreadsheet/viewform?formkey=dFg1ZFJwLWJ6ZWhWR1JJb0tES3lGMEE6MA#gid=0). It should only take a few moments of your time. Once you fill in the survey we will sign you up to the course forum on Piazza and the dropbox system that you will use to hand in the homework. It is imperative that you fill out the survey on time as we use the provided information to sign you up for these services. \n", 
 25 |             "\n", 
 26 |             "**Piazza**\n", 
 27 |             "\n", 
 28 |             "Go to [Piazza](https://piazza.com/harvard/fall2013/cs109/home) and sign up for the class using your Harvard e-mail address. \n", 
 29 |             "\n", 
 30 |             "You will use Piazza as a forum for discussion, to find team members, to arrange appointments, and to ask questions. Piazza should be your primary form of communication with the staff. Use the staff e-mail (staff@cs109.org) only for individual requests, e.g., to excuse yourself from a mandatory guest lecture. All readings, homeworks, and project descriptions will be announced on Piazza first. \n", 
 31 |             "\n", 
 32 |             "**Introduction**\n", 
 33 |             "\n", 
 34 |             "Once you are signed up to the Piazza course forum, introduce yourself to your classmates and course staff with a follow-up post in the introduction thread. Include your name/nickname, your affiliation, why you are taking this course, and tell us something interesting about yourself (e.g., an industry job, an unusual hobby, past travels, or a cool project you did, etc.). Also tell us whether you have experience with data science. \n", 
 35 |             "\n", 
 36 |             "## Programming expectations\n", 
 37 |             "\n", 
 38 |             "All the assignments and labs for this class will use Python and, for the most part, the browser-based IPython notebook format you are currently viewing. Knowledge of Python is not a prerequisite for this course, **provided you are comfortable learning on your own as needed**. While we have strived to make the programming component of this course straightforward, we will not devote much time to teaching prorgramming or Python syntax. Basically, you should feel comfortable with:\n", 
 39 |             "\n", 
 40 |             "* How to look up Python syntax on Google and StackOverflow.\n", 
 41 |             "* Basic programming concepts like functions, loops, arrays, dictionaries, strings, and if statements.\n", 
 42 |             "* How to learn new libraries by reading documentation.\n", 
 43 |             "* Asking questions on StackOverflow or Piazza.\n", 
 44 |             "\n", 
 45 |             "There are many online tutorials to introduce you to scientific python programming. [Here is one](https://github.com/jrjohansson/scientific-python-lectures) that is very nice. Lectures 1-4 are most relevant to this class.\n", 
 46 |             "\n", 
 47 |             "## Getting Python\n", 
 48 |             "\n", 
 49 |             "You will be using Python throughout the course, including many popular 3rd party Python libraries for scientific computing. [Anaconda](http://continuum.io/downloads) is an easy-to-install bundle of Python and most of these libraries. We recommend that you use Anaconda for this course.\n", 
 50 |             "\n", 
 51 |             "Please visit [this page](https://github.com/cs109/content/wiki/Installing-Python) and follow the instructions to set up Python\n", 
 52 |             "\n", 
 53 |             "<hline>\n", 
 54 |             "\n", 
 55 |             "## Hello, Python\n", 
 56 |             "\n", 
 57 |             "The IPython notebook is an application to build interactive computational notebooks. You'll be using them to complete labs and homework. Once you've set up Python, please <a href=https://raw.github.com/cs109/content/master/HW0.ipynb download=\"HW0.ipynb\">download this page</a>, and open it with IPython by typing\n", 
 58 |             "\n", 
 59 |             "```\n", 
 60 |             "ipython notebook <name_of_downloaded_file>\n", 
 61 |             "```\n", 
 62 |             "\n", 
 63 |             "For the rest of the assignment, use your local copy of this page, running on IPython.\n", 
 64 |             "\n", 
 65 |             "Notebooks are composed of many \"cells\", which can contain text (like this one), or code (like the one below). Double click on the cell below, and evaluate it by clicking the \"play\" button above, for by hitting shift + enter"
 66 |           ], 
 67 |           "cell_type": "markdown", 
 68 |           "metadata": {}
 69 |         }, 
 70 |         {
 71 |           "cell_type": "code", 
 72 |           "language": "python", 
 73 |           "outputs": [], 
 74 |           "collapsed": false, 
 75 |           "prompt_number": 1, 
 76 |           "input": [
 77 |             "x = [10, 20, 30, 40, 50]\n", 
 78 |             "for item in x:\n", 
 79 |             "    print \"Item is \", item"
 80 |           ], 
 81 |           "metadata": {}
 82 |         }, 
 83 |         {
 84 |           "source": [
 85 |             "## Python Libraries\n", 
 86 |             "\n", 
 87 |             "We will be using a several different libraries throughout this course. If you've successfully completed the [installation instructions](https://github.com/cs109/content/wiki/Installing-Python), all of the following statements should run."
 88 |           ], 
 89 |           "cell_type": "markdown", 
 90 |           "metadata": {}
 91 |         }, 
 92 |         {
 93 |           "cell_type": "code", 
 94 |           "language": "python", 
 95 |           "outputs": [], 
 96 |           "collapsed": false, 
 97 |           "prompt_number": 2, 
 98 |           "input": [
 99 |             "#IPython is what you are using now to run the notebook\n", 
100 |             "import IPython\n", 
101 |             "print \"IPython version:      %6.6s (need at least 1.0)\" % IPython.__version__\n", 
102 |             "\n", 
103 |             "# Numpy is a library for working with Arrays\n", 
104 |             "import numpy as np\n", 
105 |             "print \"Numpy version:        %6.6s (need at least 1.7.1)\" % np.__version__\n", 
106 |             "\n", 
107 |             "# SciPy implements many different numerical algorithms\n", 
108 |             "import scipy as sp\n", 
109 |             "print \"SciPy version:        %6.6s (need at least 0.12.0)\" % sp.__version__\n", 
110 |             "\n", 
111 |             "# Pandas makes working with data tables easier\n", 
112 |             "import pandas as pd\n", 
113 |             "print \"Pandas version:       %6.6s (need at least 0.11.0)\" % pd.__version__\n", 
114 |             "\n", 
115 |             "# Module for plotting\n", 
116 |             "import matplotlib\n", 
117 |             "print \"Mapltolib version:    %6.6s (need at least 1.2.1)\" % matplotlib.__version__\n", 
118 |             "\n", 
119 |             "# SciKit Learn implements several Machine Learning algorithms\n", 
120 |             "import sklearn\n", 
121 |             "print \"Scikit-Learn version: %6.6s (need at least 0.13.1)\" % sklearn.__version__\n", 
122 |             "\n", 
123 |             "# Requests is a library for getting data from the Web\n", 
124 |             "import requests\n", 
125 |             "print \"requests version:     %6.6s (need at least 1.2.3)\" % requests.__version__\n", 
126 |             "\n", 
127 |             "# Networkx is a library for working with networks\n", 
128 |             "import networkx as nx\n", 
129 |             "print \"NetworkX version:     %6.6s (need at least 1.7)\" % nx.__version__\n", 
130 |             "\n", 
131 |             "#BeautifulSoup is a library to parse HTML and XML documents\n", 
132 |             "import BeautifulSoup\n", 
133 |             "print \"BeautifulSoup version:%6.6s (need at least 3.2)\" % BeautifulSoup.__version__\n", 
134 |             "\n", 
135 |             "#MrJob is a library to run map reduce jobs on Amazon's computers\n", 
136 |             "import mrjob\n", 
137 |             "print \"Mr Job version:       %6.6s (need at least 0.4)\" % mrjob.__version__\n", 
138 |             "\n", 
139 |             "#Pattern has lots of tools for working with data from the internet\n", 
140 |             "import pattern\n", 
141 |             "print \"Pattern version:      %6.6s (need at least 2.6)\" % pattern.__version__"
142 |           ], 
143 |           "metadata": {}
144 |         }, 
145 |         {
146 |           "source": [
147 |             "If any of these libraries are missing or out of date, you will need to [install them](https://github.com/cs109/content/wiki/Installing-Python#installing-additional-libraries) and restart IPython"
148 |           ], 
149 |           "cell_type": "markdown", 
150 |           "metadata": {}
151 |         }, 
152 |         {
153 |           "source": [
154 |             "## Hello matplotlib"
155 |           ], 
156 |           "cell_type": "markdown", 
157 |           "metadata": {}
158 |         }, 
159 |         {
160 |           "source": [
161 |             "The notebook integrates nicely with Matplotlib, the primary plotting package for python. This should embed a figure of a sine wave:"
162 |           ], 
163 |           "cell_type": "markdown", 
164 |           "metadata": {}
165 |         }, 
166 |         {
167 |           "cell_type": "code", 
168 |           "language": "python", 
169 |           "outputs": [], 
170 |           "collapsed": false, 
171 |           "prompt_number": 3, 
172 |           "input": [
173 |             "#this line prepares IPython for working with matplotlib\n", 
174 |             "%matplotlib inline  \n", 
175 |             "\n", 
176 |             "# this actually imports matplotlib\n", 
177 |             "import matplotlib.pyplot as plt  \n", 
178 |             "\n", 
179 |             "x = np.linspace(0, 10, 30)  #array of 30 points from 0 to 10\n", 
180 |             "y = np.sin(x)\n", 
181 |             "z = y + np.random.normal(size=30) * .2\n", 
182 |             "plt.plot(x, y, 'ro-', label='A sine wave')\n", 
183 |             "plt.plot(x, z, 'b-', label='Noisy sine')\n", 
184 |             "plt.legend(loc = 'lower right')\n", 
185 |             "plt.xlabel(\"X axis\")\n", 
186 |             "plt.ylabel(\"Y axis\")           "
187 |           ], 
188 |           "metadata": {}
189 |         }, 
190 |         {
191 |           "source": [
192 |             "If that last cell complained about the `%matplotlib` line, you need to update IPython to v1.0, and restart the notebook. See the [installation page](https://github.com/cs109/content/wiki/Installing-Python)"
193 |           ], 
194 |           "cell_type": "markdown", 
195 |           "metadata": {}
196 |         }, 
197 |         {
198 |           "source": [
199 |             "## Hello Numpy\n", 
200 |             "\n", 
201 |             "The Numpy array processing library is the basis of nearly all numerical computing in Python. Here's a 30 second crash course. For more details, consult Chapter 4 of Python for Data Analysis, or the [Numpy User's Guide](http://docs.scipy.org/doc/numpy-dev/user/index.html)"
202 |           ], 
203 |           "cell_type": "markdown", 
204 |           "metadata": {}
205 |         }, 
206 |         {
207 |           "cell_type": "code", 
208 |           "language": "python", 
209 |           "outputs": [], 
210 |           "collapsed": false, 
211 |           "prompt_number": 4, 
212 |           "input": [
213 |             "print \"Make a 3 row x 4 column array of random numbers\"\n", 
214 |             "x = np.random.random((3, 4))\n", 
215 |             "print x\n", 
216 |             "print\n", 
217 |             "\n", 
218 |             "print \"Add 1 to every element\"\n", 
219 |             "x = x + 1\n", 
220 |             "print x\n", 
221 |             "print\n", 
222 |             "\n", 
223 |             "print \"Get the element at row 1, column 2\"\n", 
224 |             "print x[1, 2]\n", 
225 |             "print\n", 
226 |             "\n", 
227 |             "# The colon syntax is called \"slicing\" the array. \n", 
228 |             "print \"Get the first row\"\n", 
229 |             "print x[0, :]\n", 
230 |             "print\n", 
231 |             "\n", 
232 |             "print \"Get every 2nd column of the first row\"\n", 
233 |             "print x[0, ::2]\n", 
234 |             "print"
235 |           ], 
236 |           "metadata": {}
237 |         }, 
238 |         {
239 |           "source": [
240 |             "Print the maximum, minimum, and mean of the array. This does **not** require writing a loop. In the code cell below, type `x.m<TAB>`, to find built-in operations for common array statistics like this"
241 |           ], 
242 |           "cell_type": "markdown", 
243 |           "metadata": {}
244 |         }, 
245 |         {
246 |           "cell_type": "code", 
247 |           "language": "python", 
248 |           "outputs": [], 
249 |           "collapsed": false, 
250 |           "prompt_number": 5, 
251 |           "input": [
252 |             "#your code here\n"
253 |           ], 
254 |           "metadata": {}
255 |         }, 
256 |         {
257 |           "source": [
258 |             "Call the `x.max` function again, but use the `axis` keyword to print the maximum of each row in x."
259 |           ], 
260 |           "cell_type": "markdown", 
261 |           "metadata": {}
262 |         }, 
263 |         {
264 |           "cell_type": "code", 
265 |           "language": "python", 
266 |           "outputs": [], 
267 |           "collapsed": false, 
268 |           "prompt_number": 6, 
269 |           "input": [
270 |             "#your code here\n"
271 |           ], 
272 |           "metadata": {}
273 |         }, 
274 |         {
275 |           "source": [
276 |             "Here's a way to quickly simulate 500 coin \"fair\" coin tosses (where the probabily of getting Heads is 50%, or 0.5)"
277 |           ], 
278 |           "cell_type": "markdown", 
279 |           "metadata": {}
280 |         }, 
281 |         {
282 |           "cell_type": "code", 
283 |           "language": "python", 
284 |           "outputs": [], 
285 |           "collapsed": false, 
286 |           "prompt_number": 7, 
287 |           "input": [
288 |             "x = np.random.binomial(500, .5)\n", 
289 |             "print \"number of heads:\", x"
290 |           ], 
291 |           "metadata": {}
292 |         }, 
293 |         {
294 |           "source": [
295 |             "Repeat this simulation 500 times, and use the [plt.hist() function](http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.hist) to plot a histogram of the number of Heads (1s) in each simulation"
296 |           ], 
297 |           "cell_type": "markdown", 
298 |           "metadata": {}
299 |         }, 
300 |         {
301 |           "cell_type": "code", 
302 |           "language": "python", 
303 |           "outputs": [], 
304 |           "collapsed": false, 
305 |           "prompt_number": 8, 
306 |           "input": [
307 |             "#your code here\n"
308 |           ], 
309 |           "metadata": {}
310 |         }, 
311 |         {
312 |           "source": [
313 |             "## The Monty Hall Problem\n", 
314 |             "\n", 
315 |             "\n", 
316 |             "Here's a fun and perhaps surprising statistical riddle, and a good way to get some practice writing python functions\n", 
317 |             "\n", 
318 |             "In a gameshow, contestants try to guess which of 3 closed doors contain a cash prize (goats are behind the other two doors). Of course, the odds of choosing the correct door are 1 in 3. As a twist, the host of the show occasionally opens a door after a contestant makes his or her choice. This door is always one of the two the contestant did not pick, and is also always one of the goat doors (note that it is always possible to do this, since there are two goat doors). At this point, the contestant has the option of keeping his or her original choice, or swtiching to the other unopened door. The question is: is there any benefit to switching doors? The answer surprises many people who haven't heard the question before.\n", 
319 |             "\n", 
320 |             "We can answer the problem by running simulations in Python. We'll do it in several parts.\n", 
321 |             "\n", 
322 |             "First, write a function called `simulate_prizedoor`. This function will simulate the location of the prize in many games -- see the detailed specification below:"
323 |           ], 
324 |           "cell_type": "markdown", 
325 |           "metadata": {}
326 |         }, 
327 |         {
328 |           "cell_type": "code", 
329 |           "language": "python", 
330 |           "outputs": [], 
331 |           "collapsed": false, 
332 |           "prompt_number": 9, 
333 |           "input": [
334 |             "\"\"\"\n", 
335 |             "Function\n", 
336 |             "--------\n", 
337 |             "simulate_prizedoor\n", 
338 |             "\n", 
339 |             "Generate a random array of 0s, 1s, and 2s, representing\n", 
340 |             "hiding a prize between door 0, door 1, and door 2\n", 
341 |             "\n", 
342 |             "Parameters\n", 
343 |             "----------\n", 
344 |             "nsim : int\n", 
345 |             "    The number of simulations to run\n", 
346 |             "\n", 
347 |             "Returns\n", 
348 |             "-------\n", 
349 |             "sims : array\n", 
350 |             "    Random array of 0s, 1s, and 2s\n", 
351 |             "\n", 
352 |             "Example\n", 
353 |             "-------\n", 
354 |             ">>> print simulate_prizedoor(3)\n", 
355 |             "array([0, 0, 2])\n", 
356 |             "\"\"\"\n", 
357 |             "def simulate_prizedoor(nsim):\n", 
358 |             "    #compute here\n", 
359 |             "    return answer\n", 
360 |             "#your code here\n"
361 |           ], 
362 |           "metadata": {}
363 |         }, 
364 |         {
365 |           "source": [
366 |             "Next, write a function that simulates the contestant's guesses for `nsim` simulations. Call this function `simulate_guess`. The specs:"
367 |           ], 
368 |           "cell_type": "markdown", 
369 |           "metadata": {}
370 |         }, 
371 |         {
372 |           "cell_type": "code", 
373 |           "language": "python", 
374 |           "outputs": [], 
375 |           "collapsed": false, 
376 |           "prompt_number": 10, 
377 |           "input": [
378 |             "\"\"\"\n", 
379 |             "Function\n", 
380 |             "--------\n", 
381 |             "simulate_guess\n", 
382 |             "\n", 
383 |             "Return any strategy for guessing which door a prize is behind. This\n", 
384 |             "could be a random strategy, one that always guesses 2, whatever.\n", 
385 |             "\n", 
386 |             "Parameters\n", 
387 |             "----------\n", 
388 |             "nsim : int\n", 
389 |             "    The number of simulations to generate guesses for\n", 
390 |             "\n", 
391 |             "Returns\n", 
392 |             "-------\n", 
393 |             "guesses : array\n", 
394 |             "    An array of guesses. Each guess is a 0, 1, or 2\n", 
395 |             "\n", 
396 |             "Example\n", 
397 |             "-------\n", 
398 |             ">>> print simulate_guess(5)\n", 
399 |             "array([0, 0, 0, 0, 0])\n", 
400 |             "\"\"\"\n", 
401 |             "#your code here\n"
402 |           ], 
403 |           "metadata": {}
404 |         }, 
405 |         {
406 |           "source": [
407 |             "Next, write a function, `goat_door`, to simulate randomly revealing one of the goat doors that a contestant didn't pick."
408 |           ], 
409 |           "cell_type": "markdown", 
410 |           "metadata": {}
411 |         }, 
412 |         {
413 |           "cell_type": "code", 
414 |           "language": "python", 
415 |           "outputs": [], 
416 |           "collapsed": false, 
417 |           "prompt_number": 11, 
418 |           "input": [
419 |             "\"\"\"\n", 
420 |             "Function\n", 
421 |             "--------\n", 
422 |             "goat_door\n", 
423 |             "\n", 
424 |             "Simulate the opening of a \"goat door\" that doesn't contain the prize,\n", 
425 |             "and is different from the contestants guess\n", 
426 |             "\n", 
427 |             "Parameters\n", 
428 |             "----------\n", 
429 |             "prizedoors : array\n", 
430 |             "    The door that the prize is behind in each simulation\n", 
431 |             "guesses : array\n", 
432 |             "    THe door that the contestant guessed in each simulation\n", 
433 |             "\n", 
434 |             "Returns\n", 
435 |             "-------\n", 
436 |             "goats : array\n", 
437 |             "    The goat door that is opened for each simulation. Each item is 0, 1, or 2, and is different\n", 
438 |             "    from both prizedoors and guesses\n", 
439 |             "\n", 
440 |             "Examples\n", 
441 |             "--------\n", 
442 |             ">>> print goat_door(np.array([0, 1, 2]), np.array([1, 1, 1]))\n", 
443 |             ">>> array([2, 2, 0])\n", 
444 |             "\"\"\"\n", 
445 |             "#your code here\n"
446 |           ], 
447 |           "metadata": {}
448 |         }, 
449 |         {
450 |           "source": [
451 |             "Write a function, `switch_guess`, that represents the strategy of always switching a guess after the goat door is opened."
452 |           ], 
453 |           "cell_type": "markdown", 
454 |           "metadata": {}
455 |         }, 
456 |         {
457 |           "cell_type": "code", 
458 |           "language": "python", 
459 |           "outputs": [], 
460 |           "collapsed": false, 
461 |           "prompt_number": 12, 
462 |           "input": [
463 |             "\"\"\"\n", 
464 |             "Function\n", 
465 |             "--------\n", 
466 |             "switch_guess\n", 
467 |             "\n", 
468 |             "The strategy that always switches a guess after the goat door is opened\n", 
469 |             "\n", 
470 |             "Parameters\n", 
471 |             "----------\n", 
472 |             "guesses : array\n", 
473 |             "     Array of original guesses, for each simulation\n", 
474 |             "goatdoors : array\n", 
475 |             "     Array of revealed goat doors for each simulation\n", 
476 |             "\n", 
477 |             "Returns\n", 
478 |             "-------\n", 
479 |             "The new door after switching. Should be different from both guesses and goatdoors\n", 
480 |             "\n", 
481 |             "Examples\n", 
482 |             "--------\n", 
483 |             ">>> print switch_guess(np.array([0, 1, 2]), np.array([1, 2, 1]))\n", 
484 |             ">>> array([2, 0, 0])\n", 
485 |             "\"\"\"\n", 
486 |             "#your code here\n"
487 |           ], 
488 |           "metadata": {}
489 |         }, 
490 |         {
491 |           "source": [
492 |             "Last function: write a `win_percentage` function that takes an array of `guesses` and `prizedoors`, and returns the percent of correct guesses"
493 |           ], 
494 |           "cell_type": "markdown", 
495 |           "metadata": {}
496 |         }, 
497 |         {
498 |           "cell_type": "code", 
499 |           "language": "python", 
500 |           "outputs": [], 
501 |           "collapsed": false, 
502 |           "prompt_number": 13, 
503 |           "input": [
504 |             "\"\"\"\n", 
505 |             "Function\n", 
506 |             "--------\n", 
507 |             "win_percentage\n", 
508 |             "\n", 
509 |             "Calculate the percent of times that a simulation of guesses is correct\n", 
510 |             "\n", 
511 |             "Parameters\n", 
512 |             "-----------\n", 
513 |             "guesses : array\n", 
514 |             "    Guesses for each simulation\n", 
515 |             "prizedoors : array\n", 
516 |             "    Location of prize for each simulation\n", 
517 |             "\n", 
518 |             "Returns\n", 
519 |             "--------\n", 
520 |             "percentage : number between 0 and 100\n", 
521 |             "    The win percentage\n", 
522 |             "\n", 
523 |             "Examples\n", 
524 |             "---------\n", 
525 |             ">>> print win_percentage(np.array([0, 1, 2]), np.array([0, 0, 0]))\n", 
526 |             "33.333\n", 
527 |             "\"\"\"\n", 
528 |             "#your code here\n"
529 |           ], 
530 |           "metadata": {}
531 |         }, 
532 |         {
533 |           "source": [
534 |             "Now, put it together. Simulate 10000 games where contestant keeps his original guess, and 10000 games where the contestant switches his door after a  goat door is revealed. Compute the percentage of time the contestant wins under either strategy. Is one strategy better than the other?"
535 |           ], 
536 |           "cell_type": "markdown", 
537 |           "metadata": {}
538 |         }, 
539 |         {
540 |           "cell_type": "code", 
541 |           "language": "python", 
542 |           "outputs": [], 
543 |           "collapsed": false, 
544 |           "prompt_number": 14, 
545 |           "input": [
546 |             "#your code here\n"
547 |           ], 
548 |           "metadata": {}
549 |         }, 
550 |         {
551 |           "source": [
552 |             "Many people find this answer counter-intuitive (famously, PhD mathematicians have incorrectly claimed the result must be wrong. Clearly, none of them knew Python). \n", 
553 |             "\n", 
554 |             "One of the best ways to build intuition about why opening a Goat door affects the odds is to re-run the experiment with 100 doors and one prize. If the game show host opens 98 goat doors after you make your initial selection, would you want to keep your first pick or switch? Can you generalize your simulation code to handle the case of `n` doors?"
555 |           ], 
556 |           "cell_type": "markdown", 
557 |           "metadata": {}
558 |         }
559 |       ], 
560 |       "metadata": {}
561 |     }
562 |   ]
563 | }


--------------------------------------------------------------------------------
/InstructionsForAmazonEMR.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "metadata": {
 3 |   "name": ""
 4 |  },
 5 |  "nbformat": 3,
 6 |  "nbformat_minor": 0,
 7 |  "worksheets": [
 8 |   {
 9 |    "cells": [
10 |     {
11 |      "cell_type": "markdown",
12 |      "metadata": {},
13 |      "source": [
14 |       "#Instructions for Amazon Setup"
15 |      ]
16 |     },
17 |     {
18 |      "cell_type": "markdown",
19 |      "metadata": {},
20 |      "source": [
21 |       "### Getting an Amazon account and your credits:\n",
22 |       "\n",
23 |       "For the class, Amazon will be providing each of you with $100 of free AWS credits. \n",
24 |       "\n",
25 |       "First, you must register for an AWS account, during which you will be required to enter your own personal credit card information. Once registered, we will provide you with a $100 credit code. It is important you understand that once the provided credit code is used up, your credit card will be charged for any additional AWS usage, so it is important to keep track of your usage.\n",
26 |       "\n",
27 |       "The following steps will guide you through the registration process:\n",
28 |       "\n",
29 |       "1. [Sign up for AWS](https://aws-portal.amazon.com/gp/aws/developer/registration/index.html) using either your personal Amazon account or by creating a new AWS account.\n",
30 |       "2. After signing up for AWS, [sign up for EC2](https://aws-portal.amazon.com/gp/aws/developer/subscription/index.html?productCode=AmazonEC2), which will include registration for Elastic MapReduce and a other similar services. Some of these other services may carry a cost if you decide to use them for your own personal use.\n",
31 |       "3. Wait for an email from us with your AWS credit code.\n",
32 |       "4. Login to your AWS Account page. Click Payment Method. At the bottom of the page, click Redeem/View AWS Credits. Then, enter your code and click redeem.\n",
33 |       "5. As mentioned in class, you may want to set up a [billing alert](https://portal.aws.amazon.com/gp/aws/developer/account/index.html?ie=UTF8&action=billing-alerts&) using this link.\n",
34 |       "\n",
35 |       "You can manage your account via the [AWS Console](https://aws.amazon.com/console).\n",
36 |       "\n",
37 |       "####Get setup to run mrjob on EMR\n",
38 |       "\n",
39 |       "You can find out more about MRJob [here](http://packages.python.org/mrjob/index.html).\n",
40 |       "\n",
41 |       "To set yourself up to use `mrjob` on Amazon, after getting your Amazon credits and setting up an AWS account, read the following [QuickStart](http://pythonhosted.org/mrjob/guides/emr-quickstart.html). If you follow the instructions in there, you should have set up your access key, optionally set up ssh tunnel access, and written your access keys to the ~/.mrjob.conf file. You could also set\n",
42 |       "\n",
43 |       "    MRJOB_CONF=/home/you/yourpath/fileName.txt\n",
44 |       "    \n",
45 |       "with the appropriate syntax in bash/csh/zsh/command.exe.\n",
46 |       "\n",
47 |       "Use Region `us-east-1` when prompted for choosing a region. This might sometimes show up as Virginia. Its ok to use another one, but beware that if you usedifferent regions at different times you might forget to make sure your services are shut down: you will then incur a cost.\n",
48 |       "\n",
49 |       "**Note**: Just a reminder, with these keys ANYONE can send a job to Amazon under your guise (and you will be charged). It should be fairly obvious that you therefore do not want to distribute these keys. If at anytime your keys are compromised, you can log into [your account](http://aws.amazon.com/account), click on \"Security Credentials\", create a new pair, and deactivate the current pair.\n",
50 |       "\n",
51 |       "If you decide to use AWS for your final project, a configuration file is preferable to avoid the repetition of reconfiguration. However, you can also use the command line to configure MRJob. \n",
52 |       "\n",
53 |       "Type following two commands in your terminal:\n",
54 |       "\n",
55 |       "* export AWS_ACCESS_KEY ID=xxxxxx\n",
56 |       "* export AWS_SECRET_ACCESS_KEY=yyyyyy\n",
57 |       "\n",
58 |       "where the xxxxxx and yyyyyy are your Access Key ID and Secret Access Key, respectively. (or the windows or csh equivalents).\n",
59 |       "\n",
60 |       "By default, a single \u201csmall standard on-demand\u201d instance will be used for computation. However, these settings can be modified via any of the previously mentioned configuration methods using the \u201cec2 instance type\u201d and \u201cnum ec2 instances\u201d flags. See [here](http://packages.python.org/mrjob/configs-runners.html#on-emr) for more details on these flags as well as others. \n",
61 |       "\n",
62 |       "### Testing:\n",
63 |       "\n",
64 |       "\n",
65 |       "At this point it is a good idea to try running the scripts at [the mrjob quickstart](http://pythonhosted.org/mrjob/guides/quickstart.html). Note that EMR is billed by the hour, so run as many tests as you can (or as much of your code as you can) in batches of 1 hour, so you can have more credits left over for your own future use.\n",
66 |       "\n",
67 |       "\n",
68 |       "**Important**: Please always make sure that your code is bug free, before actually submitting it to amazon. Try to run the job locally first and see if it produces the desired result. Then, if this worked, you are ready to proceed to the cloud. The homework problems are small and your free credit should provide you with a lot of room for running and testing on Amazon. However, it is your responsibility to make sure the jobs terminate properly and do not cause excessive costs. You can always monitor your currently running jobs using [this overview at region US-EAST-1](https://console.aws.amazon.com/elasticmapreduce/home?region=us-east-1) of your MapReduce job flows."
69 |      ]
70 |     }
71 |    ],
72 |    "metadata": {}
73 |   }
74 |  ]
75 | }
76 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Harvard CS 109: Data Science
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Welcome to CS109: Data Science
 2 | =======
 3 | 
 4 | ## Assignments
 5 | 
 6 | * [Homework 0](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW0.ipynb): Hello, world ([solutions](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW0_solutions.ipynb))
 7 | * [Homework 1](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW1.ipynb): Which of two things is larger? ([solutions](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW1_solutions.ipynb))
 8 | * [Homework 2](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW2.ipynb): Desperately Seeking Silver ([solutions](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW2_solutions.ipynb))
 9 | * [Homework 3](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW3.ipynb): Bayesian Tomatoes ([solutions](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW3_solutions.ipynb))
10 | * [Homework 4](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW4.ipynb): Do We Really Need Chocolate Recommendations? ([solutions](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW4_solutions.ipynb))
11 | * [Homework 5](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW5.ipynb): Networks and Congress ([solutions](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/HW5_solutions.ipynb))
12 | 
13 | ## Lecture Supplements
14 | 
15 | * [A gallery of statistical graphs with matplotlib](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/lec_03_statistical_graphs.ipynb) (see also the version with [default matplotlib styles](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/lec_03_statistical_graphs_mpl_default.ipynb))
16 | * [A rubric for data wrangling and exploratory data analysis](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/lec_04_wrangling.ipynb)
17 | * [Web Scraping and Parsing Demo](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/lec_04_scraping.ipynb)
18 | * [Cross Validation: The Right and Wrong Way](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/lec_10_cross_val.ipynb)
19 | 
20 | ## Labs
21 | 
22 | * [Lab 2: Web Scraping](https://github.com/cs109/content/tree/master/labs/lab2)
23 | * [Lab 3: EDA, Pandas, Matplotlib](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/labs/lab3/lab3full.ipynb)
24 | * [Lab 4: Scikit-Learn, Regression, PCA](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/labs/lab4/Lab4full.ipynb)
25 | * [Lab 5: Bias, Variance, Cross-Validation](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/labs/lab5/Lab5.ipynb)
26 | * [Lab 6: Bayes, Linear Regression, and Metropolis Sampling](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/labs/lab6/BayesLinear.ipynb)
27 | * [Lab 7: Gibbs Sampling](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/labs/lab7/GibbsSampler.ipynb)
28 | * [Lab 8: MapReduce](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/labs/lab8/lab8_mapreduce.ipynb)
29 | * [Lab 9: Networks](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/labs/lab9/lab_9.ipynb)
30 | * [Lab 10: Support Vector Machines](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/labs/lab10/Lab_10.ipynb)
31 | 
32 | 
33 | ## Other Resources
34 | 
35 | * [Setting up Python](https://github.com/cs109/content/wiki/Installing-Python)
36 | 


--------------------------------------------------------------------------------
/computesim.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from mrjob.job import MRJob
 4 | from itertools import combinations, permutations
 5 | from math import sqrt
 6 | 
 7 | from scipy.stats.stats import pearsonr
 8 | 
 9 | class RestaurantSimilarities(MRJob):
10 | 
11 |     def steps(self):
12 |         thesteps = [
13 |             self.mr(mapper=self.line_mapper, reducer=self.users_items_collector),
14 |             self.mr(mapper=self.pair_items_mapper, reducer=self.calc_sim_collector)
15 |         ]
16 |         return thesteps
17 | 
18 |     def line_mapper(self,_,line):
19 |         user_id,business_id,stars,business_avg,user_avg=line.split(',')
20 |         yield user_id, (business_id,stars,business_avg,user_avg)
21 | 
22 |     def users_items_collector(self, user_id, values):
23 |         ratings=[]
24 |         for business_id,stars,business_avg,user_avg in values:
25 |             ratings.append((business_id,(stars, user_avg)))
26 |         yield user_id, ratings
27 | 
28 |     def pair_items_mapper(self, user_id, values):
29 |         ratings = values
30 |         for biz1tuple, biz2tuple in combinations(ratings, 2):
31 |             biz1, biz1r=biz1tuple
32 |             biz2, biz2r=biz2tuple
33 |             if biz1 <= biz2 :
34 |                 yield (biz1, biz2), (biz1r, biz2r)
35 |             else:
36 |                 yield (biz2, biz1), (biz2r, biz1r)
37 | 
38 |     def calc_sim_collector(self, key, values):
39 |         (rest1, rest2), common_ratings = key, values
40 |         diff1=[]
41 |         diff2=[]
42 |         n_common=0
43 | 
44 | 
45 |         for rt1, rt2 in common_ratings:
46 |             diff1.append(float(rt1[0])-float(rt1[1]))
47 |             diff2.append(float(rt2[0])-float(rt2[1]))
48 |             n_common=n_common+1
49 |         if n_common==0:
50 |             rho=0.
51 |         else:
52 |             rho=pearsonr(diff1, diff2)[0]
53 |             if np.isnan(rho):
54 |                 rho=0.
55 |         yield (rest1, rest2), (rho, n_common)
56 | 
57 | 
58 | #Below MUST be there for things to work!
59 | if __name__ == '__main__':
60 |     RestaurantSimilarities.run()
61 | 


--------------------------------------------------------------------------------
/computesim2.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from mrjob.job import MRJob
 4 | from itertools import combinations, permutations
 5 | from math import sqrt
 6 | import mrjob
 7 | 
 8 | from scipy.stats.stats import pearsonr
 9 | 
10 | class RestaurantSimilarities(MRJob):
11 | 
12 |     def steps(self):
13 |         thesteps = [
14 |             self.mr(mapper=self.line_mapper, reducer=self.users_items_collector),
15 |             self.mr(mapper=self.pair_items_mapper, reducer=self.calc_sim_collector),
16 |             self.mr(mapper=self.ranking_mapper, reducer=self.top_similar_collector)
17 |         ]
18 |         return thesteps
19 | 
20 |     def line_mapper(self,_,line):
21 |         user_id,business_id,stars,business_avg,user_avg=line.split(',')
22 |         yield user_id, (business_id,stars,business_avg,user_avg)
23 | 
24 |     def users_items_collector(self, user_id, values):
25 |         ratings=[]
26 |         for business_id,stars,business_avg,user_avg in values:
27 |             ratings.append((business_id,(stars, user_avg)))
28 |         yield user_id, ratings
29 | 
30 |     def pair_items_mapper(self, user_id, values):
31 |         ratings = values
32 |         for biz1tuple, biz2tuple in combinations(ratings, 2):
33 |             biz1, biz1r=biz1tuple
34 |             biz2, biz2r=biz2tuple
35 |             if biz1 <= biz2 :
36 |                 yield (biz1, biz2), (biz1r, biz2r)
37 |             else:
38 |                 yield (biz2, biz1), (biz2r, biz1r)
39 | 
40 |     def calc_sim_collector(self, key, values):
41 |         (rest1, rest2), common_ratings = key, values
42 |         diff1=[]
43 |         diff2=[]
44 |         n_common=0
45 | 
46 | 
47 |         for rt1, rt2 in common_ratings:
48 |             diff1.append(float(rt1[0])-float(rt1[1]))
49 |             diff2.append(float(rt2[0])-float(rt2[1]))
50 |             n_common=n_common+1
51 |         if n_common==0:
52 |             rho=0.
53 |         else:
54 |             rho=pearsonr(diff1, diff2)[0]
55 |             if np.isnan(rho):
56 |                 rho=0.
57 |         yield (rest1, rest2), (rho, n_common)
58 | 
59 |     def ranking_mapper(self, restaurants, values):
60 |         sim, n_common = values
61 |         rest1, rest2 = restaurants
62 |         if int(n_common) > 0:
63 |             yield (rest1), (sim, rest2, n_common)
64 | 
65 |     def top_similar_collector(self, key, values):
66 |         rest1 = key
67 |         for sim, rest2, n_common in sorted(values, reverse=True):
68 |             yield None, (rest1, rest2, sim, n_common)
69 | 
70 | #Below MUST be there for things to work!
71 | if __name__ == '__main__':
72 |     RestaurantSimilarities.run()
73 | 


--------------------------------------------------------------------------------
/labs/lab10/final_lab_images.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab10/final_lab_images.pdf


--------------------------------------------------------------------------------
/labs/lab10/svm.csv:
--------------------------------------------------------------------------------
 1 | -1.214,-1.178,0
 2 | -4.231,-5.235,0
 3 | -2.318,-7.908,0
 4 | -8.435,-1.324,0
 5 | -9.992,-3.456,0
 6 | 1.123,8.435,1
 7 | 4.389,3.123,1
 8 | 1.113,1.888,1
 9 | 5.312,9.123,1
10 | 4.124,8.432,1


--------------------------------------------------------------------------------
/labs/lab10/test.csv:
--------------------------------------------------------------------------------
1 | -1.06E-01,-8.15E-02,-1.00E+001.78E-01,-3.46E-01,-1.00E+001.02E-01,7.18E-01,1.00E+006.94E-01,6.23E-01,-1.00E+002.35E-02,7.27E-01,1.00E+00-3.20E-01,-8.34E-01,-1.00E+00-1.87E-01,5.39E-01,1.00E+00-6.37E-01,1.53E-01,1.00E+00-4.74E-01,8.54E-01,1.00E+00-3.56E-02,-2.72E-01,-1.00E+00-1.49E-01,1.62E-01,-1.00E+00-1.81E-01,-1.29E-01,-1.00E+00-6.02E-01,9.26E-01,1.00E+006.98E-01,7.95E-01,-1.00E+008.82E-01,-2.01E-01,1.00E+00-9.24E-01,3.87E-01,1.00E+00-7.66E-01,-1.13E-02,1.00E+001.36E-01,3.17E-02,-1.00E+00-1.55E-01,-3.31E-01,-1.00E+004.85E-01,2.99E-01,-1.00E+00-6.03E-01,3.33E-01,1.00E+00-5.73E-01,8.28E-01,1.00E+00-6.35E-01,-4.75E-01,-1.00E+009.09E-01,-7.85E-01,1.00E+002.52E-01,-8.94E-01,1.00E+00-5.18E-01,9.60E-01,1.00E+00-3.86E-01,-3.18E-01,-1.00E+008.23E-01,-1.28E-01,1.00E+008.22E-01,-8.77E-01,1.00E+00-5.04E-01,9.80E-01,1.00E+005.34E-01,8.21E-01,-1.00E+00-8.95E-01,-2.40E-01,1.00E+003.43E-01,4.75E-01,-1.00E+007.09E-01,5.62E-01,-1.00E+00-1.00E+00,6.05E-02,1.00E+005.24E-01,7.35E-01,-1.00E+00-5.60E-01,7.56E-01,1.00E+006.98E-01,-6.72E-01,1.00E+004.90E-01,7.85E-01,-1.00E+00-3.27E-01,3.43E-01,1.00E+00-2.93E-03,-4.15E-01,-1.00E+00-6.31E-01,3.53E-01,1.00E+009.14E-01,5.93E-01,-1.00E+002.18E-01,3.97E-02,-1.00E+00-6.16E-01,-8.87E-01,-1.00E+00-5.29E-01,2.87E-02,1.00E+00-4.07E-01,1.05E+00,1.00E+00-2.30E-01,7.14E-02,-1.00E+00-5.02E-01,8.34E-01,1.00E+00-5.08E-01,7.93E-01,1.00E+00-7.91E-01,1.88E-01,1.00E+00-3.83E-01,8.25E-01,1.00E+008.22E-01,4.01E-01,-1.00E+009.86E-01,-3.29E-01,1.00E+00-1.40E-02,-1.52E-01,-1.00E+00-5.42E-02,9.14E-01,1.00E+00-1.07E+00,-7.20E-01,-1.00E+00-2.43E-01,-1.04E+00,1.00E+00-3.24E-01,-2.83E-01,-1.00E+002.48E-01,-2.56E-01,-1.00E+00-1.72E-01,-8.49E-01,1.00E+00-4.17E-01,-3.93E-01,-1.00E+00-3.48E-01,-5.74E-01,-1.00E+00-8.52E-01,-7.23E-01,-1.00E+00-7.25E-01,-3.74E-01,-1.00E+003.45E-01,-2.23E-02,-1.00E+007.42E-01,7.41E-01,-1.00E+00-1.37E-01,-3.47E-01,-1.00E+001.06E-01,6.34E-01,1.00E+003.32E-01,-5.66E-01,1.00E+00-4.18E-01,9.49E-01,1.00E+00-4.05E-01,-6.13E-01,-1.00E+00-7.97E-01,9.07E-01,1.00E+008.76E-01,3.60E-01,-1.00E+005.44E-01,-1.81E-01,1.00E+007.54E-02,-5.11E-01,-1.00E+005.64E-01,7.72E-01,-1.00E+008.17E-01,5.26E-01,-1.00E+00-3.77E-01,1.06E-01,1.00E+004.36E-01,1.50E-01,-1.00E+003.97E-01,-5.49E-01,1.00E+00-2.74E-01,6.02E-01,1.00E+00-9.89E-01,1.58E-01,1.00E+00-5.16E-01,-8.25E-01,-1.00E+009.81E-01,5.46E-01,-1.00E+007.78E-01,-8.93E-01,1.00E+00-2.59E-01,-6.44E-01,-1.00E+00-2.38E-01,-9.07E-01,-1.00E+00-6.04E-01,8.82E-02,1.00E+00-2.80E-01,-1.51E-02,-1.00E+00-2.04E-01,7.98E-01,1.00E+00-1.64E-01,4.36E-01,1.00E+007.44E-01,4.11E-01,-1.00E+00-3.32E-01,-4.59E-01,-1.00E+00-2.77E-02,-3.61E-01,-1.00E+007.07E-01,7.54E-01,-1.00E+00-8.23E-01,-3.03E-01,1.00E+00-9.85E-01,-3.84E-01,1.00E+00-4.91E-01,7.03E-01,1.00E+00-5.22E-01,3.00E-01,1.00E+00-5.70E-01,1.04E-01,1.00E+00-3.24E-01,7.22E-01,1.00E+009.20E-01,-3.26E-01,1.00E+008.18E-01,3.49E-01,-1.00E+00-7.13E-01,-4.91E-01,-1.00E+005.37E-01,1.05E+00,-1.00E+004.89E-02,1.25E-01,-1.00E+004.00E-01,-1.34E-01,-1.00E+007.30E-01,-3.66E-01,1.00E+00-8.46E-01,8.70E-01,1.00E+008.31E-01,5.85E-01,-1.00E+004.46E-01,2.83E-01,-1.00E+006.35E-01,8.53E-01,-1.00E+001.35E-01,8.40E-01,1.00E+00-5.81E-01,-1.47E-02,1.00E+00-4.27E-01,4.92E-01,1.00E+00-3.09E-02,1.08E+00,1.00E+004.87E-01,-7.50E-01,1.00E+00-5.06E-01,-9.10E-01,-1.00E+002.49E-01,3.83E-01,-1.00E+00-4.43E-01,-7.64E-01,-1.00E+00-1.05E-01,-8.83E-01,1.00E+00-2.84E-01,-5.72E-01,-1.00E+001.01E+00,3.72E-01,1.00E+006.18E-02,-6.08E-01,1.00E+008.71E-02,3.61E-01,-1.00E+008.72E-01,4.14E-01,-1.00E+00-4.22E-01,5.21E-01,1.00E+00-5.30E-01,6.99E-01,1.00E+005.90E-01,4.47E-01,-1.00E+008.40E-01,-8.50E-01,1.00E+009.19E-02,-2.32E-01,-1.00E+00-8.22E-02,-4.02E-01,-1.00E+009.73E-01,6.42E-01,-1.00E+004.36E-01,2.76E-01,-1.00E+005.25E-01,-5.44E-01,1.00E+005.55E-01,3.83E-01,-1.00E+009.56E-01,-8.02E-01,1.00E+00-7.71E-01,4.33E-01,1.00E+008.90E-01,-5.31E-01,1.00E+00-2.62E-01,6.66E-02,1.00E+006.42E-01,-1.82E-01,1.00E+00-4.15E-01,7.78E-01,1.00E+004.67E-01,8.18E-01,-1.00E+004.78E-01,-2.49E-01,1.00E+00-8.26E-01,9.44E-01,1.00E+00-9.46E-01,-4.28E-01,1.00E+005.07E-01,-8.09E-01,1.00E+00-5.36E-01,2.72E-02,1.00E+004.10E-01,4.30E-01,-1.00E+00-2.62E-01,-5.80E-01,-1.00E+003.19E-01,2.16E-01,-1.00E+001.48E-01,-8.15E-01,1.00E+006.30E-01,7.00E-01,-1.00E+00-9.54E-01,9.26E-01,1.00E+00-2.45E-01,1.82E-02,-1.00E+005.81E-01,3.16E-01,-1.00E+004.10E-02,-4.57E-01,-1.00E+005.84E-01,6.83E-01,-1.00E+00-4.87E-01,8.65E-01,1.00E+008.26E-01,-4.98E-02,1.00E+005.22E-01,-8.89E-01,1.00E+008.39E-01,-8.26E-01,1.00E+00-6.30E-01,-1.49E-01,-1.00E+008.52E-01,-1.06E+00,1.00E+003.83E-01,6.79E-01,-1.00E+00-5.70E-02,6.06E-01,1.00E+003.04E-01,-1.07E+00,1.00E+00-8.61E-01,-2.26E-01,1.00E+00-8.63E-01,2.03E-01,1.00E+005.59E-01,1.54E-01,-1.00E+00-4.18E-01,-3.27E-01,-1.00E+001.22E-01,-2.49E-01,-1.00E+007.47E-01,5.55E-01,-1.00E+002.06E-01,5.74E-01,1.00E+00-8.91E-01,4.99E-01,1.00E+00-6.86E-01,-4.69E-01,-1.00E+00-1.98E-01,-1.36E-01,-1.00E+006.17E-01,1.29E-01,-1.00E+00-7.93E-01,3.61E-01,1.00E+00-1.01E+00,7.85E-02,1.00E+00-8.62E-01,-5.79E-01,-1.00E+00-1.97E-01,2.75E-01,-1.00E+005.76E-01,9.37E-01,1.00E+00-6.14E-01,-9.41E-01,-1.00E+00-5.52E-01,-2.68E-01,-1.00E+001.60E-01,-3.42E-01,-1.00E+00-3.97E-01,6.57E-01,1.00E+00-5.79E-01,-8.73E-01,-1.00E+008.32E-01,-8.53E-02,1.00E+00-2.99E-01,-4.29E-01,-1.00E+00-1.49E-01,6.54E-01,1.00E+00-7.45E-01,-7.19E-01,-1.00E+001.56E-01,9.21E-01,1.00E+005.29E-01,9.17E-01,-1.00E+008.28E-02,-6.28E-01,1.00E+00-9.40E-01,-6.63E-01,-1.00E+007.14E-01,-2.60E-01,1.00E+00-1.11E-02,-8.43E-01,1.00E+005.43E-01,1.18E-01,-1.00E+007.34E-01,-8.91E-01,1.00E+003.79E-01,-1.16E-01,-1.00E+00-1.67E-01,-4.11E-01,-1.00E+00-7.82E-01,3.77E-01,1.00E+002.72E-01,8.11E-01,1.00E+00-8.68E-01,-6.76E-01,-1.00E+00-1.81E-01,6.81E-01,1.00E+00-4.45E-02,4.41E-04,-1.00E+004.29E-01,8.29E-01,-1.00E+00-8.38E-01,-7.70E-02,1.00E+007.00E-01,-2.08E-01,1.00E+007.74E-01,5.12E-01,-1.00E+00-6.88E-01,7.93E-01,1.00E+00-4.25E-01,-8.50E-01,-1.00E+004.44E-01,-2.42E-01,1.00E+00-2.19E-03,6.97E-01,1.00E+003.25E-01,-1.86E-01,-1.00E+002.71E-01,-8.52E-01,1.00E+002.08E-01,-8.29E-01,1.00E+00-3.38E-01,-8.94E-01,-1.00E+00-2.43E-02,-5.51E-01,-1.00E+002.55E-01,-2.88E-01,-1.00E+00-7.17E-01,4.20E-04,1.00E+001.32E-01,-4.60E-01,-1.00E+003.45E-01,-1.29E-01,-1.00E+008.24E-02,-9.73E-01,1.00E+005.33E-01,2.95E-01,-1.00E+00-3.39E-01,9.20E-01,1.00E+005.51E-01,-8.46E-01,1.00E+00-4.11E-01,5.12E-01,1.00E+004.63E-01,-7.36E-01,1.00E+005.76E-01,-5.90E-01,1.00E+00-6.32E-01,-9.80E-01,-1.00E+00-1.68E-01,-5.29E-01,-1.00E+007.20E-01,-1.04E+00,1.00E+007.50E-01,-5.38E-01,1.00E+002.52E-01,-9.61E-01,1.00E+00-7.25E-01,7.44E-02,1.00E+00-7.20E-01,-5.57E-01,-1.00E+00-9.54E-01,4.77E-01,1.00E+007.11E-01,-9.90E-01,1.00E+002.91E-01,-4.43E-01,1.00E+003.20E-01,-4.01E-01,1.00E+002.34E-01,6.37E-01,1.00E+00-1.96E-01,-9.90E-01,1.00E+00-4.38E-01,1.17E-02,1.00E+00-3.55E-01,8.20E-01,1.00E+003.47E-01,-5.45E-01,1.00E+008.36E-01,3.44E-01,-1.00E+00-7.14E-01,-6.41E-01,-1.00E+00


--------------------------------------------------------------------------------
/labs/lab10/train.csv:
--------------------------------------------------------------------------------
 1 | -7.7947021e-01,8.3822138e-01,1.0000000e+00
 2 | 1.5563491e-01,8.9537743e-01,1.0000000e+00
 3 | -5.9907703e-02,-7.1777995e-01,1.0000000e+00
 4 | 2.0759636e-01,7.5893338e-01,1.0000000e+00
 5 | -1.9598312e-01,-3.7548716e-01,-1.0000000e+00
 6 | 5.8848947e-01,-8.4255381e-01,1.0000000e+00
 7 | 7.1985874e-03,-5.4831650e-01,-1.0000000e+00
 8 | 7.3883852e-01,-6.0339369e-01,1.0000000e+00
 9 | 7.0464808e-01,-2.0420052e-02,1.0000000e+00
10 | 9.6992666e-01,6.4137120e-01,-1.0000000e+00
11 | 4.3543099e-01,7.4477254e-01,-1.0000000e+00
12 | -8.4425822e-01,7.4235423e-01,1.0000000e+00
13 | 5.9142471e-01,-5.4602118e-01,1.0000000e+00
14 | -6.9093124e-02,3.7659995e-02,-1.0000000e+00
15 | -9.5154865e-01,-7.3305502e-01,-1.0000000e+00
16 | -1.2988138e-01,7.5676096e-01,1.0000000e+00
17 | -4.9534647e-01,-5.6627908e-01,-1.0000000e+00
18 | -9.0399413e-01,5.0922150e-01,1.0000000e+00
19 | 2.9235128e-01,1.6089015e-01,-1.0000000e+00
20 | 6.4798552e-01,-7.7933769e-01,1.0000000e+00
21 | 3.7595574e-01,7.8203087e-02,-1.0000000e+00
22 | 2.4588993e-01,4.5146739e-03,-1.0000000e+00
23 | -4.5719155e-01,4.2390461e-01,1.0000000e+00
24 | -4.4127876e-01,7.0571892e-01,1.0000000e+00
25 | 5.0744669e-01,7.5872586e-01,-1.0000000e+00


--------------------------------------------------------------------------------
/labs/lab2/Lab_2_A_Live.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "code",
 12 |      "collapsed": false,
 13 |      "input": [
 14 |       "import cs109style\n",
 15 |       "cs109style.customize_mpl()\n",
 16 |       "cs109style.customize_css()\n",
 17 |       "\n",
 18 |       "# special IPython command to prepare the notebook for matplotlib\n",
 19 |       "%matplotlib inline \n",
 20 |       "\n",
 21 |       "from collections import defaultdict\n",
 22 |       "\n",
 23 |       "import pandas as pd\n",
 24 |       "import matplotlib.pyplot as plt\n",
 25 |       "import requests\n",
 26 |       "from pattern import web\n",
 27 |       "\n"
 28 |      ],
 29 |      "language": "python",
 30 |      "metadata": {},
 31 |      "outputs": []
 32 |     },
 33 |     {
 34 |      "cell_type": "markdown",
 35 |      "metadata": {},
 36 |      "source": [
 37 |       "## Fetching population data from Wikipedia\n",
 38 |       "\n",
 39 |       "In this example we will fetch data about countries and their population from Wikipedia.\n",
 40 |       "\n",
 41 |       "http://en.wikipedia.org/wiki/List_of_countries_by_past_and_future_population has several tables for individual countries, subcontinents as well as different years. We will combine the data for all countries and all years in a single panda dataframe and visualize the change in population for different countries.\n",
 42 |       "\n",
 43 |       "###We will go through the following steps:\n",
 44 |       "* fetching html with embedded data\n",
 45 |       "* parsing html to extract the data\n",
 46 |       "* collecting the data in a panda dataframe\n",
 47 |       "* displaying the data\n",
 48 |       "\n",
 49 |       "To give you some starting points for your homework, we will also show the different sub-steps that can be taken to reach the presented solution."
 50 |      ]
 51 |     },
 52 |     {
 53 |      "cell_type": "markdown",
 54 |      "metadata": {},
 55 |      "source": [
 56 |       "## Fetching the Wikipedia site"
 57 |      ]
 58 |     },
 59 |     {
 60 |      "cell_type": "code",
 61 |      "collapsed": false,
 62 |      "input": [
 63 |       "url = 'http://en.wikipedia.org/wiki/List_of_countries_by_past_and_future_population'\n",
 64 |       "website_html = requests.get(url).text\n",
 65 |       "#print website_html"
 66 |      ],
 67 |      "language": "python",
 68 |      "metadata": {},
 69 |      "outputs": []
 70 |     },
 71 |     {
 72 |      "cell_type": "markdown",
 73 |      "metadata": {},
 74 |      "source": [
 75 |       "## Parsing html data"
 76 |      ]
 77 |     },
 78 |     {
 79 |      "cell_type": "code",
 80 |      "collapsed": false,
 81 |      "input": [
 82 |       "def get_population_html_tables(html):\n",
 83 |       "    \"\"\"Parse html and return html tables of wikipedia population data.\"\"\"\n",
 84 |       "\n",
 85 |       "    dom = web.Element(html)\n",
 86 |       "\n",
 87 |       "    ### 0. step: look at html source!\n",
 88 |       "    \n",
 89 |       "    #### 1. step: get all tables\n",
 90 |       "\n",
 91 |       "    #### 2. step: get all tables we care about\n",
 92 |       "\n",
 93 |       "    return tbls\n",
 94 |       "\n",
 95 |       "tables = get_population_html_tables(website_html)\n",
 96 |       "print \"table length: %d\" %len(tables)\n",
 97 |       "for t in tables:\n",
 98 |       "    print t.attributes\n"
 99 |      ],
100 |      "language": "python",
101 |      "metadata": {},
102 |      "outputs": []
103 |     },
104 |     {
105 |      "cell_type": "code",
106 |      "collapsed": false,
107 |      "input": [
108 |       "def table_type(tbl):\n",
109 |       "    ### Extract the table type\n",
110 |       "\n",
111 |       "# group the tables by type\n",
112 |       "tables_by_type = defaultdict(list)  # defaultdicts have a default value that is inserted when a new key is accessed\n",
113 |       "for tbl in tables:\n",
114 |       "    tables_by_type[table_type(tbl)].append(tbl)\n",
115 |       "\n",
116 |       "print tables_by_type"
117 |      ],
118 |      "language": "python",
119 |      "metadata": {},
120 |      "outputs": []
121 |     },
122 |     {
123 |      "cell_type": "markdown",
124 |      "metadata": {},
125 |      "source": [
126 |       "## Extracting data and filling it into a dictionary"
127 |      ]
128 |     },
129 |     {
130 |      "cell_type": "code",
131 |      "collapsed": false,
132 |      "input": [
133 |       "def get_countries_population(tables):\n",
134 |       "    \"\"\"Extract population data for countries from all tables and store it in dictionary.\"\"\"\n",
135 |       "    \n",
136 |       "    result = defaultdict(dict)\n",
137 |       "\n",
138 |       "    # 1. step: try to extract data for a single table\n",
139 |       "\n",
140 |       "    # 2. step: iterate over all tables, extract headings and actual data and combine data into single dict\n",
141 |       "    \n",
142 |       "    return result\n",
143 |       "\n",
144 |       "\n",
145 |       "result = get_countries_population(tables_by_type['Country or territory'])\n",
146 |       "print result"
147 |      ],
148 |      "language": "python",
149 |      "metadata": {},
150 |      "outputs": []
151 |     },
152 |     {
153 |      "cell_type": "markdown",
154 |      "metadata": {},
155 |      "source": [
156 |       "## Creating a dataframe from a dictionary"
157 |      ]
158 |     },
159 |     {
160 |      "cell_type": "code",
161 |      "collapsed": false,
162 |      "input": [
163 |       "# create dataframe\n",
164 |       "\n",
165 |       "df = pd.DataFrame.from_dict(result, orient='index')\n",
166 |       "# sort based on year\n",
167 |       "df.sort(axis=1,inplace=True)\n",
168 |       "print df\n"
169 |      ],
170 |      "language": "python",
171 |      "metadata": {},
172 |      "outputs": []
173 |     },
174 |     {
175 |      "cell_type": "markdown",
176 |      "metadata": {},
177 |      "source": [
178 |       "## Some data accessing functions for a panda dataframe"
179 |      ]
180 |     },
181 |     {
182 |      "cell_type": "code",
183 |      "collapsed": false,
184 |      "input": [
185 |       "subtable = df.iloc[0:2, 0:2]\n",
186 |       "print \"subtable\"\n",
187 |       "print subtable\n",
188 |       "print \"\"\n",
189 |       "\n",
190 |       "column = df[1955]\n",
191 |       "print \"column\"\n",
192 |       "print column\n",
193 |       "print \"\"\n",
194 |       "\n",
195 |       "row = df.ix[0] #row 0\n",
196 |       "print \"row\"\n",
197 |       "print row\n",
198 |       "print \"\"\n",
199 |       "\n",
200 |       "rows = df.ix[:2] #rows 0,1\n",
201 |       "print \"rows\"\n",
202 |       "print rows\n",
203 |       "print \"\"\n",
204 |       "\n",
205 |       "element = df.ix[0,1955] #element\n",
206 |       "print \"element\"\n",
207 |       "print element\n",
208 |       "print \"\"\n",
209 |       "\n",
210 |       "# max along column\n",
211 |       "print \"max\"\n",
212 |       "print df[1950].max()\n",
213 |       "print \"\"\n",
214 |       "\n",
215 |       "# axes\n",
216 |       "print \"axes\"\n",
217 |       "print df.axes\n",
218 |       "print \"\"\n",
219 |       "\n",
220 |       "row = df.ix[0]\n",
221 |       "print \"row info\"\n",
222 |       "print row.name\n",
223 |       "print row.index\n",
224 |       "print \"\"\n",
225 |       "\n",
226 |       "countries =  df.index\n",
227 |       "print \"countries\"\n",
228 |       "print countries\n",
229 |       "print \"\"\n",
230 |       "\n",
231 |       "print \"Austria\"\n",
232 |       "print df.ix['Austria']"
233 |      ],
234 |      "language": "python",
235 |      "metadata": {},
236 |      "outputs": []
237 |     },
238 |     {
239 |      "cell_type": "markdown",
240 |      "metadata": {},
241 |      "source": [
242 |       "## Plotting population of 4 countries"
243 |      ]
244 |     },
245 |     {
246 |      "cell_type": "code",
247 |      "collapsed": false,
248 |      "input": [
249 |       "plotCountries = ['Austria', 'Germany', 'United States', 'France']\n",
250 |       "    \n",
251 |       "for country in plotCountries:\n",
252 |       "    row = df.ix[country]\n",
253 |       "    plt.plot(row.index, row, label=row.name ) \n",
254 |       "    \n",
255 |       "plt.ylim(ymin=0) # start y axis at 0\n",
256 |       "\n",
257 |       "plt.xticks(rotation=70)\n",
258 |       "plt.legend(loc='best')\n",
259 |       "plt.xlabel(\"Year\")\n",
260 |       "plt.ylabel(\"# people (million)\")\n",
261 |       "plt.title(\"Population of countries\")"
262 |      ],
263 |      "language": "python",
264 |      "metadata": {},
265 |      "outputs": []
266 |     },
267 |     {
268 |      "cell_type": "markdown",
269 |      "metadata": {},
270 |      "source": [
271 |       "## Plot 5 most populous countries from 2010 and 2060"
272 |      ]
273 |     },
274 |     {
275 |      "cell_type": "code",
276 |      "collapsed": false,
277 |      "input": [
278 |       "def plot_populous(df, year):\n",
279 |       "    # sort table depending on data value in year column\n",
280 |       "    df_by_year = df.sort(year, ascending=False)\n",
281 |       "    \n",
282 |       "    plt.figure()\n",
283 |       "    for i in range(5):  \n",
284 |       "        row = df_by_year.ix[i]\n",
285 |       "        plt.plot(row.index, row, label=row.name ) \n",
286 |       "            \n",
287 |       "    plt.ylim(ymin=0)\n",
288 |       "    \n",
289 |       "    plt.xticks(rotation=70)\n",
290 |       "    plt.legend(loc='best')\n",
291 |       "    plt.xlabel(\"Year\")\n",
292 |       "    plt.ylabel(\"# people (million)\")\n",
293 |       "    plt.title(\"Most populous countries in %d\" % year)\n",
294 |       "\n",
295 |       "plot_populous(df, 2010)\n",
296 |       "plot_populous(df, 2050)"
297 |      ],
298 |      "language": "python",
299 |      "metadata": {},
300 |      "outputs": []
301 |     },
302 |     {
303 |      "cell_type": "code",
304 |      "collapsed": false,
305 |      "input": [],
306 |      "language": "python",
307 |      "metadata": {},
308 |      "outputs": []
309 |     }
310 |    ],
311 |    "metadata": {}
312 |   }
313 |  ]
314 | }


--------------------------------------------------------------------------------
/labs/lab2/Lab_2_B_Live.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "code",
 12 |      "collapsed": false,
 13 |      "input": [
 14 |       "# Setup\n",
 15 |       "import pattern.web as web\n",
 16 |       "import pandas as pd\n",
 17 |       "import numpy as np\n",
 18 |       "import matplotlib.pyplot as plt\n",
 19 |       "\n",
 20 |       "from cs109style import customize_mpl, customize_css\n",
 21 |       "customize_mpl()\n",
 22 |       "customize_css()\n",
 23 |       "%pylab inline"
 24 |      ],
 25 |      "language": "python",
 26 |      "metadata": {},
 27 |      "outputs": []
 28 |     },
 29 |     {
 30 |      "cell_type": "markdown",
 31 |      "metadata": {},
 32 |      "source": [
 33 |       "## Example 2: extracting reddit titles, upvotes, downvotes, and submission time\n",
 34 |       "\n",
 35 |       "### We'll operate in two phases:\n",
 36 |       "* first, find all the URLs to comment pages on the first few front pages of reddit.\n",
 37 |       "* second, extract information from each comments page"
 38 |      ]
 39 |     },
 40 |     {
 41 |      "cell_type": "code",
 42 |      "collapsed": false,
 43 |      "input": [
 44 |       "def get_links_from_front_pages(n):\n",
 45 |       "    'find  URLs of comments pages, linked from the n first few pages of reddit'\n",
 46 |       "    url = web.URL('http://www.reddit.com/')\n",
 47 |       "    comment_pages = []\n",
 48 |       "    for page_idx in range(n):\n",
 49 |       "        dom = web.DOM(url.download(cached=False))\n",
 50 |       "        \n",
 51 |       "        ### Extract comments pages\n",
 52 |       "        \n",
 53 |       "        ### find the next page link - reddit has 25 links per page\n",
 54 |       "\n",
 55 |       "    # use set() to remove repeated URLs\n",
 56 |       "    return list(set(comment_pages))\n",
 57 |       "\n",
 58 |       "            \n",
 59 |       "print len(get_links_from_front_pages(6))"
 60 |      ],
 61 |      "language": "python",
 62 |      "metadata": {},
 63 |      "outputs": []
 64 |     },
 65 |     {
 66 |      "cell_type": "code",
 67 |      "collapsed": false,
 68 |      "input": [
 69 |       "def info_from_comments_pages(links):\n",
 70 |       "    'fetch title, upvotes, downvotes, time of submission from a sequence of links'\n",
 71 |       "    results = []\n",
 72 |       "    for urltext in links:\n",
 73 |       "        url = web.URL(urltext)\n",
 74 |       "        print \"fetching info for\", url\n",
 75 |       "        try:\n",
 76 |       "            dom = web.DOM(url.download(cached=False))\n",
 77 |       "            \n",
 78 |       "            ### Extract title, upvotes, downvotes, submission time\n",
 79 |       "            \n",
 80 |       "            results.append((title, upvotes, downvotes, pd.to_datetime(time)))\n",
 81 |       "        except KeyboardInterrupt:\n",
 82 |       "            # allow us to interrupt the kernel and still continue\n",
 83 |       "            break\n",
 84 |       "        except:\n",
 85 |       "            pass  # some things that look like comment pages don't have the information above\n",
 86 |       "    return results"
 87 |      ],
 88 |      "language": "python",
 89 |      "metadata": {},
 90 |      "outputs": []
 91 |     },
 92 |     {
 93 |      "cell_type": "code",
 94 |      "collapsed": false,
 95 |      "input": [
 96 |       "comments_pages = get_links_from_front_pages(5)\n",
 97 |       "print \"Fetching info for\", len(comments_pages), \"pages\"\n",
 98 |       "pages = info_from_comments_pages(comments_pages)\n",
 99 |       "titles, upvotes, downvotes, dates = zip(*pages)  # zip(*seq) transposes a sequence of sequences.\n",
100 |       "df = pd.DataFrame({'title' : titles, 'upvotes' : upvotes, 'downvotes' : downvotes, 'date' : dates}, index=dates)\n",
101 |       "print df"
102 |      ],
103 |      "language": "python",
104 |      "metadata": {},
105 |      "outputs": []
106 |     },
107 |     {
108 |      "cell_type": "code",
109 |      "collapsed": false,
110 |      "input": [
111 |       "df.sort('date', inplace=True)\n",
112 |       "df['upvotes'].plot(c='g')\n",
113 |       "df['downvotes'].plot(c='r')\n",
114 |       "(df['upvotes'] - df['downvotes']).plot(c='k')\n"
115 |      ],
116 |      "language": "python",
117 |      "metadata": {},
118 |      "outputs": []
119 |     },
120 |     {
121 |      "cell_type": "code",
122 |      "collapsed": false,
123 |      "input": [],
124 |      "language": "python",
125 |      "metadata": {},
126 |      "outputs": []
127 |     }
128 |    ],
129 |    "metadata": {}
130 |   }
131 |  ]
132 | }


--------------------------------------------------------------------------------
/labs/lab2/README.md:
--------------------------------------------------------------------------------
 1 | # Files for Lab 2 of CS109
 2 | 
 3 | ## Part A
 4 | 
 5 | * [Lab_2_A_Live.ipynb](http://nbviewer.ipython.org/github/cs109/content/blob/master/labs/lab2/Lab_2_A_Live.ipynb) - starting point for lab
 6 | * [Lab_2_A_Johanna.ipynb](http://nbviewer.ipython.org/github/cs109/content/blob/master/labs/lab2/Lab_2_A_Johanna.ipynb) - Johanna's original writeup
 7 | * [Lab_2_A_Live_Ray_Final.ipynb](http://nbviewer.ipython.org/github/cs109/content/blob/master/labs/lab2/Lab_2_A_Live_Ray_Final.ipynb) - Ray's in-class reconstruction
 8 | 
 9 | ## Part B
10 | 
11 | * [Lab_2_B.ipynb](http://nbviewer.ipython.org/github/cs109/content/blob/master/labs/lab2/Lab_2_B.ipynb) - Reddit example
12 | 


--------------------------------------------------------------------------------
/labs/lab2/cs109style.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from IPython.core.display import HTML
 4 | from matplotlib import rcParams
 5 | 
 6 | #colorbrewer2 Dark2 qualitative color table
 7 | dark2_colors = [(0.10588235294117647, 0.6196078431372549, 0.4666666666666667),
 8 |                 (0.8509803921568627, 0.37254901960784315, 0.00784313725490196),
 9 |                 (0.4588235294117647, 0.4392156862745098, 0.7019607843137254),
10 |                 (0.9058823529411765, 0.1607843137254902, 0.5411764705882353),
11 |                 (0.4, 0.6509803921568628, 0.11764705882352941),
12 |                 (0.9019607843137255, 0.6705882352941176, 0.00784313725490196),
13 |                 (0.6509803921568628, 0.4627450980392157, 0.11372549019607843),
14 |                 (0.4, 0.4, 0.4)]
15 | 
16 | def customize_mpl():
17 |     """Tweak matplotlib visual style"""
18 |     print("Setting custom matplotlib visual style")
19 | 
20 |     rcParams['figure.figsize'] = (10, 6)
21 |     rcParams['figure.dpi'] = 150
22 |     rcParams['axes.color_cycle'] = dark2_colors
23 |     rcParams['lines.linewidth'] = 2
24 |     rcParams['axes.grid'] = True
25 |     rcParams['axes.facecolor'] = '#eeeeee'
26 |     rcParams['font.size'] = 14
27 |     rcParams['patch.edgecolor'] = 'none'
28 | 
29 | 
30 | def customize_css():
31 |     print("Setting custom CSS for the IPython Notebook")
32 |     styles = open('custom.css', 'r').read()
33 |     return HTML(styles)
34 | 


--------------------------------------------------------------------------------
/labs/lab2/custom.css:
--------------------------------------------------------------------------------
 1 | <style>
 2 |     @font-face {
 3 |         font-family: "Computer Modern";
 4 |         src: url('http://mirrors.ctan.org/fonts/cm-unicode/fonts/otf/cmunss.otf');
 5 |     }
 6 |     div.cell{
 7 |         width:800px;
 8 |         margin-left:16% !important;
 9 |         margin-right:auto;
10 |     }
11 |     h1 {
12 |         font-family: Helvetica, serif;
13 |     }
14 |     h4{
15 |         margin-top:12px;
16 |         margin-bottom: 3px;
17 |        }
18 |     div.text_cell_render{
19 |         font-family: Computer Modern, "Helvetica Neue", Arial, Helvetica, Geneva, sans-serif;
20 |         line-height: 145%;
21 |         font-size: 130%;
22 |         width:800px;
23 |         margin-left:auto;
24 |         margin-right:auto;
25 |     }
26 | 
27 |     div.text_cell_render li {
28 |         line-height: 145%;
29 |     }
30 | 
31 |     div.text_cell_render code {
32 |         color: rgb(40, 114, 43);
33 |         font-family: "Source Code Pro", source-code-pro,Consolas, monospace;
34 |         font-size: 80%;
35 |     }
36 | 
37 |     .CodeMirror{
38 |             font-family: "Source Code Pro", source-code-pro,Consolas, monospace;
39 |     }
40 | 
41 |     .text_cell_render h5 {
42 |         font-weight: 300;
43 |         font-size: 16pt;
44 |         color: #4057A1;
45 |         font-style: italic;
46 |         margin-bottom: .5em;
47 |         margin-top: 0.5em;
48 |         display: block;
49 |     }
50 | 
51 |     .warning{
52 |         color: rgb( 240, 20, 20 )
53 |         }
54 | 
55 |     strong{
56 |         color: rgb(23, 103, 140);
57 | 	border: 1px solid;
58 |     }
59 | 
60 | </style>
61 | <script>
62 |     MathJax.Hub.Config({
63 |                         TeX: {
64 |                            extensions: ["AMSmath.js"]
65 |                            },
66 |                 tex2jax: {
67 |                     inlineMath: [ ['$','$'], ["\\(","\\)"] ],
68 |                     displayMath: [ ['$$','$$'], ["\\[","\\]"] ]
69 |                 },
70 |                 displayAlign: 'center', // Change this to 'center' to center equations.
71 |                 "HTML-CSS": {
72 |                     styles: {'.MathJax_Display': {"margin": 4}}
73 |                 }
74 |         });
75 | </script>
76 | 


--------------------------------------------------------------------------------
/labs/lab3/Italy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab3/Italy.png


--------------------------------------------------------------------------------
/labs/lab4/data/US_Unemployment_Oct2012.csv:
--------------------------------------------------------------------------------
 1 | State,Unemployment
 2 | AL,7.1
 3 | AK,6.8
 4 | AZ,8.1
 5 | AR,7.2
 6 | CA,10.1
 7 | CO,7.7
 8 | CT,8.4
 9 | DE,7.1
10 | FL,8.2
11 | GA,8.8
12 | HI,5.4
13 | ID,6.6
14 | IL,8.8
15 | IN,8.4
16 | IA,5.1
17 | KS,5.6
18 | KY,8.1
19 | LA,5.9
20 | ME,7.2
21 | MD,6.8
22 | MA,6.7
23 | MI,9.1
24 | MN,5.6
25 | MS,9.1
26 | MO,6.7
27 | MT,5.8
28 | NE,3.9
29 | NV,10.3
30 | NH,5.7
31 | NJ,9.6
32 | NM,6.8
33 | NY,8.4
34 | NC,9.4
35 | ND,3.2
36 | OH,6.9
37 | OK,5.2
38 | OR,8.5
39 | PA,8
40 | RI,10.1
41 | SC,8.8
42 | SD,4.4
43 | TN,7.8
44 | TX,6.4
45 | UT,5.5
46 | VT,5
47 | VA,5.8
48 | WA,7.8
49 | WV,7.5
50 | WI,6.8
51 | WY,5.1


--------------------------------------------------------------------------------
/labs/lab4/data/census_demographics.csv:
--------------------------------------------------------------------------------
 1 | state,per_black,per_hisp,per_white,educ_hs,educ_coll,average_income,median_income,pop_density,vote_pop,older_pop,per_older,per_vote
 2 | ALABAMA,26.5,4.0,66.8,81.4,21.7,22984,42081,94.4,3001712.5,672383.6,0.14,0.625
 3 | ALASKA,3.6,5.8,63.7,90.7,27.0,30726,66521,1.2,475548.444,58540.158,0.081,0.658
 4 | ARIZONA,4.5,30.1,57.4,85.0,26.3,25680,50448,56.3,3934880.535,920515.71,0.142,0.607
 5 | ARKANSAS,15.6,6.6,74.2,81.9,19.1,21274,39267,56.0,1798043.148,428944.934,0.146,0.612
 6 | CALIFORNIA,6.6,38.1,39.7,80.7,30.1,29188,60883,239.1,24009747.944,4409953.704,0.117,0.637
 7 | COLORADO,4.3,20.9,69.7,89.3,35.9,30151,56456,48.5,3310567.012,578197.948,0.113,0.647
 8 | CONNECTICUT,11.1,13.8,70.9,88.4,35.2,36775,67740,738.1,2263008.088,515622.096,0.144,0.632
 9 | DELAWARE,21.9,8.4,65.1,87.0,27.7,29007,57599,460.8,568773.645,133348.845,0.147,0.627
10 | DISTRICT OF COLUMBIA,50.7,9.5,35.3,86.5,49.2,42078,58526,9856.5,442485.136,70451.544,0.114,0.716
11 | FLORIDA,16.5,22.9,57.5,85.3,25.9,26551,47661,350.6,11701330.788,3354127.392,0.176,0.614
12 | GEORGIA,31.0,9.1,55.5,83.5,27.2,25134,49347,168.4,6242473.56,1079673.1,0.11,0.636
13 | HAWAII,2.0,9.2,22.9,89.8,29.4,28882,66420,211.8,867505.11,202097.07,0.147,0.631
14 | IDAHO,0.8,11.5,83.6,88.2,24.3,22518,46423,19.0,954160.97,202878.08,0.128,0.602
15 | ILLINOIS,14.8,16.2,63.3,86.2,30.3,28782,55735,231.1,8133370.424,1634395.639,0.127,0.632
16 | INDIANA,9.4,6.2,81.3,86.2,22.4,24058,47697,181.0,4060042.406,860233.704,0.132,0.623
17 | IOWA,3.1,5.2,88.4,89.9,24.5,25335,48872,54.5,1880257.726,456284.041,0.149,0.614
18 | KANSAS,6.1,10.8,77.8,89.2,29.3,25907,49424,34.9,1765811.37,381874.654,0.133,0.615
19 | KENTUCKY,8.0,3.2,86.1,81.0,20.3,22515,41576,109.9,2757063.636,589863.06,0.135,0.631
20 | LOUISIANA,32.4,4.4,60.1,81.0,20.9,23094,43445,104.9,2886721.516,571854.5,0.125,0.631
21 | MAINE,1.3,1.4,94.3,89.8,26.5,25385,46933,43.1,842071.192,216494.644,0.163,0.634
22 | MARYLAND,30.0,8.4,54.4,87.8,35.7,34849,70647,594.8,3753418.116,728536.125,0.125,0.644
23 | MASSACHUSETTS,7.8,9.9,76.4,88.7,38.3,33966,64509,839.4,4262135.792,922255.04,0.14,0.647
24 | MICHIGAN,14.3,4.5,76.4,88.0,25.0,25135,48432,174.8,6192369.249,1392542.367,0.141,0.627
25 | MINNESOTA,5.4,4.9,82.8,91.3,31.4,29582,57243,66.6,3367262.43,700176.791,0.131,0.63
26 | MISSISSIPPI,37.3,2.9,57.7,79.6,19.5,19977,37881,63.2,1840720.416,387206.56,0.13,0.618
27 | MISSOURI,11.7,3.7,80.8,86.2,25.0,24724,46262,87.1,3744658.624,853517.696,0.142,0.623
28 | MONTANA,0.5,3.1,87.5,91.0,27.9,23836,43872,6.8,623874.375,151726.248,0.152,0.625
29 | NEBRASKA,4.7,9.5,81.8,90.0,27.7,25229,49342,23.8,1131381.574,250599.176,0.136,0.614
30 | NEVADA,8.6,27.1,53.6,84.3,21.8,27589,55726,24.6,1718416.182,340415.25,0.125,0.631
31 | NEW HAMPSHIRE,1.3,2.9,92.2,90.9,32.9,31422,63277,147.0,854189.712,184547.16,0.14,0.648
32 | NEW JERSEY,14.6,18.1,58.9,87.3,34.6,34858,69811,1195.5,5566148.805,1208498.235,0.137,0.631
33 | NEW MEXICO,2.5,46.7,40.2,82.7,25.5,22966,43820,17.0,1280567.76,283182.464,0.136,0.615
34 | NEW YORK,17.5,18.0,58.0,84.4,32.1,30948,55603,411.2,12516121.671,2666731.989,0.137,0.643
35 | NORTH CAROLINA,22.0,8.6,65.0,83.6,26.1,24745,45570,196.1,6093189.031,1274644.932,0.132,0.631
36 | NORTH DAKOTA,1.3,2.2,88.6,89.4,26.3,25803,46781,9.7,434296.82,98486.208,0.144,0.635
37 | OHIO,12.4,3.2,81.0,87.4,24.1,25113,47358,282.3,7204049.424,1650927.993,0.143,0.624
38 | OKLAHOMA,7.7,9.2,68.2,85.4,22.6,23094,42979,54.7,2335568.928,519436.596,0.137,0.616
39 | OREGON,2.0,12.0,78.1,88.6,28.6,26171,49260,39.9,2454758.606,553675.837,0.143,0.634
40 | PENNSYLVANIA,11.3,5.9,79.2,87.4,26.4,27049,50398,283.9,7989789.522,1987890.216,0.156,0.627
41 | RHODE ISLAND,7.2,12.8,76.5,83.7,30.3,28707,54902,1018.1,677038.488,154541.394,0.147,0.644
42 | SOUTH CAROLINA,28.1,5.3,64.0,83.0,24.0,23443,43939,153.9,2938556.44,659771.43,0.141,0.628
43 | SOUTH DAKOTA,1.4,2.9,84.4,89.3,25.3,24110,46369,10.7,501865.938,118667.808,0.144,0.609
44 | TENNESSEE,16.9,4.7,75.4,82.5,22.7,23722,43314,153.9,4034112.39,877259.361,0.137,0.63
45 | TEXAS,12.2,38.1,44.8,80.0,25.8,24870,49646,96.3,16021000.944,2695841.505,0.105,0.624
46 | UTAH,1.3,13.2,80.1,90.6,29.4,23139,56330,33.6,1679064.312,259184.424,0.092,0.596
47 | VERMONT,1.1,1.6,94.2,90.6,33.3,27478,51841,67.9,406553.719,93964.65,0.15,0.649
48 | VIRGINIA,19.8,8.2,64.5,86.1,33.8,32145,61406,202.6,5230406.184,1012075.5,0.125,0.646
49 | WASHINGTON,3.8,11.6,72.1,89.6,31.0,29733,57244,101.2,4378054.358,867414.826,0.127,0.641
50 | WEST VIRGINIA,3.5,1.3,93.0,81.9,17.3,21232,38380,77.1,1170734.684,300568.968,0.162,0.631
51 | WISCONSIN,6.5,6.1,83.1,89.4,25.8,26624,51598,105.0,3592701.443,793935.613,0.139,0.629
52 | WYOMING,1.1,9.1,85.5,91.3,23.6,27860,53802,5.8,361348.488,72156.066,0.127,0.636
53 | 


--------------------------------------------------------------------------------
/labs/lab4/data/chall-damage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/chall-damage.png


--------------------------------------------------------------------------------
/labs/lab4/data/chall-table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/chall-table.png


--------------------------------------------------------------------------------
/labs/lab4/data/chall.txt:
--------------------------------------------------------------------------------
 1 | 66   0
 2 | 70   1
 3 | 69   0
 4 | 68   0
 5 | 67   0
 6 | 72   0
 7 | 73   0
 8 | 70   0
 9 | 57   1
10 | 63   1
11 | 70   1
12 | 78   0
13 | 67   0
14 | 53   1
15 | 67   0
16 | 75   0
17 | 70   0
18 | 81   0
19 | 76   0
20 | 79   0
21 | 75   1
22 | 76   0
23 | 58   1
24 | 


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (1).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (1).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (10).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (10).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (11).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (11).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (12).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (12).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (13).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (13).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (14).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (14).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (15).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (15).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (16).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (16).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (17).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (17).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (18).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (18).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (19).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (19).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (2).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (2).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (20).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (20).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (21).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (21).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (22).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (22).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (23).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (23).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (24).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (24).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (25).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (25).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (26).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (26).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (27).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (27).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (28).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (28).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (29).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (29).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (3).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (3).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (30).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (30).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (31).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (31).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (32).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (32).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (33).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (33).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (34).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (34).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (35).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (35).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (4).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (4).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (5).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (5).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (6).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (6).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (7).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (7).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (8).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (8).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th (9).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th (9).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/checks/th.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/checks/th.jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (1).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (1).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (10).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (10).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (11).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (11).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (12).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (12).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (13).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (13).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (14).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (14).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (15).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (15).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (16).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (16).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (17).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (17).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (18).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (18).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (19).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (19).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (2).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (2).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (20).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (20).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (21).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (21).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (22).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (22).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (23).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (23).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (24).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (24).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (25).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (25).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (26).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (26).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (27).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (27).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (28).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (28).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (29).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (29).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (3).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (3).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (30).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (30).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (31).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (31).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (32).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (32).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (33).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (33).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (34).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (34).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (35).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (35).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (36).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (36).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (37).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (37).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (38).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (38).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (39).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (39).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (4).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (4).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (40).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (40).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (41).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (41).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (42).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (42).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (43).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (43).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (44).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (44).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (45).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (45).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (46).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (46).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (47).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (47).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (48).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (48).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (49).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (49).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (5).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (5).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (50).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (50).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (6).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (6).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (7).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (7).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (8).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (8).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th (9).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th (9).jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/images/dollars/th.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/images/images/dollars/th.jpeg


--------------------------------------------------------------------------------
/labs/lab4/data/images/query_bing_images.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import requests
 3 | import re
 4 | import urllib2
 5 | import os
 6 | import sys
 7 | 
 8 | 
 9 | def get_soup(url):
10 |     return BeautifulSoup(requests.get(url).text)
11 | 
12 | query = sys.argv[1]
13 | image_type = '_'.join(query.split())
14 | print query, image_type
15 | url = "http://www.bing.com/images/search?q=" + query + "&qft=+filterui:color2-bw+filterui:imagesize-large&FORM=R5IR3"
16 | 
17 | soup = get_soup(url)
18 | images = [a['src'] for a in soup.find_all("img", {"src": re.compile("mm.bing.net")})]
19 | 
20 | for img in images:
21 |     raw_img = urllib2.urlopen(img).read()
22 |     cntr = len([i for i in os.listdir("images") if image_type in i]) + 1
23 |     f = open("images/" + image_type + "_"+ str(cntr) + ".jpg", 'wb')
24 |     f.write(raw_img)
25 |     f.close()
26 | 


--------------------------------------------------------------------------------
/labs/lab4/data/myclusters.csv:
--------------------------------------------------------------------------------
 1 | State,Cluster
 2 | AL,1
 3 | AK,6
 4 | AZ,4
 5 | AR,1
 6 | CA,5
 7 | CO,4
 8 | CT,7
 9 | DE,7
10 | FL,8
11 | GA,1
12 | HI,5
13 | ID,6
14 | IL,7
15 | IN,8
16 | IA,8
17 | KS,6
18 | KY,1
19 | LA,1
20 | ME,8
21 | MD,7
22 | MA,7
23 | MI,8
24 | MN,8
25 | MS,1
26 | MO,1
27 | MT,4
28 | NE,6
29 | NV,4
30 | NH,8
31 | NJ,7
32 | NM,4
33 | NY,7
34 | NC,1
35 | ND,6
36 | OH,8
37 | OK,1
38 | OR,5
39 | PA,8
40 | RI,7
41 | SC,1
42 | SD,6
43 | TN,1
44 | TX,1
45 | UT,6
46 | VT,5
47 | VA,8
48 | WA,5
49 | WV,1
50 | WI,8
51 | WY,6


--------------------------------------------------------------------------------
/labs/lab4/data/partisan_voting.csv:
--------------------------------------------------------------------------------
 1 | State,PVI
 2 | Alabama,R+13
 3 | Alaska,R+13
 4 | Arizona,R+6 
 5 | Arkansas,R+9 
 6 | California,D+7 
 7 | Colorado,EVEN
 8 | Connecticut,D+7 
 9 | Delaware,D+7 
10 | District of Columbia,D+39
11 | Florida,R+2 
12 | Georgia,R+7 
13 | Hawaii,D+12
14 | Idaho,R+17
15 | Illinois,D+8 
16 | Indiana,R+6 
17 | Iowa,D+1 
18 | Kansas,R+12
19 | Kentucky,R+10
20 | Louisiana,R+10
21 | Maine,D+5 
22 | Maryland,D+9 
23 | Massachusetts,D+12
24 | Michigan,D+4 
25 | Minnesota,D+2 
26 | Mississippi,R+10
27 | Missouri,R+3 
28 | Montana,R+7 
29 | Nebraska,R+13
30 | Nevada,D+1 
31 | New Hampshire,D+2 
32 | New Jersey,D+4 
33 | New Mexico,D+2 
34 | New York,D+10
35 | North Carolina,R+4 
36 | North Dakota,R+10
37 | Ohio,R+1 
38 | Oklahoma,R+17
39 | Oregon,D+4 
40 | Pennsylvania,D+2 
41 | Rhode Island,D+11
42 | South Carolina,R+8 
43 | South Dakota,R+9 
44 | Tennessee,R+9 
45 | Texas,R+10
46 | Utah,R+20
47 | Vermont,D+13
48 | Virginia,R+2 
49 | Washington,D+5 
50 | West Virginia,R+8 
51 | Wisconsin,D+2 
52 | Wyoming,R+20
53 | 


--------------------------------------------------------------------------------
/labs/lab4/data/pcavsfit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/pcavsfit.png


--------------------------------------------------------------------------------
/labs/lab4/data/shuttle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab4/data/shuttle.png


--------------------------------------------------------------------------------
/labs/lab5/data/bias-variance-error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab5/data/bias-variance-error.png


--------------------------------------------------------------------------------
/labs/lab5/data/lc-hb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab5/data/lc-hb.png


--------------------------------------------------------------------------------
/labs/lab5/data/lc-hv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab5/data/lc-hv.png


--------------------------------------------------------------------------------
/labs/lab5/data/reg-bias-variance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cs109/content/a573a8ebcb503f9aae24885750c0fe244a67c68c/labs/lab5/data/reg-bias-variance.png


--------------------------------------------------------------------------------
/labs/lab6/_multivariate.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Author: Joris Vankerschaver 2013
  3 | #
  4 | from __future__ import division, print_function, absolute_import
  5 | 
  6 | from scipy.misc import doccer
  7 | from functools import wraps
  8 | import numpy as np
  9 | import scipy.linalg
 10 | 
 11 | __all__ = ['multivariate_normal']
 12 | 
 13 | 
 14 | _LOG_2PI = np.log(2 * np.pi)
 15 | 
 16 | 
 17 | def _process_parameters(dim, mean, cov):
 18 |     """
 19 |     Infer dimensionality from mean or covariance matrix, ensure that
 20 |     mean and covariance are full vector resp. matrix.
 21 | 
 22 |     """
 23 | 
 24 |     # Try to infer dimensionality
 25 |     if dim is None:
 26 |         if mean is None:
 27 |             if cov is None:
 28 |                 dim = 1
 29 |             else:
 30 |                 cov = np.asarray(cov, dtype=float)
 31 |                 if cov.ndim < 2:
 32 |                     dim = 1
 33 |                 else:
 34 |                     dim = cov.shape[0]
 35 |         else:
 36 |             mean = np.asarray(mean, dtype=float)
 37 |             dim = mean.size
 38 |     else:
 39 |         if not np.isscalar(dim):
 40 |             raise ValueError("Dimension of random variable must be a scalar.")
 41 | 
 42 |     # Check input sizes and return full arrays for mean and cov if necessary
 43 |     if mean is None:
 44 |         mean = np.zeros(dim)
 45 |     mean = np.asarray(mean, dtype=float)
 46 | 
 47 |     if cov is None:
 48 |         cov = 1.0
 49 |     cov = np.asarray(cov, dtype=float)
 50 | 
 51 |     if dim == 1:
 52 |         mean.shape = (1,)
 53 |         cov.shape = (1, 1)
 54 | 
 55 |     if mean.ndim != 1 or mean.shape[0] != dim:
 56 |         raise ValueError("Array 'mean' must be vector of length %d." % dim)
 57 |     if cov.ndim == 0:
 58 |         cov = cov * np.eye(dim)
 59 |     elif cov.ndim == 1:
 60 |         cov = np.diag(cov)
 61 |     else:
 62 |         if cov.shape != (dim, dim):
 63 |             raise ValueError("Array 'cov' must be at most two-dimensional,"
 64 |                                  " but cov.ndim = %d" % cov.ndim)
 65 | 
 66 |     return dim, mean, cov
 67 | 
 68 | 
 69 | def _process_quantiles(x, dim):
 70 |     """
 71 |     Adjust quantiles array so that last axis labels the components of
 72 |     each data point.
 73 | 
 74 |     """
 75 |     x = np.asarray(x, dtype=float)
 76 | 
 77 |     if x.ndim == 0:
 78 |         x = x[np.newaxis]
 79 |     elif x.ndim == 1:
 80 |         if dim == 1:
 81 |             x = x[:, np.newaxis]
 82 |         else:
 83 |             x = x[np.newaxis, :]
 84 | 
 85 |     return x
 86 | 
 87 | 
 88 | def _squeeze_output(out):
 89 |     """
 90 |     Remove single-dimensional entries from array and convert to scalar,
 91 |     if necessary.
 92 | 
 93 |     """
 94 |     out = out.squeeze()
 95 |     if out.ndim == 0:
 96 |         out = out[()]
 97 |     return out
 98 | 
 99 | 
100 | def _pinv_1d(v, eps=1e-5):
101 |     """
102 |     A helper function for computing the pseudoinverse.
103 | 
104 |     Parameters
105 |     ----------
106 |     v : iterable of numbers
107 |         This may be thought of as a vector of eigenvalues or singular values.
108 |     eps : float
109 |         Elements of v smaller than eps are considered negligible.
110 | 
111 |     Returns
112 |     -------
113 |     v_pinv : 1d float ndarray
114 |         A vector of pseudo-inverted numbers.
115 | 
116 |     """
117 |     return np.array([0 if abs(x) < eps else 1/x for x in v], dtype=float)
118 | 
119 | 
120 | def _psd_pinv_decomposed_log_pdet(mat, cond=None, rcond=None,
121 |                                   lower=True, check_finite=True):
122 |     """
123 |     Compute a decomposition of the pseudo-inverse and the logarithm of
124 |     the pseudo-determinant of a symmetric positive semi-definite
125 |     matrix.
126 | 
127 |     The pseudo-determinant of a matrix is defined as the product of
128 |     the non-zero eigenvalues, and coincides with the usual determinant
129 |     for a full matrix.
130 | 
131 |     Parameters
132 |     ----------
133 |     mat : array_like
134 |         Input array of shape (`m`, `n`)
135 |     cond, rcond : float or None
136 |         Cutoff for 'small' singular values.
137 |         Eigenvalues smaller than ``rcond*largest_eigenvalue``
138 |         are considered zero.
139 |         If None or -1, suitable machine precision is used.
140 |     lower : bool, optional
141 |         Whether the pertinent array data is taken from the lower or upper
142 |         triangle of `mat`. (Default: lower)
143 |     check_finite : boolean, optional
144 |         Whether to check that the input matrix contains only finite numbers.
145 |         Disabling may give a performance gain, but may result in problems
146 |         (crashes, non-termination) if the inputs do contain infinities or NaNs.
147 | 
148 |     Returns
149 |     -------
150 |     M : array_like
151 |         The pseudo-inverse of the input matrix is np.dot(M, M.T).
152 |     log_pdet : float
153 |         Logarithm of the pseudo-determinant of the matrix.
154 | 
155 |     """
156 |     # Compute the symmetric eigendecomposition.
157 |     # The input covariance matrix is required to be real symmetric
158 |     # and positive semidefinite which implies that its eigenvalues
159 |     # are all real and non-negative,
160 |     # but clip them anyway to avoid numerical issues.
161 | 
162 |     # TODO: the code to set cond/rcond is identical to that in
163 |     # scipy.linalg.{pinvh, pinv2} and if/when this function is subsumed
164 |     # into scipy.linalg it should probably be shared between all of
165 |     # these routines.
166 | 
167 |     # Note that eigh takes care of array conversion, chkfinite,
168 |     # and assertion that the matrix is square.
169 |     s, u = scipy.linalg.eigh(mat, lower=lower, check_finite=check_finite)
170 | 
171 |     if rcond is not None:
172 |         cond = rcond
173 |     if cond in [None, -1]:
174 |         t = u.dtype.char.lower()
175 |         factor = {'f': 1E3, 'd': 1E6}
176 |         cond = factor[t] * np.finfo(t).eps
177 |     eps = cond * np.max(abs(s))
178 | 
179 |     if np.min(s) < -eps:
180 |         raise ValueError('the covariance matrix must be positive semidefinite')
181 | 
182 |     s_pinv = _pinv_1d(s, eps)
183 |     U = np.multiply(u, np.sqrt(s_pinv))
184 |     log_pdet = np.sum(np.log(s[s > eps]))
185 | 
186 |     return U, log_pdet
187 | 
188 | 
189 | _doc_default_callparams = \
190 | """mean : array_like, optional
191 |     Mean of the distribution (default zero)
192 | cov : array_like, optional
193 |     Covariance matrix of the distribution (default one)
194 | """
195 | 
196 | _doc_callparams_note = \
197 | """Setting the parameter `mean` to `None` is equivalent to having `mean`
198 | be the zero-vector. The parameter `cov` can be a scalar, in which case
199 | the covariance matrix is the identity times that value, a vector of
200 | diagonal entries for the covariance matrix, or a two-dimensional
201 | array_like.
202 | """
203 | 
204 | _doc_frozen_callparams = ""
205 | 
206 | _doc_frozen_callparams_note = \
207 | """See class definition for a detailed description of parameters."""
208 | 
209 | docdict_params = {
210 |     '_doc_default_callparams': _doc_default_callparams,
211 |     '_doc_callparams_note': _doc_callparams_note
212 | }
213 | 
214 | docdict_noparams = {
215 |     '_doc_default_callparams': _doc_frozen_callparams,
216 |     '_doc_callparams_note': _doc_frozen_callparams_note
217 | }
218 | 
219 | 
220 | class multivariate_normal_gen(object):
221 |     r"""
222 |     A multivariate normal random variable.
223 | 
224 |     The `mean` keyword specifies the mean. The `cov` keyword specifies the
225 |     covariance matrix.
226 | 
227 |     .. versionadded:: 0.14.0
228 | 
229 |     Methods
230 |     -------
231 |     pdf(x, mean=None, cov=1)
232 |         Probability density function.
233 |     logpdf(x, mean=None, cov=1)
234 |         Log of the probability density function.
235 |     rvs(mean=None, cov=1)
236 |         Draw random samples from a multivariate normal distribution.
237 |     entropy()
238 |         Compute the differential entropy of the multivariate normal.
239 | 
240 |     Parameters
241 |     ----------
242 |     x : array_like
243 |         Quantiles, with the last axis of `x` denoting the components.
244 |     %(_doc_default_callparams)s
245 | 
246 |     Alternatively, the object may be called (as a function) to fix the mean
247 |     and covariance parameters, returning a "frozen" multivariate normal
248 |     random variable:
249 | 
250 |     rv = multivariate_normal(mean=None, scale=1)
251 |         - Frozen  object with the same methods but holding the given
252 |           mean and covariance fixed.
253 | 
254 |     Notes
255 |     -----
256 |     %(_doc_callparams_note)s
257 | 
258 |     The covariance matrix `cov` must be a (symmetric) positive
259 |     semi-definite matrix. The determinant and inverse of `cov` are computed
260 |     as the pseudo-determinant and pseudo-inverse, respectively, so
261 |     that `cov` does not need to have full rank.
262 | 
263 |     The probability density function for `multivariate_normal` is
264 | 
265 |     .. math::
266 | 
267 |         f(x) = \frac{1}{\sqrt{(2 \pi)^k \det \Sigma}} \exp\left( -\frac{1}{2} (x - \mu)^T \Sigma^{-1} (x - \mu) \right),
268 | 
269 |     where :math:`\mu` is the mean, :math:`\Sigma` the covariance matrix,
270 |     and :math:`k` is the dimension of the space where :math:`x` takes values.
271 | 
272 |     Examples
273 |     --------
274 |     >>> from scipy.stats import multivariate_normal
275 |     >>> x = np.linspace(0, 5, 10, endpoint=False)
276 |     >>> y = multivariate_normal.pdf(x, mean=2.5, cov=0.5); y
277 |     array([ 0.00108914,  0.01033349,  0.05946514,  0.20755375,  0.43939129,
278 |             0.56418958,  0.43939129,  0.20755375,  0.05946514,  0.01033349])
279 |     >>> plt.plot(x, y)
280 | 
281 |     The input quantiles can be any shape of array, as long as the last
282 |     axis labels the components.  This allows us for instance to
283 |     display the frozen pdf for a non-isotropic random variable in 2D as
284 |     follows:
285 | 
286 |     >>> x, y = np.mgrid[-1:1:.01, -1:1:.01]
287 |     >>> pos = np.empty(x.shape + (2,))
288 |     >>> pos[:, :, 0] = x; pos[:, :, 1] = y
289 |     >>> rv = multivariate_normal([0.5, -0.2], [[2.0, 0.3], [0.3, 0.5]])
290 |     >>> plt.contourf(x, y, rv.pdf(pos))
291 | 
292 |     """
293 | 
294 |     def __init__(self):
295 |         self.__doc__ = doccer.docformat(self.__doc__, docdict_params)
296 | 
297 |     def __call__(self, mean=None, cov=1):
298 |         """
299 |         Create a frozen multivariate normal distribution.
300 | 
301 |         See `multivariate_normal_frozen` for more information.
302 | 
303 |         """
304 |         return multivariate_normal_frozen(mean, cov)
305 | 
306 |     def _logpdf(self, x, mean, prec_U, log_det_cov):
307 |         """
308 |         Parameters
309 |         ----------
310 |         x : ndarray
311 |             Points at which to evaluate the log of the probability
312 |             density function
313 |         mean : ndarray
314 |             Mean of the distribution
315 |         prec_U : ndarray
316 |             A decomposition such that np.dot(prec_U, prec_U.T)
317 |             is the precision matrix, i.e. inverse of the covariance matrix.
318 |         log_det_cov : float
319 |             Logarithm of the determinant of the covariance matrix
320 | 
321 |         Notes
322 |         -----
323 |         As this function does no argument checking, it should not be
324 |         called directly; use 'logpdf' instead.
325 | 
326 |         """
327 |         dim = x.shape[-1]
328 |         dev = x - mean
329 |         maha = np.sum(np.square(np.dot(dev, prec_U)), axis=-1)
330 |         return -0.5 * (dim * _LOG_2PI + log_det_cov + maha)
331 | 
332 |     def logpdf(self, x, mean, cov):
333 |         """
334 |         Log of the multivariate normal probability density function.
335 | 
336 |         Parameters
337 |         ----------
338 |         x : array_like
339 |             Quantiles, with the last axis of `x` denoting the components.
340 |         %(_doc_default_callparams)s
341 | 
342 |         Notes
343 |         -----
344 |         %(_doc_callparams_note)s
345 | 
346 |         Returns
347 |         -------
348 |         pdf : ndarray
349 |             Log of the probability density function evaluated at `x`
350 | 
351 |         """
352 |         dim, mean, cov = _process_parameters(None, mean, cov)
353 |         x = _process_quantiles(x, dim)
354 |         prec_U, log_det_cov = _psd_pinv_decomposed_log_pdet(cov)
355 |         out = self._logpdf(x, mean, prec_U, log_det_cov)
356 |         return _squeeze_output(out)
357 | 
358 |     def pdf(self, x, mean, cov):
359 |         """
360 |         Multivariate normal probability density function.
361 | 
362 |         Parameters
363 |         ----------
364 |         x : array_like
365 |             Quantiles, with the last axis of `x` denoting the components.
366 |         %(_doc_default_callparams)s
367 | 
368 |         Notes
369 |         -----
370 |         %(_doc_callparams_note)s
371 | 
372 |         Returns
373 |         -------
374 |         pdf : ndarray
375 |             Probability density function evaluated at `x`
376 | 
377 |         """
378 |         dim, mean, cov = _process_parameters(None, mean, cov)
379 |         x = _process_quantiles(x, dim)
380 |         prec_U, log_det_cov = _psd_pinv_decomposed_log_pdet(cov)
381 |         out = np.exp(self._logpdf(x, mean, prec_U, log_det_cov))
382 |         return _squeeze_output(out)
383 | 
384 |     def rvs(self, mean=None, cov=1, size=1):
385 |         """
386 |         Draw random samples from a multivariate normal distribution.
387 | 
388 |         Parameters
389 |         ----------
390 |         %(_doc_default_callparams)s
391 |         size : integer, optional
392 |             Number of samples to draw (default 1).
393 | 
394 |         Notes
395 |         -----
396 |         %(_doc_callparams_note)s
397 | 
398 |         Returns
399 |         -------
400 |         rvs : ndarray or scalar
401 |             Random variates of size (`size`, `N`), where `N` is the
402 |             dimension of the random variable.
403 | 
404 |         """
405 |         dim, mean, cov = _process_parameters(None, mean, cov)
406 |         out = np.random.multivariate_normal(mean, cov, size)
407 |         return _squeeze_output(out)
408 | 
409 |     def entropy(self, mean=None, cov=1):
410 |         """
411 |         Compute the differential entropy of the multivariate normal.
412 | 
413 |         Parameters
414 |         ----------
415 |         %(_doc_default_callparams)s
416 | 
417 |         Notes
418 |         -----
419 |         %(_doc_callparams_note)s
420 | 
421 |         Returns
422 |         -------
423 |         h : scalar
424 |             Entropy of the multivariate normal distribution
425 | 
426 |         """
427 |         dim, mean, cov = _process_parameters(None, mean, cov)
428 |         return 1/2 * np.log(np.linalg.det(2 * np.pi * np.e * cov))
429 | 
430 | multivariate_normal = multivariate_normal_gen()
431 | 
432 | 
433 | class multivariate_normal_frozen(object):
434 |     def __init__(self, mean=None, cov=1):
435 |         """
436 |         Create a frozen multivariate normal distribution.
437 | 
438 |         Parameters
439 |         ----------
440 |         mean : array_like, optional
441 |             Mean of the distribution (default zero)
442 |         cov : array_like, optional
443 |             Covariance matrix of the distribution (default one)
444 | 
445 |         Examples
446 |         --------
447 |         When called with the default parameters, this will create a 1D random
448 |         variable with mean 0 and covariance 1:
449 | 
450 |         >>> from scipy.stats import multivariate_normal
451 |         >>> r = multivariate_normal()
452 |         >>> r.mean
453 |         array([ 0.])
454 |         >>> r.cov
455 |         array([[1.]])
456 | 
457 |         """
458 |         self.dim, self.mean, self.cov = _process_parameters(None, mean, cov)
459 |         self.prec_U, self._log_det_cov = _psd_pinv_decomposed_log_pdet(self.cov)
460 | 
461 |         self._mnorm = multivariate_normal_gen()
462 | 
463 |     def logpdf(self, x):
464 |         x = _process_quantiles(x, self.dim)
465 |         out = self._mnorm._logpdf(x, self.mean, self.prec_U, self._log_det_cov)
466 |         return _squeeze_output(out)
467 | 
468 |     def pdf(self, x):
469 |         return np.exp(self.logpdf(x))
470 | 
471 |     def rvs(self, size=1):
472 |         return self._mnorm.rvs(self.mean, self.cov, size)
473 | 
474 |     def entropy(self):
475 |         """
476 |         Computes the differential entropy of the multivariate normal.
477 | 
478 |         Returns
479 |         -------
480 |         h : scalar
481 |             Entropy of the multivariate normal distribution
482 | 
483 |         """
484 |         return 1/2 * (self.dim * (_LOG_2PI + 1) + self._log_det_cov)
485 | 
486 | 
487 | # Set frozen generator docstrings from corresponding docstrings in
488 | # multivariate_normal_gen and fill in default strings in class docstrings
489 | for name in ['logpdf', 'pdf', 'rvs']:
490 |     method = multivariate_normal_gen.__dict__[name]
491 |     method_frozen = multivariate_normal_frozen.__dict__[name]
492 |     method_frozen.__doc__ = doccer.docformat(method.__doc__, docdict_noparams)
493 |     method.__doc__ = doccer.docformat(method.__doc__, docdict_params)
494 | 


--------------------------------------------------------------------------------
/labs/lab8/anagrams.py:
--------------------------------------------------------------------------------
 1 | from mrjob.job import MRJob
 2 | 
 3 | class MRAnagram(MRJob):
 4 | 
 5 |     def mapper(self, _, line):
 6 |         # Convert word into a list of characters, sort them, and convert
 7 |         # back to a string.
 8 |         letters = list(line)
 9 |         letters.sort()
10 | 
11 |         # Key is the sorted word, value is the regular word.
12 |         yield letters, line
13 | 
14 |     def reducer(self, _, words):
15 |         # Get the list of words containing these letters.
16 |         anagrams = [w for w in words]
17 | 
18 |         # Only yield results if there are at least two words which are
19 |         # anagrams of each other.
20 |         if len(anagrams) > 1:
21 |             yield len(anagrams), anagrams
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     MRAnagram.run()
26 | 
27 |         
28 | 


--------------------------------------------------------------------------------
/labs/lab8/friend_affiliations.py:
--------------------------------------------------------------------------------
 1 | from mrjob.job import MRJob
 2 | 
 3 | class MRFriendAffiliations(MRJob):
 4 | 
 5 |     def mapper(self, _, line):
 6 |         # Tokenize line.
 7 |         tokens = line.split(',')
 8 |         tokens = [t.strip() for t in tokens]
 9 |         
10 |         # First token is the person's name.
11 |         # Second token is their favorite team.
12 |         # Remaining tokens are their friends' names.
13 |         name, team, friends = (tokens[0], tokens[1], tokens[2:])
14 | 
15 |         # Emit (key, value) pairs with friends names as the keys and 
16 |         # (this_name, this_team) as the value (same value for all).
17 |         for friend in friends:
18 |             yield friend, (name, team)
19 | 
20 |         # Special case: emit a similar (key, value) pair for this person.
21 |         yield name, (name, team)
22 | 
23 |     def reducer(self, name, friends):
24 |         # Count the number of Red Sox and Cardinals fans who are friends
25 |         # with this person.
26 |         team = None
27 |         red_sox_count = 0
28 |         cardinals_count = 0
29 |         for friend in friends:
30 |             # Keep an eye out of the special case where the friends name 
31 |             # and this persons name are the same -- that tells us which 
32 |             # team this person cheers for.
33 |             if friend[0] == name:
34 |                 this_team = friend[1]
35 |             else:
36 |                 if friend[1] == "Red Sox":
37 |                     red_sox_count += 1
38 |                 elif friend[1] == "Cardinals":
39 |                     cardinals_count += 1
40 |                 else:
41 |                     print "ERROR: Unknown team \"{0}\"".format(friend[1])
42 | 
43 |         # Yield results.
44 |         yield name, (this_team, red_sox_count, cardinals_count)
45 | 
46 | if __name__ == '__main__':
47 |     MRFriendAffiliations.run()
48 | 
49 | 


--------------------------------------------------------------------------------
/labs/lab8/generate_friends.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | """
 4 | generate_friends.py
 5 | 
 6 | Generates data file "baseball_friends.csv" to be used for lab8 MapReduce
 7 | example.
 8 | 
 9 | Reads list of names from "names.txt", randomly assigns team alligiences,
10 | then assigns friendships based on super simple algorithm, and finally 
11 | writes out the file in the following csv format:
12 | 
13 |   name, team, friend1, friend2, friend3, ...
14 | 
15 | """
16 | 
17 | import numpy as np
18 | from numpy.random import binomial
19 | 
20 | # Read list of names from file.
21 | names = [line.strip() for line in open("names.txt")]
22 | names = np.unique(names)
23 | 
24 | # Randomly generate team affiliations for each person.
25 | team = binomial(1, 0.5, len(names))
26 | 
27 | # Probability that two people who are fans of the same team are friends.
28 | friendliness_same = 0.05
29 | # Probability that two people who are fans of opposite teams are friends.
30 | friendliness_diff = 0.03
31 | 
32 | # Create matrix to store friend relationships.
33 | friends = np.zeros([len(names), len(names)])
34 | for i1 in range(len(names)):
35 |     for i2 in range(i1 + 1, len(names)):
36 |         if team[i1] == team[i2]:
37 |             flip = binomial(1, friendliness_same)
38 |         else:
39 |             flip = binomial(1, friendliness_diff)
40 | 
41 |         friends[i1, i2] = flip
42 |         friends[i2, i1] = flip
43 | 
44 | # Write output file.
45 | outfile = open("baseball_friends.csv", 'w')
46 | for i in range(len(names)):
47 |     # Get data for this row.
48 |     this_name = names[i]
49 |     this_team = "Red Sox" if team[i] else "Cardinals"
50 |     friend_list = np.array(names)[friends[i,:] == 1]
51 | 
52 |     # Write to file.
53 |     outstr = ", ".join((this_name, this_team) + tuple(friend_list))
54 |     outfile.write(outstr + "\n")
55 | outfile.close()
56 | 
57 | 


--------------------------------------------------------------------------------
/labs/lab8/lab8_mapreduce.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "heading",
 12 |      "level": 1,
 13 |      "metadata": {},
 14 |      "source": [
 15 |       "Lab 8: MapReduce, mrjob, and EC2"
 16 |      ]
 17 |     },
 18 |     {
 19 |      "cell_type": "markdown",
 20 |      "metadata": {},
 21 |      "source": [
 22 |       "In this week's lab, we will mostly ignore statistics and instead focus on some practical issues that you will encouter on Homework 4. Section 4 of that homework includes new python techniques (classes, inheritance), an unfamiliar approach to breaking up large computing problems (MapReduce), code that has to be run outside the friendly confines of an ipython notebook, and then you are asked to put it all to use on Amazon's Elastic Compute Cloud (EC2). This sounds very complicated, but the end result is a simpler algorithm for that problem of calculating similarity scores, as well as the ability to expand to arbitrarily large data sets."
 23 |      ]
 24 |     },
 25 |     {
 26 |      "cell_type": "heading",
 27 |      "level": 2,
 28 |      "metadata": {},
 29 |      "source": [
 30 |       "1. Classes and generators in python"
 31 |      ]
 32 |     },
 33 |     {
 34 |      "cell_type": "markdown",
 35 |      "metadata": {},
 36 |      "source": [
 37 |       "On previous homeworks, nearly all of the coding has been done by writing python functions plus a small amount of code that calls the functions you have written. Included below is the code for the mrjob word_count example that was covered in lecture (the canonical MapReduce example). There are a lot of new features here!\n",
 38 |       "\n",
 39 |       "Below is the code for a simple MapReduce algorithm to count the number of words in a text file. This is one of the simplest examples of a problem that can be solved using MapReduce (I even took it from the Section \"[Writing your first job](http://mrjob.readthedocs.org/en/latest/guides/quickstart.html#writing-your-first-job)\" in the mrjob documentation). If you try to run the cell in this notebook, it will not work! We will get to running programs with mrjob soon, but for now it will just serve as reference for some topics we want to cover."
 40 |      ]
 41 |     },
 42 |     {
 43 |      "cell_type": "code",
 44 |      "collapsed": true,
 45 |      "input": [
 46 |       "from mrjob.job import MRJob\n",
 47 |       "\n",
 48 |       "class MRWordFrequencyCount(MRJob):\n",
 49 |       "\n",
 50 |       "    def mapper(self, _, line):\n",
 51 |       "        yield \"chars\", len(line)\n",
 52 |       "        yield \"words\", len(line.split())\n",
 53 |       "        yield \"lines\", 1\n",
 54 |       "\n",
 55 |       "    def reducer(self, key, values):\n",
 56 |       "        yield key, sum(values)\n",
 57 |       "\n",
 58 |       "if __name__ == '__main__':\n",
 59 |       "    MRWordFrequencyCount.run()\n"
 60 |      ],
 61 |      "language": "python",
 62 |      "metadata": {},
 63 |      "outputs": []
 64 |     },
 65 |     {
 66 |      "cell_type": "heading",
 67 |      "level": 3,
 68 |      "metadata": {},
 69 |      "source": [
 70 |       "1.1 Classes"
 71 |      ]
 72 |     },
 73 |     {
 74 |      "cell_type": "markdown",
 75 |      "metadata": {},
 76 |      "source": [
 77 |       "Classes are the basis of object-oriented programming in python. For all of the problems on previous homework assignments, we have written functions to do calculations, draw figures, etc. To use mrjob, we have to switch gears and use a different style of programming. \n",
 78 |       "\n",
 79 |       "As you can see in the example above, the <span style=\"font-family: monospace\">MRWordFrequencyCount</span> class is defined with an indented block similar to a function definition, except with <span style=\"font-family: monospace; color: green; font-weight: bold;\">class</span> instead of <span style=\"font-family: monospace; color: green; font-weight: bold;\">def</span>. Instead of a list of arguments, the item in parentheses (<span style=\"font-family: monospace;\">MRJob</span>) is a *base class* that our newly defined class will inherit most of its features from. Even though there is very little code written above for <span style=\"font-family: monospace\">MRWordFrequencyCount</span>, it knows how to do many complex operations (running a mapper and a reducer, submitting jobs to EC2, etc.) because it inherited these abilities from the base class.\n",
 80 |       "\n",
 81 |       "There are two methods, <span style=\"font-family: monospace\">mapper</span> and <span style=\"font-family: monospace\">reducer</span>, that have been written specifically for <span style=\"font-family: monospace\">MRWordFrequencyCount</span>. These methods are also defined for the <span style=\"font-family: monospace\">MRJob</span> base class, but the methods defined here supercede the inherted ones. A class method is similar to a function (as you might guess, since it is also defined with a <span style=\"font-family: monospace; color: green; font-weight: bold;\">def</span> statement), but the first argument to a class method will always be <span style=\"font-family: monospace\">self</span>, a reference back to the object to which the method belongs. The always-present <span style=\"font-family: monospace\">self</span> argument allows the method to access other members of the same object (both data and methods). However, when you actually call a class method, you don't have to supply anything for the <span style=\"font-family: monospace\">self</span> argument -- it is implicit. For example, to call the <span style=\"font-family: monospace;\">reducer</span> method defined above, you would use:"
 82 |      ]
 83 |     },
 84 |     {
 85 |      "cell_type": "code",
 86 |      "collapsed": false,
 87 |      "input": [
 88 |       "# Call reducer method of MRWordFrequencyCount object using some key and values.\n",
 89 |       "MRWordFrequencyCount.reducer(my_key, my_values) # Did not specify 'self' argument"
 90 |      ],
 91 |      "language": "python",
 92 |      "metadata": {},
 93 |      "outputs": []
 94 |     },
 95 |     {
 96 |      "cell_type": "markdown",
 97 |      "metadata": {},
 98 |      "source": [
 99 |       "The next mrjob example -- [Writing your second job](http://mrjob.readthedocs.org/en/latest/guides/quickstart.html#writing-your-second-job) -- processes text to find the most commonly used word. That algorithm involves two MapReduce steps, so it is necessary to write a <span style=\"font-family: monospace;\">MRMostUsedWord.steps</span> method to override the inherited method. Notice that the <span style=\"font-family: monospace;\">self</span> is used repeatedly to specify the function references inside the list returned by the <span style=\"font-family: monospace;\">steps</span> method."
100 |      ]
101 |     },
102 |     {
103 |      "cell_type": "code",
104 |      "collapsed": false,
105 |      "input": [
106 |       "import re\n",
107 |       "\n",
108 |       "WORD_RE = re.compile(r\"[\\w']+\")\n",
109 |       "\n",
110 |       "\n",
111 |       "class MRMostUsedWord(MRJob):\n",
112 |       "\n",
113 |       "    def mapper_get_words(self, _, line):\n",
114 |       "        # yield each word in the line\n",
115 |       "        for word in WORD_RE.findall(line):\n",
116 |       "            yield (word.lower(), 1)\n",
117 |       "\n",
118 |       "    def combiner_count_words(self, word, counts):\n",
119 |       "        # optimization: sum the words we've seen so far\n",
120 |       "        yield (word, sum(counts))\n",
121 |       "\n",
122 |       "    def reducer_count_words(self, word, counts):\n",
123 |       "        # send all (num_occurrences, word) pairs to the same reducer.\n",
124 |       "        # num_occurrences is so we can easily use Python's max() function.\n",
125 |       "        yield None, (sum(counts), word)\n",
126 |       "\n",
127 |       "    # discard the key; it is just None\n",
128 |       "    def reducer_find_max_word(self, _, word_count_pairs):\n",
129 |       "        # each item of word_count_pairs is (count, word),\n",
130 |       "        # so yielding one results in key=counts, value=word\n",
131 |       "        yield max(word_count_pairs)\n",
132 |       "\n",
133 |       "    def steps(self):\n",
134 |       "        return [\n",
135 |       "            self.mr(mapper=self.mapper_get_words,\n",
136 |       "                    combiner=self.combiner_count_words,\n",
137 |       "                    reducer=self.reducer_count_words),\n",
138 |       "            self.mr(reducer=self.reducer_find_max_word)\n",
139 |       "        ]\n",
140 |       "\n",
141 |       "\n",
142 |       "if __name__ == '__main__':\n",
143 |       "    MRMostUsedWord.run()"
144 |      ],
145 |      "language": "python",
146 |      "metadata": {},
147 |      "outputs": []
148 |     },
149 |     {
150 |      "cell_type": "markdown",
151 |      "metadata": {},
152 |      "source": [
153 |       "[More about classes in python](http://docs.python.org/2/tutorial/classes.html#)"
154 |      ]
155 |     },
156 |     {
157 |      "cell_type": "heading",
158 |      "level": 3,
159 |      "metadata": {},
160 |      "source": [
161 |       "1.2 Generators"
162 |      ]
163 |     },
164 |     {
165 |      "cell_type": "markdown",
166 |      "metadata": {},
167 |      "source": [
168 |       "Generators are necessary to understand all of those <span style=\"font-family: monospace; font-weight: bold; color: green;\">yield</span> statements popping up in the mapper and reducer methods. The main issue, in the case of industrial-strength MapReduce, is that you don't have enough memory to store all of your data at once. This is true even after you have split your data between many compute nodes. So instead of getting an enormous list of data, the mapper and reducer functions both receive and emit generators.\n",
169 |       "\n",
170 |       "When you run a function, it chugs along until it hits a <span style=\"font-family: monospace; font-weight: bold; color: green;\">return</span> statement, at which point it returns some results and then it is done. A generator does its specified calculations until it hits a <span style=\"font-family: monospace; font-weight: bold; color: green;\">yield</span> statement. It passes along whatever values it was supposed to yield and then it *pauses* and waits for someone to tell it to continue. It continues until it reaches another <span style=\"font-family: monospace; font-weight: bold; color: green;\">yield</span>, and so on.\n",
171 |       "\n",
172 |       "Not only are mapper and reducer generators, their (key, value) inputs are also generators. This means that for each step of the mapper, it pulls in one (key, value) pair, does some processing, and then emits one or more key value pairs, which move along to a combiner or a shuffler or whatever. This is how MapReduce avoids ever having to load huge datasets into limited memory.\n",
173 |       "\n",
174 |       "A common stumbling block with generators is the fact that once you have iterated through an entire generator, it is done. You can see an example of this mistake by trying to run the code block below."
175 |      ]
176 |     },
177 |     {
178 |      "cell_type": "code",
179 |      "collapsed": false,
180 |      "input": [
181 |       "# This function converts a list into a generator.\n",
182 |       "def example_generator(list):\n",
183 |       "    for item in list:\n",
184 |       "        yield item\n",
185 |       "        \n",
186 |       "# Create a generator.\n",
187 |       "my_generator = example_generator([0, 1, 2, 3, 4])\n",
188 |       "\n",
189 |       "# Iterating over the generator works great the first time.\n",
190 |       "print \"generator iteration 1\"\n",
191 |       "print \"---------------------\"\n",
192 |       "for value in my_generator:\n",
193 |       "    print value\n",
194 |       "    \n",
195 |       "# ...but it doesn't work the second time.\n",
196 |       "print \"\\n\"\n",
197 |       "print \"generator iteration 2\"\n",
198 |       "print \"---------------------\"\n",
199 |       "for value in my_generator:\n",
200 |       "    print value"
201 |      ],
202 |      "language": "python",
203 |      "metadata": {},
204 |      "outputs": []
205 |     },
206 |     {
207 |      "cell_type": "heading",
208 |      "level": 3,
209 |      "metadata": {},
210 |      "source": [
211 |       "1.3 What does <span style=\"font-family: monospace;\">\\_\\_name\\_\\_ <span style=\"color: violet;\">==</span> <span style=\"color: red;\">'\\_\\_main\\_\\_'</span></span> mean??"
212 |      ]
213 |     },
214 |     {
215 |      "cell_type": "markdown",
216 |      "metadata": {},
217 |      "source": [
218 |       "Python is *really* into namespaces (see, for example, [The Zen of Python](http://www.python.org/dev/peps/pep-0020/)). The <span style=\"font-family: monospace;\">\\_\\_name\\_\\_</span> keyword tells you what namespace it is in. For example, if we <span style=\"font-family: monospace;\"><span style=\"font-weight: bold; color: green;\">import</span> numpy</span>, then all of the numpy features are in the numpy namespace."
219 |      ]
220 |     },
221 |     {
222 |      "cell_type": "code",
223 |      "collapsed": false,
224 |      "input": [
225 |       "import numpy as np\n",
226 |       "print np.__name__\n",
227 |       "\n",
228 |       "import matplotlib.pyplot as plt\n",
229 |       "print plt.__name__"
230 |      ],
231 |      "language": "python",
232 |      "metadata": {},
233 |      "outputs": []
234 |     },
235 |     {
236 |      "cell_type": "markdown",
237 |      "metadata": {},
238 |      "source": [
239 |       "If you try to import the above file containing the definition for <span style=\"font-family: monospace;\">MRMostUsedWord</span>, then python will interpret the file all the way down until it hits that last <span style=\"font-family: monospace; font-weight: bold; color: green;\">if</span> statement. <span style=\"font-family: monospace;\">\\_\\_name\\_\\_</span> will evaluate to <span style=\"font-family: monospace;\">MRMostUsedWord</span> (or whatever the name was of the file we imported) and the line inside the if statement will be ignored. On the other hand, if you run this code from the command line, python will interpret it *without* importing it and <tt>\\_\\_name\\_\\_</tt> will be the python top level namespace, which is <tt><span color=\"red\">'\\_\\_main\\_\\_'</span></tt>, so <tt>MRMostUsedWord.run()</tt> gets called.\n",
240 |       "\n",
241 |       "In (many) fewer words: it tells you to run the job only when invoked from the command line.\n",
242 |       "\n",
243 |       "Try copying the code for MRMostUsedWord to a file, named <tt>MRMostUsedWord.py</tt>, and then running it on any old text file you might have lying around. The invokation will be somthing like this (modify based on your particular python installation):"
244 |      ]
245 |     },
246 |     {
247 |      "cell_type": "code",
248 |      "collapsed": false,
249 |      "input": [
250 |       "python MRMostUsedWord.py some_file.txt > most_used_word.out"
251 |      ],
252 |      "language": "python",
253 |      "metadata": {},
254 |      "outputs": []
255 |     },
256 |     {
257 |      "cell_type": "heading",
258 |      "level": 2,
259 |      "metadata": {},
260 |      "source": [
261 |       "2. Setting up your Amazon Web Services account"
262 |      ]
263 |     },
264 |     {
265 |      "cell_type": "markdown",
266 |      "metadata": {},
267 |      "source": [
268 |       "There is quite a bit of overhead involved in setting up an AWS account and keeping an eye on the jobs that you end up running. In lab, we will run through an example account activation including:\n",
269 |       "\n",
270 |       "* Account creation\n",
271 |       "* Signing up for Elastic MapReduce\n",
272 |       "* Storing security credentials in your mrjob.conf file\n",
273 |       "* Redeeming account credits\n",
274 |       "* Billing alerts\n",
275 |       "* Checking on running jobs using the console\n",
276 |       "\n",
277 |       "These documents (also linked from HW4) are very useful: [Instructions for Amazon Setup notebook](http://nbviewer.ipython.org/urls/raw.github.com/cs109/content/master/InstructionsForAmazonEMR.ipynb), [Elastic MapReduce Quickstart](http://pythonhosted.org/mrjob/guides/emr-quickstart.html)\n",
278 |       "\n",
279 |       "Once you have this all set up and working, then mrjob makes it *very easy* to run a MapReduce job with EMR. Using the same MRMostUsedWord example as above, the command line invokation to run with EMR is:"
280 |      ]
281 |     },
282 |     {
283 |      "cell_type": "code",
284 |      "collapsed": false,
285 |      "input": [
286 |       "python MRMostUsedWord.py -r emr some_file.txt > most_used_word.out"
287 |      ],
288 |      "language": "python",
289 |      "metadata": {},
290 |      "outputs": []
291 |     },
292 |     {
293 |      "cell_type": "heading",
294 |      "level": 2,
295 |      "metadata": {},
296 |      "source": [
297 |       "3. MapReduce exercises"
298 |      ]
299 |     },
300 |     {
301 |      "cell_type": "markdown",
302 |      "metadata": {},
303 |      "source": [
304 |       "![MapReduce schematic](https://developers.google.com/appengine/docs/python/images/mapreduce_mapshuffle.png)\n",
305 |       "<br /> \\[<span style=\"font-size: small;\">Image from [https://developers.google.com/appengine/docs/python/dataprocessing/](https://developers.google.com/appengine/docs/python/dataprocessing/)</span>\\]\n",
306 |       "\n",
307 |       "Below are two practice problems to get the hang of writing MapReduce algorithms. Remember, you will be writing these programs in separate files that you run from the command line. You are welcome to try out EC2, but these are small datasets and it will generally be much faster to run locally."
308 |      ]
309 |     },
310 |     {
311 |      "cell_type": "heading",
312 |      "level": 3,
313 |      "metadata": {},
314 |      "source": [
315 |       "3.1 Anagram finder"
316 |      ]
317 |     },
318 |     {
319 |      "cell_type": "markdown",
320 |      "metadata": {},
321 |      "source": [
322 |       "First, grab the file [word_list.txt](https://raw.github.com/cs109/content/master/labs/lab8/word_list.txt). This contains a list of six-letter words that I dumped from my spellchecker. To keep things simple, all of the words consist of lower-case letters only."
323 |      ]
324 |     },
325 |     {
326 |      "cell_type": "code",
327 |      "collapsed": false,
328 |      "input": [
329 |       "word_list = [word.strip() for word in open(\"word_list.txt\").readlines()]\n",
330 |       "print \"{0} words in list\".format(len(word_list))\n",
331 |       "print \"First ten words: {0}\".format(\", \".join(word_list[0:10]))"
332 |      ],
333 |      "language": "python",
334 |      "metadata": {},
335 |      "outputs": []
336 |     },
337 |     {
338 |      "cell_type": "markdown",
339 |      "metadata": {},
340 |      "source": [
341 |       "Use mrjob to write a class that finds all anagrams in word_list.txt. \n",
342 |       "\n",
343 |       "**UPDATE**: [My solution to exercise 3.1](https://raw.github.com/cs109/content/master/labs/lab8/anagrams.py)"
344 |      ]
345 |     },
346 |     {
347 |      "cell_type": "heading",
348 |      "level": 3,
349 |      "metadata": {},
350 |      "source": [
351 |       "3.2 Friends don't let friends root for the Cardinals"
352 |      ]
353 |     },
354 |     {
355 |      "cell_type": "markdown",
356 |      "metadata": {},
357 |      "source": [
358 |       "![Cardinals v. Red Sox](http://www.stlcardinalbaseball.com/wp-content/uploads/2013/10/CARDINALS-V-RED-SOX-650x325.jpg)"
359 |      ]
360 |     },
361 |     {
362 |      "cell_type": "markdown",
363 |      "metadata": {},
364 |      "source": [
365 |       "For the next problem, download the file [baseball_friends.csv](https://raw.github.com/cs109/content/master/labs/lab8/baseball_friends.csv). Each row of this csv file contains the following:\n",
366 |       "\n",
367 |       "* A person's name\n",
368 |       "* The team that person is rooting for -- either \"Cardinals\" or \"Red Sox\"\n",
369 |       "* A list of that person's friends, which could have arbitrary length\n",
370 |       "\n",
371 |       "Let's take a look at one line:"
372 |      ]
373 |     },
374 |     {
375 |      "cell_type": "code",
376 |      "collapsed": false,
377 |      "input": [
378 |       "friends = open(\"baseball_friends.csv\").readlines()\n",
379 |       "print friends[0].strip()\n",
380 |       "print len(friends[0].split(\",\")) - 2"
381 |      ],
382 |      "language": "python",
383 |      "metadata": {},
384 |      "outputs": []
385 |     },
386 |     {
387 |      "cell_type": "markdown",
388 |      "metadata": {},
389 |      "source": [
390 |       "This line tells us that Aaden is a Red Sox friend and he has 65 friends, who are all listed here. For this problem, it's safe to assume that all of the names are unique and that the friendship structure is symmetric (*i.e.* if Alannah shows up in Aaden's friends list, then Aaden will show up in Alannah's friends list).\n",
391 |       "\n",
392 |       "Write an mrjob class that lists each person's name, their favorite team, the number of Red Sox fans they are friends with, and the number of Cardinals fans they are friends with.\n",
393 |       "\n",
394 |       "After running that program, we can look at the results to get an idea of the absurdly simple model that I used to generate the input csv file. You might need to modify the code below if the format of your output file doesn't quite match mine."
395 |      ]
396 |     },
397 |     {
398 |      "cell_type": "code",
399 |      "collapsed": false,
400 |      "input": [
401 |       "import pandas as pd\n",
402 |       "import json\n",
403 |       "\n",
404 |       "# Read results.\n",
405 |       "result_file = \"baseball_friends.out\"\n",
406 |       "result = [[json.loads(field) for field in line.strip().split('\\t')] for line in open(result_file)]\n",
407 |       "\n",
408 |       "# Break out columns.\n",
409 |       "names = [x[0] for x in result]\n",
410 |       "teams = [x[1][0] for x in result]\n",
411 |       "redsox_count = [x[1][1] for x in result]\n",
412 |       "cardinals_count = [x[1][2] for x in result]\n",
413 |       "\n",
414 |       "# Combine in data frame.\n",
415 |       "result = pd.DataFrame(index=names, data={'teams': teams, 'redsox_count': redsox_count, \n",
416 |       "                                         'cardinals_count': cardinals_count})"
417 |      ],
418 |      "language": "python",
419 |      "metadata": {},
420 |      "outputs": []
421 |     },
422 |     {
423 |      "cell_type": "code",
424 |      "collapsed": false,
425 |      "input": [
426 |       "%matplotlib inline\n",
427 |       "import matplotlib.pyplot as plt\n",
428 |       "from matplotlib import rcParams\n",
429 |       "rcParams['figure.figsize'] = (10, 6)\n",
430 |       "rcParams['font.size'] = 14\n",
431 |       "\n",
432 |       "# Average number of friends by affiliation.\n",
433 |       "print result.groupby('teams').mean()\n",
434 |       "\n",
435 |       "# Histogram the affiliations of people who are friends of Red Sox fans.\n",
436 |       "plt.hist(result.redsox_count[result.teams == \"Red Sox\"], label=\"Red Sox friend Red Sox\")\n",
437 |       "plt.hist(result.cardinals_count[result.teams == \"Red Sox\"], label=\"Red Sox friend Cardinals\")\n",
438 |       "plt.xlabel('number of friends')\n",
439 |       "plt.ylabel('count')\n",
440 |       "plt.legend(loc=0)"
441 |      ],
442 |      "language": "python",
443 |      "metadata": {},
444 |      "outputs": []
445 |     },
446 |     {
447 |      "cell_type": "markdown",
448 |      "metadata": {},
449 |      "source": [
450 |       "**UPDATE**: [My solution to exercise 3.2](https://raw.github.com/cs109/content/master/labs/lab8/friend_affiliations.py)"
451 |      ]
452 |     }
453 |    ],
454 |    "metadata": {}
455 |   }
456 |  ]
457 | }


--------------------------------------------------------------------------------
/labs/lab8/most_used_word.py:
--------------------------------------------------------------------------------
 1 | from mrjob.job import MRJob
 2 | import re
 3 | 
 4 | WORD_RE = re.compile(r"[\w']+")
 5 | 
 6 | class MRMostUsedWord(MRJob):
 7 | 
 8 |     def mapper_get_words(self, _, line):
 9 |         # yield each word in the line
10 |         for word in WORD_RE.findall(line):
11 |             yield (word.lower(), 1)
12 | 
13 |     def combiner_count_words(self, word, counts):
14 |         # optimization: sum the words we've seen so far
15 |         yield (word, sum(counts))
16 | 
17 |     def reducer_count_words(self, word, counts):
18 |         # send all (num_occurrences, word) pairs to the same reducer.
19 |         # num_occurrences is so we can easily use Python's max() function.
20 |         yield None, (sum(counts), word)
21 | 
22 |     # discard the key, it is just None
23 |     def reducer_find_max_word(self, _, word_count_pairs):
24 |         # each item of word_count_pairs is (count, word),
25 |         # so yielding one results in key=counts, value=word
26 |         yield max(word_count_pairs)
27 | 
28 |     def steps(self):
29 |         return [
30 |             self.mr(mapper=self.mapper_get_words,
31 |                     combiner=self.combiner_count_words,
32 |                     reducer=self.reducer_count_words),
33 |             self.mr(reducer=self.reducer_find_max_word)
34 |             ]
35 | 
36 | if __name__ == '__main__':
37 |     MRMostUsedWord.run()
38 | 
39 | 


--------------------------------------------------------------------------------
/labs/lab8/names.txt:
--------------------------------------------------------------------------------
   1 | Sophia
   2 | Jacob
   3 | Emma
   4 | Mason
   5 | Isabella
   6 | Ethan
   7 | Olivia
   8 | Noah
   9 | Ava
  10 | William
  11 | Emily
  12 | Liam
  13 | Abigail
  14 | Jayden
  15 | Mia
  16 | Michael
  17 | Madison
  18 | Alexander
  19 | Elizabeth
  20 | Aiden
  21 | Chloe
  22 | Daniel
  23 | Ella
  24 | Matthew
  25 | Avery
  26 | Elijah
  27 | Addison
  28 | James
  29 | Aubrey
  30 | Anthony
  31 | Lily
  32 | Benjamin
  33 | Natalie
  34 | Joshua
  35 | Sofia
  36 | Andrew
  37 | Charlotte
  38 | David
  39 | Zoey
  40 | Joseph
  41 | Grace
  42 | Logan
  43 | Hannah
  44 | Jackson
  45 | Amelia
  46 | Christopher
  47 | Harper
  48 | Gabriel
  49 | Lillian
  50 | Samuel
  51 | Samantha
  52 | Ryan
  53 | Evelyn
  54 | Lucas
  55 | Victoria
  56 | John
  57 | Brooklyn
  58 | Nathan
  59 | Zoe
  60 | Isaac
  61 | Layla
  62 | Dylan
  63 | Hailey
  64 | Caleb
  65 | Leah
  66 | Christian
  67 | Kaylee
  68 | Landon
  69 | Anna
  70 | Jonathan
  71 | Aaliyah
  72 | Carter
  73 | Gabriella
  74 | Luke
  75 | Allison
  76 | Owen
  77 | Nevaeh
  78 | Brayden
  79 | Alexis
  80 | Gavin
  81 | Audrey
  82 | Wyatt
  83 | Savannah
  84 | Isaiah
  85 | Sarah
  86 | Henry
  87 | Alyssa
  88 | Eli
  89 | Claire
  90 | Hunter
  91 | Taylor
  92 | Jack
  93 | Riley
  94 | Evan
  95 | Camila
  96 | Jordan
  97 | Arianna
  98 | Nicholas
  99 | Ashley
 100 | Tyler
 101 | Brianna
 102 | Aaron
 103 | Sophie
 104 | Jeremiah
 105 | Peyton
 106 | Julian
 107 | Bella
 108 | Cameron
 109 | Khloe
 110 | Levi
 111 | Genesis
 112 | Brandon
 113 | Alexa
 114 | Angel
 115 | Serenity
 116 | Austin
 117 | Kylie
 118 | Connor
 119 | Aubree
 120 | Adrian
 121 | Scarlett
 122 | Robert
 123 | Stella
 124 | Charles
 125 | Maya
 126 | Thomas
 127 | Katherine
 128 | Sebastian
 129 | Julia
 130 | Colton
 131 | Lucy
 132 | Jaxon
 133 | Madelyn
 134 | Kevin
 135 | Autumn
 136 | Zachary
 137 | Makayla
 138 | Ayden
 139 | Kayla
 140 | Dominic
 141 | Mackenzie
 142 | Blake
 143 | Lauren
 144 | Jose
 145 | Gianna
 146 | Oliver
 147 | Ariana
 148 | Justin
 149 | Faith
 150 | Bentley
 151 | Alexandra
 152 | Jason
 153 | Melanie
 154 | Chase
 155 | Sydney
 156 | Ian
 157 | Bailey
 158 | Josiah
 159 | Caroline
 160 | Parker
 161 | Naomi
 162 | Xavier
 163 | Morgan
 164 | Adam
 165 | Kennedy
 166 | Cooper
 167 | Ellie
 168 | Nathaniel
 169 | Jasmine
 170 | Grayson
 171 | Eva
 172 | Jace
 173 | Skylar
 174 | Carson
 175 | Kimberly
 176 | Nolan
 177 | Violet
 178 | Tristan
 179 | Molly
 180 | Luis
 181 | Aria
 182 | Brody
 183 | Jocelyn
 184 | Juan
 185 | Trinity
 186 | Hudson
 187 | London
 188 | Bryson
 189 | Lydia
 190 | Carlos
 191 | Madeline
 192 | Easton
 193 | Reagan
 194 | Damian
 195 | Piper
 196 | Alex
 197 | Andrea
 198 | Kayden
 199 | Annabelle
 200 | Ryder
 201 | Maria
 202 | Jesus
 203 | Brooke
 204 | Cole
 205 | Payton
 206 | Micah
 207 | Paisley
 208 | Vincent
 209 | Paige
 210 | Max
 211 | Ruby
 212 | Jaxson
 213 | Nora
 214 | Eric
 215 | Mariah
 216 | Asher
 217 | Rylee
 218 | Hayden
 219 | Lilly
 220 | Diego
 221 | Brielle
 222 | Miles
 223 | Jade
 224 | Steven
 225 | Destiny
 226 | Ivan
 227 | Nicole
 228 | Elias
 229 | Mila
 230 | Aidan
 231 | Kendall
 232 | Maxwell
 233 | Liliana
 234 | Bryce
 235 | Kaitlyn
 236 | Antonio
 237 | Natalia
 238 | Giovanni
 239 | Sadie
 240 | Timothy
 241 | Jordyn
 242 | Bryan
 243 | Vanessa
 244 | Santiago
 245 | Mary
 246 | Colin
 247 | Mya
 248 | Richard
 249 | Penelope
 250 | Braxton
 251 | Isabelle
 252 | Kaleb
 253 | Alice
 254 | Kyle
 255 | Reese
 256 | Kaden
 257 | Gabrielle
 258 | Preston
 259 | Hadley
 260 | Miguel
 261 | Katelyn
 262 | Jonah
 263 | Angelina
 264 | Lincoln
 265 | Rachel
 266 | Riley
 267 | Isabel
 268 | Leo
 269 | Eleanor
 270 | Victor
 271 | Clara
 272 | Brady
 273 | Brooklynn
 274 | Jeremy
 275 | Jessica
 276 | Mateo
 277 | Elena
 278 | Brian
 279 | Aliyah
 280 | Jaden
 281 | Vivian
 282 | Ashton
 283 | Laila
 284 | Patrick
 285 | Sara
 286 | Declan
 287 | Amy
 288 | Sean
 289 | Eliana
 290 | Joel
 291 | Lyla
 292 | Gael
 293 | Juliana
 294 | Sawyer
 295 | Valeria
 296 | Alejandro
 297 | Adriana
 298 | Marcus
 299 | Makenzie
 300 | Leonardo
 301 | Elise
 302 | Jesse
 303 | Mckenzie
 304 | Caden
 305 | Quinn
 306 | Jake
 307 | Delilah
 308 | Kaiden
 309 | Cora
 310 | Wesley
 311 | Kylee
 312 | Camden
 313 | Rebecca
 314 | Edward
 315 | Gracie
 316 | Brantley
 317 | Izabella
 318 | Roman
 319 | Josephine
 320 | Axel
 321 | Alaina
 322 | Silas
 323 | Michelle
 324 | Jude
 325 | Jennifer
 326 | Grant
 327 | Eden
 328 | Cayden
 329 | Valentina
 330 | Emmanuel
 331 | Aurora
 332 | George
 333 | Catherine
 334 | Maddox
 335 | Stephanie
 336 | Malachi
 337 | Valerie
 338 | Bradley
 339 | Jayla
 340 | Alan
 341 | Willow
 342 | Weston
 343 | Daisy
 344 | Gage
 345 | Alana
 346 | Devin
 347 | Melody
 348 | Greyson
 349 | Hazel
 350 | Kenneth
 351 | Summer
 352 | Mark
 353 | Melissa
 354 | Oscar
 355 | Margaret
 356 | Tanner
 357 | Kinsley
 358 | Rylan
 359 | Kinley
 360 | Nicolas
 361 | Ariel
 362 | Harrison
 363 | Lila
 364 | Derek
 365 | Giselle
 366 | Peyton
 367 | Ryleigh
 368 | Ezra
 369 | Haley
 370 | Tucker
 371 | Julianna
 372 | Emmett
 373 | Ivy
 374 | Avery
 375 | Alivia
 376 | Cody
 377 | Brynn
 378 | Calvin
 379 | Keira
 380 | Andres
 381 | Daniela
 382 | Jorge
 383 | Aniyah
 384 | Abel
 385 | Angela
 386 | Paul
 387 | Kate
 388 | Abraham
 389 | Londyn
 390 | Kai
 391 | Hayden
 392 | Collin
 393 | Harmony
 394 | Theodore
 395 | Adalyn
 396 | Ezekiel
 397 | Megan
 398 | Omar
 399 | Allie
 400 | Jayce
 401 | Gabriela
 402 | Conner
 403 | Alayna
 404 | Bennett
 405 | Presley
 406 | Trevor
 407 | Jenna
 408 | Eduardo
 409 | Alexandria
 410 | Peter
 411 | Ashlyn
 412 | Maximus
 413 | Adrianna
 414 | Jaiden
 415 | Jada
 416 | Jameson
 417 | Fiona
 418 | Seth
 419 | Norah
 420 | Kingston
 421 | Emery
 422 | Javier
 423 | Maci
 424 | Travis
 425 | Miranda
 426 | Garrett
 427 | Ximena
 428 | Everett
 429 | Amaya
 430 | Graham
 431 | Cecilia
 432 | Xander
 433 | Ana
 434 | Cristian
 435 | Shelby
 436 | Damien
 437 | Katie
 438 | Ryker
 439 | Hope
 440 | Griffin
 441 | Callie
 442 | Corbin
 443 | Jordan
 444 | Myles
 445 | Luna
 446 | Luca
 447 | Leilani
 448 | Zane
 449 | Eliza
 450 | Francisco
 451 | Mckenna
 452 | Ricardo
 453 | Angel
 454 | Alexis
 455 | Genevieve
 456 | Stephen
 457 | Makenna
 458 | Zayden
 459 | Isla
 460 | Iker
 461 | Lola
 462 | Drake
 463 | Danielle
 464 | Lukas
 465 | Chelsea
 466 | Charlie
 467 | Leila
 468 | Spencer
 469 | Tessa
 470 | Zion
 471 | Adelyn
 472 | Erick
 473 | Camille
 474 | Josue
 475 | Mikayla
 476 | Jeffrey
 477 | Adeline
 478 | Trenton
 479 | Adalynn
 480 | Chance
 481 | Sienna
 482 | Paxton
 483 | Esther
 484 | Elliot
 485 | Jacqueline
 486 | Fernando
 487 | Emerson
 488 | Keegan
 489 | Arabella
 490 | Landen
 491 | Maggie
 492 | Manuel
 493 | Athena
 494 | Amir
 495 | Lucia
 496 | Shane
 497 | Lexi
 498 | Raymond
 499 | Ayla
 500 | Zander
 501 | Diana
 502 | Andre
 503 | Alexia
 504 | Israel
 505 | Juliet
 506 | Mario
 507 | Josie
 508 | Cesar
 509 | Allyson
 510 | Simon
 511 | Addyson
 512 | King
 513 | Delaney
 514 | Jaylen
 515 | Teagan
 516 | Johnathan
 517 | Marley
 518 | Troy
 519 | Amber
 520 | Dean
 521 | Rose
 522 | Clayton
 523 | Erin
 524 | Dominick
 525 | Leslie
 526 | Tyson
 527 | Kayleigh
 528 | Jasper
 529 | Amanda
 530 | Martin
 531 | Kathryn
 532 | Kyler
 533 | Kelsey
 534 | Hector
 535 | Emilia
 536 | Edgar
 537 | Alina
 538 | Marco
 539 | Kenzie
 540 | Cash
 541 | Kaydence
 542 | Edwin
 543 | Alicia
 544 | Shawn
 545 | Alison
 546 | Judah
 547 | Paris
 548 | Andy
 549 | Sabrina
 550 | Donovan
 551 | Ashlynn
 552 | Kameron
 553 | Lilliana
 554 | Elliott
 555 | Sierra
 556 | Dante
 557 | Cassidy
 558 | Braylon
 559 | Laura
 560 | Anderson
 561 | Alondra
 562 | Johnny
 563 | Iris
 564 | Drew
 565 | Kyla
 566 | Sergio
 567 | Christina
 568 | Cruz
 569 | Carly
 570 | Dalton
 571 | Jillian
 572 | Rafael
 573 | Madilyn
 574 | Gregory
 575 | Kyleigh
 576 | Lane
 577 | Madeleine
 578 | Erik
 579 | Cadence
 580 | Skyler
 581 | Nina
 582 | Finn
 583 | Evangeline
 584 | Reid
 585 | Nadia
 586 | Gunner
 587 | Raegan
 588 | Jared
 589 | Lyric
 590 | Caiden
 591 | Giuliana
 592 | Holden
 593 | Briana
 594 | Emilio
 595 | Georgia
 596 | Fabian
 597 | Yaretzi
 598 | Aden
 599 | Elliana
 600 | Brendan
 601 | Haylee
 602 | Rowan
 603 | Fatima
 604 | Emiliano
 605 | Phoebe
 606 | Braden
 607 | Selena
 608 | Jase
 609 | Charlie
 610 | Jax
 611 | Dakota
 612 | Emanuel
 613 | Annabella
 614 | Lorenzo
 615 | Abby
 616 | Roberto
 617 | Daniella
 618 | Amari
 619 | Juliette
 620 | Angelo
 621 | Lilah
 622 | Beau
 623 | Bianca
 624 | Louis
 625 | Mariana
 626 | Derrick
 627 | Miriam
 628 | Beckett
 629 | Parker
 630 | Dawson
 631 | Veronica
 632 | Felix
 633 | Gemma
 634 | Pedro
 635 | Noelle
 636 | Brennan
 637 | Cheyenne
 638 | Frank
 639 | Marissa
 640 | Maximiliano
 641 | Heaven
 642 | Quinn
 643 | Vivienne
 644 | Dallas
 645 | Brynlee
 646 | Romeo
 647 | Joanna
 648 | Braylen
 649 | Mallory
 650 | Joaquin
 651 | Aubrie
 652 | Waylon
 653 | Journey
 654 | Allen
 655 | Nyla
 656 | Colt
 657 | Cali
 658 | Ruben
 659 | Tatum
 660 | Milo
 661 | Carmen
 662 | Julius
 663 | Gia
 664 | Grady
 665 | Jazmine
 666 | August
 667 | Heidi
 668 | Dakota
 669 | Miley
 670 | Cohen
 671 | Baylee
 672 | Brock
 673 | Elaina
 674 | Kellen
 675 | Macy
 676 | Brycen
 677 | Ainsley
 678 | Desmond
 679 | Jane
 680 | Malik
 681 | Raelynn
 682 | Colby
 683 | Anastasia
 684 | Nehemiah
 685 | Adelaide
 686 | Leland
 687 | Ruth
 688 | Jett
 689 | Camryn
 690 | Marcos
 691 | Kiara
 692 | Taylor
 693 | Alessandra
 694 | Karter
 695 | Hanna
 696 | Marshall
 697 | Finley
 698 | Ty
 699 | Maddison
 700 | Phillip
 701 | Lia
 702 | Corey
 703 | Bethany
 704 | Ali
 705 | Karen
 706 | Adan
 707 | Kelly
 708 | Dillon
 709 | Malia
 710 | Arthur
 711 | Jazmin
 712 | Maverick
 713 | Jayda
 714 | Leon
 715 | Esmeralda
 716 | Brooks
 717 | Kira
 718 | Tristen
 719 | Lena
 720 | Titus
 721 | Kamryn
 722 | Keith
 723 | Kamila
 724 | Dexter
 725 | Karina
 726 | Karson
 727 | Eloise
 728 | Emerson
 729 | Kara
 730 | Landyn
 731 | Elisa
 732 | Armando
 733 | Rylie
 734 | Pablo
 735 | Olive
 736 | Knox
 737 | Nayeli
 738 | Enrique
 739 | Tiffany
 740 | Cade
 741 | Macie
 742 | Gerardo
 743 | Skyler
 744 | Reed
 745 | Addisyn
 746 | Kellan
 747 | Angelica
 748 | Jayson
 749 | Briella
 750 | Barrett
 751 | Fernanda
 752 | Walter
 753 | Annie
 754 | Dustin
 755 | Maliyah
 756 | Kolton
 757 | Amiyah
 758 | Ronald
 759 | Jayden
 760 | Trent
 761 | Charlee
 762 | Phoenix
 763 | Caitlyn
 764 | Ismael
 765 | Elle
 766 | Julio
 767 | Crystal
 768 | Danny
 769 | Julie
 770 | Kason
 771 | Imani
 772 | Scott
 773 | Kendra
 774 | Messiah
 775 | Talia
 776 | Jay
 777 | Angelique
 778 | Esteban
 779 | Jazlyn
 780 | Gideon
 781 | Guadalupe
 782 | Tate
 783 | Alejandra
 784 | Abram
 785 | Emely
 786 | Trey
 787 | Lucille
 788 | Keaton
 789 | Anya
 790 | Jakob
 791 | April
 792 | Jaime
 793 | Elsie
 794 | Devon
 795 | Madelynn
 796 | Braydon
 797 | Myla
 798 | Izaiah
 799 | Julissa
 800 | Donald
 801 | Scarlet
 802 | Albert
 803 | Helen
 804 | Raul
 805 | Breanna
 806 | Darius
 807 | Kyra
 808 | Archer
 809 | Madisyn
 810 | Colten
 811 | Rosalie
 812 | Damon
 813 | Brittany
 814 | River
 815 | Brylee
 816 | Gustavo
 817 | Jayleen
 818 | Philip
 819 | Arielle
 820 | Atticus
 821 | Karla
 822 | Walker
 823 | Kailey
 824 | Matteo
 825 | Arya
 826 | Randy
 827 | Sarai
 828 | Saul
 829 | Harley
 830 | Rocco
 831 | Miracle
 832 | Davis
 833 | Kaelyn
 834 | Enzo
 835 | Kali
 836 | Noel
 837 | Cynthia
 838 | Orion
 839 | Daphne
 840 | Jamari
 841 | Aleah
 842 | Remington
 843 | Caitlin
 844 | Bruce
 845 | Cassandra
 846 | Darren
 847 | Holly
 848 | Larry
 849 | Janelle
 850 | Mathew
 851 | Marilyn
 852 | Russell
 853 | Katelynn
 854 | Dennis
 855 | Kaylie
 856 | Tony
 857 | Itzel
 858 | Chris
 859 | Carolina
 860 | Porter
 861 | Bristol
 862 | Rodrigo
 863 | Haven
 864 | Armani
 865 | Michaela
 866 | Zaiden
 867 | Monica
 868 | Kade
 869 | June
 870 | Ari
 871 | Janiyah
 872 | Hugo
 873 | Camilla
 874 | Zachariah
 875 | Jamie
 876 | Kamden
 877 | Rebekah
 878 | Mohamed
 879 | Audrina
 880 | Quentin
 881 | Dayana
 882 | Solomon
 883 | Lana
 884 | Curtis
 885 | Serena
 886 | Leonel
 887 | Tiana
 888 | Issac
 889 | Nylah
 890 | Khalil
 891 | Braelyn
 892 | Alberto
 893 | Savanna
 894 | Jerry
 895 | Skye
 896 | Alec
 897 | Raelyn
 898 | Gianni
 899 | Madalyn
 900 | Moises
 901 | Sasha
 902 | Gunnar
 903 | Perla
 904 | Adriel
 905 | Bridget
 906 | Lawrence
 907 | Aniya
 908 | Alijah
 909 | Rowan
 910 | Chandler
 911 | Logan
 912 | Ronan
 913 | Mckinley
 914 | Prince
 915 | Averie
 916 | Payton
 917 | Jaylah
 918 | Arturo
 919 | Aylin
 920 | Jimmy
 921 | Joselyn
 922 | Orlando
 923 | Nia
 924 | Ricky
 925 | Hayley
 926 | Mitchell
 927 | Lilian
 928 | Maximilian
 929 | Adelynn
 930 | Cason
 931 | Jaliyah
 932 | Malcolm
 933 | Kassidy
 934 | Muhammad
 935 | Kaylin
 936 | Kasen
 937 | Kadence
 938 | Marvin
 939 | Celeste
 940 | Jalen
 941 | Jaelyn
 942 | Cyrus
 943 | Zariah
 944 | Mauricio
 945 | Tatiana
 946 | Warren
 947 | Jimena
 948 | Jonas
 949 | Lilyana
 950 | Kendrick
 951 | Anaya
 952 | Rhys
 953 | Catalina
 954 | Dane
 955 | Viviana
 956 | Ryland
 957 | Cataleya
 958 | Pierce
 959 | Sloane
 960 | Johan
 961 | Courtney
 962 | Rory
 963 | Johanna
 964 | Uriel
 965 | Amari
 966 | Major
 967 | Melany
 968 | Bryant
 969 | Anabelle
 970 | Reece
 971 | Francesca
 972 | Casey
 973 | Ada
 974 | Ibrahim
 975 | Alanna
 976 | Nikolas
 977 | Priscilla
 978 | Arjun
 979 | Danna
 980 | Sullivan
 981 | Angie
 982 | Finnegan
 983 | Kailyn
 984 | Alfredo
 985 | Lacey
 986 | Royce
 987 | Sage
 988 | Ahmed
 989 | Lillie
 990 | Amare
 991 | Brinley
 992 | Lance
 993 | Caylee
 994 | Ramon
 995 | Joy
 996 | Jamison
 997 | Kenley
 998 | Brayan
 999 | Vera
1000 | Brenden
1001 | Bailee
1002 | Dominik
1003 | Amira
1004 | Case
1005 | Aileen
1006 | Kristopher
1007 | Aspen
1008 | Maurice
1009 | Emmalyn
1010 | Mekhi
1011 | Erica
1012 | Kobe
1013 | Gracelyn
1014 | Zackary
1015 | Kennedi
1016 | Rhett
1017 | Skyla
1018 | Jensen
1019 | Annalise
1020 | Jaxton
1021 | Danica
1022 | Deandre
1023 | Dylan
1024 | Isaias
1025 | Kiley
1026 | Channing
1027 | Gwendolyn
1028 | Yahir
1029 | Jasmin
1030 | Ezequiel
1031 | Lauryn
1032 | Tobias
1033 | Aleena
1034 | Talon
1035 | Justice
1036 | Sam
1037 | Annabel
1038 | Justice
1039 | Tenley
1040 | Kash
1041 | Dahlia
1042 | Nash
1043 | Gloria
1044 | Alvin
1045 | Lexie
1046 | Jacoby
1047 | Lindsey
1048 | Ace
1049 | Hallie
1050 | Nico
1051 | Sylvia
1052 | Quinton
1053 | Elyse
1054 | Cannon
1055 | Annika
1056 | Franklin
1057 | Maeve
1058 | Raiden
1059 | Marlee
1060 | Joe
1061 | Aryanna
1062 | Lawson
1063 | Kenya
1064 | Beckham
1065 | Lorelei
1066 | Gary
1067 | Selah
1068 | Aldo
1069 | Kaliyah
1070 | Raylan
1071 | Adele
1072 | Frederick
1073 | Natasha
1074 | London
1075 | Brenda
1076 | Boston
1077 | Erika
1078 | Carl
1079 | Alyson
1080 | Byron
1081 | Braylee
1082 | Ernesto
1083 | Emilee
1084 | Moshe
1085 | Raven
1086 | Terry
1087 | Ariella
1088 | Eddie
1089 | Blakely
1090 | Kane
1091 | Liana
1092 | Moses
1093 | Jaycee
1094 | Finley
1095 | Sawyer
1096 | Salvador
1097 | Anahi
1098 | Reese
1099 | Jaelynn
1100 | Kelvin
1101 | Elsa
1102 | Cullen
1103 | Farrah
1104 | Madden
1105 | Cameron
1106 | Wade
1107 | Evelynn
1108 | Clark
1109 | Luciana
1110 | Mohammed
1111 | Zara
1112 | Kieran
1113 | Madilynn
1114 | Jagger
1115 | Eve
1116 | Dorian
1117 | Kaia
1118 | Korbin
1119 | Helena
1120 | Nelson
1121 | Anne
1122 | Roy
1123 | Estrella
1124 | Asa
1125 | Leighton
1126 | Matias
1127 | Nataly
1128 | Nasir
1129 | Whitney
1130 | Nickolas
1131 | Lainey
1132 | Roger
1133 | Amara
1134 | Alonzo
1135 | Anabella
1136 | Jaxen
1137 | Malaysia
1138 | Skylar
1139 | Samara
1140 | Callen
1141 | Zoie
1142 | Malakai
1143 | Amani
1144 | Douglas
1145 | Phoenix
1146 | Ahmad
1147 | Dulce
1148 | Uriah
1149 | Paola
1150 | Conor
1151 | Marie
1152 | Kristian
1153 | Aisha
1154 | Carmelo
1155 | Harlow
1156 | Blaine
1157 | Virginia
1158 | Kayson
1159 | Ember
1160 | Bentlee
1161 | Regina
1162 | Braeden
1163 | Jaylee
1164 | Julien
1165 | Anika
1166 | Nathanael
1167 | Ally
1168 | Aarav
1169 | Kayden
1170 | Keagan
1171 | Alani
1172 | Lucian
1173 | Miah
1174 | Morgan
1175 | Yareli
1176 | Chad
1177 | Journee
1178 | Terrance
1179 | Kiera
1180 | Benson
1181 | Nathalie
1182 | Noe
1183 | Mikaela
1184 | Rodney
1185 | Jaylynn
1186 | Francis
1187 | Litzy
1188 | Layne
1189 | Charley
1190 | Mohammad
1191 | Claudia
1192 | Zayne
1193 | Aliya
1194 | Tatum
1195 | Madyson
1196 | Brett
1197 | Cecelia
1198 | Wilson
1199 | Liberty
1200 | Kian
1201 | Braelynn
1202 | Marc
1203 | Evie
1204 | Rohan
1205 | Rosemary
1206 | Dayton
1207 | Myah
1208 | Braiden
1209 | Lizbeth
1210 | Harper
1211 | Giana
1212 | Luciano
1213 | Ryan
1214 | Nikolai
1215 | Teresa
1216 | Kamari
1217 | Ciara
1218 | Camron
1219 | Isis
1220 | Joey
1221 | Lea
1222 | Santino
1223 | Shayla
1224 | Ellis
1225 | Jazlynn
1226 | Layton
1227 | Rosa
1228 | Xzavier
1229 | Gracelynn
1230 | Jefferson
1231 | Desiree
1232 | Winston
1233 | Elisabeth
1234 | Guillermo
1235 | Isabela
1236 | Demetrius
1237 | Arely
1238 | Bowen
1239 | Mariam
1240 | Daxton
1241 | Abbigail
1242 | Melvin
1243 | Emersyn
1244 | Soren
1245 | Brenna
1246 | Neil
1247 | Kaylynn
1248 | Sylas
1249 | Nova
1250 | Jon
1251 | Raquel
1252 | Raphael
1253 | Dana
1254 | Rex
1255 | Laney
1256 | Yusuf
1257 | Laylah
1258 | Shaun
1259 | Siena
1260 | Brodie
1261 | Amelie
1262 | Tommy
1263 | Clarissa
1264 | Harley
1265 | Lilianna
1266 | Quincy
1267 | Lylah
1268 | Dax
1269 | Halle
1270 | Trace
1271 | Madalynn
1272 | Adonis
1273 | Maleah
1274 | Bently
1275 | Sherlyn
1276 | Giovani
1277 | Linda
1278 | Jeffery
1279 | Shiloh
1280 | Odin
1281 | Jessie
1282 | Luka
1283 | Kenia
1284 | Kylan
1285 | Greta
1286 | Willie
1287 | Marina
1288 | Lewis
1289 | Melina
1290 | Tripp
1291 | Amiya
1292 | Vihaan
1293 | Bria
1294 | Davion
1295 | Natalee
1296 | Kendall
1297 | Sariah
1298 | Arian
1299 | Mollie
1300 | Cory
1301 | Nancy
1302 | Jamarion
1303 | Christine
1304 | Jonathon
1305 | Felicity
1306 | Nixon
1307 | Zuri
1308 | Rayan
1309 | Irene
1310 | Emery
1311 | Simone
1312 | Jermaine
1313 | Amya
1314 | Reginald
1315 | Matilda
1316 | Tomas
1317 | Colette
1318 | Emmitt
1319 | Kristen
1320 | Ayaan
1321 | Paityn
1322 | Zechariah
1323 | Alayah
1324 | Billy
1325 | Janiya
1326 | Hamza
1327 | Kallie
1328 | Micheal
1329 | Mira
1330 | Urijah
1331 | Hailee
1332 | Aryan
1333 | Kathleen
1334 | Lee
1335 | Meredith
1336 | Jasiah
1337 | Janessa
1338 | Landry
1339 | Noemi
1340 | Crosby
1341 | Aiyana
1342 | Mathias
1343 | Aliana
1344 | Toby
1345 | Leia
1346 | Tristian
1347 | Mariyah
1348 | Will
1349 | Tori
1350 | Felipe
1351 | Alissa
1352 | Triston
1353 | Ivanna
1354 | Eden
1355 | Joslyn
1356 | Terrell
1357 | Sandra
1358 | Deacon
1359 | Maryam
1360 | Matthias
1361 | Saniyah
1362 | Jamal
1363 | Kassandra
1364 | Makai
1365 | Danika
1366 | Maxim
1367 | Denise
1368 | Sterling
1369 | Jemma
1370 | Hank
1371 | River
1372 | Gerald
1373 | Charleigh
1374 | Alessandro
1375 | Emelia
1376 | Jaydon
1377 | Kristina
1378 | Hayes
1379 | Armani
1380 | Niko
1381 | Beatrice
1382 | Branson
1383 | Jaylene
1384 | Flynn
1385 | Karlee
1386 | Kody
1387 | Blake
1388 | Marlon
1389 | Cara
1390 | Mayson
1391 | Addilyn
1392 | Allan
1393 | Amina
1394 | Augustus
1395 | Ansley
1396 | Jessie
1397 | Kaitlynn
1398 | Neymar
1399 | Iliana
1400 | Adrien
1401 | Mckayla
1402 | Aydan
1403 | Adelina
1404 | Leonard
1405 | Briley
1406 | Sincere
1407 | Elaine
1408 | Kyson
1409 | Lailah
1410 | Terrence
1411 | Mercedes
1412 | Jerome
1413 | Chaya
1414 | Jadiel
1415 | Lindsay
1416 | Kole
1417 | Hattie
1418 | Aron
1419 | Lisa
1420 | Aydin
1421 | Marisol
1422 | Omari
1423 | Patricia
1424 | Ronnie
1425 | Bryanna
1426 | Zain
1427 | Taliyah
1428 | Vicente
1429 | Adrienne
1430 | Bobby
1431 | Emmy
1432 | Yosef
1433 | Millie
1434 | Alexzander
1435 | Paislee
1436 | Harry
1437 | Charli
1438 | Kale
1439 | Kourtney
1440 | Rogelio
1441 | Leyla
1442 | Casen
1443 | Maia
1444 | Ray
1445 | Willa
1446 | Clay
1447 | Milan
1448 | Masen
1449 | Paula
1450 | Sage
1451 | Ayleen
1452 | Ulises
1453 | Clare
1454 | Kymani
1455 | Kensley
1456 | Chaim
1457 | Reyna
1458 | Javon
1459 | Martha
1460 | Brent
1461 | Adley
1462 | Jadon
1463 | Elianna
1464 | Elisha
1465 | Emilie
1466 | Stanley
1467 | Karsyn
1468 | Jovanni
1469 | Yasmin
1470 | Princeton
1471 | Lorelai
1472 | Alonso
1473 | Amirah
1474 | Darian
1475 | Aryana
1476 | Conrad
1477 | Livia
1478 | Dwayne
1479 | Alena
1480 | Eugene
1481 | Kiana
1482 | Gauge
1483 | Celia
1484 | Rene
1485 | Kailee
1486 | Kareem
1487 | Rylan
1488 | Roland
1489 | Ellen
1490 | Ben
1491 | Galilea
1492 | Vincenzo
1493 | Kynlee
1494 | Abdullah
1495 | Leanna
1496 | Camren
1497 | Renata
1498 | Kenny
1499 | Mae
1500 | Brentley
1501 | Ayanna
1502 | Memphis
1503 | Chanel
1504 | Blaze
1505 | Lesly
1506 | Edison
1507 | Cindy
1508 | Osvaldo
1509 | Carla
1510 | Teagan
1511 | Pearl
1512 | Westin
1513 | Jaylin
1514 | Deshawn
1515 | Kimora
1516 | Rayden
1517 | Angeline
1518 | Cedric
1519 | Carlee
1520 | Marquis
1521 | Aubri
1522 | Samir
1523 | Edith
1524 | Steve
1525 | Alia
1526 | Draven
1527 | Frances
1528 | Jairo
1529 | Corinne
1530 | Giovanny
1531 | Jocelynn
1532 | Brennen
1533 | Cherish
1534 | Bronson
1535 | Wendy
1536 | Crew
1537 | Carolyn
1538 | Davin
1539 | Lina
1540 | Kolten
1541 | Tabitha
1542 | Ronin
1543 | Winter
1544 | Ariel
1545 | Abril
1546 | Semaj
1547 | Bryn
1548 | Alden
1549 | Jolie
1550 | Isiah
1551 | Yaritza
1552 | Lennox
1553 | Casey
1554 | Davian
1555 | Zion
1556 | Jaylin
1557 | Lillianna
1558 | Cain
1559 | Jordynn
1560 | Wayne
1561 | Zariyah
1562 | Craig
1563 | Audriana
1564 | Lamar
1565 | Jayde
1566 | Leonidas
1567 | Jaida
1568 | Cristopher
1569 | Salma
1570 | Otto
1571 | Diamond
1572 | Bo
1573 | Malaya
1574 | Darrell
1575 | Kimber
1576 | Kolby
1577 | Ryann
1578 | Marcelo
1579 | Abbie
1580 | Bruno
1581 | Paloma
1582 | Fletcher
1583 | Destinee
1584 | Justus
1585 | Kaleigh
1586 | Alfonso
1587 | Asia
1588 | Theo
1589 | Demi
1590 | Tyrone
1591 | Yamileth
1592 | Aidyn
1593 | Deborah
1594 | Harvey
1595 | Elin
1596 | Rudy
1597 | Kaiya
1598 | Brendon
1599 | Mara
1600 | Tristin
1601 | Averi
1602 | Dominique
1603 | Nola
1604 | Kaeden
1605 | Tara
1606 | Samson
1607 | Taryn
1608 | Kyree
1609 | Emmalee
1610 | Jovani
1611 | Aubrianna
1612 | Lionel
1613 | Janae
1614 | Amos
1615 | Kyndall
1616 | Giancarlo
1617 | Jewel
1618 | Misael
1619 | Zaniyah
1620 | Callum
1621 | Kaya
1622 | Quintin
1623 | Sonia
1624 | Valentino
1625 | Alaya
1626 | Gavyn
1627 | Heather
1628 | Lennon
1629 | Nathaly
1630 | Jamir
1631 | Shannon
1632 | Kamron
1633 | Ariah
1634 | Zavier
1635 | Avah
1636 | Arlo
1637 | Giada
1638 | Junior
1639 | Lilith
1640 | Killian
1641 | Samiyah
1642 | Leandro
1643 | Sharon
1644 | Konnor
1645 | Coraline
1646 | Hezekiah
1647 | Eileen
1648 | Jordyn
1649 | Julianne
1650 | Markus
1651 | Milania
1652 | Ramiro
1653 | Chana
1654 | Callan
1655 | Regan
1656 | Chace
1657 | Krystal
1658 | Johnathon
1659 | Rihanna
1660 | Lyric
1661 | Sidney
1662 | Fisher
1663 | Hadassah
1664 | Rashad
1665 | Macey
1666 | Kamryn
1667 | Mina
1668 | Legend
1669 | Paulina
1670 | Duncan
1671 | Rayne
1672 | Harold
1673 | Kaitlin
1674 | Camilo
1675 | Maritza
1676 | Hendrix
1677 | Susan
1678 | Seamus
1679 | Raina
1680 | Coleman
1681 | Hana
1682 | Vance
1683 | Keyla
1684 | Rylee
1685 | Temperance
1686 | Elian
1687 | Aimee
1688 | Jaeden
1689 | Alisson
1690 | Jamie
1691 | Charlize
1692 | Krish
1693 | Kendal
1694 | Abdiel
1695 | Lara
1696 | Antoine
1697 | Roselyn
1698 | Camdyn
1699 | Alannah
1700 | Van
1701 | Alma
1702 | Branden
1703 | Dixie
1704 | Cayson
1705 | Larissa
1706 | Gibson
1707 | Patience
1708 | Javion
1709 | Taraji
1710 | Izayah
1711 | Sky
1712 | Darwin
1713 | Zaria
1714 | Jamar
1715 | Aleigha
1716 | Mike
1717 | Alyvia
1718 | Randall
1719 | Aviana
1720 | Brecken
1721 | Bryleigh
1722 | Hassan
1723 | Elliot
1724 | Thiago
1725 | Jenny
1726 | Heath
1727 | Luz
1728 | Arnav
1729 | Ali
1730 | Kingsley
1731 | Alisha
1732 | Kyrie
1733 | Ayana
1734 | Xavi
1735 | Campbell
1736 | Damari
1737 | Karis
1738 | Deangelo
1739 | Lilyanna
1740 | Jionni
1741 | Azaria
1742 | Joziah
1743 | Blair
1744 | Makhi
1745 | Micah
1746 | Vaughn
1747 | Moriah
1748 | Zeke
1749 | Myra
1750 | Konner
1751 | Lilia
1752 | Ean
1753 | Aliza
1754 | Frankie
1755 | Giovanna
1756 | Yael
1757 | Karissa
1758 | Benton
1759 | Saniya
1760 | Oakley
1761 | Emory
1762 | Efrain
1763 | Estella
1764 | Marcel
1765 | Juniper
1766 | Rolando
1767 | Kairi
1768 | Maxton
1769 | Kenna
1770 | Jaycob
1771 | Meghan
1772 | Keenan
1773 | Abrielle
1774 | Rowen
1775 | Elissa
1776 | Yousef
1777 | Rachael
1778 | Ishaan
1779 | Emmaline
1780 | Jedidiah
1781 | Jolene
1782 | Remy
1783 | Joyce
1784 | Todd
1785 | Britney
1786 | Reagan
1787 | Carlie
1788 | Bodhi
1789 | Haylie
1790 | Damarion
1791 | Judith
1792 | Juelz
1793 | Renee
1794 | Valentin
1795 | Saanvi
1796 | Austyn
1797 | Yesenia
1798 | Broderick
1799 | Barbara
1800 | Anders
1801 | Dallas
1802 | Alvaro
1803 | Jaqueline
1804 | Mustafa
1805 | Karma
1806 | Thaddeus
1807 | America
1808 | Brenton
1809 | Sariyah
1810 | Cale
1811 | Azalea
1812 | Clinton
1813 | Everly
1814 | Derick
1815 | Ingrid
1816 | Jorden
1817 | Lillyana
1818 | Gilberto
1819 | Emmalynn
1820 | Jabari
1821 | Marianna
1822 | Rey
1823 | Brisa
1824 | Salvatore
1825 | Kaelynn
1826 | Freddy
1827 | Leona
1828 | Donte
1829 | Libby
1830 | Ernest
1831 | Deanna
1832 | Aaden
1833 | Mattie
1834 | Axton
1835 | Miya
1836 | Blaise
1837 | Kai
1838 | Lucca
1839 | Annalee
1840 | Maximo
1841 | Nahla
1842 | Sidney
1843 | Dorothy
1844 | Dario
1845 | Kaylyn
1846 | Rodolfo
1847 | Rayna
1848 | Trevon
1849 | Araceli
1850 | Camryn
1851 | Cambria
1852 | Deegan
1853 | Evalyn
1854 | Sonny
1855 | Haleigh
1856 | Cassius
1857 | Thalia
1858 | Truman
1859 | Jakayla
1860 | Brice
1861 | Maliah
1862 | Brogan
1863 | Saige
1864 | Hugh
1865 | Avianna
1866 | Yehuda
1867 | Charity
1868 | Agustin
1869 | Kaylen
1870 | Eliot
1871 | Raylee
1872 | Stefan
1873 | Tamia
1874 | Zaid
1875 | Aubrielle
1876 | Bridger
1877 | Bayleigh
1878 | Damion
1879 | Carley
1880 | Eliseo
1881 | Kailynn
1882 | Houston
1883 | Katrina
1884 | Johann
1885 | Belen
1886 | Leroy
1887 | Karlie
1888 | Sheldon
1889 | Natalya
1890 | Dariel
1891 | Alaysia
1892 | Darryl
1893 | Celine
1894 | Isai
1895 | Milana
1896 | Tyrell
1897 | Monroe
1898 | Alfred
1899 | Estelle
1900 | Demarcus
1901 | Meadow
1902 | Kohen
1903 | Audrianna
1904 | Ignacio
1905 | Cristina
1906 | Rylen
1907 | Harlee
1908 | Santos
1909 | Jazzlyn
1910 | Cael
1911 | Scarlette
1912 | Davon
1913 | Zahra
1914 | Kaysen
1915 | Akira
1916 | Mack
1917 | Ann
1918 | Darien
1919 | Collins
1920 | Ross
1921 | Kendyl
1922 | Titan
1923 | Anabel
1924 | Tyree
1925 | Azariah
1926 | Ameer
1927 | Carissa
1928 | Zaire
1929 | Milena
1930 | Aditya
1931 | Tia
1932 | Briggs
1933 | Alisa
1934 | Immanuel
1935 | Bree
1936 | Malaki
1937 | Carleigh
1938 | Turner
1939 | Cheyanne
1940 | Bradyn
1941 | Sarahi
1942 | Graysen
1943 | Laurel
1944 | Kase
1945 | Kylah
1946 | Reuben
1947 | Tinley
1948 | Yandel
1949 | Kora
1950 | Gaige
1951 | Marisa
1952 | Jaidyn
1953 | Esme
1954 | Franco
1955 | Sloan
1956 | Trystan
1957 | Cailyn
1958 | Maison
1959 | Gisselle
1960 | Simeon
1961 | Kasey
1962 | Anton
1963 | Kyndal
1964 | Darnell
1965 | Marlene
1966 | Emory
1967 | Riya
1968 | Roderick
1969 | Annabell
1970 | Deon
1971 | Aubriana
1972 | Devan
1973 | Izabelle
1974 | Graeme
1975 | Kirsten
1976 | Howard
1977 | Aya
1978 | Jael
1979 | Dalilah
1980 | Kael
1981 | Devyn
1982 | Karsen
1983 | Geraldine
1984 | Jarrett
1985 | Analia
1986 | Apollo
1987 | Hayleigh
1988 | Denzel
1989 | Landry
1990 | Foster
1991 | Sofie
1992 | Gilbert
1993 | Tess
1994 | Jaylon
1995 | Ashtyn
1996 | Kylen
1997 | Jessa
1998 | Augustine
1999 | Katalina
2000 | Dangelo


--------------------------------------------------------------------------------
/labs/lab8/word_count.py:
--------------------------------------------------------------------------------
 1 | from mrjob.job import MRJob
 2 | 
 3 | class MRWordFrequencyCount(MRJob):
 4 | 
 5 |     def mapper(self, _, line):
 6 |         yield "chars", len(line)
 7 |         yield "words", len(line.split())
 8 |         yield "lines", 1
 9 | 
10 |     def reducer(self, key, values):
11 |         yield key, sum(values)
12 | 
13 | if __name__ == '__main__':
14 |     MRWordFrequencyCount.run()
15 | 


--------------------------------------------------------------------------------
/lec_04_scraping.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "markdown",
 12 |      "metadata": {},
 13 |      "source": [
 14 |       "# Scraping Demo\n",
 15 |       "Companion to Lecture 4 of Harvard [CS109: Data Science](http://cs109.org)\n"
 16 |      ]
 17 |     },
 18 |     {
 19 |      "cell_type": "code",
 20 |      "collapsed": false,
 21 |      "input": [
 22 |       "import requests\n",
 23 |       "from pattern import web\n",
 24 |       "from BeautifulSoup import BeautifulSoup"
 25 |      ],
 26 |      "language": "python",
 27 |      "metadata": {},
 28 |      "outputs": [],
 29 |      "prompt_number": 1
 30 |     },
 31 |     {
 32 |      "cell_type": "markdown",
 33 |      "metadata": {},
 34 |      "source": [
 35 |       "# Task\n",
 36 |       "\n",
 37 |       "Find and print the movie title, list of genres, runtime, and score of all movies on [this page](http://www.imdb.com/search/title?at=0&sort=num_votes,desc&start=1&title_type=feature&year=1950,2012)"
 38 |      ]
 39 |     },
 40 |     {
 41 |      "cell_type": "markdown",
 42 |      "metadata": {},
 43 |      "source": [
 44 |       "### Two ways of making get requests"
 45 |      ]
 46 |     },
 47 |     {
 48 |      "cell_type": "markdown",
 49 |      "metadata": {},
 50 |      "source": [
 51 |       "#### 1. Explicit URL"
 52 |      ]
 53 |     },
 54 |     {
 55 |      "cell_type": "code",
 56 |      "collapsed": false,
 57 |      "input": [
 58 |       "url = 'http://www.imdb.com/search/title?sort=num_votes,desc&start=1&title_type=feature&year=1950,2012'\n",
 59 |       "r = requests.get(url)\n",
 60 |       "print r.url"
 61 |      ],
 62 |      "language": "python",
 63 |      "metadata": {},
 64 |      "outputs": [
 65 |       {
 66 |        "output_type": "stream",
 67 |        "stream": "stdout",
 68 |        "text": [
 69 |         "http://www.imdb.com/search/title?sort=num_votes,desc&start=1&title_type=feature&year=1950,2012\n"
 70 |        ]
 71 |       }
 72 |      ],
 73 |      "prompt_number": 7
 74 |     },
 75 |     {
 76 |      "cell_type": "markdown",
 77 |      "metadata": {},
 78 |      "source": [
 79 |       "#### 2. Base URL with GET dictionary"
 80 |      ]
 81 |     },
 82 |     {
 83 |      "cell_type": "code",
 84 |      "collapsed": false,
 85 |      "input": [
 86 |       "url = 'http://www.imdb.com/search/title'\n",
 87 |       "params = dict(sort='num_votes,desc', start=1, title_type='feature', year='1950,2012')\n",
 88 |       "r = requests.get(url, params=params)\n",
 89 |       "print r.url  # notice it constructs the full url for you"
 90 |      ],
 91 |      "language": "python",
 92 |      "metadata": {},
 93 |      "outputs": [
 94 |       {
 95 |        "output_type": "stream",
 96 |        "stream": "stdout",
 97 |        "text": [
 98 |         "http://www.imdb.com/search/title?sort=num_votes%2Cdesc&start=1&title_type=feature&year=1950%2C2012\n"
 99 |        ]
100 |       }
101 |      ],
102 |      "prompt_number": 8
103 |     },
104 |     {
105 |      "cell_type": "markdown",
106 |      "metadata": {},
107 |      "source": [
108 |       "# Using Pattern"
109 |      ]
110 |     },
111 |     {
112 |      "cell_type": "code",
113 |      "collapsed": false,
114 |      "input": [
115 |       "#selection in pattern follows the rules of CSS\n",
116 |       "\n",
117 |       "dom = web.Element(r.text)\n",
118 |       "for movie in dom.by_tag('td.title'):    \n",
119 |       "    title = movie.by_tag('a')[0].content\n",
120 |       "    genres = movie.by_tag('span.genre')[0].by_tag('a')\n",
121 |       "    genres = [g.content for g in genres]\n",
122 |       "    runtime = movie.by_tag('span.runtime')[0].content\n",
123 |       "    rating = movie.by_tag('span.value')[0].content\n",
124 |       "    print title, genres, runtime, rating"
125 |      ],
126 |      "language": "python",
127 |      "metadata": {},
128 |      "outputs": [
129 |       {
130 |        "output_type": "stream",
131 |        "stream": "stdout",
132 |        "text": [
133 |         "The Shawshank Redemption [u'Crime', u'Drama'] 142 mins. 9.3\n",
134 |         "The Dark Knight [u'Action', u'Crime', u'Drama', u'Thriller'] 152 mins. 9.0\n",
135 |         "Inception [u'Action', u'Adventure', u'Mystery', u'Sci-Fi', u'Thriller'] 148 mins. 8.8\n",
136 |         "Pulp Fiction [u'Crime', u'Drama', u'Thriller'] 154 mins. 9.0\n",
137 |         "Fight Club [u'Drama'] 139 mins. 8.9\n",
138 |         "The Lord of the Rings: The Fellowship of the Ring"
139 |        ]
140 |       },
141 |       {
142 |        "output_type": "stream",
143 |        "stream": "stdout",
144 |        "text": [
145 |         " [u'Action', u'Adventure', u'Fantasy'] 178 mins. 8.8\n",
146 |         "The Matrix [u'Action', u'Adventure', u'Sci-Fi'] 136 mins. 8.7\n",
147 |         "The Lord of the Rings: The Return of the King [u'Action', u'Adventure', u'Fantasy'] 201 mins. 8.9\n",
148 |         "The Godfather [u'Crime', u'Drama'] 175 mins. 9.2\n",
149 |         "Forrest Gump [u'Drama', u'Romance'] 142 mins. 8.7\n",
150 |         "The Dark Knight Rises [u'Action', u'Crime', u'Thriller'] 165 mins. 8.6\n",
151 |         "The Lord of the Rings: The Two Towers"
152 |        ]
153 |       },
154 |       {
155 |        "output_type": "stream",
156 |        "stream": "stdout",
157 |        "text": [
158 |         " [u'Action', u'Adventure', u'Fantasy'] 179 mins. 8.7\n",
159 |         "Se7en [u'Crime', u'Mystery', u'Thriller'] 127 mins. 8.7\n",
160 |         "Avatar [u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'] 162 mins. 7.9\n",
161 |         "Batman Begins [u'Action', u'Adventure', u'Crime', u'Drama'] 140 mins. 8.3\n",
162 |         "Gladiator [u'Action', u'Adventure', u'Drama'] 155 mins. 8.5\n",
163 |         "Star Wars"
164 |        ]
165 |       },
166 |       {
167 |        "output_type": "stream",
168 |        "stream": "stdout",
169 |        "text": [
170 |         " [u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'] 121 mins. 8.8\n",
171 |         "The Avengers [u'Action', u'Fantasy'] 143 mins. 8.2\n",
172 |         "Memento [u'Mystery', u'Thriller'] 113 mins. 8.6\n",
173 |         "American Beauty [u'Drama'] 122 mins. 8.5\n",
174 |         "Schindler&#x27;s List [u'Biography', u'Drama', u'History', u'War'] 195 mins. 8.9\n",
175 |         "Saving Private Ryan [u'Action', u'Drama', u'War'] 169 mins. 8.6\n",
176 |         "The Departed"
177 |        ]
178 |       },
179 |       {
180 |        "output_type": "stream",
181 |        "stream": "stdout",
182 |        "text": [
183 |         " [u'Crime', u'Drama', u'Thriller'] 151 mins. 8.5\n",
184 |         "The Silence of the Lambs [u'Crime', u'Drama', u'Thriller'] 118 mins. 8.7\n",
185 |         "Pirates of the Caribbean: The Curse of the Black Pearl [u'Action', u'Adventure', u'Fantasy'] 143 mins. 8.0\n",
186 |         "Star Wars: Episode V - The Empire Strikes Back"
187 |        ]
188 |       },
189 |       {
190 |        "output_type": "stream",
191 |        "stream": "stdout",
192 |        "text": [
193 |         " [u'Action', u'Adventure', u'Sci-Fi'] 124 mins. 8.8\n",
194 |         "Titanic [u'Drama', u'Romance'] 194 mins. 7.6\n",
195 |         "V for Vendetta [u'Action', u'Crime', u'Fantasy', u'Mystery', u'Sci-Fi', u'Thriller'] 132 mins. 8.2\n",
196 |         "Inglourious Basterds [u'Adventure', u'Drama', u'War'] 153 mins. 8.3\n",
197 |         "The Prestige [u'Drama', u'Mystery', u'Thriller'] 130 mins. 8.4\n",
198 |         "American History X"
199 |        ]
200 |       },
201 |       {
202 |        "output_type": "stream",
203 |        "stream": "stdout",
204 |        "text": [
205 |         " [u'Crime', u'Drama'] 119 mins. 8.6\n",
206 |         "The Godfather: Part II [u'Crime', u'Drama'] 200 mins. 9.0\n",
207 |         "The Usual Suspects [u'Crime', u'Mystery', u'Thriller'] 106 mins. 8.7\n",
208 |         "Braveheart [u'Action', u'Biography', u'Drama', u'History', u'War'] 177 mins. 8.4\n",
209 |         "Terminator 2: Judgment Day"
210 |        ]
211 |       },
212 |       {
213 |        "output_type": "stream",
214 |        "stream": "stdout",
215 |        "text": [
216 |         " [u'Action', u'Sci-Fi', u'Thriller'] 137 mins. 8.6\n",
217 |         "The Sixth Sense [u'Drama', u'Mystery', u'Thriller'] 107 mins. 8.2\n",
218 |         "Kill Bill: Vol. 1 [u'Action', u'Crime'] 111 mins. 8.2\n",
219 |         "Goodfellas [u'Biography', u'Crime', u'Drama', u'Thriller'] 146 mins. 8.8\n",
220 |         "Sin City [u'Crime', u'Thriller'] 124 mins. 8.2\n",
221 |         "L&#xE9;on: The Professional"
222 |        ]
223 |       },
224 |       {
225 |        "output_type": "stream",
226 |        "stream": "stdout",
227 |        "text": [
228 |         " [u'Crime', u'Drama', u'Thriller'] 110 mins. 8.6\n",
229 |         "Django Unchained [u'Adventure', u'Drama', u'Western'] 165 mins. 8.5\n",
230 |         "One Flew Over the Cuckoo&#x27;s Nest [u'Drama'] 133 mins. 8.8\n",
231 |         "The Green Mile [u'Crime', u'Drama', u'Fantasy', u'Mystery'] 189 mins. 8.5\n",
232 |         "Raiders of the Lost Ark [u'Action', u'Adventure'] 115 mins. 8.6\n",
233 |         "Eternal Sunshine of the Spotless Mind"
234 |        ]
235 |       },
236 |       {
237 |        "output_type": "stream",
238 |        "stream": "stdout",
239 |        "text": [
240 |         " [u'Drama', u'Romance', u'Sci-Fi'] 108 mins. 8.4\n",
241 |         "Shutter Island [u'Drama', u'Thriller'] 138 mins. 8.0\n",
242 |         "Iron Man [u'Action', u'Adventure', u'Sci-Fi'] 126 mins. 7.9\n",
243 |         "Back to the Future [u'Adventure', u'Comedy', u'Sci-Fi'] 116 mins. 8.5\n",
244 |         "WALL&#xB7;E [u'Animation', u'Adventure', u'Family', u'Romance', u'Sci-Fi'] 98 mins. 8.5\n",
245 |         "300"
246 |        ]
247 |       },
248 |       {
249 |        "output_type": "stream",
250 |        "stream": "stdout",
251 |        "text": [
252 |         " [u'Action', u'Fantasy', u'History', u'War'] 117 mins. 7.7\n"
253 |        ]
254 |       }
255 |      ],
256 |      "prompt_number": 9
257 |     },
258 |     {
259 |      "cell_type": "markdown",
260 |      "metadata": {},
261 |      "source": [
262 |       "# Using BeautifulSoup"
263 |      ]
264 |     },
265 |     {
266 |      "cell_type": "code",
267 |      "collapsed": false,
268 |      "input": [
269 |       "bs = BeautifulSoup(r.text)\n",
270 |       "for movie in bs.findAll('td', 'title'):\n",
271 |       "    title = movie.find('a').contents[0]\n",
272 |       "    genres = movie.find('span', 'genre').findAll('a')\n",
273 |       "    genres = [g.contents[0] for g in genres]\n",
274 |       "    runtime = movie.find('span', 'runtime').contents[0]\n",
275 |       "    rating = movie.find('span', 'value').contents[0]\n",
276 |       "    print title, genres, runtime, rating\n"
277 |      ],
278 |      "language": "python",
279 |      "metadata": {},
280 |      "outputs": [
281 |       {
282 |        "output_type": "stream",
283 |        "stream": "stdout",
284 |        "text": [
285 |         "The Shawshank Redemption [u'Crime', u'Drama'] 142 mins. 9.3\n",
286 |         "The Dark Knight [u'Action', u'Crime', u'Drama', u'Thriller'] 152 mins. 9.0\n",
287 |         "Inception [u'Action', u'Adventure', u'Mystery', u'Sci-Fi', u'Thriller'] 148 mins. 8.8\n",
288 |         "Pulp Fiction [u'Crime', u'Drama', u'Thriller'] 154 mins. 9.0\n",
289 |         "Fight Club [u'Drama'] 139 mins. 8.9\n",
290 |         "The Lord of the Rings: The Fellowship of the Ring [u'Action', u'Adventure', u'Fantasy'] 178 mins. 8.8\n",
291 |         "The Matrix [u'Action', u'Adventure', u'Sci-Fi'] 136 mins. 8.7\n",
292 |         "The Lord of the Rings: The Return of the King [u'Action', u'Adventure', u'Fantasy'] 201 mins. 8.9\n",
293 |         "The Godfather [u'Crime', u'Drama'] 175 mins. 9.2\n",
294 |         "Forrest Gump"
295 |        ]
296 |       },
297 |       {
298 |        "output_type": "stream",
299 |        "stream": "stdout",
300 |        "text": [
301 |         " [u'Drama', u'Romance'] 142 mins. 8.7\n",
302 |         "The Dark Knight Rises [u'Action', u'Crime', u'Thriller'] 165 mins. 8.6\n",
303 |         "The Lord of the Rings: The Two Towers [u'Action', u'Adventure', u'Fantasy'] 179 mins. 8.7\n",
304 |         "Se7en [u'Crime', u'Mystery', u'Thriller'] 127 mins. 8.7\n",
305 |         "Avatar [u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'] 162 mins. 7.9\n",
306 |         "Batman Begins [u'Action', u'Adventure', u'Crime', u'Drama'] 140 mins. 8.3\n",
307 |         "Gladiator [u'Action', u'Adventure', u'Drama'] 155 mins. 8.5\n",
308 |         "Star Wars [u'Action', u'Adventure', u'Fantasy', u'Sci-Fi'] 121 mins. 8.8\n",
309 |         "The Avengers [u'Action', u'Fantasy'] 143 mins. 8.2\n",
310 |         "Memento"
311 |        ]
312 |       },
313 |       {
314 |        "output_type": "stream",
315 |        "stream": "stdout",
316 |        "text": [
317 |         " [u'Mystery', u'Thriller'] 113 mins. 8.6\n",
318 |         "American Beauty [u'Drama'] 122 mins. 8.5\n",
319 |         "Schindler&#x27;s List [u'Biography', u'Drama', u'History', u'War'] 195 mins. 8.9\n",
320 |         "Saving Private Ryan [u'Action', u'Drama', u'War'] 169 mins. 8.6\n",
321 |         "The Departed [u'Crime', u'Drama', u'Thriller'] 151 mins. 8.5\n",
322 |         "The Silence of the Lambs [u'Crime', u'Drama', u'Thriller'] 118 mins. 8.7\n",
323 |         "Pirates of the Caribbean: The Curse of the Black Pearl [u'Action', u'Adventure', u'Fantasy'] 143 mins. 8.0\n",
324 |         "Star Wars: Episode V - The Empire Strikes Back [u'Action', u'Adventure', u'Sci-Fi'] 124 mins. 8.8\n",
325 |         "Titanic [u'Drama', u'Romance'] 194 mins. 7.6\n",
326 |         "V for Vendetta"
327 |        ]
328 |       },
329 |       {
330 |        "output_type": "stream",
331 |        "stream": "stdout",
332 |        "text": [
333 |         " [u'Action', u'Crime', u'Fantasy', u'Mystery', u'Sci-Fi', u'Thriller'] 132 mins. 8.2\n",
334 |         "Inglourious Basterds [u'Adventure', u'Drama', u'War'] 153 mins. 8.3\n",
335 |         "The Prestige [u'Drama', u'Mystery', u'Thriller'] 130 mins. 8.4\n",
336 |         "American History X [u'Crime', u'Drama'] 119 mins. 8.6\n",
337 |         "The Godfather: Part II [u'Crime', u'Drama'] 200 mins. 9.0\n",
338 |         "The Usual Suspects [u'Crime', u'Mystery', u'Thriller'] 106 mins. 8.7\n",
339 |         "Braveheart [u'Action', u'Biography', u'Drama', u'History', u'War'] 177 mins. 8.4\n",
340 |         "Terminator 2: Judgment Day [u'Action', u'Sci-Fi', u'Thriller'] 137 mins. 8.6\n",
341 |         "The Sixth Sense [u'Drama', u'Mystery', u'Thriller'] 107 mins. 8.2\n",
342 |         "Kill Bill: Vol. 1"
343 |        ]
344 |       },
345 |       {
346 |        "output_type": "stream",
347 |        "stream": "stdout",
348 |        "text": [
349 |         " [u'Action', u'Crime'] 111 mins. 8.2\n",
350 |         "Goodfellas [u'Biography', u'Crime', u'Drama', u'Thriller'] 146 mins. 8.8\n",
351 |         "Sin City [u'Crime', u'Thriller'] 124 mins. 8.2\n",
352 |         "L&#xE9;on: The Professional [u'Crime', u'Drama', u'Thriller'] 110 mins. 8.6\n",
353 |         "Django Unchained [u'Adventure', u'Drama', u'Western'] 165 mins. 8.5\n",
354 |         "One Flew Over the Cuckoo&#x27;s Nest [u'Drama'] 133 mins. 8.8\n",
355 |         "The Green Mile [u'Crime', u'Drama', u'Fantasy', u'Mystery'] 189 mins. 8.5\n",
356 |         "Raiders of the Lost Ark [u'Action', u'Adventure'] 115 mins. 8.6\n",
357 |         "Eternal Sunshine of the Spotless Mind [u'Drama', u'Romance', u'Sci-Fi'] 108 mins. 8.4\n",
358 |         "Shutter Island"
359 |        ]
360 |       },
361 |       {
362 |        "output_type": "stream",
363 |        "stream": "stdout",
364 |        "text": [
365 |         " [u'Drama', u'Thriller'] 138 mins. 8.0\n",
366 |         "Iron Man [u'Action', u'Adventure', u'Sci-Fi'] 126 mins. 7.9\n",
367 |         "Back to the Future [u'Adventure', u'Comedy', u'Sci-Fi'] 116 mins. 8.5\n",
368 |         "WALL&#xB7;E [u'Animation', u'Adventure', u'Family', u'Romance', u'Sci-Fi'] 98 mins. 8.5\n",
369 |         "300 [u'Action', u'Fantasy', u'History', u'War'] 117 mins. 7.7\n"
370 |        ]
371 |       }
372 |      ],
373 |      "prompt_number": 5
374 |     }
375 |    ],
376 |    "metadata": {}
377 |   }
378 |  ]
379 | }


--------------------------------------------------------------------------------
/matplotlib_examples/imdb.tsv:
--------------------------------------------------------------------------------
 1 | Rank 	 Title 	 Year 	 Director 	 Country 
 2 | 1 	 Citizen Kane 	 1941 	 Welles, Orson 	 USA 
 3 | 2 	 Bicycle Thieves 	 1948 	 De Sica, Vittorio 	 Italy 
 4 | 3 	 Psycho 	 1960 	 Hitchcock, Alfred 	 USA 
 5 | 4 	 The Godfather 	 1972 	 Coppola, Francis Ford 	 USA 
 6 | 5 	 2001: A Space Odyssey 	 1968 	 Kubrick, Stanley 	 UK / USA 
 7 | 6 	 The Rules of the Game 	 1939 	 Renoir, Jean 	 France 
 8 | 7 	 Singin' in the Rain 	 1952 	 Kelly, Gene / Donen, Stanley 	 USA 
 9 | 8 	 Bonnie and Clyde 	 1967 	 Penn, Arthur 	 USA 
10 | 9 	 The Searchers 	 1956 	 Ford, John 	 USA 
11 | 10 	 Casablanca 	 1942 	 Curtiz, Michael 	 USA 
12 | 11 	 Seven Samurai 	 1954 	 Kurosawa, Akira 	 Japan 
13 | 12 	 Annie Hall 	 1977 	 Allen, Woody 	 USA 
14 | 13 	 Chinatown 	 1974 	 Polanski, Roman 	 USA 
15 | 14 	 Pulp Fiction 	 1994 	 Tarantino, Quentin 	 USA 
16 | 15 	 Raging Bull 	 1980 	 Scorsese, Martin 	 USA 
17 | 16 	 Rashmon 	 1950 	 Kurosawa, Akira 	 Japan 
18 | 17 	 Star Wars: Episode IV -  	 1977 	 Lucas, George 	 USA 
19 | 18 	 The 400 Blows 	 1959 	 Truffaut, Franois 	 France 
20 | 19 	 Breathless 	 1960 	 Godard, Jean-Luc 	 France 
21 | 20 	 E.T. The Extra-Terrestrial 	 1982 	 Spielberg, Steven 	 USA 
22 | 21 	 Vertigo 	 1958 	 Hitchcock, Alfred 	 USA 
23 | 22 	 Schindler's List 	 1993 	 Spielberg, Steven 	 USA 
24 | 23 	 8 1/2 	 1963 	 Fellini, Federico 	 Italy / France 
25 | 24 	 Metropolis 	 1927 	 Lang, Fritz 	 Germany 
26 | 25 	 Lawrence of Arabia 	 1962 	 Lean, David 	 UK 
27 | 26 	 GoodFellas 	 1990 	 Scorsese, Martin 	 USA 
28 | 27 	 Some Like It Hot 	 1959 	 Wilder, Billy 	 USA 
29 | 28 	 La Dolce Vita 	 1960 	 Fellini, Federico 	 Italy / France 
30 | 29 	 M 	 1931 	 Lang, Fritz 	 Germany 
31 | 30 	 On the Waterfront 	 1954 	 Kazan, Elia 	 USA 
32 | 31 	 All About Eve 	 1950 	 Mankiewicz, Joseph L. 	 USA 
33 | 32 	 Gone with the Wind 	 1939 	 Fleming, Victor 	 USA 
34 | 33 	 Taxi Driver 	 1976 	 Scorsese, Martin 	 USA 
35 | 34 	 The Conformist 	 1970 	 Bertolucci, Bernardo 	 Italy / France 
36 | 35 	 The Wizard of Oz 	 1939 	 Fleming, Victor / Vidor, King 	 USA 
37 | 36 	 Dr. Strangelove  	 1964 	 Kubrick, Stanley 	 UK 
38 | 37 	 The Godfather Part II 	 1974 	 Coppola, Francis Ford 	 USA 
39 | 38 	 Raise the Red Lantern 	 1991 	 Zhang, Yimou 	 China / Hong Kong 
40 | 39 	 Tokyo Story 	 1953 	 Ozu, Yasujiro 	 Japan 
41 | 40 	 Sunset Boulevard 	 1950 	 Wilder, Billy 	 USA 
42 | 41 	 The World of Apu 	 1959 	 Ray, Satyajit 	 India 
43 | 42 	 Double Indemnity 	 1944 	 Wilder, Billy 	 USA 
44 | 43 	 Belle de jour 	 1967 	 Buuel, Luis 	 France / Italy 
45 | 44 	 Persona 	 1966 	 Bergman, Ingmar 	 Sweden 
46 | 45 	 Jules and Jim 	 1962 	 Truffaut, Franois 	 France 
47 | 46 	 Apocalypse Now 	 1979 	 Coppola, Francis Ford 	 USA 
48 | 47 	 The Graduate 	 1967 	 Nichols, Mike 	 USA 
49 | 48 	 Nashville 	 1975 	 Altman, Robert 	 USA 
50 | 49 	 L'Avventura 	 1960 	 Antonioni, Michelangelo 	 Italy / France 
51 | 50 	 It's a Wonderful Life 	 1946 	 Capra, Frank 	 USA 


--------------------------------------------------------------------------------
/skeleton.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from mrjob.job import MRJob
 4 | from itertools import combinations, permutations
 5 | 
 6 | from scipy.stats.stats import pearsonr
 7 | 
 8 | 
 9 | class RestaurantSimilarities(MRJob):
10 | 
11 |     def steps(self):
12 |         "the steps in the map-reduce process"
13 |         thesteps = [
14 |             self.mr(mapper=self.line_mapper, reducer=self.users_items_collector),
15 |             self.mr(mapper=self.pair_items_mapper, reducer=self.calc_sim_collector)
16 |         ]
17 |         return thesteps
18 | 
19 |     def line_mapper(self,_,line):
20 |         "this is the complete implementation"
21 |         user_id,business_id,stars,business_avg,user_avg=line.split(',')
22 |         yield user_id, (business_id,stars,business_avg,user_avg)
23 | 
24 | 
25 |     def users_items_collector(self, user_id, values):
26 |         """
27 |         #iterate over the list of tuples yielded in the previous mapper
28 |         #and append them to an array of rating information
29 |         """
30 |         pass
31 | 
32 | 
33 |     def pair_items_mapper(self, user_id, values):
34 |         """
35 |         ignoring the user_id key, take all combinations of business pairs
36 |         and yield as key the pair id, and as value the pair rating information
37 |         """
38 | 	   pass #your code here
39 | 
40 |     def calc_sim_collector(self, key, values):
41 |         """
42 |         Pick up the information from the previous yield as shown. Compute
43 |         the pearson correlation and yield the final information as in the
44 |         last line here.
45 |         """
46 |         (rest1, rest2), common_ratings = key, values
47 | 	    #your code here
48 |         yield (rest1, rest2), (rho, n_common)
49 | 
50 | 
51 | #Below MUST be there for things to work
52 | if __name__ == '__main__':
53 |     RestaurantSimilarities.run()
54 | 


--------------------------------------------------------------------------------